Exemplo n.º 1
0
 def handle_result(self, response, result_dict):
     item = ParsedItem()
     self.common_item_assembler(response, item)
     item['_parsed_data'] = map_field(result_dict)
     item['_id'] = calc_str_md5(response.url)
     item['bbd_html'] = ''
     item['bbd_type'] = "credit_bj"
     item['rowkey'] = gen_rowkey(item, keys=('do_time', 'bbd_type'))
     item['bbd_params'] = ''
     return item
Exemplo n.º 2
0
    def handle_result(self, response, result_dict):
        result_dict = map_field(result_dict)
        if 'xzcf' in self.name and 'license_status' in result_dict:
            result_dict['punish_status'] = result_dict.pop(
                'license_status', '')

        item = ParsedItem()
        self.common_item_assembler(response, item)
        item['_parsed_data'] = result_dict
        item['_id'] = calc_str_md5(response.url)
        item['bbd_html'] = ''
        item['bbd_params'] = ''
        item['bbd_type'] = self.name.split('__')[-1][:-7]
        item['rowkey'] = gen_rowkey(item, keys=('do_time', 'bbd_type'))
        return item
    def parse(self, source, *args, **kwargs):
        """

        :Keyword Arguments:
         self         --
         source *args --
         **kwargs     --
        :return: None
        """
        try:
            detail_html = source.pop('bbd_html', '')
            detail_url = source.get('bbd_url', '')
            self.logger.info('开始解析:{} {}'.format(self.parser_info, detail_url))
            json_data = json.loads(detail_html)
            get_func = self.get_value(json_data)
            res_dict = {
                "company_name": get_func("xkXdr"),
                "credit_code": get_func("xkXdrShxym"),
                "case_name": get_func("xkXmmc"),
                "license_code": get_func("xkWsh"),
                "approval_category": get_func("xkSplb"),
                "license_content": get_func("xkNr"),
                "license_org": get_func("xkXzjg"),
                "license_start_date": get_func("xkSxq"),
                "license_end_date": get_func("xkJzq"),
                "license_status": get_func("xkZt"),
                "administrative_code": get_func("dfbm"),
                "data_source": get_func("depName"),
                "pubdate": get_func("publishDate"),
                "id_number": get_func("xkXdrSfz"),
            }
            if "PDetial" in detail_url:
                res_dict.pop("credit_code", "")
            else:
                res_dict.pop("id_number", "")
            res_dict.update(source)
            res_dict.update(self.base_dict)
            res_dict["_id"] = "{}".format(uuid.uuid4())
            res_dict["rowkey"] = gen_rowkey(res_dict)
            res_dict["bbd_html"] = ""
            self.logger.info("save {} to mongo".format(
                res_dict["company_name"]))
            return res_dict
        except Exception:
            msg = "{} parse error url {}! msg:{}".format(
                self.parser_info, source["bbd_url"], traceback.format_exc())
            self.logger.error(msg)
Exemplo n.º 4
0
 def parse(self, source, *args, **kwargs):
     """
     parse logic
     :Keyword Arguments:
      self     --
      source   --
      *args    --
      **kwargs --
     :return: None
     """
     try:
         detail_html = source.pop('bbd_html', '')
         detail_url = source.get('bbd_url', '')
         self.logger.info('开始解析:{} {}'.format(self.parser_info, detail_url))
         json_data = json.loads(detail_html)
         data_content = json_data["data"]["dataContentJson"]
         data_func = self.get_value(json_data["data"])
         detail_func = self.get_value(data_content)
         credit_code = detail_func("id_number") if "空" != detail_func(
             "id_number") else data_func("uscCode")
         res_dict = {
             "company_name": detail_func("org_name"),
             "credit_code": credit_code,
             "regno": data_func("regCode"),
             "case_name": detail_func("punish_name"),
             "punish_code": detail_func("decide_docno"),
             "punish_category": detail_func("punish_type1"),
             "punish_type": detail_func("reason"),
             "punish_content": detail_func("punish_ret"),
             "punish_basis": detail_func("gist"),
             "punish_org": detail_func("organization"),
             "punish_date": detail_func("dt_penalty"),
             "punish_status": detail_func("cur_status"),
         }
         res_dict.update(source)
         res_dict.update(self.base_dict)
         res_dict["_id"] = "{}".format(uuid.uuid4())
         res_dict["rowkey"] = gen_rowkey(res_dict)
         res_dict["bbd_html"] = ""
         self.logger.info("save {} to mongo".format(
             res_dict["company_name"]))
         return res_dict
     except Exception as err:
         msg = "{} parse error url {}! msg:{}".format(
             self.parser_info, source["bbd_url"], traceback.format_exc())
         self.logger.error(msg)
    def parse(self, source, *args, **kwargs):
        """

        :Keyword Arguments:
         self     --
         source   --
         *args    --
         **kwargs --
        :return: None
        """
        try:
            detail_html = source.pop('bbd_html', '')
            detail_url = source.get('bbd_url', '')
            self.logger.info('开始解析:{} {}'.format(self.parser_info, detail_url))
            json_data = json.loads(detail_html)
            get_func = self.get_value(json_data)
            res_dict = {
                "punish_code": get_func("cfWsh"),
                "case_name": get_func("cfAjmc"),
                "punish_category_one": get_func("cfCflb"),
                "punish_type": get_func("cfSy"),
                "punish_basis": get_func("cfYj"),
                "company_name": get_func("cfXdrMc"),
                "credit_code": get_func("cfXdrShxym"),
                "punish_content": get_func("cfJg"),
                "punish_date": get_func("cfSxq"),
                "punish_org": get_func("cfXzjg"),
                "punish_status": get_func("cfZt"),
                "administrative_code": get_func("dfbm"),
                "data_source": get_func("depName"),
                "pubdate": get_func("publishDate"),
            }
            res_dict.update(source)
            res_dict.update(self.base_dict)
            res_dict["_id"] = "{}".format(uuid.uuid4())
            res_dict["rowkey"] = gen_rowkey(res_dict)
            res_dict["bbd_html"] = ""
            self.logger.info("save {} to mongo".format(res_dict["company_name"]))
            return res_dict
        except Exception:
            msg = "{} parse error url {}! msg:{}".format(
                self.parser_info, source["bbd_url"], traceback.format_exc())
            self.logger.error(msg)
Exemplo n.º 6
0
 def parse_detail(self, response):
     """
     解析详情页
     :param response:
     :return:
     """
     try:
         key = response.meta["key"]
         titles_tds = response.xpath("//table//tr//td[1]")
         values_tds = response.xpath("//table//tr//td[last()]")
         titles = [
             clean_all_space(td.xpath("string(.)").extract())
             for td in titles_tds
         ]
         values = [
             "".join(td.xpath("string(.)").extract()).strip()
             for td in values_tds
         ]
         if len(titles) != len(values):
             raise Exception(
                 "the length of titles and values are not equal, url {}".
                 format(response.url))
         tmp_dict = dict(zip(titles, values))
         res_dict = map_field(tmp_dict)
         if "xzcf" in self.name:
             if "license_status" in res_dict.keys():
                 res_dict["punish_status"] = res_dict.pop(
                     "license_status", "")
         item = ParsedItem()
         self.common_item_assembler(response, item)
         item["_id"] = "{}_{}".format(key, uuid.uuid4())
         item["bbd_html"] = ""
         item["_parsed_data"] = res_dict
         item["rowkey"] = gen_rowkey(item, keys=('do_time', 'bbd_type'))
         yield item
         self.logger1.info("one data {} save to mongodb".format(key))
     except:
         err_msg = traceback.format_exc()
         self.logger1.error(
             "Exception on detail {url}, error:{err_msg}".format(
                 url=response.url, err_msg=err_msg))
 def parse(self, source, *args, **kwargs):
     """
     parse logic
     :Keyword Arguments:
      self     --
      source   --
      *args    --
      **kwargs --
     :return: parsed dict
     """
     try:
         detail_html = clean_html(source.pop('bbd_html', ''))
         detail_url = source.get('bbd_url', '')
         self.logger.info('开始解析:{} {}'.format(self.parser_info, detail_url))
         response = Selector(text=detail_html)
         titles = [
             clean_all_space(re.sub(r':|:', r'', til.strip()))
             for til in response.xpath('//table//tr[position()>1]//th').
             xpath('string(.)').extract()
         ]
         values = [
             val.strip()
             for val in response.xpath('//table//tr[position()>1]//td').
             xpath('string(.)').extract()
         ]
         tmp_dict = dict(zip(titles, values))
         res_dict = map_field(tmp_dict)
         res_dict["bbd_seed"] = ""
         res_dict["_id"] = "{}".format(uuid.uuid4())
         res_dict["bbd_html"] = ""
         res_dict.update(source)
         res_dict.update(self.base_dict)
         res_dict["rowkey"] = gen_rowkey(res_dict)
         return res_dict
     except Exception as err:
         msg = '{} parse error! msg:{}'.format(self.parser_info,
                                               traceback.format_exc())
         self.logger.warning(msg)
Exemplo n.º 8
0
 def parse_detail(self, response):
     """
     parse detail page
     :param response:
     :return:
     """
     try:
         self.logger1.info('start to parse {}'.format(response.url))
         tr_list = response.xpath(
             "//div[@class='fl ml20 mb10 mt10 f_yh']//table//tr")
         data_dict = {}
         for tr in tr_list:
             title = clean_all_space(''.join(
                 tr.xpath(".//th").xpath("string(.)").extract()).replace(
                     ":", "").replace(":", ""))
             value = clean_all_space(''.join(
                 tr.xpath(".//td").xpath("string(.)").extract()))
             data_dict.update({title: value})
         item = ParsedItem()
         self.common_item_assembler(response, item)
         item["_id"] = "{}_{}".format(response.url, uuid.uuid4())
         item['bbd_html'] = ''
         item['bbd_type'] = "credit_jx"
         item['rowkey'] = gen_rowkey(item, keys=('do_time', 'bbd_type'))
         res_dict = self.convert_time(map_field(data_dict))
         if "xzcf" in self.name:
             if "license_status" in res_dict.keys():
                 res_dict["punish_status"] = res_dict.pop(
                     "license_status", "")
         item['_parsed_data'] = res_dict
         yield item
         self.logger1.info('{} save successfully'.format(response.url))
     except Exception as e:
         self.logger1.warning(
             "Exception on save detail page {} {} {}".format(
                 response.url, traceback.format_exc(), e))
 def parse(self, source, *args, **kwargs):
     """
     parse logic
     :Keyword Arguments:
      self     --
      source   --
      *args    --
      **kwargs --
     :return: None
     """
     try:
         detail_html = source.pop('bbd_html', '')
         detail_url = source.get('bbd_url', '')
         self.logger.info('开始解析:{} {}'.format(self.parser_info, detail_url))
         json_data = json.loads(detail_html)
         res_list = []
         for data in json_data["results"]:
             determine_date = data.get("DETERMINEDATE", None)
             terminal_date = data.get("TERMINALDATE", None)
             license_start_date = self.date_convert(determine_date)
             license_end_date = self.date_convert(terminal_date)
             res_dict = {
                 "company_name":
                 data.get("LEGALPERSON", "")
                 if data.get("LEGALPERSON", "") else "",
                 "license_org":
                 data.get("ORGNAME", "") if data.get("ORGNAME", "") else "",
                 "license_code":
                 data.get("NO", "") if data.get("NO", "") else "",
                 "case_name":
                 data.get("PROJECTNAME", "")
                 if data.get("PROJECTNAME", "") else "",
                 "approval_category":
                 data.get("AUDITTYPE", "")
                 if data.get("AUDITTYPE", "") else "",
                 "license_content":
                 data.get("NOTE", "") if data.get("NOTE", "") else "",
                 "credit_code":
                 data.get("CREDITCODE", "")
                 if data.get("CREDITCODE", "") else "",
                 "organization_code":
                 data.get("ORGNO", "") if data.get("ORGNO", "") else "",
                 "regno":
                 data.get("ICREGCODE", "")
                 if data.get("ICREGCODE", "") else "",
                 "tax_code":
                 data.get("TAXCODE", "") if data.get("TAXCODE", "") else "",
                 "id_number":
                 data.get("REPRESENTATIVEID", "") if data.get(
                     "REPRESENTATIVEID", "") else "",
                 "frname":
                 data.get("REPRESENTATIVE", "") if data.get(
                     "REPRESENTATIVE", "") else "",
                 "license_start_date":
                 license_start_date,
                 "license_end_date":
                 license_end_date,
                 "remark":
                 data.get("REMARK", "") if data.get("REMARK", "") else "",
             }
             res_dict.update(source)
             res_dict.update(self.base_dict)
             res_dict["_id"] = "{}".format(uuid.uuid4())
             res_dict["rowkey"] = gen_rowkey(res_dict)
             res_dict["bbd_html"] = ""
             res_dict["bbd_url"] = self.real_url_format.format(data["ID"])
             res_list.append(res_dict)
         return res_list
     except Exception:
         msg = "{} parse error url {}! msg:{}".format(
             self.parser_info, source["bbd_url"], traceback.format_exc())
         self.logger.error(msg)
 def parse(self, source, *args, **kwargs):
     """
     parse logic
     :Keyword Arguments:
      self     --
      source   --
      *args    --
      **kwargs --
     :return: None
     """
     try:
         detail_html = source.pop('bbd_html', '')
         detail_url = source.get('bbd_url', '')
         self.logger.info('开始解析:{} {}'.format(self.parser_info, detail_url))
         json_data = json.loads(detail_html)["results"]
         res_dict = {
             "company_name":
             self.get_value(json_data, "LEGALPERSON"),
             "punish_org":
             self.get_value(json_data, "ORGNAME"),
             "case_name":
             self.get_value(json_data, "PUNISHNAME"),
             "punish_code":
             self.get_value(json_data, "NO"),
             "punish_category_one":
             self.get_value(json_data, "AUDITTYPE"),
             "punish_type":
             self.get_value(json_data, "REASON"),
             "punish_basis":
             self.get_value(json_data, "ACCORDING"),
             "credit_code":
             self.get_value(json_data, "CREDITCODE"),
             "organization_code":
             self.get_value(json_data, "ORGNO"),
             "regno":
             self.get_value(json_data, "ICREGCODE"),
             "tax_code":
             self.get_value(json_data, "TAXCODE"),
             "id_number":
             self.get_value(json_data, "REPRESENTATIVEID"),
             "frname":
             self.get_value(json_data, "REPRESENTATIVE"),
             "punish_content":
             self.get_value(json_data, "NOTE"),
             "punish_date":
             self.date_convert(self.get_value(json_data, "PUNISHDATE")),
             "remark":
             self.get_value(json_data, "REMARK"),
         }
         res_dict.update(source)
         res_dict.update(self.base_dict)
         res_dict["_id"] = "{}".format(uuid.uuid4())
         res_dict["rowkey"] = gen_rowkey(res_dict)
         res_dict["bbd_html"] = ""
         self.logger.info("save {} to mongo".format(
             res_dict["company_name"]))
         return res_dict
     except Exception:
         msg = "{} parse error url {}! msg:{}".format(
             self.parser_info, source["bbd_url"], traceback.format_exc())
         self.logger.error(msg)