def get_contributive_info(self, session, contributive_info_url, pripid, data): con_info = self.task_request(session, session.get, contributive_info_url) if con_info is None: self.append_model(data, Model.contributive_info, contributive_info_url, '', status=self.STATUS_FAIL) return self.append_model(data, Model.contributive_info, contributive_info_url, con_info.text) array_pattern = '.*?global_gdJosnData=(.*?)}];' search_list = re.findall(array_pattern, con_info.text) if len(search_list) <= 0: return search_list[0] += '}]' url = 'http://{host}/saicpub/entPublicitySC/entPublicityDC/getGsgsTzrxxPojoList.action'.format( host=self.host) array_data = util.json_loads(search_list[0]) for index, item in enumerate(array_data): invid = item.get('invid', None) if invid is None: continue post_data = { 'pripid': pripid, 'invid': invid, } r = self.task_request(session, session.post, url, data=post_data) if r is None: continue self.append_model(data, Model.contributive_info, url, r.text, classify=Model.type_detail)
def get_change_info(self, change_info): change_info_dict = {} page_text = self.get_crawl_page(change_info) lst_change_records = [] if page_text is None or page_text == u'': return {} native_json = util.json_loads(page_text) if native_json is None: return {} json_data_arr = native_json.get('data', []) if json_data_arr is None: return {} for data in json_data_arr: change_model = { GsModel.ChangeRecords.CHANGE_ITEM: data.get('bcsxmc', ''), # 去除多余的字 GsModel.ChangeRecords.BEFORE_CONTENT: util.format_content(data.get('bcnr', '')), GsModel.ChangeRecords.AFTER_CONTENT: util.format_content(data.get('bghnr', '')), # 日期格式化 GsModel.ChangeRecords.CHANGE_DATE: data.get('hzrq', '') } change_model = replace_none(change_model) lst_change_records.append(change_model) change_info_dict[GsModel.CHANGERECORDS] = lst_change_records return change_info_dict
def get_change_info(self, change_info): change_info_dict = {} pages = self.get_crawl_page(change_info, True) lst_change_records = [] for page in pages: text = page.get('text') json_data = util.json_loads(text) if json_data is None: continue data_list = json_data.get('data', []) if data_list is not None: for data in data_list: change_model = { GsModel.ChangeRecords.CHANGE_ITEM: data.get('altItem_CN'), # 去除多余的字 GsModel.ChangeRecords.BEFORE_CONTENT: util.format_content(data.get('altBe')), GsModel.ChangeRecords.AFTER_CONTENT: util.format_content(data.get('altAf')), # 日期格式化 GsModel.ChangeRecords.CHANGE_DATE: data.get('altDate') } change_model = replace_none(change_model) lst_change_records.append(change_model) change_info_dict[GsModel.CHANGERECORDS] = lst_change_records return change_info_dict
def get_change_info(self, change_info): change_info_dict = {} pages = self.get_crawl_page(change_info, True) lst_changerecords = [] for page in pages: res_text = page.get(u'text', u'{}') json_data = util.json_loads(res_text) if json_data is None: self.log.error( 'json转换失败: res_text = {text}'.format(text=res_text)) continue json_list = json_data.get(u'list', {}) obj_list = json_list.get(u'list', []) if obj_list is None: self.log.info('没有 变更信息..') continue # js_obj = json.loads(page.get(u'text', u'{}')).get(u'list', {}).get(u'list', []) for obj in obj_list: change_model = self._get_change_info_2_model(obj) lst_changerecords.append(change_model) # json 包含了全部页面的数据信息 所以只需要解析一页 break change_info_dict[GsModel.CHANGERECORDS] = lst_changerecords return change_info_dict
def get_key_person_info(self, key_person_info): """ :param key_person_info: 网页库字典, 里面包含list 与 detail 两个列表, 列表中存储的为网页数据 其中两个列表一定会存在一个, 否则则认为这个数据包无效, list一般储存列表翻页信息, detail存储列表项详情信息 具体结构参考mongodb网页库或者查看 common/global_field.py 中Model定义注释 主要人员一般存储在list列表中, 因为主要人员不包含列表结构不需要detail列表 :return: 返回工商schema字典 """ key_person_info_dict = {} lst_key_person = [] page_list = self.get_crawl_page(key_person_info) json_data_arr = util.json_loads(page_list) if json_data_arr is None: return key_person_info_dict for data in json_data_arr: key_person_model = { GsModel.KeyPerson.KEY_PERSON_NAME: data.get('PERSON_NAME'), GsModel.KeyPerson.KEY_PERSON_POSITION: data.get('POSITION_NAME') } key_person_model = bu_ding(key_person_model) lst_key_person.append(key_person_model) if len(lst_key_person) > 0: key_person_info_dict[GsModel.KEY_PERSON] = lst_key_person return key_person_info_dict
def get_key_person_info(self, key_person_info): """ :param key_person_info: 网页库字典, 里面包含list 与 detail 两个列表, 列表中存储的为网页数据 其中两个列表一定会存在一个, 否则则认为这个数据包无效, list一般储存列表翻页信息, detail存储列表项详情信息 具体结构参考mongodb网页库或者查看 common/global_field.py 中Model定义注释 主要人员一般存储在list列表中, 因为主要人员不包含列表结构不需要detail列表 :return: 返回工商schema字典 """ key_person_info_dict = {} pages = self.get_crawl_page(key_person_info) lst_key_person = [] pages = util.json_loads(pages) if pages is None: return {} for page in pages: key_person_dict = { GsModel.KeyPerson.KEY_PERSON_NAME: page.get('NAME'), GsModel.KeyPerson.KEY_PERSON_POSITION: page.get('POSITION_CN') } key_person_dict = replace_none(key_person_dict) lst_key_person.append(key_person_dict) key_person_info_dict[GsModel.KEY_PERSON] = lst_key_person return key_person_info_dict
def get_contributive_info_detail(self, session, data, page_text): json_data = util.json_loads(page_text) if json_data is None: return czxx = json_data.get('czxx', None) if czxx is None: return for item in czxx: recid = item.get('recid', None) if recid is None: continue url = 'http://{host}/pub/czxx/{recid}'.format(host=self.host, recid=recid) r = self.task_request(session, session.post, url=url) if r is None: self.append_model(data, Model.contributive_info, url, '', status=self.STATUS_FAIL, classify=Model.type_detail) else: self.append_model(data, Model.contributive_info, url, r.text, classify=Model.type_detail)
def get_base_info(self, base_info): base_info_dict = {} new_base_info_dict = {} page = self.get_crawl_page(base_info) json_data = util.json_loads(page) if json_data is None: return {} base_info_dict[unicode(json_data.get('uniscIdName', ''))] = json_data.get('uniscId') base_info_dict[u'企业名称'] = json_data.get('entName') base_info_dict[u'类型'] = json_data.get('entType_CN') base_info_dict[u'法定代表人'] = json_data.get('leRep') base_info_dict[u'注册资本'] = json_data.get('regCap') base_info_dict[u'成立日期'] = json_data.get('estDate') base_info_dict[u'经营期限自'] = json_data.get('opFrom') base_info_dict[u'经营期限至'] = json_data.get('opTo') base_info_dict[u'登记机关'] = json_data.get('regOrg_CN') base_info_dict[u'核准日期'] = json_data.get('apprDate') base_info_dict[u'登记状态'] = json_data.get('regState_CN') base_info_dict[u'住所'] = json_data.get('dom') base_info_dict[u'经营范围'] = json_data.get('opScope') if 'compForm_CN' in json_data.keys(): base_info_dict[u'组成形式'] = json_data.get('compForm_CN') for k, v in base_info_dict.items(): if v is None: v = '' new_k = GsModel.format_base_model(k) new_base_info_dict[new_k] = v new_base_info_dict[GsModel.PERIOD] = u"{0}至{1}". \ format(new_base_info_dict.get(GsModel.PERIOD_FROM), new_base_info_dict.get(GsModel.PERIOD_TO)) return new_base_info_dict
def get_change_shareholding_info(self, change_shareholding_info): info_dict = {} lst_info = [] page = self.get_crawl_page(change_shareholding_info) if page is None or page == u'': return {} json_data = util.json_loads(page) if json_data is None: return info_dict data_arr = json_data.get('data', []) if data_arr is None: return info_dict for data in data_arr: info_model = { GsModel.ChangeShareholding.SHAREHOLDER: data.get('inv', ''), GsModel.ChangeShareholding.CHANGE_BEFORE: data.get('transAmPrBf', ''), GsModel.ChangeShareholding.CHANGE_AFTER: util.from_13stamp_to_time(data.get('transAmPrAf', '')), GsModel.ChangeShareholding.CHANGE_DATE: data.get('altDate', ''), GsModel.ChangeShareholding.PUBLIC_DATE: util.from_13stamp_to_time(data.get('publicDate', '')), } lst_info.append(info_model) if len(lst_info) > 0: info_dict[GsModel.CHANGE_SHAREHOLDING] = lst_info return info_dict
def get_branch_info(self, branch_info): """ :param branch_info: 网页库字典, 里面包含list 与 detail 两个列表, 列表中存储的为网页数据 其中两个列表一定会存在一个, 否则则认为这个数据包无效, list一般储存列表翻页信息, detail存储列表项详情信息 具体结构参考mongodb网页库或者查看 common/global_field.py 中Model定义注释 分支机构一般存储在list列表中, 因为分支机构不包含列表结构不需要detail列表 :return: 返回工商schema字典 """ branch_info_dict = {} pages = self.get_crawl_page(branch_info) lst_branch = [] pages = util.json_loads(pages) if pages is None: return {} for page in pages: branch_model = { GsModel.Branch.COMPAY_NAME: page.get('BRNAME'), GsModel.Branch.CODE: page.get('REGNO'), GsModel.Branch.REGISTERED_ADDRESS: page.get('REGORG_CN') } branch_model = replace_none(branch_model) lst_branch.append(branch_model) branch_info_dict[GsModel.BRANCH] = lst_branch return branch_info_dict
def get_annual_info(self, session, i_d, data): url = 'http://{host}/api/PubAnnualInfo/Annuals/{id}?_={rand}'.format( host=self.host, id=i_d, rand=util.get_time_stamp()) r = self.task_request(session, session.get, url) if r is None: return json_data = util.json_loads(r.text) if json_data is None: return data_list = json_data.get('data', None) if data_list is None: return for item in data_list: anche_id = item.get('ancheId', None) year_info = item.get('year', None) if anche_id is None or year_info is None: continue year_list = re.findall('(\d+)', year_info) if len(year_list) <= 0: continue year = year_list[0] # 获得详细年报信息 self.get_annual_detail_info(session, i_d, anche_id, year, data)
def dispatch(self): if self.annual_item_list is None: raise IndexError("未抓取到相关网页,或者抓取网页失败") if len(self.annual_item_list) <= 0: return {} if self.annual_item_list is None or self.annual_item_list[0].get( u'status', u'fail') != u'success': raise IndexError("为抓取到相关网页,或者抓取网页失败") page = self.annual_item_list[0].get(u'text', u'') json_all = util.json_loads(page) annual_base_info = self.get_annual_base_info(json_all) self.annual_info_dict.update(annual_base_info) # 企业资产状况信息 asset_model = self.get_annual_asset_info(json_all.get('zczk', u'')) self.annual_info_dict[ AnnualReports.ENTERPRISE_ASSET_STATUS_INFORMATION] = asset_model # 对外投资 json_inv_company = json_all.get('dwtzs', u'') self.annual_info_dict[ AnnualReports.INVESTED_COMPANIES] = self.get_annual_inv_info( json_inv_company) # 股东出资信息 json_share_hold = json_all.get('czxxs', '') lst_share_hold = self.get_annual_share_hold_info(json_share_hold) self.annual_info_dict[ AnnualReports.SHAREHOLDER_INFORMATION] = lst_share_hold # 对外担保 json_out_guarantee = json_all.get('dwdbs', '') lst_out_guaranty = self.get_annual_out_guarantee_info( json_out_guarantee) self.annual_info_dict[ AnnualReports.OUT_GUARANTEE_INFO] = lst_out_guaranty # 网站或网店信息 json_out_websites = json_all.get('wdxxs', '') lst_websites = self.get_annual_out_website(json_out_websites) self.annual_info_dict[AnnualReports.WEBSITES] = lst_websites # 修改记录 json_edit_change = json_all.get('alterHis', '') lst_edit_change = self.get_annual_edit_change(json_edit_change) self.annual_info_dict[ AnnualReports.EDIT_CHANGE_INFOS] = lst_edit_change # 股权变更 json_edit_shareholding_change = json_all.get('gqbgs', '') lst_edit_shareholding_change = self.get_annual_edit_shareholding_change( json_edit_shareholding_change) self.annual_info_dict[ AnnualReports. EDIT_SHAREHOLDING_CHANGE_INFOS] = lst_edit_shareholding_change
def _get_annual_out_guarantee_info(self, json_str): lst = [] if json_str is None or json_str == '': return lst js_obj = util.json_loads(json_str) for js_item in js_obj: perfrom = js_item.get(u'pefPerForm', 0) perto = js_item.get(u'pefPerTo', 0) share_model = { AnnualReports.OutGuaranteeInfo.CREDITOR: js_item.get(u'more', u''), AnnualReports.OutGuaranteeInfo.OBLIGOR: js_item.get(u'mortgagor', u''), AnnualReports.OutGuaranteeInfo.DEBT_TYPE: js_item.get(u'priClaSecKindName', u''), AnnualReports.OutGuaranteeInfo.DEBT_AMOUNT: util.get_amount_with_unit(js_item.get(u'priClaSecAm', u'')), AnnualReports.OutGuaranteeInfo.PERFORMANCE_PERIOD: u"{0}-{1}".format(perfrom, perto), AnnualReports.OutGuaranteeInfo.GUARANTEE_PERIOD: js_item.get(u'guaranPeriodName', u''), AnnualReports.OutGuaranteeInfo.GUARANTEE_TYPE: js_item.get(u'gaTypeName', u''), } lst.append(share_model) return lst
def _get_annual_sharehold_info(self, json_str): lst = [] if json_str is None or json_str == '': return lst js_obj = util.json_loads(json_str) for js_item in js_obj: share_model = { AnnualReports.ShareholderInformation.SHAREHOLDER_NAME: js_item.get(u'inv', u''), AnnualReports.ShareholderInformation.SUBSCRIPTION_AMOUNT: util.get_amount_with_unit(js_item.get(u'subConAm', u'')), AnnualReports.ShareholderInformation.SUBSCRIPTION_TIME: str(js_item.get(u'conDate', 0)), AnnualReports.ShareholderInformation.SUBSCRIPTION_TYPE: js_item.get(u'conFormName', u''), AnnualReports.ShareholderInformation.PAIED_AMOUNT: util.get_amount_with_unit(js_item.get(u'acConAm', u'')), AnnualReports.ShareholderInformation.PAIED_TIME: str(js_item.get(u'realConDate', 0)), AnnualReports.ShareholderInformation.PAIED_TYPE: js_item.get(u'realConFormName', u''), } lst.append(share_model) return lst
def get_shareholder_info(self, session, pri_pid, data): page = 1 total_page = 1 while page <= total_page: url = 'http://{host}/ansubcapital/queryAnsubcapitaltrue.do' \ '?pripid={pripid}&randommath={randommath}¤tPage={page}' \ .format(host=self.host, pripid=pri_pid, randommath=util.get_random_num(), page=page) r = self.task_request(session, session.get, url) if r is None: self.append_model(data, Model.shareholder_info, url, '', status=self.STATUS_FAIL) return json_data = util.json_loads(r.text) if json_data is None: self.append_model(data, Model.shareholder_info, url, r.text, status=self.STATUS_FAIL) return page_info = json_data.get('page', None) if page_info is None: self.append_model(data, Model.shareholder_info, url, r.text, status=self.STATUS_FAIL) return total_page = page_info.get('totalPage', None) if total_page is None: self.append_model(data, Model.shareholder_info, url, r.text, status=self.STATUS_FAIL) return total_page = int(total_page) if total_page == 0: total_page = 1 self.append_model(data, Model.shareholder_info, url, r.text) page += 1
def get_change_info(self, change_info): change_info_dict = {} pages = self.get_crawl_page(change_info, True) lst_change_records = [] for page in pages: text = page.get('text') if text is None: continue native_json = util.json_loads(text) if native_json is None: continue data_json_arr = native_json.get('data', []) if data_json_arr is None: continue for data in data_json_arr: change_model = { GsModel.ChangeRecords.CHANGE_ITEM: data.get('altContent'), # 去除多余的字 GsModel.ChangeRecords.BEFORE_CONTENT: util.format_content(data.get('altBeContent')), GsModel.ChangeRecords.AFTER_CONTENT: util.format_content(data.get('altAfContent')), GsModel.ChangeRecords.CHANGE_DATE: data.get('altDate') } change_model = self.replace_none(change_model) lst_change_records.append(change_model) change_info_dict[GsModel.CHANGERECORDS] = lst_change_records return change_info_dict
def get_branch_info(self, branch_info): """ :param branch_info: 网页库字典, 里面包含list 与 detail 两个列表, 列表中存储的为网页数据 其中两个列表一定会存在一个, 否则则认为这个数据包无效, list一般储存列表翻页信息, detail存储列表项详情信息 具体结构参考mongodb网页库或者查看 common/global_field.py 中Model定义注释 分支机构一般存储在list列表中, 因为分支机构不包含列表结构不需要detail列表 :return: 返回工商schema字典 """ branch_info_dict = {} lst_branch = [] page_list = self.get_crawl_page(branch_info) json_data_arr = util.json_loads(page_list) if json_data_arr is None: return branch_info_dict for data in json_data_arr: branch_model = { GsModel.Branch.COMPAY_NAME: data.get('DIST_NAME'), GsModel.Branch.CODE: data.get('DIST_REG_NO'), GsModel.Branch.REGISTERED_ADDRESS: data.get('DIST_BELONG_ORG') } branch_model = bu_ding(branch_model) lst_branch.append(branch_model) if len(lst_branch) > 0: branch_info_dict[GsModel.BRANCH] = lst_branch return branch_info_dict
def get_annual_info(self, session, param_dict, data): org = param_dict['org'] i_d = param_dict['id'] seq_id = param_dict['seqId'] nb_reg_no = param_dict['nb_reg_no'] uni_scid = param_dict['uniScid'] post_data = { 'org': org, 'id': i_d, 'seqId': seq_id, 'abnormal': '', 'activeTabId': '', 'tmp': 53, 'regNo': nb_reg_no, 'admitMain': '08', 'uniScid': uni_scid, 'econKind': 51, 'pageSize': 100, 'curPage': 1, 'sortName': '', 'sortOrder': '', } url = 'http://{host}/ecipplatform/publicInfoQueryServlet.json?queryQynbxxYears=true'.format( host=self.host) r = self.task_request(session, session.post, url, data=post_data) if r is None: return result = util.json_loads(r.text) if result is None: return nb_data = result.get('data') for nb in nb_data: try: year = nb.get('REPORT_YEAR') nb_id = nb.get('ID') except: continue # 企业年报基本信息 self.get_annual_base_info(session, nb_id, data, year) # 企业年报网点信息 self.get_annual_website_info(session, nb_id, data, year) # 企业年报股东信息 self.get_annual_shareholder_info(session, nb_id, data, year) # 企业年报对外投资信息 self.get_annual_investment_info(session, nb_id, data, year) # 企业年报对外提供保证担保信息 self.get_annual_assurance_info(session, nb_id, data, year) # 企业年报股权变更信息 self.get_annual_change_info(session, nb_id, data, year) # 企业年报修改信息 self.get_annual_amendant_info(session, nb_id, data, year)
def get_change_shareholding_info(self, session, data, base_text): sub_url = util.get_match_value('insAlterstockinfoUrl = "', '"', base_text) url = "http://{0}{1}".format(self.host, sub_url) cur_page = 1 total_page = 1 while cur_page <= total_page: post_data = { 'draw': 1, 'start': (cur_page - 1) * 5, 'length': 5, } r = self.task_request(session, session.post, url, data=post_data) if r is None: self.append_model(data, Model.change_shareholding_info, url, '', status=self.STATUS_FAIL) return self.append_model(data, Model.change_shareholding_info, url, r.text, post_data=post_data) json_data = util.json_loads(r.text) if json_data is None: return if total_page <= 1: total_page = json_data.get('totalPage', 1) cur_page += 1
def get_change_info(self, change_info): """ :param change_info: 网页库字典, 里面包含list 与 detail 两个列表, 列表中存储的为网页数据 其中两个列表一定会存在一个, 否则则认为这个数据包无效, list一般储存列表翻页信息, detail存储列表项详情信息 具体结构参考mongodb网页库或者查看 common/global_field.py 中Model定义注释 变更信息一般只包含list列表, 但是特殊情况下也会有detail详情页列表 比如 北京这个省份有发现过包含详情页的变更信息 :return: 返回工商schema字典 """ change_info_dict = {} pages = self.get_crawl_page(change_info, True) lst_change_records = [] for page in pages: text = page.get('text') data_arr = util.json_loads(text).get('data', []) if data_arr is None: return {} for data in data_arr: change_model = { GsModel.ChangeRecords.CHANGE_ITEM: data.get('ALTITEM_CN'), # 去除多余的字 GsModel.ChangeRecords.BEFORE_CONTENT: util.format_content(data.get('ALTBE')), GsModel.ChangeRecords.AFTER_CONTENT: util.format_content(data.get('ALTAF')), # 日期格式化 GsModel.ChangeRecords.CHANGE_DATE: data.get('ALTDATE') } change_model = replace_none(change_model) lst_change_records.append(change_model) change_info_dict[GsModel.CHANGERECORDS] = lst_change_records return change_info_dict
def scraping_IP_int(head, raw_data): log('start scraping Interpark crawled data') raw_json = json_loads(raw_data) #json.loads(raw_data) if raw_json is None: ## 파싱 에러 return None if type(raw_json['Responses']['GoodsList']) == str: ## 데이터가 없을 경우 체크 return None raw_list = [] fare_goods = raw_json['Responses']['GoodsList']['Goods'] if type(fare_goods) == dict: ##데이터가 하나인 경우 방지용 fare_goods = [fare_goods] for fare_set in fare_goods: air_itns = fare_set['AirAvail']['StartAvail']['AirItn'] if type(air_itns) == dict: ##데이터가 하나인 경우 방지용 air_itns = [air_itns] for air_itn in air_itns: seg_detail_t = air_itn['seg_detail_t'] raw_list.append([ seg_detail_t['car_code'], seg_detail_t['main_flt'], fare_set['StartDT'], seg_detail_t['dep_city'], seg_detail_t['arr_city'], seg_detail_t['dep_date_time'][8:], seg_detail_t['arr_date_time'][8:], fare_set['SaleFare'], fare_set['Qcharge'], fare_set['Tax'], seg_detail_t['no_of_avail_seat'] ]) log('end scraping Interpark crawled data') return raw_list
def get_change_info(self, change_info): change_info_dict = {} change_info_list = [] pages = self.get_crawl_page(change_info, True) if pages is None or len(pages) <= 0: return change_info_dict page = pages[0] page_text = page.get(u'text', u'') json_data = util.json_loads(page_text) if json_data is None: return change_info_dict data_arr = json_data.get('data', []) if data_arr is None: return change_info_dict for data in data_arr: # 转为网页格式 html_alt_item = data.get('altItem_CN', '') alt_item = '' if html_alt_item != '': alt_item = PyQuery(html_alt_item, parser='html').remove('div').remove("span") \ .text().replace(' ', '').strip() change_model = { GsModel.ChangeRecords.CHANGE_ITEM: alt_item, GsModel.ChangeRecords.BEFORE_CONTENT: data.get('altAf', u''), GsModel.ChangeRecords.AFTER_CONTENT: data.get('altBe', u''), GsModel.ChangeRecords.CHANGE_DATE: data.get('altDate', u'') } change_info_list.append(change_model) if len(change_info_list) > 0: change_info_dict[GsModel.CHANGERECORDS] = change_info_list return change_info_dict
def get_annual_detail(self, session, text, data): json_data = util.json_loads(text) if json_data is None: return for item in json_data: ancheid = item.get('ancheid', None) ancheyear = item.get('ancheyear', None) if ancheid is None: continue if ancheyear is None: continue url = 'http://{host}/pub/annual/qy/detail/{ancheid}'.format( host=self.host, ancheid=ancheid) r = self.task_request(session, session.post, url=url) if r is None: self.append_model(data, Model.annual_info, url, '', year=ancheyear, status=self.STATUS_FAIL, classify=Model.type_detail) continue self.append_model(data, Model.annual_info, url, r.text, year=ancheyear, classify=Model.type_detail)
def get_search_list_html(self, keyword, session): param_list = [] try: url = "http://{host}/saicpub/entPublicitySC/entPublicityDC/lngsSearchFpc!searchSolr.action".format( host=self.host) data = { 'solrCondition': keyword, 'authCode': 'finish', 'currentPage': 1, 'pageSize': 10, } # 如果验证失败则再次尝试 r = self.task_request(session, session.post, url, data=data) if r is None: return param_list, self.SEARCH_ERROR json_data = util.json_loads(r.text) if json_data is None: return param_list, self.SEARCH_ERROR json_array = json_data.get('jsonArray', None) if json_array is None: return param_list, self.SEARCH_ERROR if len(json_array) <= 0: return param_list, self.SEARCH_NOTHING_FIND for item in json_array: if item is None: continue pri_pid = item.get('pripid') ent_name = item.get('realEntName') ent_type = item.get('enttype') seed_code = item.get('realRegNo') if pri_pid is None or pri_pid == '': continue if ent_name is None or ent_name == '': continue if ent_type is None or ent_type == '': continue param = { 'pripid': pri_pid, 'entname': ent_name, 'enttype': ent_type, 'search_name': ent_name, } if seed_code is not None: param['unified_social_credit_code'] = seed_code param_list.append(param) return param_list, self.SEARCH_SUCCESS if len(param_list) > 0 else self.SEARCH_NOTHING_FIND except Exception as e: self.log.exception(e) return param_list, self.SEARCH_ERROR
def get_contributive_info(self, session, data, base_text): sub_url = util.get_match_value('var shareholderUrl = "', '";', base_text) url = "http://{0}{1}".format(self.host, sub_url) cur_page = 1 total_page = 1 while cur_page <= total_page: post_data = { 'draw': 1, 'start': (cur_page - 1) * 5, 'length': 5, } r = self.task_request(session, session.post, url, data=post_data) if r is None: self.append_model(data, Model.contributive_info, url, '', status=self.STATUS_FAIL) return self.append_model(data, Model.contributive_info, url, r.text, post_data=post_data) # 从 那个contributive_info的页面里面拿 详细信息 json_data = util.json_loads(r.text) if json_data is None: return # 这里不成功 json_array = json_data.get('data', []) for json_item in json_array: inv_id = json_item.get('invId', '') detail_url = "http://{0}/corp-query-entprise-info-shareholderDetail-{1}.html".format( self.host, inv_id) detail_r = self.task_request(session, session.get, detail_url) if detail_r is None: self.append_model(data, Model.contributive_info, detail_url, '', status=self.STATUS_FAIL, classify=Model.type_detail) continue self.append_model(data, Model.contributive_info, detail_url, detail_r.text, classify=Model.type_detail) if total_page <= 1: total_page = json_data.get('totalPage', 1) cur_page += 1
def get_shareholder_info(self, shareholder_info): shareholder_info_dict = {} pages = self.get_crawl_page(shareholder_info, True) lst_shareholder = [] for page in pages: text = page.get('text') json_data = util.json_loads(text) if json_data is None: continue data_list = json_data.get('data', []) if data_list is None: continue for data in data_list: sub_dict = { GsModel.ShareholderInformation.SUBSCRIPTION_TYPE: data.get('subConForm_CN'), GsModel.ShareholderInformation.SUBSCRIPTION_AMOUNT: data.get('subConAm'), GsModel.ShareholderInformation.SUBSCRIPTION_TIME: data.get('currency'), GsModel.ShareholderInformation.SUBSCRIPTION_PUBLISH_TIME: data.get('shouldPublicDate') } sub_dict = replace_none(sub_dict) paid_dict = { GsModel.ShareholderInformation.PAIED_TYPE: data.get('acConForm_CN'), GsModel.ShareholderInformation.PAIED_AMOUNT: data.get('acConAm'), GsModel.ShareholderInformation.PAIED_TIME: data.get('conDate'), GsModel.ShareholderInformation.PAIED_PUBLISH_TIME: data.get('factPublicDate') } paid_dict = replace_none(paid_dict) share_model = { GsModel.ShareholderInformation.SHAREHOLDER_NAME: data.get('inv'), GsModel.ShareholderInformation.SUBSCRIPTION_AMOUNT: data.get('totalSubConAm'), GsModel.ShareholderInformation.PAIED_AMOUNT: data.get('totalAcConAm'), GsModel.ShareholderInformation.SUBSCRIPTION_DETAIL: [sub_dict], GsModel.ShareholderInformation.PAIED_DETAIL: [paid_dict] } share_model = replace_none(share_model) lst_shareholder.append(share_model) shareholder_info_dict[ GsModel.SHAREHOLDER_INFORMATION] = lst_shareholder return shareholder_info_dict
def __get_company_name(text): json_data = util.json_loads(text) if json_data is None: return None ent_name = json_data.get('entName', None) if ent_name is None: return None return ent_name.strip()
def __get_company_name(text): result = util.json_loads(text) if result is None: return None search_list = result.get('ENTNAME', None) if search_list is not None: return search_list.strip() return None
def __get_company_name(self, text): try: json_data = util.json_loads(text) if json_data is None: return None return json_data.get('CORP_NAME', None) except Exception as e: self.log.exception(e) return None
def get_keyword_info(base_text): # "REG_NO":"913202052500830484", "UNI_SCID":"97F2A486B9901B971CCF8E9FA95B14A3" json_data = util.json_loads(base_text) if json_data is None: return None, None reg_no = json_data.get('REG_NO', None) uni_scid = json_data.get('UNI_SCID', None) nb_reg_no = json_data.get('REG_NO_EN', None) return reg_no, uni_scid, nb_reg_no