def get_change_info(self, change_info): change_info_dict = {} change_records_list = [] page_items = self.get_crawl_page(change_info, True) for page in page_items: text = page.get('text') page_lists = json.loads(text) for page_list in page_lists: page_data_list = page_list.get('list', []) for pl in page_data_list: change_model = { GsModel.ChangeRecords.CHANGE_ITEM: pl.get('altitem_cn'), # 去除多余的字 GsModel.ChangeRecords.BEFORE_CONTENT: util.format_content(pl.get('altbe')), GsModel.ChangeRecords.AFTER_CONTENT: util.format_content(pl.get('altaf')), # 日期格式化 GsModel.ChangeRecords.CHANGE_DATE: pl.get('altdate') } change_model = self.replace_none(change_model) change_records_list.append(change_model) if len(change_records_list) > 0: change_info_dict[GsModel.CHANGERECORDS] = change_records_list return change_info_dict
def get_change_info(self, page): change_info_dict = {} lst_change_records = [] if isinstance(page, dict) or page is None: return {} trs = py(page, parser='html').find('#table_bgxx').find('tr').items() for tr in trs: tds = tr.find('td') if tds is None or len(tds) < 2: continue change_model = { GsModel.ChangeRecords.CHANGE_ITEM: tds.eq(1).text(), # 去除多余的字 GsModel.ChangeRecords.BEFORE_CONTENT: util.format_content(tds.eq(2).text()), GsModel.ChangeRecords.AFTER_CONTENT: util.format_content(tds.eq(3).text()), # 日期格式化 GsModel.ChangeRecords.CHANGE_DATE: tds.eq(4).text() } lst_change_records.append(change_model) change_info_dict[GsModel.CHANGERECORDS] = lst_change_records if len( lst_change_records) != 0 else None return change_info_dict
def get_inline_change_info(page): change_info_dict = {} change_records_list = [] trs = PyQuery(page, parser='html').find('#changeTab').find('tr').items() for tr in trs: tds = tr.find('td') if len(tds) < 2: continue change_model = { GsModel.ChangeRecords.CHANGE_ITEM: tds.eq(1).text(), # 去除多余的字 GsModel.ChangeRecords.BEFORE_CONTENT: util.format_content(tds.eq(2).text()), GsModel.ChangeRecords.AFTER_CONTENT: util.format_content(tds.eq(3).text()), # 日期格式化 GsModel.ChangeRecords.CHANGE_DATE: tds.eq(4).text() } change_records_list.append(change_model) if len(change_records_list) > 0: change_info_dict[GsModel.CHANGERECORDS] = change_records_list return change_info_dict
def get_change_info(self, change_info): """ :param change_info: 网页库字典, 里面包含list 与 detail 两个列表, 列表中存储的为网页数据 其中两个列表一定会存在一个, 否则则认为这个数据包无效, list一般储存列表翻页信息, detail存储列表项详情信息 具体结构参考mongodb网页库或者查看 common/global_field.py 中Model定义注释 变更信息一般只包含list列表, 但是特殊情况下也会有detail详情页列表 比如 北京这个省份有发现过包含详情页的变更信息 :return: 返回工商schema字典 """ change_info_dict = {} pages = self.get_crawl_page(change_info, True) lst_change_records = [] for page in pages: text = page.get('text') data_arr = util.json_loads(text).get('data', []) if data_arr is None: return {} for data in data_arr: change_model = { GsModel.ChangeRecords.CHANGE_ITEM: data.get('ALTITEM_CN'), # 去除多余的字 GsModel.ChangeRecords.BEFORE_CONTENT: util.format_content(data.get('ALTBE')), GsModel.ChangeRecords.AFTER_CONTENT: util.format_content(data.get('ALTAF')), # 日期格式化 GsModel.ChangeRecords.CHANGE_DATE: data.get('ALTDATE') } change_model = replace_none(change_model) lst_change_records.append(change_model) change_info_dict[GsModel.CHANGERECORDS] = lst_change_records return change_info_dict
def get_change_info(self, change_info): change_info_dict = {} lst_change_records = [] pages = self.get_crawl_page(change_info, True) if pages is None: return {} for page in pages: trs = py(page.get(u'text', u''), parser='html').find( '.partner_com').find('tr').not_('.partner_com_top').items() for tr in trs: tds = tr.find('td') change_model = { GsModel.ChangeRecords.CHANGE_ITEM: tds.eq(1).text(), # 去除多余的字 GsModel.ChangeRecords.BEFORE_CONTENT: util.format_content(tds.eq(2).text()), GsModel.ChangeRecords.AFTER_CONTENT: util.format_content(tds.eq(3).text()), # 日期格式化 GsModel.ChangeRecords.CHANGE_DATE: tds.eq(4).text() } lst_change_records.append(change_model) change_info_dict[GsModel.CHANGERECORDS] = lst_change_records return change_info_dict
def get_change_info(self, change_info): change_info_dict = {} pages = self.get_crawl_page(change_info, True) lst_change_records = [] for page in pages: text = page.get('text') json_data = util.json_loads(text) if json_data is None: continue data_list = json_data.get('data', []) if data_list is not None: for data in data_list: change_model = { GsModel.ChangeRecords.CHANGE_ITEM: data.get('altItem_CN'), # 去除多余的字 GsModel.ChangeRecords.BEFORE_CONTENT: util.format_content(data.get('altBe')), GsModel.ChangeRecords.AFTER_CONTENT: util.format_content(data.get('altAf')), # 日期格式化 GsModel.ChangeRecords.CHANGE_DATE: data.get('altDate') } change_model = replace_none(change_model) lst_change_records.append(change_model) change_info_dict[GsModel.CHANGERECORDS] = lst_change_records return change_info_dict
def _get_change_info_td_text(self, tds, start_index=1): if tds is None: return {} length = len(tds) # todo 这个是一个暂时的方案 ,北京-中信银行特殊结构 is_special = False sepcial_field = u'详细' if sepcial_field in tds.text(): is_special = True if length <= start_index + 2: return {} else: if length <= start_index + 3: return {} change_model = { GsModel.ChangeRecords.CHANGE_ITEM: tds.eq(start_index).text(), # 去除多余的字 GsModel.ChangeRecords.BEFORE_CONTENT: util.format_content( tds.eq(start_index + 1).text()) if not is_special else sepcial_field, GsModel.ChangeRecords.AFTER_CONTENT: util.format_content( tds.eq(start_index + 2).text()) if not is_special else sepcial_field, # 日期格式化 GsModel.ChangeRecords.CHANGE_DATE: tds.eq(start_index + 3).text() if not is_special else tds.eq(start_index + 2).text() } return change_model
def get_change_info(self, change_info): change_info_dict = {} pages = self.get_crawl_page(change_info, True) lst_change_records = [] for page in pages: text = page.get('text') if text is None: continue native_json = util.json_loads(text) if native_json is None: continue data_json_arr = native_json.get('data', []) if data_json_arr is None: continue for data in data_json_arr: change_model = { GsModel.ChangeRecords.CHANGE_ITEM: data.get('altContent'), # 去除多余的字 GsModel.ChangeRecords.BEFORE_CONTENT: util.format_content(data.get('altBeContent')), GsModel.ChangeRecords.AFTER_CONTENT: util.format_content(data.get('altAfContent')), GsModel.ChangeRecords.CHANGE_DATE: data.get('altDate') } change_model = self.replace_none(change_model) lst_change_records.append(change_model) change_info_dict[GsModel.CHANGERECORDS] = lst_change_records return change_info_dict
def get_change_info(self, change_info): change_info_dict = {} page_text = self.get_crawl_page(change_info) lst_change_records = [] if page_text is None or page_text == u'': return {} native_json = util.json_loads(page_text) if native_json is None: return {} json_data_arr = native_json.get('data', []) if json_data_arr is None: return {} for data in json_data_arr: change_model = { GsModel.ChangeRecords.CHANGE_ITEM: data.get('bcsxmc', ''), # 去除多余的字 GsModel.ChangeRecords.BEFORE_CONTENT: util.format_content(data.get('bcnr', '')), GsModel.ChangeRecords.AFTER_CONTENT: util.format_content(data.get('bghnr', '')), # 日期格式化 GsModel.ChangeRecords.CHANGE_DATE: data.get('hzrq', '') } change_model = replace_none(change_model) lst_change_records.append(change_model) change_info_dict[GsModel.CHANGERECORDS] = lst_change_records return change_info_dict
def _get_change_info_2_model(self, obj): change_model = { GsModel.ChangeRecords.CHANGE_ITEM: obj.get(u'altFiledName', u''), # 去除多余的字 GsModel.ChangeRecords.BEFORE_CONTENT: util.format_content(obj.get(u'altBe', u'')), GsModel.ChangeRecords.AFTER_CONTENT: util.format_content(obj.get(u'altAf', u'')), # 日期格式化 GsModel.ChangeRecords.CHANGE_DATE: obj.get(u'altDate', 0) } return change_model
def get_change_info(self, change_info): """ :param change_info: 网页库字典, 里面包含list 与 detail 两个列表, 列表中存储的为网页数据 其中两个列表一定会存在一个, 否则则认为这个数据包无效, list一般储存列表翻页信息, detail存储列表项详情信息 具体结构参考mongodb网页库或者查看 common/global_field.py 中Model定义注释 变更信息一般只包含list列表, 但是特殊情况下也会有detail详情页列表 比如 北京这个省份有发现过包含详情页的变更信息 :return: 返回工商schema字典 """ change_info_dict = {} page_list = self.get_crawl_page(change_info, True) if page_list is None: return change_info_dict change_record_list = [] for page in page_list: text = page.get('text') native_json = util.json_loads(text) if native_json is None: continue json_data_arr = native_json.get('data', []) for data in json_data_arr: change_model = { GsModel.ChangeRecords.CHANGE_ITEM: data.get('CHANGE_ITEM_NAME'), # 去除多余的字 GsModel.ChangeRecords.BEFORE_CONTENT: util.format_content(data.get('OLD_CONTENT')), GsModel.ChangeRecords.AFTER_CONTENT: util.format_content(data.get('NEW_CONTENT')), # 日期格式化 GsModel.ChangeRecords.CHANGE_DATE: data.get('CHANGE_DATE') } change_model = bu_ding(change_model) change_record_list.append(change_model) if len(change_record_list) > 0: change_info_dict[GsModel.CHANGERECORDS] = change_record_list return change_info_dict
def get_change_info_page(self, tds, start_index=1, detail=None): if tds is None: return {} onclick = tds.find('a').attr('onclick') if onclick is None or onclick.strip() == '': change_model = { GsModel.ChangeRecords.CHANGE_ITEM: tds.eq(start_index).text(), GsModel.ChangeRecords.BEFORE_CONTENT: util.format_content(tds.eq(start_index + 1).text()), GsModel.ChangeRecords.AFTER_CONTENT: util.format_content(tds.eq(start_index + 2).text()), GsModel.ChangeRecords.CHANGE_DATE: tds.eq(start_index + 3).text(), } else: before, after = self.get_change_info_detail(onclick.strip(), detail) change_model = { GsModel.ChangeRecords.CHANGE_ITEM: tds.eq(start_index).text(), GsModel.ChangeRecords.BEFORE_CONTENT: before, GsModel.ChangeRecords.AFTER_CONTENT: after, GsModel.ChangeRecords.CHANGE_DATE: tds.eq(start_index + 2).text(), } return change_model
def get_change_info(self, page): change_info_dict = {} lst_change_records = [] json_data_arr = util.json_loads(page).get('bgsx', u'') if json_data_arr is None: return {} for json_data in json_data_arr: change_model = { GsModel.ChangeRecords.CHANGE_ITEM: json_data.get('altitem'), GsModel.ChangeRecords.BEFORE_CONTENT: util.format_content(json_data.get('altbe')), GsModel.ChangeRecords.AFTER_CONTENT: util.format_content(json_data.get('altaf')), GsModel.ChangeRecords.CHANGE_DATE: json_data.get('altdate') } change_model = replace_none(change_model) lst_change_records.append(change_model) change_info_dict[GsModel.CHANGERECORDS] = lst_change_records return change_info_dict