def get_change_info(self, change_info):
        change_info_dict = {}
        change_records_list = []
        page_items = self.get_crawl_page(change_info, True)
        for page in page_items:
            text = page.get('text')
            page_lists = json.loads(text)
            for page_list in page_lists:
                page_data_list = page_list.get('list', [])
                for pl in page_data_list:
                    change_model = {
                        GsModel.ChangeRecords.CHANGE_ITEM:
                        pl.get('altitem_cn'),
                        # 去除多余的字
                        GsModel.ChangeRecords.BEFORE_CONTENT:
                        util.format_content(pl.get('altbe')),
                        GsModel.ChangeRecords.AFTER_CONTENT:
                        util.format_content(pl.get('altaf')),
                        # 日期格式化
                        GsModel.ChangeRecords.CHANGE_DATE:
                        pl.get('altdate')
                    }
                    change_model = self.replace_none(change_model)
                    change_records_list.append(change_model)

        if len(change_records_list) > 0:
            change_info_dict[GsModel.CHANGERECORDS] = change_records_list

        return change_info_dict
示例#2
0
    def get_change_info(self, page):
        change_info_dict = {}
        lst_change_records = []
        if isinstance(page, dict) or page is None:
            return {}

        trs = py(page, parser='html').find('#table_bgxx').find('tr').items()
        for tr in trs:
            tds = tr.find('td')
            if tds is None or len(tds) < 2:
                continue

            change_model = {
                GsModel.ChangeRecords.CHANGE_ITEM:
                tds.eq(1).text(),
                # 去除多余的字
                GsModel.ChangeRecords.BEFORE_CONTENT:
                util.format_content(tds.eq(2).text()),
                GsModel.ChangeRecords.AFTER_CONTENT:
                util.format_content(tds.eq(3).text()),
                # 日期格式化
                GsModel.ChangeRecords.CHANGE_DATE:
                tds.eq(4).text()
            }
            lst_change_records.append(change_model)
        change_info_dict[GsModel.CHANGERECORDS] = lst_change_records if len(
            lst_change_records) != 0 else None
        return change_info_dict
示例#3
0
    def get_inline_change_info(page):
        change_info_dict = {}
        change_records_list = []
        trs = PyQuery(page,
                      parser='html').find('#changeTab').find('tr').items()
        for tr in trs:
            tds = tr.find('td')
            if len(tds) < 2:
                continue

            change_model = {
                GsModel.ChangeRecords.CHANGE_ITEM:
                tds.eq(1).text(),
                # 去除多余的字
                GsModel.ChangeRecords.BEFORE_CONTENT:
                util.format_content(tds.eq(2).text()),
                GsModel.ChangeRecords.AFTER_CONTENT:
                util.format_content(tds.eq(3).text()),
                # 日期格式化
                GsModel.ChangeRecords.CHANGE_DATE:
                tds.eq(4).text()
            }
            change_records_list.append(change_model)

        if len(change_records_list) > 0:
            change_info_dict[GsModel.CHANGERECORDS] = change_records_list

        return change_info_dict
    def get_change_info(self, change_info):
        """
        :param change_info: 网页库字典, 里面包含list 与 detail 两个列表, 列表中存储的为网页数据
        其中两个列表一定会存在一个, 否则则认为这个数据包无效, list一般储存列表翻页信息, detail存储列表项详情信息
        具体结构参考mongodb网页库或者查看 common/global_field.py 中Model定义注释
        变更信息一般只包含list列表, 但是特殊情况下也会有detail详情页列表 比如 北京这个省份有发现过包含详情页的变更信息
        :return: 返回工商schema字典
        """
        change_info_dict = {}
        pages = self.get_crawl_page(change_info, True)
        lst_change_records = []
        for page in pages:
            text = page.get('text')
            data_arr = util.json_loads(text).get('data', [])
            if data_arr is None:
                return {}

            for data in data_arr:
                change_model = {
                    GsModel.ChangeRecords.CHANGE_ITEM:
                    data.get('ALTITEM_CN'),
                    # 去除多余的字
                    GsModel.ChangeRecords.BEFORE_CONTENT:
                    util.format_content(data.get('ALTBE')),
                    GsModel.ChangeRecords.AFTER_CONTENT:
                    util.format_content(data.get('ALTAF')),
                    # 日期格式化
                    GsModel.ChangeRecords.CHANGE_DATE:
                    data.get('ALTDATE')
                }
                change_model = replace_none(change_model)
                lst_change_records.append(change_model)
        change_info_dict[GsModel.CHANGERECORDS] = lst_change_records
        return change_info_dict
    def get_change_info(self, change_info):
        change_info_dict = {}
        lst_change_records = []
        pages = self.get_crawl_page(change_info, True)
        if pages is None:
            return {}

        for page in pages:
            trs = py(page.get(u'text', u''), parser='html').find(
                '.partner_com').find('tr').not_('.partner_com_top').items()

            for tr in trs:
                tds = tr.find('td')
                change_model = {
                    GsModel.ChangeRecords.CHANGE_ITEM:
                    tds.eq(1).text(),
                    # 去除多余的字
                    GsModel.ChangeRecords.BEFORE_CONTENT:
                    util.format_content(tds.eq(2).text()),
                    GsModel.ChangeRecords.AFTER_CONTENT:
                    util.format_content(tds.eq(3).text()),
                    # 日期格式化
                    GsModel.ChangeRecords.CHANGE_DATE:
                    tds.eq(4).text()
                }
                lst_change_records.append(change_model)
        change_info_dict[GsModel.CHANGERECORDS] = lst_change_records
        return change_info_dict
示例#6
0
    def get_change_info(self, change_info):
        change_info_dict = {}
        pages = self.get_crawl_page(change_info, True)
        lst_change_records = []
        for page in pages:
            text = page.get('text')
            json_data = util.json_loads(text)
            if json_data is None:
                continue

            data_list = json_data.get('data', [])
            if data_list is not None:
                for data in data_list:
                    change_model = {
                        GsModel.ChangeRecords.CHANGE_ITEM:
                        data.get('altItem_CN'),
                        # 去除多余的字
                        GsModel.ChangeRecords.BEFORE_CONTENT:
                        util.format_content(data.get('altBe')),
                        GsModel.ChangeRecords.AFTER_CONTENT:
                        util.format_content(data.get('altAf')),
                        # 日期格式化
                        GsModel.ChangeRecords.CHANGE_DATE:
                        data.get('altDate')
                    }
                    change_model = replace_none(change_model)

                    lst_change_records.append(change_model)
        change_info_dict[GsModel.CHANGERECORDS] = lst_change_records
        return change_info_dict
    def _get_change_info_td_text(self, tds, start_index=1):
        if tds is None:
            return {}
        length = len(tds)
        # todo 这个是一个暂时的方案 ,北京-中信银行特殊结构
        is_special = False
        sepcial_field = u'详细'
        if sepcial_field in tds.text():
            is_special = True
            if length <= start_index + 2:
                return {}
        else:
            if length <= start_index + 3:
                return {}

        change_model = {
            GsModel.ChangeRecords.CHANGE_ITEM: tds.eq(start_index).text(),
            # 去除多余的字
            GsModel.ChangeRecords.BEFORE_CONTENT: util.format_content(
                tds.eq(start_index + 1).text()) if not is_special else sepcial_field,
            GsModel.ChangeRecords.AFTER_CONTENT: util.format_content(
                tds.eq(start_index + 2).text()) if not is_special else sepcial_field,
            # 日期格式化
            GsModel.ChangeRecords.CHANGE_DATE: tds.eq(start_index + 3).text()
            if not is_special else tds.eq(start_index + 2).text()
        }
        return change_model
    def get_change_info(self, change_info):
        change_info_dict = {}
        pages = self.get_crawl_page(change_info, True)
        lst_change_records = []
        for page in pages:
            text = page.get('text')
            if text is None:
                continue

            native_json = util.json_loads(text)
            if native_json is None:
                continue

            data_json_arr = native_json.get('data', [])
            if data_json_arr is None:
                continue

            for data in data_json_arr:
                change_model = {
                    GsModel.ChangeRecords.CHANGE_ITEM:
                    data.get('altContent'),
                    # 去除多余的字
                    GsModel.ChangeRecords.BEFORE_CONTENT:
                    util.format_content(data.get('altBeContent')),
                    GsModel.ChangeRecords.AFTER_CONTENT:
                    util.format_content(data.get('altAfContent')),
                    GsModel.ChangeRecords.CHANGE_DATE:
                    data.get('altDate')
                }
                change_model = self.replace_none(change_model)
                lst_change_records.append(change_model)
        change_info_dict[GsModel.CHANGERECORDS] = lst_change_records
        return change_info_dict
    def get_change_info(self, change_info):
        change_info_dict = {}
        page_text = self.get_crawl_page(change_info)
        lst_change_records = []
        if page_text is None or page_text == u'':
            return {}

        native_json = util.json_loads(page_text)
        if native_json is None:
            return {}

        json_data_arr = native_json.get('data', [])
        if json_data_arr is None:
            return {}

        for data in json_data_arr:
            change_model = {
                GsModel.ChangeRecords.CHANGE_ITEM:
                data.get('bcsxmc', ''),
                # 去除多余的字
                GsModel.ChangeRecords.BEFORE_CONTENT:
                util.format_content(data.get('bcnr', '')),
                GsModel.ChangeRecords.AFTER_CONTENT:
                util.format_content(data.get('bghnr', '')),
                # 日期格式化
                GsModel.ChangeRecords.CHANGE_DATE:
                data.get('hzrq', '')
            }
            change_model = replace_none(change_model)
            lst_change_records.append(change_model)
        change_info_dict[GsModel.CHANGERECORDS] = lst_change_records
        return change_info_dict
示例#10
0
 def _get_change_info_2_model(self, obj):
     change_model = {
         GsModel.ChangeRecords.CHANGE_ITEM:
         obj.get(u'altFiledName', u''),
         # 去除多余的字
         GsModel.ChangeRecords.BEFORE_CONTENT:
         util.format_content(obj.get(u'altBe', u'')),
         GsModel.ChangeRecords.AFTER_CONTENT:
         util.format_content(obj.get(u'altAf', u'')),
         # 日期格式化
         GsModel.ChangeRecords.CHANGE_DATE:
         obj.get(u'altDate', 0)
     }
     return change_model
    def get_change_info(self, change_info):
        """
        :param change_info: 网页库字典, 里面包含list 与 detail 两个列表, 列表中存储的为网页数据
        其中两个列表一定会存在一个, 否则则认为这个数据包无效, list一般储存列表翻页信息, detail存储列表项详情信息
        具体结构参考mongodb网页库或者查看 common/global_field.py 中Model定义注释
        变更信息一般只包含list列表, 但是特殊情况下也会有detail详情页列表 比如 北京这个省份有发现过包含详情页的变更信息
        :return: 返回工商schema字典
        """
        change_info_dict = {}
        page_list = self.get_crawl_page(change_info, True)
        if page_list is None:
            return change_info_dict

        change_record_list = []
        for page in page_list:
            text = page.get('text')
            native_json = util.json_loads(text)
            if native_json is None:
                continue

            json_data_arr = native_json.get('data', [])
            for data in json_data_arr:
                change_model = {
                    GsModel.ChangeRecords.CHANGE_ITEM:
                    data.get('CHANGE_ITEM_NAME'),
                    # 去除多余的字
                    GsModel.ChangeRecords.BEFORE_CONTENT:
                    util.format_content(data.get('OLD_CONTENT')),
                    GsModel.ChangeRecords.AFTER_CONTENT:
                    util.format_content(data.get('NEW_CONTENT')),
                    # 日期格式化
                    GsModel.ChangeRecords.CHANGE_DATE:
                    data.get('CHANGE_DATE')
                }
                change_model = bu_ding(change_model)
                change_record_list.append(change_model)

        if len(change_record_list) > 0:
            change_info_dict[GsModel.CHANGERECORDS] = change_record_list

        return change_info_dict
    def get_change_info_page(self, tds, start_index=1, detail=None):
        if tds is None:
            return {}

        onclick = tds.find('a').attr('onclick')
        if onclick is None or onclick.strip() == '':
            change_model = {
                GsModel.ChangeRecords.CHANGE_ITEM: tds.eq(start_index).text(),
                GsModel.ChangeRecords.BEFORE_CONTENT: util.format_content(tds.eq(start_index + 1).text()),
                GsModel.ChangeRecords.AFTER_CONTENT: util.format_content(tds.eq(start_index + 2).text()),
                GsModel.ChangeRecords.CHANGE_DATE: tds.eq(start_index + 3).text(),
            }
        else:
            before, after = self.get_change_info_detail(onclick.strip(), detail)
            change_model = {
                GsModel.ChangeRecords.CHANGE_ITEM: tds.eq(start_index).text(),
                GsModel.ChangeRecords.BEFORE_CONTENT: before,
                GsModel.ChangeRecords.AFTER_CONTENT: after,
                GsModel.ChangeRecords.CHANGE_DATE: tds.eq(start_index + 2).text(),
            }
        return change_model
    def get_change_info(self, page):
        change_info_dict = {}
        lst_change_records = []
        json_data_arr = util.json_loads(page).get('bgsx', u'')
        if json_data_arr is None:
            return {}

        for json_data in json_data_arr:
            change_model = {
                GsModel.ChangeRecords.CHANGE_ITEM:
                json_data.get('altitem'),
                GsModel.ChangeRecords.BEFORE_CONTENT:
                util.format_content(json_data.get('altbe')),
                GsModel.ChangeRecords.AFTER_CONTENT:
                util.format_content(json_data.get('altaf')),
                GsModel.ChangeRecords.CHANGE_DATE:
                json_data.get('altdate')
            }
            change_model = replace_none(change_model)
            lst_change_records.append(change_model)
        change_info_dict[GsModel.CHANGERECORDS] = lst_change_records
        return change_info_dict