예제 #1
0
 def __init__(self, conf):
     self.conf = conf
     self.log = log
     self.parser_tool = parser_tool
     self.route = EntityExtractorRoute(conf)
     self.topic_manager = TopicManager(conf)
     self.validate_manager = ValidateManager(self.topic_manager, conf,
                                             'all')
     self.count = 0
예제 #2
0
if __name__ == '__main__':
    import pytoml
    import sys

    sys.path.append('../../')
    from conf import get_config

    with open('../../entity.toml', 'rb') as config:
        config = pytoml.load(config)
    conf = get_config(config)
    import common

    topic_id = 32
    from entity_extractor_route import EntityExtractorRoute

    route = EntityExtractorRoute()
    topic_info = route.all_topics.get(topic_id, None)

    obj = ListingEventsExtractor(topic_info, common.log)
    extract_data = {
        "_site_record_id": "http://www.pedata.cn/ipo/321436101.html",
    "accounting_firm": "大华会所",
    "enterprise_full_name": "广东芳源环保股份有限公司",
    "equity": "31,000,000",
    "exchanges": "全国中小企业股份转让系统(新三板)",
    "industry": "互联网 电商",
    "law_firm": "广东华商律所",
    "lead_underwriter": "华创证券",
    "market_date": "2016-10-21",
    "site_url": "http://www.pedata.cn/ipo/321436101.html",
    "source_site": "私募通",
예제 #3
0
    import pytoml
    import sys

    sys.path.append('../../')
    from conf import get_config
    from bdp.i_crawler.i_extractor.ttypes import BaseInfo, CrawlInfo, ExtractInfo, PageParseInfo

    with open('../../entity.toml', 'rb') as config:
        config = pytoml.load(config)
    conf = get_config(config)
    import common

    topic_id = 102
    from entity_extractor_route import EntityExtractorRoute

    route = EntityExtractorRoute(conf)
    topic_info = route.all_topics.get(topic_id, None)
    obj = SsggCaibaoExtractor(topic_info, common.log)
    src_url = ""
    extract_data = {
        "code":
        "szcn300599",
        "info": [{
            "key": "公司全称",
            "value": "广东雄塑科技集团股份有限公司"
        }, {
            "key": "英文名称",
            "value": "Guangdong Xiongsu Technology Group Co., Ltd."
        }, {
            "key": "注册地址",
            "value": "广东省佛山市南海区九江镇龙高路敦根路段雄塑工业园"
예제 #4
0
            "sex_female": female_rate,
            "province_rank": extract_data.get("province_rank"),
        }

        return entity_data


if __name__ == "__main__":
    import conf
    from bdp.i_crawler.i_extractor.ttypes import BaseInfo, CrawlInfo, ExtractInfo, PageParseInfo
    from common_parser_lib.parser_tool import ParserTool
    from entity_extractor_route import EntityExtractorRoute

    topic_id = 56
    parser_tool = ParserTool(conf)
    route = EntityExtractorRoute(conf, parser_tool)
    topic_info = route.all_topics.get(topic_id, None)
    parser_tool = ParserTool(conf)
    obj = BaiDuIndexExtractor(conf.log, topic_info, parser_tool)
    extract_data = {}
    src_url = "www.baidu.com"
    data = json.dumps({})
    extract_info = ExtractInfo(ex_status=2, extract_data=data)
    base_info = BaseInfo(url=src_url)
    parser_info = PageParseInfo(base_info=base_info, extract_info=extract_info)
    data = obj.entity_extract(parser_info, extract_data)
    print src_url

    for key, value in data.items():
        if isinstance(value, list):
            for i in value:
예제 #5
0
class EntityExtractor(object):

    validator_used = ['pk', 'required_attr', 'jsonschema']
    all_validators = ['meta', 'pk', 'jsonschema']

    def __init__(self, conf):
        self.conf = conf
        self.log = log
        self.parser_tool = parser_tool
        self.route = EntityExtractorRoute(conf)
        self.topic_manager = TopicManager(conf)
        self.validate_manager = ValidateManager(self.topic_manager, conf,
                                                'all')
        self.count = 0

    def reload(self):
        self.topic_manager.reload(-1)
        return True

    # TODO: need to change extractor route to use the common topic manager, this is temporary code
    def add_topic(self, topic_info):
        resp = self.route.add_topic(topic_info)
        self.topic_manager.reload(-1)
        return resp

    def insert_extractor(self, extractor_info):
        resp = self.route.insert_extractor(extractor_info)
        self.topic_manager.reload(-1)
        return resp

    def process_json(self, j, topic_id):
        """
        消息队列处理
        :param j:  json data
        :param topic_id: topic_id
        :return: [{topic_id:主题ID(int), data:{解析结果(json)}}]
        """
        result_list = []
        try:
            extractor = self.route.get_extractor(topic_id)
            formatted_json = extractor.process_json(j, topic_id)
            if formatted_json is None:
                return None

            def process_single_result(single_result):
                after_process_json = extractor.after_process(single_result)
                if after_process_json is None:
                    return None
                else:
                    result_list.append({
                        "topic_id": topic_id,
                        "data": after_process_json
                    })

            if isinstance(formatted_json, list):
                for entity_data in formatted_json:
                    process_single_result(entity_data)
            else:
                process_single_result(formatted_json)

        except Exception, e:
            self.log.error("extract_error\tmsg:%s" % (traceback.format_exc()))
        return result_list