Пример #1
0
def main():
    try:
        client = ThriftEntityExtractor(port=12500)

        with open('extractors/owx/宝应.txt', 'r') as f:
            html_table = f.read()

        print "client - start"
        extractor_info = {"topic_id": 155, "target_dir_name": "test", "extractor_name": "测试"}
        primary_keys = json.dumps([["title"]])
        schema = json.dumps({"type": "object", "title": "百度搜索", "description": "百度搜索",
                             "properties": {"keyword": {"type": "string", "title": "搜索公司"},
                                            "title": {"type": "string", "title": "标题"},
                                            "url": {"type": "string", "title": "来源url"},
                                            "abstract": {"type": "string", "title": "抽象"}}})
        topic_info = {"id": 155, "name": "测试动态加载解析器", "schema": schema, "primary_keys": primary_keys, "table_name":"test"}
        topic_info = json.dumps(topic_info)
        resp = client.add_topic(topic_info)
        # print resp.msg
        #
        extractor_info = json.dumps(extractor_info)
        resp = client.add_extractor(extractor_info)
        resp = client.reload(155)

        extract_data = {}

        extract_data = json.dumps(extract_data)
        base_info = BaseInfo(url="", site_id=1)
        extract_info = ExtractInfo(ex_status=2, extract_data=extract_data, topic_id=155)

        crawl_info = CrawlInfo(download_time=1474547589)
        req = PageParseInfo(base_info=base_info, crawl_info=crawl_info, extract_info=extract_info, scheduler="a",
                            parse_extends="b", data_extends="c")


        resp = client.entity_extract(req)
        print resp.entity_data_list


    # 捕获异常
    except Thrift.TException, ex:
        print "%s" % (ex.message)
Пример #2
0
def main(obj):
    from pymongo import MongoClient
    import traceback
    host = '101.201.102.37'
    port = 28019
    database = 'final_data'
    coll = 'baidu_news'
    client = MongoClient(host, port)
    db = client[database][coll]
    cursor = db.find()
    num = 0
    for item in cursor:
        try:
            num += 1
            item.pop('_id')

            src_url = item.get('_src')[0]['url']

            extract_data = item
            data = json.dumps(extract_data)
            extract_info = ExtractInfo(ex_status=2, extract_data=data)
            base_info = BaseInfo(url=src_url)
            crawl_info = CrawlInfo(download_time=1474547589)
            parser_info = PageParseInfo(base_info=base_info,
                                        extract_info=extract_info,
                                        crawl_info=crawl_info)
            data = obj.do_merge(parser_info, item)
            print src_url

            for key, value in data.items():
                if isinstance(value, list):
                    for i in value:
                        print key, ":", i
                elif isinstance(value, dict):
                    for key2, value2 in value.items():
                        print key2, ":", value2
                else:
                    print key, ":", value
            if num % 100 == 0:
                break
        except:
            print traceback.format_exc()
Пример #3
0
    def get_entity_extractor_info(self, company, base_info_url, in_time, model, topic, year=None):

        # 去除none值
        store_model = util.del_none(model)

        base_url = base_info_url.encode('utf-8')
        replace_company = company.replace('(', '(').replace(')', ')')
        if year is None:
            record = '|' + replace_company
        else:
            record = '|' + replace_company + '|' + str(year)
        _site_record_id = tools.get_md5(record)

        if year is None:
            self.log.info('company = {company} record_id = {_set_record_id} url = {url}'.
                          format(company=company, _set_record_id=_site_record_id,
                                 url=base_url))
        else:
            self.log.info('company = {company} year = {year} record_id = {_set_record_id} url = {url}'.
                          format(company=company, _set_record_id=_site_record_id,
                                 url=base_url, year=year))

        store_model['_src'] = []
        store_model['_src'].append({'url': base_url, 'site': self.host, 'download_time': in_time})
        store_model['_site_record_id'] = _site_record_id

        extract_info = ExtractInfo()
        extract_info.ex_status = ExStatus.kEsSuccess
        extract_info.extract_data = json.dumps(store_model)
        extract_info.topic_id = topic

        crawl_info = CrawlInfo()
        crawl_info.content = ""
        crawl_info.download_time = in_time

        url_info = get_url_info(base_url)

        base_info = BaseInfo()
        base_info.site = url_info.get('site', '')
        base_info.url = url_info.get('url', '')
        base_info.site_id = url_info.get('site_id', 0)
        base_info.url_id = url_info.get('url_id', 0)

        return PageParseInfo(extract_info=extract_info, crawl_info=crawl_info, base_info=base_info)
Пример #4
0
            "key": "主承销商",
            "value": "广发证券股份有限公司"
        }, {
            "key": "上市推荐人",
            "value": ""
        }, {
            "key": "保荐机构",
            "value": "广发证券股份有限公司"
        }, {
            "key": "股票代码",
            "value": "300599"
        }]
    }
    data = json.dumps(extract_data)
    extract_info = ExtractInfo(ex_status=2, extract_data=data)
    base_info = BaseInfo(url=src_url)
    parser_info = PageParseInfo(base_info=base_info, extract_info=extract_info)
    entity_data = obj.entity_extract(parser_info, extract_data)

    entity_data = obj.after_extract(base_info.url, entity_data, extract_data)

    for key, value in entity_data.items():
        if isinstance(value, list):
            for i in value:
                print key, ":", i
        elif isinstance(value, dict):
            for key2, value2 in value.items():
                print key2, ":", value2
        else:
            print key, ":", value
Пример #5
0
    def get_page_parseinfo(self):
        url = 'http://www.baidu.com'
        url_id = 0
        site = 'www.baidu.com'
        site_id = 0
        domain = None
        domain_id = 0
        segment_id = 0
        src_type = 'test src_type'
        base_info = BaseInfo(url=url,
                             url_id=url_id,
                             site=site,
                             site_id=site_id,
                             domain=domain,
                             domain_id=domain_id,
                             segment_id=segment_id,
                             src_type=src_type)

        status_code = 0
        http_code = 0
        download_time = 0
        redirect_url = 'test redirect_url'
        elapsed = 0
        content_type = 'test content_type'
        content = 'test content1'
        page_size = 0

        crawl_info = CrawlInfoOld(status_code=status_code,
                                  http_code=http_code,
                                  download_time=download_time,
                                  redirect_url=redirect_url,
                                  elapsed=elapsed,
                                  content_type=content_type,
                                  content=content,
                                  page_size=page_size)

        ex_status = ExStatus.kEsSuccess
        extract_error = ExFailErrorCode.KExFailPageTranscoding
        redirect_url = 'test redirect_url'
        next_page_type = True
        struct_type = 0
        compose_type = 0
        content_type = 0
        topic_id = 0
        extracted_body_time = 0
        content_time = 0
        html_tag_title = 'test html_tag_title'
        analyse_title = 'test analyse_title3'
        zone = 'test zone'
        page_text = 'test page_text'
        content_language = 'test content_language'
        second_navigate = 'test second_navigate'
        valid_pic_url = 'test valid_pic_url'
        digest = 'test digest'
        finger_feature = 'test finger_feature'
        content_finger = 0
        simhash_finger = 0
        link_finger = 0

        link1 = Link(url='http://www.baidu.com/url3', type=0)
        link2 = Link(url='http://www.baidu.com/url2')
        link3 = Link(url='http://www.baidusdf.com/url5', type=0)
        link4 = Link(url='http://www.baidusdf.com/url6', type=2)
        links = [link1, link2, link3, link4]
        extract_data = 'test extract_data'

        extract_info = ExtractInfo(ex_status=ex_status,
                                   extract_error=extract_error,
                                   redirect_url=redirect_url,
                                   next_page_type=next_page_type,
                                   struct_type=struct_type,
                                   compose_type=compose_type,
                                   content_type=content_type,
                                   topic_id=topic_id,
                                   extracted_body_time=extracted_body_time,
                                   content_time=content_time,
                                   html_tag_title=html_tag_title,
                                   analyse_title=analyse_title,
                                   zone=zone,
                                   page_text=page_text,
                                   content_language=content_language,
                                   second_navigate=second_navigate,
                                   valid_pic_url=valid_pic_url,
                                   digest=digest,
                                   finger_feature=finger_feature,
                                   content_finger=content_finger,
                                   simhash_finger=simhash_finger,
                                   link_finger=link_finger,
                                   links=links,
                                   extract_data=extract_data)

        parse_extends = 'b'
        data_extends = 'c'
        scheduler = 'd'

        page_parseinfo = PageParseInfo(base_info=base_info,
                                       crawl_info=crawl_info,
                                       extract_info=extract_info,
                                       parse_extends=parse_extends,
                                       data_extends=data_extends,
                                       scheduler=scheduler)
        return page_parseinfo