Exemplo n.º 1
0
def construct_downloader_req(urls):
    i = 0

    for url in urls:
        i = i + 1
        print i
        # print url
        download_req = DownLoadReq()
        download_req.method = 'get'
        download_req.url = url['url']
        download_req.http_header = {}
        download_req.session_commit = SessionCommit()
        download_req.session_commit.refer_url = ""
        download_req.session_commit.identifying_code_url = ""
        download_req.session_commit.identifying_code_check_url = ""
        download_req.session_commit.check_body = ""
        download_req.session_commit.check_body_not = ""
        download_req.session_commit.session_msg = {}
        download_req.session_commit.need_identifying = False
        download_req.session_commit.need_identifying = False
        scheduler_info = {}
        download_req.scheduler = json.dumps(scheduler_info)
        download_req.use_proxy = False
        download_req.src_type = "seed"
        if download_req.url is not None:
            download_req.download_type = 'simple'
            priority_key = str(time.time())
            index_queue.put((priority_key, download_req))
Exemplo n.º 2
0
 def download(self, url, req = None):
     rsp = None;
     self.transport.open()
     try:
         if req is None:
             req = DownLoadReq();
         if url != None:
             req.url = url;
         rsp = self.client.download(req)
     finally:
         self.transport.close()
     return rsp
Exemplo n.º 3
0
def create_download_req(url,
                        method='simple',
                        parser_id="-1",
                        http_method="get"):
    download_req = DownLoadReq()
    download_req.url = url
    download_req.post_data = {}
    download_req.src_type = 'linkbase'
    download_req.download_type = method
    download_req.parse_extends = json.dumps({"parser_id": parser_id})
    download_req.method = http_method
    scheduler_info = {}
    scheduler_info["schedule_time"] = time.time()
    download_req.scheduler = json.dumps(scheduler_info)
    return download_req
Exemplo n.º 4
0
    port = 8088
    user = '******'
    password = '******'
    proxy = Proxy(host=host, port=port, user=user, password=password)
    kw = {
        'refer_url': 'http://wsgs.fjaic.gov.cn/creditpub/home',
        'session_msg': {
            'session.token': 'session.token'
        },
    }
    url = 'http://wsgs.fjaic.gov.cn/creditpub/search/ent_info_list'
    post_data = {'searchType': '1', 'captcha': ''}
    #    session_commit=SessionCommit(**kw)
    req = DownLoadReq(url=url, method='get', download_type='simple')
    #req.proxy=proxy
    req.priority = 0
    req.time_out = 30
    req.http_header = http_header
    req.retry_times = 1
    req.post_data = post_data
    #    req.session_commit=session_commit
    for i in range(1, 199):
        req.url = 'http://data.eastmoney.com/Notice_n/Noticelist.aspx?type=&market=hk&code=01224&date=&page=%s' % (
            str(i))
        res = client.download(req)
        time.sleep(2)
        print res
    transport.close()
#iutl改了log   i——config改了mysql密码,,phantomjs改了log
except Thrift.TException, tx:
    print '%s' % (tx.message)
Exemplo n.º 5
0
def test_parser_config():
    if not request.json:
        return jsonify({'status': 'failed', 'data': 'request error'})
    req_datas = request.json
    download_req = DownLoadReq()
    download_req.method = req_datas.get('request_method', 'get')
    download_req.url = req_datas.get('request_url')
    download_req.download_type = req_datas.get('download_type')
    download_req.post_data = {}
    download_req.http_header = {}
    try:
        download_req.http_header = json.loads(req_datas.get('headers'))
    except Exception as e:
        download_req.http_header = None
    post_data = None
    try:
        post_data = json.loads(req_datas.get('request_params'))
    except Exception as e:
        pass
    parser_id = req_datas.get('parser_id', "-1")
    page_source = req_datas.get('page_source').strip()

    if page_source not in ['cache', 'downloader', 'pagedb', 'input']:
        page_source = 'cache'
    hz_url = download_req.url
    if post_data and download_req.method == "post":
        hz_url = build_hzpost_url(download_req.url, post_data)
        download_req.url = hz_url
    spend_time = {}
    try:
        page_id = get_md5_i64(hz_url)
        download_rsp = None
        stime = time.time()

        if page_source == 'pagedb':
            download_rsp = current_app.config['crawler_merge'].select_one(
                hz_url)
            if download_rsp.status == 1:
                download_rsp = None
        elif page_source == 'cache':
            download_rsp = get_page_cache(page_id)
        elif page_source == 'input':
            download_rsp = DownLoadRsp()
            download_rsp.url = hz_url
            download_rsp.status = 0
            download_rsp.content_type = "text"
            download_rsp.http_code = 200
            download_rsp.download_time = 0
            download_rsp.content = req_datas.get('input_page',
                                                 "").encode('utf8')
            download_rsp.src_type = "input"
            download_rsp.elapsed = 50
        if not download_rsp:
            downloader = current_app.config['downloader']
            download_rsp = downloader.download(hz_url, download_req)
            download_rsp.url = hz_url
        spend_time['download_spend'] = (time.time() - stime) * 1000
        set_page_cache(page_id, download_rsp)
        is_save = req_datas.get('is_save', 'false')
        if is_save == "true":
            download_rsp.parse_extends = json.dumps({'parser_id': parser_id})
            download_rsp_tube = current_app.config[
                'put_beanstald_server'].get_tube_by_name('download_rsp_tube')
            if download_rsp_tube:
                current_app.config['put_beanstald_server'].save_record({
                    'tube_name':
                    download_rsp_tube,
                    'obj':
                    download_rsp
                })
        #复制download_rsp, 防止多线程修改
        download_rsp = deepcopy(download_rsp)
        download_rsp.parse_extends = json.dumps({
            "parser_id": parser_id,
            "debug": True
        })
        extractor = current_app.config['extractor']
        stime = time.time()
        extract_rsp = extractor.extract(download_rsp)
        spend_time['extract_spend'] = (time.time() - stime) * 1000
        #实体解析数据列表
        entity_datas = None
        #schema检查结果
        schema_check_result = None
        entity_rsps = None
        cur_datetime = str(datetime.datetime.now())
        try:
            stime = time.time()
            extract_data = extract_rsp.extract_info.extract_data
            if extract_data:
                extract_data_dict = json.loads(extract_data)
                _src = {
                    "url": extract_rsp.base_info.url,
                    "site_id": extract_rsp.base_info.site_id,
                    "site": extract_rsp.base_info.site
                }
                if "datas" in extract_data_dict:
                    datas = extract_data_dict['datas']
                    tmp_datas = []
                    for d in datas:
                        d['_src'] = [_src]
                        tmp_datas.append(d)
                    extract_data_dict['datas'] = tmp_datas
                else:
                    extract_data_dict['_src'] = [_src]
                extract_rsp.extract_info.extract_data = json.dumps(
                    extract_data_dict)
                entity_rsps = current_app.config[
                    'entity_extractor'].entity_extract(extract_rsp)
                spend_time['entity_spend'] = (time.time() - stime) * 1000
                entity_datas = []
                for data in entity_rsps.entity_data_list:
                    if data:
                        entity_datas.append(json.loads(data.entity_data))
                    else:
                        entity_datas.append(None)
        except Exception as e:
            if entity_rsps:
                entity_datas = {
                    'sys_error': e.message,
                    'error_message': entity_rsps.msg
                }
            else:
                entity_datas = {'sys_error': e.message}
        final_data = {}
        try:
            if entity_rsps.entity_data_list:
                entity_json = {
                    "topic_id": entity_rsps.entity_data_list[0].topic_id,
                    "data":
                    json.loads(entity_rsps.entity_data_list[0].entity_data)
                }
                datasaver_resp = current_app.config['data_saver'].check_data(
                    json.dumps(entity_json))
                final_data = json.loads(datasaver_resp.data)
        except Exception as e:
            final_data = {'sys_error': e.message}
        return jsonify({
            'status':
            True,
            'data':
            build_test_parser_config_rsp(extract_rsp, entity_datas, final_data,
                                         spend_time)
        })
    except Exception as e:
        current_app.config['logger'].error(hz_url)
        current_app.config['logger'].info(traceback.format_exc())
        return jsonify({'status': False, 'data': e.message})