Exemplos de crawl em Python, exemplos de utils.html_downloader.crawl em Python

Exemplo n.º 1

0

Exibir arquivo

def get_points():

    index_num=0
    global train_stations
    url = "https://apis.map.qq.com/jsapi?qt=poi&wd="
    try:
        for station in train_stations:
            text = crawl(url + station.name + '火车站')
            json_train_station_msg = json.loads(text)
            if (json_train_station_msg['detail'].__contains__('pois')):
                point = json_train_station_msg['detail']['pois'][0]
            else:
                train_stations = train_stations[index_num+1:len(train_stations)]
                continue
            if(index_num==42):
                print("")
            point_info = {
                'point_x': point['pointx'],
                'point_y': point['pointy'],
                'station_id': station.id,
                'station_name': station.name
            }
            res=create_point_msg_info(**point_info)
            index_num += 1
            print(index_num)
            if not res['success']:
                logger.warning(res['msg'])
            else:
                logger.critical('保存成功第' + str(res['point'].id) + '条')

    except Exception as e:
        train_stations = train_stations[index_num:len(train_stations)]
        logger.warning(e)
        logger.warning("异常 已爬取到车站：" + str(station.name))
        time.sleep(10 * CRAWL_INTERVAL)

Exemplo n.º 2

0

Exibir arquivo

def gen_train_num(num_static, count_num_static):
    url = "https://search.12306.cn/search/v1/h5/search?callback=jQuery19108124885820364023_1567759292307&keyword="
    tran_num = TRAIN_NUM_HEAD
    # tran_num = "K"
    global sign
    global count_num_static1
    global num_static1
    num = num_static
    count_num = count_num_static
    if (num >=10000):
        sign="station_num_relation"
    while num < 10000:
        try:
            tran_num_u = tran_num + str(num)

            text = crawl(url + tran_num_u)

            if not text:
                count_num_static1 = count_num
                num_static1 = num
                logger.info("中断 已爬取到车次：" + str(tran_num_u) + "数据主键已经到" + str(count_num))
                break
            # text = crawl("https://search.12306.cn/search/v1/h5/search?callback=jQuery110201481886827579022_1567752183819&keyword=" + tran_num_u + "&suorce=&action=&_=1567752183845")
            json_train = json.loads(text[text.find("(") + 1:text.find(")")])
            # print(json_train)

            i = 0
            if ((json_train['data'] is not None)):
                while (json_train['data'] is not None) & (i < len(json_train['data'])):
                    if json_train['data'][i]['params']['station_train_code'] == tran_num_u:
                        info = json_train['data'][i]['params']
                        tran_num_info = {
                            'id': count_num,
                            'total_station_num': info['total_num'],
                            'useful': 'T',
                            'train_no': info['train_no'],
                            'train_code': info['station_train_code'],
                            'from_station': info['from_station'],
                            'to_station': info['to_station']
                        }
                        res = create_train_num(**tran_num_info)
                        logger.critical("已保存成功:" + str(count_num) + "条" + res['msg'])
                        count_num += 1
                    i += 1
            num += 1
        except Exception as e:
            count_num_static1 = count_num - 1
            num_static1 = num
            logger.warning(e)
            logger.warning("异常 已爬取到车次：" + str(tran_num_u) + "数据主键已经到" + str(count_num))
            time.sleep(60 * CRAWL_INTERVAL)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: info_engine.py Projeto: hingbox/kxnews_feed-master

def extract(w_id):
    try:
        w = get_website(w_id)
        # log(NOTICE, "开始 #{id} {name} {site} ".format(id=w.id, name=w.company.name_cn, site=w.url))

        new_html_content = crawl(w.url)
        if not new_html_content:
            log(NOTICE, "#{id} {name} {site} 抓到更新 0 条".format(id=w.company.id, name=w.company.name_cn, site=w.url))
            return

        if w.html_content:
            old_html_content = w.html_content.content
        else:
            save_html_content(w.id, new_html_content)
            log(NOTICE, "#{id} {name} {site} 抓到更新 0 条".format(id=w.company.id, name=w.company.name_cn, site=w.url))
            return

        diff_text = diff_file(old_html_content, new_html_content)
        if not diff_text:
            log(NOTICE, "#{id} {name} {site} 抓到更新 0 条".format(id=w.company.id, name=w.company.name_cn, site=w.url))
            return

        save_html_content(w.id, new_html_content)

        soup = BeautifulSoup(diff_text, 'lxml')
        items = soup.find_all('a')
        COUNT = 0
        if items:
            for a in items:
                if a.string:
                    url, text = a.get('href'), a.string
                    check_pass = check_content(url, text)
                    if check_pass:
                        url = complement_url(url, w.url)
                        if url:
                            result = save_info_feed(url, text, w.id, w.company.id)
                            if result:
                                COUNT += 1
                            # log(RECORD, "[name] [+] [{url}  {text}]".format(name=w.company.name_cn, url=url, text=text.strip()))
        if COUNT == 0:
            log(NOTICE, "#{id} {name} {site} 抓到更新 {count} 条".format(id=w.company.id, name=w.company.name_cn, site=w.url, count=COUNT))
        else:
            log(RECORD, "#{id} {name} {site} 抓到更新 {count} 条".format(id=w.company.id, name=w.company.name_cn, site=w.url, count=COUNT))

    except Exception as e:
        try:
            w = get_website(w_id)
            log(ERROR, "#{id} {name} {site} {err}".format(id=w.id, name=w.company.name_cn, site=w.url, err=str(e)))
        except Exception as e:
            log(ERROR, str(e))

Exemplo n.º 4

0

Exibir arquivo

Arquivo: info_engine.py Projeto: jiyulongxu/news_feed

def extract(w_id):
    try:
        w = get_website(w_id)
        # log(NOTICE, "开始 #{id} {name} {site} ".format(id=w.id, name=w.company.name_cn, site=w.url))

        new_html_content = crawl(w.url)
        if not new_html_content:
            log(NOTICE, "#{id} {name} {site} 抓到更新 0 条".format(id=w.company.id, name=w.company.name_cn, site=w.url))
            return

        if w.html_content:
            old_html_content = w.html_content.content
        else:
            save_html_content(w.id, new_html_content)
            log(NOTICE, "#{id} {name} {site} 抓到更新 0 条".format(id=w.company.id, name=w.company.name_cn, site=w.url))
            return

        diff_text = diff_file(old_html_content, new_html_content)
        if not diff_text:
            log(NOTICE, "#{id} {name} {site} 抓到更新 0 条".format(id=w.company.id, name=w.company.name_cn, site=w.url))
            return

        save_html_content(w.id, new_html_content)

        soup = BeautifulSoup(diff_text, 'lxml')
        items = soup.find_all('a')
        COUNT = 0
        if items:
            for a in items:
                if a.string:
                    url, text = a.get('href'), a.string
                    check_pass = check_content(url, text)
                    if check_pass:
                        url = complement_url(url, w.url)
                        if url:
                            result = save_info_feed(url, text, w.id, w.company.id)
                            if result:
                                COUNT += 1
                            # log(RECORD, "[name] [+] [{url}  {text}]".format(name=w.company.name_cn, url=url, text=text.strip()))
        if COUNT == 0:
            log(NOTICE, "#{id} {name} {site} 抓到更新 {count} 条".format(id=w.company.id, name=w.company.name_cn, site=w.url, count=COUNT))
        else:
            log(RECORD, "#{id} {name} {site} 抓到更新 {count} 条".format(id=w.company.id, name=w.company.name_cn, site=w.url, count=COUNT))

    except Exception as e:
        try:
            w = get_website(w_id)
            log(ERROR, "#{id} {name} {site} {err}".format(id=w.id, name=w.company.name_cn, site=w.url, err=str(e)))
        except Exception as e:
            log(ERROR, str(e))

Exemplo n.º 5

0

Exibir arquivo

def gen_station_num_relation():
    index_num = 0
    global train_nums
    global sign
    try:
        if(len(train_nums)<=0):
            sign="points"
        for train_num in train_nums[:]:
            url = "https://kyfw.12306.cn/otn/queryTrainInfo/query?leftTicketDTO.train_no="
            url_parm_date = "&leftTicketDTO.train_date="
            url_suffix = "&rand_code="
            param_date = datetime.date.today() + datetime.timedelta(days=3)
            text = crawl(url + train_num.train_no + url_parm_date + str(param_date) + url_suffix)
            # if (train_num.train_no == '0300000K4009'):
            json_train_msg = json.loads(text)
            if ((json_train_msg['data']['data'] is not None)):
                for relation in json_train_msg['data']['data']:
                    relation_info = {
                        'arrive_time': relation['arrive_time'],
                        'train_code': relation['station_train_code'],
                        'running_time': relation['running_time'],
                        'start_time': relation['start_time'],
                        'station_name': relation['station_name'],
                        'arrive_day_diff': relation['arrive_day_diff'],
                        'station_no': relation['station_no'],
                        'train_no': train_num.train_no
                    }
                    res = create_train_relation_info(**relation_info)
                    logger.info('this is info message')
                    if not res['success']:
                        logger.warning(res['msg'])
                    else:
                        logger.critical('保存成功第' + str(res['trainNumStationRelation'].id) + '条' + str(index_num))
            else:
                train_num.useful = 'F'
                res = train_num_update(**to_dict(train_num))
                if (res['success']):
                    logger.critical('已更新状态' + str(train_num.train_code))

            index_num += 1
    except Exception as e:
        train_nums = train_nums[index_num:len(train_nums)]
        logger.warning(e)
        logger.warning("异常 已爬取到车次：" + str(train_num.train_code))
        time.sleep(10 * CRAWL_INTERVAL)

Exemplo n.º 6

0

Exibir arquivo

def extract(w_id):
    old_html_content = ''
    try:
        # 从数据库取
        currentWebsite = get_website(w_id)

        # 抓取到的内容
        websiteContents = crawl(currentWebsite.url)

        if not websiteContents:
            log(NOTICE, "#{id} {name} {site} 没有取得新内容".format(id=currentWebsite.company.id, name=currentWebsite.company.name_cn.encode('utf-8').strip(), site=currentWebsite.url))
            return


        # 数据库里的内容
        if currentWebsite.html_content:
            old_html_content = currentWebsite.html_content.content

            # 抓取到的内容和旧内容进行对比
            diff_text = diff_file(old_html_content, websiteContents)

            if not diff_text:
                log(NOTICE, "#{id} {name} {site} 没有不同，不会执行了".format(id=currentWebsite.company.id,
                                                                    name=currentWebsite.company.name_cn.encode(
                                                                        'utf-8').strip(), site=currentWebsite.url))
                return
            else:
                parseAndSave(diff_text,currentWebsite)


        else:
            save_html_content(currentWebsite.id, websiteContents)
            log(NOTICE, "#{id} {name} {site} 保存成功".format(id=currentWebsite.company.id, name=currentWebsite.company.name_cn.encode('utf-8').strip(), site=currentWebsite.url))
            # return
            parseAndSave(websiteContents,currentWebsite)

    except Exception as e:
        print(e)
        try:
            currentWebsite = get_website(w_id)
            print(currentWebsite)

            log(ERROR, "#{id} {name} {site} {err}".format(id=currentWebsite.id, name=currentWebsite.company.name_cn.encode('utf-8').strip(), site=currentWebsite.url, err=str(e)))
        except Exception as e:
            log(ERROR, str(e))

Exemplo n.º 7

0

Exibir arquivo

def gen_station():
    text = crawl("https://www.12306.cn/index/script/core/common/station_name_v10037.js")
    count = 0
    global sign

    while len(text) > 7:
        # if count==0:
        text = text[text.find("|") + 1:len(text)]

        name = text[0:text.find("|")]
        text = text[text.find("|") + 1:len(text)]

        big_abbr = text[0:text.find("|")]
        text = text[text.find("|") + 1:len(text)]

        full_pinyin = text[0:text.find("|")]
        text = text[text.find("|") + 1:len(text)]

        small_abbr = text[0:text.find("|")]
        text = text[text.find("|") + 1:len(text)]

        station_info = {
            'id': count,
            'big_abbr': big_abbr,
            'full_pinyin': full_pinyin,
            'small_abbr': small_abbr,
            'name': name
        }
        count += 1
        res = create_station(**station_info)
        if (not res["success"]):
            logger.warning(res["msg"])
        else:
            logger.info(res["station"].name)
        if (len(text) <=7):
            sign='train_num'

Exemplo n.º 8

0

Exibir arquivo

Arquivo: info_engine.py Projeto: zjms/news_feed

def extract(w_id):
    """

    :param w_id:
    :return:
    """
    try:
        # 列举出所有没能成功抓取更新的情况，并在log中记录。

        w = get_website(w_id)
        # log(NOTICE, "开始 #{id} {name} {site} ".format(id=w.id, name=w.company.name_cn, site=w.url))
        # Todo 此处尝试调用Scrapy
        new_html_content = crawl(w.url)
        if not new_html_content:
            log(
                NOTICE,
                "#{id} {name} {site} 抓到更新 0 条".format(id=w.company.id,
                                                      name=w.company.name_cn,
                                                      site=w.url))
            return

        # if current website 'w' already have html_content. compare it with 'new_content' and save those when 'diff' exist.
        if w.html_content:
            old_html_content = w.html_content.content
        else:
            save_html_content(w.id, new_html_content)
            log(
                NOTICE,
                "#{id} {name} {site} 抓到更新 0 条".format(id=w.company.id,
                                                      name=w.company.name_cn,
                                                      site=w.url))
            return
        diff_text = diff_file(old_html_content, new_html_content)
        if not diff_text:
            log(
                NOTICE,
                "#{id} {name} {site} 抓到更新 0 条".format(id=w.company.id,
                                                      name=w.company.name_cn,
                                                      site=w.url))
            return
        save_html_content(w.id, new_html_content)

        # lxml是一个html解析器,与它类似的还有html5lib等。
        soup = BeautifulSoup(diff_text, 'lxml')
        items = soup.find_all('a')
        COUNT = 0

        # 基本逻辑：抓取所有<a href>标签,check内容是否合规，是则该标签的url补全，存入info_feed表中。
        if items:
            for a in items:
                if a.string:
                    url, text = a.get('href'), a.string
                    check_pass = check_content(url, text)
                    if check_pass:
                        url = complement_url(url, w.url)
                        if url:
                            result = save_info_feed(url, text, w.id,
                                                    w.company.id)
                            if result:
                                COUNT += 1
                            # log(RECORD, "[name] [+] [{url}  {text}]".format(name=w.company.name_cn, url=url, text=text.strip()))
        if COUNT == 0:
            log(
                NOTICE, "#{id} {name} {site} 抓到更新 {count} 条".format(
                    id=w.company.id,
                    name=w.company.name_cn,
                    site=w.url,
                    count=COUNT))
        else:
            log(
                RECORD, "#{id} {name} {site} 抓到更新 {count} 条".format(
                    id=w.company.id,
                    name=w.company.name_cn,
                    site=w.url,
                    count=COUNT))

    except Exception as e:
        try:
            w = get_website(w_id)
            log(
                ERROR,
                "#{id} {name} {site} {err}".format(id=w.id,
                                                   name=w.company.name_cn,
                                                   site=w.url,
                                                   err=str(e)))
        except Exception as e:
            log(ERROR, str(e))

Exemplo n.º 9

0

Exibir arquivo

Arquivo: law_engine.py Projeto: sharkme/spider-prototype

def get_pku_law():
    print(keyword)
    #创建excel文件
    execl = xlwt.Workbook()

    index_num = 0
    url = "https://www.pkulaw.com/"
    group_json_data = {
        "library": stype,
        "className": "EffectivenessDic",
        "classCodeKeys": "",
        # QueryBase64Request: eyJGaWVsZE5hbWUiOm51bGwsIlZhbHVlIjpudWxsLCJSdWxlVHlwZSI6NCwiTWFueVZhbHVlU3BsaXQiOiJcdTAwMDAiLCJXb3JkTWF0Y2hUeXBlIjowLCJXb3JkUmF0ZSI6MCwiQ29tYmluYXRpb25UeXBlIjoyLCJDaGlsZE5vZGVzIjpbeyJGaWVsZE5hbWUiOiJLZXl3b3JkU2VhcmNoVHJlZSIsIlZhbHVlIjpudWxsLCJSdWxlVHlwZSI6NCwiTWFueVZhbHVlU3BsaXQiOiJcdTAwMDAiLCJXb3JkTWF0Y2hUeXBlIjowLCJXb3JkUmF0ZSI6MCwiQ29tYmluYXRpb25UeXBlIjoxLCJDaGlsZE5vZGVzIjpbeyJGaWVsZE5hbWUiOiJEb2N1bWVudE5PIiwiVmFsdWUiOiLogIHml6flsI/ljLoiLCJSdWxlVHlwZSI6NCwiTWFueVZhbHVlU3BsaXQiOiJcdTAwMDAiLCJXb3JkTWF0Y2hUeXBlIjoxLCJXb3JkUmF0ZSI6MCwiQ29tYmluYXRpb25UeXBlIjoyLCJDaGlsZE5vZGVzIjpbXSwiQW5hbHl6ZXIiOiJpa19tYXhfd29yZCIsIkJvb3N0IjpudWxsLCJNaW5pbXVtX3Nob3VsZF9tYXRjaCI6bnVsbH0seyJGaWVsZE5hbWUiOiJUaXRsZSIsIlZhbHVlIjoi6ICB5pen5bCP5Yy6IiwiUnVsZVR5cGUiOjQsIk1hbnlWYWx1ZVNwbGl0IjoiXHUwMDAwIiwiV29yZE1hdGNoVHlwZSI6MSwiV29yZFJhdGUiOjAsIkNvbWJpbmF0aW9uVHlwZSI6MiwiQ2hpbGROb2RlcyI6W10sIkFuYWx5emVyIjoiaWtfbWF4X3dvcmQiLCJCb29zdCI6bnVsbCwiTWluaW11bV9zaG91bGRfbWF0Y2giOm51bGx9XSwiQW5hbHl6ZXIiOm51bGwsIkJvb3N0IjpudWxsLCJNaW5pbXVtX3Nob3VsZF9tYXRjaCI6bnVsbH1dLCJBbmFseXplciI6bnVsbCwiQm9vc3QiOm51bGwsIk1pbmltdW1fc2hvdWxkX21hdGNoIjpudWxsfQ==
        "keyword": keyword,
        # advDic:
        # SearchInResult:
        "ClassFlag": stype,
        "KeywordType": "DefaultSearch",
        "MatchType": "Exact"
    }
    chl_json_data = {
        "Keywords": keyword,
        "PreviousLib": stype,
        "PreKeywords": keyword,
        "Library": stype,
        "RecordShowType": "List",
        "ClassFlag": stype,
        "PreviousLib": stype
    }

    rs_json_data = {
        "Menu": "law",
        "RangeType": "Piece",
        "IsSynonymSearch": False,
        "LastLibForChangeColumn": stype,
        "IsAdv": False,
        "OrderByIndex": 4,
        "RecordShowType": "List",
        "Keywords": keyword,
        #匹配方式
        "MatchType": "Exact",
        "Library": stype,
        "ClassFlag": stype,
        "SearchKeywordType": "DefaultSearch",
        "PreviousLib": stype,
        "PreKeywords": keyword,
        "AfterSearch": True,
        "ClassCodeKey": "wq",
        "ShowType": "Default",
        "QueryOnClick": False,
        "Pager.PageIndex": 0,
        "GroupByIndex": 0,
        "OldPageIndex": 0,
        "Pager.PageSize": pagesize
    }
    local_path = "/law/chl"
    group_get_path = "/Tool/SingleClassResult"
    record_path = "/law/search/RecordSearch"
    res = {}
    try:
        # main_page=requests.post(url+local_path,headers=header,data=datau)
        #map中key为代码，value为名字
        # post_res=crawl_law_post(url+local_path,group_json_data)
        post_res = crawl_law_post(url + group_get_path, group_json_data)
        if (post_res):
            group_map = json.loads(post_res)
        else:
            logger.error("请求失败")
            pass
        # group_map={}
        # group_html=BeautifulSoup(data_hrml,'html.parser').find_all('div',class_='grouping-title')
        # #循环获取group 信息，包含名称和代号用于后续请求
        # for group in group_html:
        #     group_map[group.find('a').text]=group.find('a').attrs.get('groupvalue')

        for group in group_map:
            page_index = 0
            cur_map = []
            #获取每一项的篇数
            cur_size = int(re.sub("\D", "", group.get('value')))
            rs_json_data["GroupByIndex"] = page_index
            rs_json_data["Pager.PageIndex"] = page_index
            rs_json_data["ClassCodeKey"] = "," + group.get('key') + ",,,,"

            rc_data_html = crawl_law_post(url + record_path, rs_json_data)
            cur_map = get_useful_data(rc_data_html)
            cur_size -= pagesize
            #...
            while (cur_size > 0):
                cur_size -= pagesize
                page_index += 1
                rs_json_data["GroupByIndex"] = page_index
                rs_json_data["Pager.PageIndex"] = page_index
                rc_data_html = crawl_law_post(url + record_path, rs_json_data)
                tem = get_useful_data(rc_data_html)
                cur_map += tem
                # cur_map.extend(get_useful_data(rc_data_html))
            res[group.get('value')] = cur_map
        #处理获取到的数据到excel
        sheet = execl.add_sheet("测试表名", cell_overwrite_ok=True)

        for group_name in res:
            sheet = execl.add_sheet(group_name, cell_overwrite_ok=True)

            sheet.write(1, 0, group_name)
            i = 1
            cur_data = res[group_name]
            for one in cur_data:
                sheet.write(i, 0, one.get("title"))
                i += 1
        # os.mknod("a.xlsx")
        execl.save(outputdir + '/' + keyword)
        main_page = crawl(url)

    except Exception as e:
        # train_stations = train_stations[index_num:len(train_stations)]
        logger.warning(e)
        logger.error("异常终止")