Python get_html 예제들, http_utils.get_html Python 예제들

예제 #1

0

파일 보기

def parser_models():
    file_index = 0
    model_links = file_utils.load_links_brand()
    for brand in model_links:
        for model in brand:
            file_index += 1
            proxy = {'http': 'http://' + get_proxy.get_proxies_list()}
            useragent = {'User-Agent': get_proxy.get_useregent_list()}
            # присваиваем html страницу в переменную soup
            soup = parser.get_html(model['href'], useragent, proxy)
            page_count = parser.get_pagination_index_models(soup)
            model_name = model['name']
            brands = re.findall('^[^\s]+', model_name)
            brand_name = brands[0]
            print(
                str(file_index) + ': ' + model_name + ', page count - ' +
                str(page_count))
            erc_csv = parser.parser_errors(soup, brand_name, model_name)
            file_utils.save_error_code(erc_csv, brand_name, model_name)
            if page_count > 1:
                for i in range(page_count):
                    index = i + 2
                    if index <= page_count:
                        soup = parser.get_html(
                            model['href'] + f'&page={index}', useragent, proxy)
                        erc_csv = parser.parser_errors(soup, brand_name,
                                                       model_name)
                        file_utils.save_error_code(erc_csv, brand_name,
                                                   model_name)

예제 #2

0

파일 보기

def get_save_links():
    url = 'https://printcopy.info/?mod=erc'
    proxy = {'http': 'http://' + get_proxy.get_proxies_list()}
    useragent = {'User-Agent': get_proxy.get_useregent_list()}
    brand_list = parser.get_brand_model_links(
        parser.get_html(url, useragent, proxy), 'brandList')

    for brand in brand_list:
        print(brand['name'])
        proxy = {'http': 'http://' + get_proxy.get_proxies_list()}
        useragent = {'User-Agent': get_proxy.get_useregent_list()}

        # присваиваем html страницу в переменную soup
        soup = parser.get_html(brand['href'], useragent, proxy)
        page_count = parser.get_pagination_index_models(soup)
        print(page_count)
        model_link = parser.get_brand_model_links(soup, 'modelList')
        file_utils.save_model_links_csv(model_link, brand['name'],
                                        brand['name'])
        if page_count > 1:
            for i in range(page_count):
                index = i + 2
                if index <= page_count:
                    model_link = parser.get_brand_model_links(
                        parser.get_html(brand['href'] + f'&page={index}',
                                        useragent, proxy), 'modelList')
                    file_utils.save_model_links_csv(model_link, brand['name'],
                                                    brand['name'])

예제 #3

0

파일 보기

파일: get_data.py 프로젝트: GuoSong1393/baidu_migration_crawl

def get_migration_index(city, move_type):
    """
    获取城市迁徙规模指数

    :param city：城市或者省
    :param move_type：迁移类型（move_in;move_out）
    :return: none
    """

    url = MIGRATION_INDEX_BACE_URL + 'dt=city&id=' + str(
        CITY_NUM[city]) + '&type=' + move_type
    print(city)
    print(url)

    restext = http_utils.get_html(url)
    internal_flow = json.loads(restext[3:-1])['data']['list']

    key = list(internal_flow.keys())
    value = list(internal_flow.values())
    tempdict = {'date': key, 'value': value}
    df = pd.DataFrame(tempdict)
    df.to_csv('./data/' + city + '_' + move_type + '_migration_index.csv',
              encoding='ANSI',
              index=False)
    print(city + '_' + move_type + '_migration_index.csv' +
          ' have been saved!')

예제 #4

0

파일 보기

파일: get_data.py 프로젝트: GuoSong1393/baidu_migration_crawl

def get_migration_city(startDate, endDate, city, move_type):
    """
    获取市级迁移数据

    :param startDate: 开始时间
    :param endDate：结束时间
    :param city：城市或者省
    :param move_type：迁移类型（move_in;move_out）
    :return: none
    """

    apiUrl = CITY_RANK_BASE_URL + 'dt=city&id=' + str(
        CITY_NUM[city]) + '&type=' + move_type + '&date={}'
    date = datetime.strptime(startDate, "%Y%m%d")
    end = datetime.strptime(endDate, "%Y%m%d")
    print(city)
    final_list = []
    while date <= end:
        currentDate = date.strftime('%Y%m%d')
        print(currentDate)
        date = date + timedelta(days=1)

        url = apiUrl.format(currentDate)
        print(url)

        restext = http_utils.get_html(url)

        migration_data = json.loads(restext[3:-1])['data']['list']

        result = [currentDate, city]
        for data in migration_data:
            result.append(data['city_name'])
            result.append(data['value'])
        final_list.append(result)
    if move_type == 'move_out':
        city1 = 'from_city'
        city2 = 'to_city'
    else:
        city1 = 'to_city'
        city2 = 'from_city'
    with open('./data/' + city + '_' + move_type + '_migration_city.csv',
              'w',
              encoding='utf-8-sig',
              newline='') as outFileCsv:
        writer = csv.writer(outFileCsv)
        # 表头
        result = ['date', city1]
        for i in range(1, 101):
            result.append(city2 + str(i))
            result.append('ratio' + str(i))
        writer.writerow(result)
        writer = csv.writer(outFileCsv)
        # 数据
        writer.writerows(final_list)
        print(city + '_' + move_type + '_migration_city.csv' +
              ' have been saved!')

예제 #5

0

파일 보기

파일: main.py 프로젝트: backupalisher/parser_princopy_partcode

def parser_models():
    file_index = 0
    model_links = file_utils.load_links_brand()
    for brand in model_links:
        brand_name = brand[0]
        for model in brand:
            file_index += 1
            proxy = {'http': 'http://' + get_proxy.get_proxies_list()}
            useragent = {'User-Agent': get_proxy.get_useregent_list()}
            # присваиваем html страницу в переменную soup
            soup = parser.get_html(model['href'], useragent, proxy)
            model_name = model['name']
            print(str(file_index) + '. ' + model_name)
            modules = parser.get_modules(soup, 'pcToc')

            for module in modules:
                module_name = module['name']
                soup = parser.get_html(module['href'], useragent, proxy)
                file_utils.save_partcode(
                    parser.get_partcodes(soup, brand_name['brand'], model_name,
                                         module_name), brand_name['brand'],
                    model_name)

예제 #6

0

파일 보기

파일: proxyutils.py 프로젝트: asanoboy/circus

    def get_html(self, url, timeout=20):
        proxy = random.sample(
            [p for p in self.proxy_list if p.is_available()],
            1)[0]

        start_at = time.time()
        html = get_html(url, proxy=proxy.proxy, timeout=timeout)
        elapsed_sec = time.time() - start_at
        if html is None:
            proxy.log_fail(elapsed_sec)
            return None
        else:
            proxy.log_success(elapsed_sec)
        return html

예제 #7

0

파일 보기

파일: get_data.py 프로젝트: GuoSong1393/baidu_migration_crawl

def get_internal_flow(city):
    """
    获取城市城内出行强度数据

    :param city：只能是城市（只有城市有城内出行强度）
    :return: none
    """

    url = INTERNAL_FLOW_BACE_URL + 'dt=city&id=' + str(
        CITY_NUM[city]) + '&date=20200223'
    print(city)
    print(url)

    restext = http_utils.get_html(url)
    internal_flow = json.loads(restext[3:-1])['data']['list']

    key = list(internal_flow.keys())
    value = list(internal_flow.values())
    tempdict = {'date': key, 'value': value}
    df = pd.DataFrame(tempdict)
    df.to_csv('./data/' + city + '_internal_flow.csv',
              encoding='ANSI',
              index=False)
    print(city + '_internal_flow.csv' + ' have been saved!')

예제 #8

0

파일 보기

def get_answer(parsed, as_text=False, debug=False):

    # If cache directory doesnt exists, create it
    if not path.isdir(cache_dir):

        if debug: print("[Info] Diretório de cache não existe, criando um")

        try:
            makedirs(cache_dir)
        except OSError as e:
            print(
                "[Warning] Falha ao criar o diretório de cache, continuando sem cache"
            )
        except Exception as e:
            print("[Error]: Erro desconhecido. Mensagem: '{0}'".format(str(e)))

    # Check if cached file exists
    try:
        cache_file_path = cache_file_base.format(parsed.country.lower())
    except AttributeError as e:
        err_msg = "[Error] Nenhum país fornecido, impossível encontrar uma resposta."
        raise ChatbotException(e, err_msg, parsed.question)

    cache_need_update = True

    if path.isfile(cache_file_path):

        # Check cached file timestamp
        cache_timestamp = datetime.fromtimestamp(
            path.getmtime(cache_file_path))
        diff = datetime.now() - cache_timestamp

        # If file is more than 1 day old, update it, else just load file
        if diff.days > 0:
            cache_need_update = True
        else:

            cache_need_update = False

            if as_text: infobox = _get_cached_webpage(cache_file_path)
            else:
                html = _get_cached_webpage(cache_file_path + "-html")
                infobox = BeautifulSoup(html, "html.parser")

            if debug: print("[Info] Página encontrada no cache")

    # No cache or cache needs update, download file
    if cache_need_update:

        if debug: print("[Info] Atualizando cache de " + parsed.country)

        # Create url for indexmundi
        country = url_encode(parsed.country)
        url = index_mundi_base_url + country

        html = get_html(url)
        soup = BeautifulSoup(html, "html.parser")

        # Find infobox table
        infobox = soup.findAll("table", attrs={"class": "infobox"})

        try:
            infobox = infobox[0]
        except IndexError as e:
            # NOTE: a página do kiribati não existe
            # NOTE: a do Vietnã deve ser vietname
            err_msg = "[Error] Tabela não encontrada"
            raise ChatbotException(e, err_msg, parsed.question)

        # Pre-process infobox text
        # IMPORTANT: NÃO REMOVER O K DA NORMALIZAÇÃO!!!!!! VAI CAGAR TUDO, PARECE UMA BOA IDEIA NA HORA, MAS DEPOIS VAI SER PIOR!
        if as_text:

            if debug: print("[Info] Salvando página como texto puro")

            infobox = unicodedata.normalize("NFKC", infobox.text)
            infobox = separate_words(infobox)
            # infobox = re.sub(r"\n", r" ", infobox.lower())
            _cache_webpage(infobox, cache_file_path)
        else:
            if debug: print("[Info] Salvando página como html")
            infobox_text = infobox.decode_contents()
            _cache_webpage(infobox_text, cache_file_path + "-html")

    # If we are working with pure text (clear html, other source of info, text)
    # cache, etc
    if as_text:
        # Generate lowercase infobox for use in comparations
        # TODO: use spacy idx
        infobox_ = re.sub(r"[-–−]", r"-", infobox.lower())
        infobox_model = pt_model(infobox)
        canon_infobox_model = pt_model(infobox)
        unstoppable_infobox = " ".join(
            [word.text for word in infobox_model if word.is_stop == False])
        canon_unstoppable_infobox = " ".join([
            word.text for word in canon_infobox_model if word.is_stop == False
        ])

        # print(unstoppable_infobox, "\n\n", canon_unstoppable_infobox)

        ans = None
        # Try searching for answer using question's core first, if no answer, search
        # using topic.
        # IMPORTANT: this isnt working and i dunno why, halp
        # if parsed.core in infobox:
        # 	_, start, end = find_between(infobox_, parsed.core.lower(), " - ")
        # 	# Get everything between - blahblah - and assume its the correct answer
        # 	# TODO: clean answer
        # 	ans = infobox[start:end]

        # No answer found with core, search with topic
        if not ans and parsed.topic:
            perm = permutations(parsed.topic.lower().split(" "))
            for p in perm:
                query = " ".join(p)
                # print(repr(query))
                _, start, end = find_between(canon_unstoppable_infobox.lower(),
                                             query, " - ")
                ans = unstoppable_infobox[start:end]
                if ans: break

        if not ans:
            return 'Topic not Found'

        ans = _process_answer(re.sub(r"[-–−]", r"-", ans))
        return ans.strip()

    # If we are working with raw html (structured data)
    else:

        old_cell = None
        topic_found = False

        for child in infobox.children:
            cells = child.findAll("td")

            for cell in cells:

                if debug:
                    print()
                    print(cell.prettify())
                    print()

                # Topic found in previous cell, try to find answer here
                if topic_found:
                    ans = cell.text
                    ans = _process_answer(re.sub(r"[-–−]", r"-", ans))
                    return ans

                topic_found = False
                if parsed.topic.lower() in cell.text.lower():
                    topic_found = True
                    old_cell = cell
                    print(cell.prettify())
                    print(
                        "[Info] Topic found in this cell, answer may be in the next."
                    )

                if debug:
                    print("Topic found? " + str(topic_found))
                    input("Press anything to continue...")

        return 'Topic not Found'