Exemplo n.º 1
0
 def it_logging_warning_if_broken_file(xml_sitemap_bad, site_url,
                                       caplog):
     res = tm._get_urls(xml_sitemap_bad, site_url,
                        tm.get_file_type(xml_sitemap_bad))
     print('caplog.text()', caplog.text())
     assert 'sitemap.get_urls' in caplog.text()
     assert res == ([])
Exemplo n.º 2
0
def scan_sp(next_step=False, max_limit=0):
    logging.info('solo crawler start')
    keywords, all_robots = _init_crawler()
    pages = database.get_pages_rows(None)
    # TODO: добавить проверку если len(pages) = 0
    # то найти наименьшую дату и выбрать по ней.
    add_urls_total = 0
    for page in pages:
        page_id, url, site_id, base_url = page
        request_time = time.time()
        logging.info('#BEGIN %s url %s, base_url %s', page_id, url, base_url)
        content = _get_content(url)
        robots = all_robots.get(site_id)

        if add_urls_total >= max_limit:
            page_type = sitemap.get_file_type(content)
            add_urls_count = 0
        else:
            new_pages_data, page_id, page_type = sitemap.scan_urls(
                content, page, robots)
            if len(new_pages_data) > max_limit:
                new_pages_data = new_pages_data[:max_limit + 1]
            add_urls_count, page_id = database.add_urls(
                new_pages_data, page_id)
            if page_type != sitemap.SM_TYPE_HTML:
                database.update_last_scan_date(page_id)

        if page_type == sitemap.SM_TYPE_HTML:
            ranks, page_id = parsers.process_ranks(
                content,
                page_id,
                keywords,
            )
            database.update_person_page_rank(page_id, ranks)

        request_time = time.time() - request_time
        logging.info('#END url %s, base_url %s, add urls %s, time %s', url,
                     base_url, add_urls_count, request_time)
        add_urls_total = add_urls_total + add_urls_count

    logging.info('Crawler.scan: Add %s new urls on date %s', add_urls_total,
                 'NULL')
    return add_urls_total
Exemplo n.º 3
0
def parse_html(page_content, words_dict):
    """
        page_content - контент страницы
        words_dict - {"person_id":[words_list]}
        возвращает словарь {"person_id": "rank"}
    """
    if get_file_type(page_content) != SM_TYPE_HTML:
        return {}

    min_len = 3  # минимальная длина слова которе считается словом
    result = {}
    logging.debug('parse_html: %s', words_dict)
    try:
        html_text = _extract_text(page_content)
        words_list = _split_text(html_text, min_len)
        result = _count_words(words_list, words_dict)
    except Exception as ex:
        logging.error("parsers.parse_html: error %s", ex)

    logging.debug('parse_html %s completed...', result)
    return result
Exemplo n.º 4
0
 def it_return_tuple_of_urls_rec(rec_sitemap, site_url, rec_list):
     assert tm._get_urls(rec_sitemap, site_url,
                         tm.get_file_type(rec_sitemap)) == (rec_list)
Exemplo n.º 5
0
 def it_return_tuple_of_urls_txt(txt_sitemap, site_url, urls_list):
     assert tm._get_urls(txt_sitemap, site_url,
                         tm.get_file_type(txt_sitemap)) == (urls_list)
Exemplo n.º 6
0
 def it_return_tuple_of_urls_html(html_sitemap, site_url, urls_list):
     assert tm._get_urls(html_sitemap, site_url,
                         tm.get_file_type(html_sitemap)) == (urls_list)
Exemplo n.º 7
0
 def it_return_type_of_sitemap_txt(txt_sitemap):
     assert tm.get_file_type(txt_sitemap) == tm.SM_TYPE_TXT
Exemplo n.º 8
0
 def it_return_type_of_sitemap_html(html_sitemap):
     assert tm.get_file_type(html_sitemap) == tm.SM_TYPE_HTML
Exemplo n.º 9
0
 def it_return_type_of_sitemap_xml(xml_sitemap):
     assert tm.get_file_type(xml_sitemap) == tm.SM_TYPE_XML