Exemplo n.º 1
0
def prepare_one_file_for_index(document, limit_domain):
    import gridfs
    from upol_search_engine.db import mongodb

    mongodb_client = mongodb.create_client()
    mongodb_database = mongodb.get_database(limit_domain, mongodb_client)
    fs = gridfs.GridFS(mongodb_database)
    out = fs.get(document.get('content').get('binary'))
    content = out.read()

    mongodb_client.close()

    content_hash = document.get('content').get('hashes').get('text')
    url_hash = document.get('_id')
    url = document.get('url')
    url_decoded = urls.decode(url)
    url_length = len(url)
    is_file = True
    file_type = document.get('file_type')
    filename = urls.get_filename(url_decoded)
    depth = document.get('depth')
    pagerank = document.get('pagerank')

    body_text = extract_content_from_pdf(content)

    # Reduce size of body_text for database
    while utf8len(body_text) > 800000:
        body_text = body_text[:-10000]

    if (body_text == "") or (body_text is None) or (len(body_text) < 500):
        return None

    max_length_detection = 10000
    body_text_length = len(body_text)

    try:
        if body_text_length < max_length_detection:
            language = detect(body_text)
        else:
            half = body_text_length / 2
            language = detect(body_text[int(half - max_length_detection /
                                            2):int(half +
                                                   max_length_detection / 2)])
    except lang_detect_exception.LangDetectException as e:
        # Fallback language
        language = 'cs'

    title = filename

    description = ""
    keywords = ""
    important_headlines = ""
    url_words = ' '.join(extract_words_from_url(url_decoded, limit_domain))

    row = (url_hash, url, url_decoded, url_words, title, language, keywords,
           description, important_headlines, body_text, content_hash, depth,
           is_file, file_type, pagerank, url_length)

    return row
Exemplo n.º 2
0
def feeder_task(crawler_settings, seed, batch_size, delay_between_feeding,
                task_id):
    from upol_search_engine.db import mongodb
    from upol_search_engine.utils import urls
    from upol_search_engine.upol_crawler.core import feeder

    client = mongodb.create_client()
    database = mongodb.get_database(crawler_settings.get('limit_domain'),
                                    client)
    regex = urls.generate_regex(crawler_settings.get('limit_domain'))

    mongodb.drop_database(
        urls.domain_replace_dots(crawler_settings.get('limit_domain')))

    # Init database
    mongodb.init(database)

    feeder.load_seed(seed, database, regex, crawler_settings.get('max_depth'),
                     crawler_settings.get('blacklist'))

    blacklist = crawler_settings.get('blacklist')

    for blacklisted_domain in blacklist:
        crawl_url_task.delay('http://' + blacklisted_domain,
                             crawler_settings.get('max_depth'),
                             crawler_settings,
                             ignore_blacklist=True)

    sleeping = False
    number_of_waiting = 0
    number_of_added_links = 0

    while True:
        if sleeping is False:

            feeder.feed_crawler(database, crawler_settings, batch_size)

            sleeping = True
        else:
            mongodb.update_crawler_progress(client, database, task_id)

            number_of_waiting = feeder.sleep_crawler(database,
                                                     number_of_waiting,
                                                     delay_between_feeding)

            if number_of_waiting >= 2:
                break

            sleeping = False

    mongodb.update_crawler_progress(client, database, task_id)
    client.close()
Exemplo n.º 3
0
def calculate_pagerank_task(crawler_settings, task_id):
    from upol_search_engine.db import mongodb
    from upol_search_engine.upol_crawler.core import pagerank

    client = mongodb.create_client()
    database = mongodb.get_database(crawler_settings.get('limit_domain'),
                                    client)

    mongodb.update_pagerank_progress(client, task_id, 'building_graph')
    graph = pagerank.build_graph(database)

    mongodb.update_pagerank_progress(client, task_id, 'calculation')
    graph_pagerank = pagerank.calculate_pagerank(graph, database)

    mongodb.update_pagerank_progress(client, task_id, 'uploading')
    pagerank.insert_pagerank_db(graph_pagerank, database)

    client.close()
Exemplo n.º 4
0
def crawl_url(url, depth, crawler_settings, ignore_blacklist=False):
    try:
        client = mongodb.create_client()
        database = mongodb.get_database(crawler_settings.get('limit_domain'),
                                        client)

        allowed = limiter.is_crawl_allowed(
            url, database, crawler_settings.get('frequency_per_server'))

        if not allowed:
            mongodb.set_url_for_recrawl(database, url)
            client.close()
            return

        url, original_url, redirected, response = get_page(
            url, crawler_settings.get('connect_max_timeout'),
            crawler_settings.get('read_max_timeout'))
    except requests.exceptions.ReadTimeout as e:
        # It also remove url from queue and set it as timeouted
        mongodb.set_timeout_url(database, url)
        log.warning('(Timeout) - ReadTimeout: {0}'.format(url))
    except requests.exceptions.ConnectionError as e:
        # It also remove url from queue and set it as timeouted
        mongodb.set_timeout_url(database, url)
        log.warning('(Timeout) - ConnectionError: {0}'.format(url))
    except requests.exceptions.ChunkedEncodingError as e:
        # It also remove url from queue and set it as timeouted
        mongodb.set_timeout_url(database, url)
        log.warning('(Timeout) - ChunkedEncodingError: {0}'.format(url))
    except Exception as e:
        mongodb.delete_url(database, url)
        log.exception('Exception: {0}'.format(url))
        client.close()
        raise
    else:
        _handle_response(database, url, original_url, redirected, response,
                         depth, crawler_settings.get('max_depth'),
                         crawler_settings.get('limit_domain'),
                         crawler_settings.get('blacklist'), ignore_blacklist)

    client.close()
Exemplo n.º 5
0
def indexer_task(crawler_settings, indexer_settings, task_id):
    from upol_search_engine.db import mongodb
    from upol_search_engine.db import postgresql
    import locale
    from celery.result import AsyncResult
    from celery.states import PENDING, STARTED, RECEIVED, SUCCESS
    import time

    locale.setlocale(locale.LC_ALL, 'cs_CZ.utf-8')

    mongodb_client = mongodb.create_client()
    mongodb_database = mongodb.get_database(
        crawler_settings.get('limit_domain'), mongodb_client)
    mongodb_batch_size = indexer_settings.get('batch_size')

    postgresql_client = postgresql.create_client()
    postgresql_cursor = postgresql_client.cursor()
    postgresql_table_name = indexer_settings.get('table_name')
    postgresql_table_name_production = indexer_settings.get(
        'table_name_production')
    postgresql_metadata_table_name = indexer_settings.get(
        'metadata_table_name')
    postgresql_metadata_table_name_production = indexer_settings.get(
        'metadata_table_name_production')

    # Test if postgresql table is ready
    # if (not postgresql.test_if_table_exists(postgresql_client, postgresql_cursor, postgresql_table_name)) or (not postgresql.test_if_table_exists(postgresql_client, postgresql_cursor, 'metadata_tmp')):
    postgresql.reset_and_init_db(postgresql_client,
                                 postgresql_cursor,
                                 postgresql_table_name,
                                 postgresql_metadata_table_name)

    tasks_list = []

    while True:
        document_batch = mongodb.get_batch_of_ids_for_indexer(
            mongodb_database,
            mongodb_batch_size)

        document_batch = list(document_batch)

        if len(document_batch) == 0:
            break

        document_ids = []

        for document in document_batch:
            document_ids.append(document.get('representative'))

        if len(document_ids) > 0:
            mongodb.set_documents_as_indexed(mongodb_database, document_ids)
            counter = 0
            for document_id in document_ids:
                counter += 1
                tasks_list.append(index_document_task.delay(document_id,
                                                            task_id,
                                                            crawler_settings,
                                                            indexer_settings))

    waiting = True

    while waiting:
        n_of_running = 0

        for task in tasks_list:
            state = AsyncResult(task.task_id).status

            if state == PENDING or state == STARTED or state == RECEIVED:
                n_of_running += 1

        if n_of_running == 0:
            waiting = False
            for task in tasks_list:
                state = AsyncResult(task.task_id).status
                if state != SUCCESS:
                    print(state)

        time.sleep(10)

    postgresql.change_table_to_production(postgresql_client,
                                          postgresql_cursor,
                                          postgresql_table_name,
                                          postgresql_table_name_production)

    postgresql.create_psql_index(postgresql_client,
                                 postgresql_cursor,
                                 postgresql_table_name_production,
                                 'search_index',
                                 'search_idx')

    postgresql.change_table_to_production(postgresql_client,
                                          postgresql_cursor,
                                          postgresql_metadata_table_name,
                                          postgresql_metadata_table_name_production)

    postgresql.create_psql_index(postgresql_client,
                                 postgresql_cursor,
                                 postgresql_metadata_table_name_production,
                                 'microformat_index',
                                 'microformat_idx')

    postgresql_cursor.close()
    postgresql_client.close()
    mongodb_client.close()
Exemplo n.º 6
0
def index_document_task(document_id, task_id,
                        crawler_settings, indexer_settings):
    from upol_search_engine.db import mongodb
    from upol_search_engine.db import postgresql
    from upol_search_engine.upol_indexer import indexer
    from celery.utils.log import get_task_logger
    from upol_search_engine.upol_indexer import microformat
    from psycopg2 import IntegrityError
    import json
    import hashlib

    log = get_task_logger(__name__)

    mongodb_client = mongodb.create_client()
    mongodb_database = mongodb.get_database(
        crawler_settings.get('limit_domain'), mongodb_client)
    postgresql_client = postgresql.create_client()
    postgresql_cursor = postgresql_client.cursor()
    postgresql_table_name = indexer_settings.get('table_name')
    postgresql_table_name_production = indexer_settings.get(
        'table_name_production')
    postgresql_metadata_table_name = indexer_settings.get('metadata_table_name')
    postgresql_metadata_table_name_production = indexer_settings.get(
        'metadatatable_name_production')

    try:
        document = mongodb.get_document_by_id(mongodb_database, document_id)

        indexed_rows = []
        copied_rows = []

        does_production_exists = postgresql.test_if_table_exists(
            postgresql_client,
            postgresql_cursor,
            postgresql_table_name_production)

        try:
            is_file = document.get('file')

            # Metadata
            if not is_file:
                soup = indexer.get_soup_from_document(document)
                metadata = microformat.find_microformat_on_page(soup)

                if metadata is not None:
                    parsed_metadata = microformat.parse_json(metadata)
                    metadata_hash = hashlib.sha1(
                        json.dumps(parsed_metadata,
                                   sort_keys=True).encode('utf-8')).hexdigest()

                    if microformat.validate_json_schema(parsed_metadata):

                        parsed_metadata, metadata_type = microformat.prepare_metadata_for_insert(parsed_metadata)

                        try:
                            postgresql.insert_microformat(postgresql_client,
                                                          postgresql_cursor,
                                                          json.dumps(parsed_metadata),
                                                          metadata_hash,
                                                          metadata_type,
                                                          postgresql_metadata_table_name)
                        except IntegrityError as e:
                            log.info('METADATA duplicity: {}'.format(
                                parsed_metadata))
                    else:
                        log.info('METADATA not valid: {}'.format(
                            document.get('url')))

            if does_production_exists:
                url_hash = document.get('_id')
                content_hash = document.get('content').get('hashes').get('text')

                production_document = postgresql.get_document_by_hash(
                    postgresql_client,
                    postgresql_cursor,
                    url_hash,
                    postgresql_table_name_production)
            else:
                production_document = None

            if (production_document is None) or (production_document[10] != content_hash):
                if is_file:
                    log.info('INDEXER: Indexing document (file).')

                    # Index only pdf this time
                    if document.get('file_type') == 'pdf':
                        try:
                            row = indexer.prepare_one_file_for_index(
                                document, crawler_settings.get('limit_domain'))
                        except Exception as e:
                            log.exception('Exception: {0}'.format(document.get('url')))
                            row = None
                    else:
                        row = None
                else:
                    log.info('INDEXER: Indexing document.')
                    row = indexer.prepare_one_document_for_index(
                        document,
                        soup,
                        crawler_settings.get('limit_domain'))

                if row is not None:
                    indexed_rows.append(row)
            else:
                if is_file:
                    log.info('INDEXER: Coping document (file).')
                else:
                    log.info('INDEXER: Coping document.')

                copied_rows.append(production_document)

                postgresql.copy_row_from_table_to_table(
                    postgresql_client,
                    postgresql_cursor,
                    url_hash,
                    postgresql_table_name_production,
                    postgresql_table_name)
        except Exception as e:
            log.exception('Exception: {0}'.format(document.get('url')))

        if len(indexed_rows) > 0:
                postgresql.insert_rows_into_index(postgresql_client,
                                                  postgresql_cursor,
                                                  indexed_rows,
                                                  postgresql_table_name)

        mongodb.update_indexer_progress(
            mongodb_client, task_id, len(indexed_rows) + len(copied_rows))
    except Exception as e:
        log.exception('Exception: INDEXER TASK POSSIBLE FAILURE')
    finally:
        postgresql_cursor.close()
        postgresql_client.close()
        mongodb_client.close()