def prepare_one_file_for_index(document, limit_domain): import gridfs from upol_search_engine.db import mongodb mongodb_client = mongodb.create_client() mongodb_database = mongodb.get_database(limit_domain, mongodb_client) fs = gridfs.GridFS(mongodb_database) out = fs.get(document.get('content').get('binary')) content = out.read() mongodb_client.close() content_hash = document.get('content').get('hashes').get('text') url_hash = document.get('_id') url = document.get('url') url_decoded = urls.decode(url) url_length = len(url) is_file = True file_type = document.get('file_type') filename = urls.get_filename(url_decoded) depth = document.get('depth') pagerank = document.get('pagerank') body_text = extract_content_from_pdf(content) # Reduce size of body_text for database while utf8len(body_text) > 800000: body_text = body_text[:-10000] if (body_text == "") or (body_text is None) or (len(body_text) < 500): return None max_length_detection = 10000 body_text_length = len(body_text) try: if body_text_length < max_length_detection: language = detect(body_text) else: half = body_text_length / 2 language = detect(body_text[int(half - max_length_detection / 2):int(half + max_length_detection / 2)]) except lang_detect_exception.LangDetectException as e: # Fallback language language = 'cs' title = filename description = "" keywords = "" important_headlines = "" url_words = ' '.join(extract_words_from_url(url_decoded, limit_domain)) row = (url_hash, url, url_decoded, url_words, title, language, keywords, description, important_headlines, body_text, content_hash, depth, is_file, file_type, pagerank, url_length) return row
def feeder_task(crawler_settings, seed, batch_size, delay_between_feeding, task_id): from upol_search_engine.db import mongodb from upol_search_engine.utils import urls from upol_search_engine.upol_crawler.core import feeder client = mongodb.create_client() database = mongodb.get_database(crawler_settings.get('limit_domain'), client) regex = urls.generate_regex(crawler_settings.get('limit_domain')) mongodb.drop_database( urls.domain_replace_dots(crawler_settings.get('limit_domain'))) # Init database mongodb.init(database) feeder.load_seed(seed, database, regex, crawler_settings.get('max_depth'), crawler_settings.get('blacklist')) blacklist = crawler_settings.get('blacklist') for blacklisted_domain in blacklist: crawl_url_task.delay('http://' + blacklisted_domain, crawler_settings.get('max_depth'), crawler_settings, ignore_blacklist=True) sleeping = False number_of_waiting = 0 number_of_added_links = 0 while True: if sleeping is False: feeder.feed_crawler(database, crawler_settings, batch_size) sleeping = True else: mongodb.update_crawler_progress(client, database, task_id) number_of_waiting = feeder.sleep_crawler(database, number_of_waiting, delay_between_feeding) if number_of_waiting >= 2: break sleeping = False mongodb.update_crawler_progress(client, database, task_id) client.close()
def calculate_pagerank_task(crawler_settings, task_id): from upol_search_engine.db import mongodb from upol_search_engine.upol_crawler.core import pagerank client = mongodb.create_client() database = mongodb.get_database(crawler_settings.get('limit_domain'), client) mongodb.update_pagerank_progress(client, task_id, 'building_graph') graph = pagerank.build_graph(database) mongodb.update_pagerank_progress(client, task_id, 'calculation') graph_pagerank = pagerank.calculate_pagerank(graph, database) mongodb.update_pagerank_progress(client, task_id, 'uploading') pagerank.insert_pagerank_db(graph_pagerank, database) client.close()
def crawl_url(url, depth, crawler_settings, ignore_blacklist=False): try: client = mongodb.create_client() database = mongodb.get_database(crawler_settings.get('limit_domain'), client) allowed = limiter.is_crawl_allowed( url, database, crawler_settings.get('frequency_per_server')) if not allowed: mongodb.set_url_for_recrawl(database, url) client.close() return url, original_url, redirected, response = get_page( url, crawler_settings.get('connect_max_timeout'), crawler_settings.get('read_max_timeout')) except requests.exceptions.ReadTimeout as e: # It also remove url from queue and set it as timeouted mongodb.set_timeout_url(database, url) log.warning('(Timeout) - ReadTimeout: {0}'.format(url)) except requests.exceptions.ConnectionError as e: # It also remove url from queue and set it as timeouted mongodb.set_timeout_url(database, url) log.warning('(Timeout) - ConnectionError: {0}'.format(url)) except requests.exceptions.ChunkedEncodingError as e: # It also remove url from queue and set it as timeouted mongodb.set_timeout_url(database, url) log.warning('(Timeout) - ChunkedEncodingError: {0}'.format(url)) except Exception as e: mongodb.delete_url(database, url) log.exception('Exception: {0}'.format(url)) client.close() raise else: _handle_response(database, url, original_url, redirected, response, depth, crawler_settings.get('max_depth'), crawler_settings.get('limit_domain'), crawler_settings.get('blacklist'), ignore_blacklist) client.close()
def indexer_task(crawler_settings, indexer_settings, task_id): from upol_search_engine.db import mongodb from upol_search_engine.db import postgresql import locale from celery.result import AsyncResult from celery.states import PENDING, STARTED, RECEIVED, SUCCESS import time locale.setlocale(locale.LC_ALL, 'cs_CZ.utf-8') mongodb_client = mongodb.create_client() mongodb_database = mongodb.get_database( crawler_settings.get('limit_domain'), mongodb_client) mongodb_batch_size = indexer_settings.get('batch_size') postgresql_client = postgresql.create_client() postgresql_cursor = postgresql_client.cursor() postgresql_table_name = indexer_settings.get('table_name') postgresql_table_name_production = indexer_settings.get( 'table_name_production') postgresql_metadata_table_name = indexer_settings.get( 'metadata_table_name') postgresql_metadata_table_name_production = indexer_settings.get( 'metadata_table_name_production') # Test if postgresql table is ready # if (not postgresql.test_if_table_exists(postgresql_client, postgresql_cursor, postgresql_table_name)) or (not postgresql.test_if_table_exists(postgresql_client, postgresql_cursor, 'metadata_tmp')): postgresql.reset_and_init_db(postgresql_client, postgresql_cursor, postgresql_table_name, postgresql_metadata_table_name) tasks_list = [] while True: document_batch = mongodb.get_batch_of_ids_for_indexer( mongodb_database, mongodb_batch_size) document_batch = list(document_batch) if len(document_batch) == 0: break document_ids = [] for document in document_batch: document_ids.append(document.get('representative')) if len(document_ids) > 0: mongodb.set_documents_as_indexed(mongodb_database, document_ids) counter = 0 for document_id in document_ids: counter += 1 tasks_list.append(index_document_task.delay(document_id, task_id, crawler_settings, indexer_settings)) waiting = True while waiting: n_of_running = 0 for task in tasks_list: state = AsyncResult(task.task_id).status if state == PENDING or state == STARTED or state == RECEIVED: n_of_running += 1 if n_of_running == 0: waiting = False for task in tasks_list: state = AsyncResult(task.task_id).status if state != SUCCESS: print(state) time.sleep(10) postgresql.change_table_to_production(postgresql_client, postgresql_cursor, postgresql_table_name, postgresql_table_name_production) postgresql.create_psql_index(postgresql_client, postgresql_cursor, postgresql_table_name_production, 'search_index', 'search_idx') postgresql.change_table_to_production(postgresql_client, postgresql_cursor, postgresql_metadata_table_name, postgresql_metadata_table_name_production) postgresql.create_psql_index(postgresql_client, postgresql_cursor, postgresql_metadata_table_name_production, 'microformat_index', 'microformat_idx') postgresql_cursor.close() postgresql_client.close() mongodb_client.close()
def index_document_task(document_id, task_id, crawler_settings, indexer_settings): from upol_search_engine.db import mongodb from upol_search_engine.db import postgresql from upol_search_engine.upol_indexer import indexer from celery.utils.log import get_task_logger from upol_search_engine.upol_indexer import microformat from psycopg2 import IntegrityError import json import hashlib log = get_task_logger(__name__) mongodb_client = mongodb.create_client() mongodb_database = mongodb.get_database( crawler_settings.get('limit_domain'), mongodb_client) postgresql_client = postgresql.create_client() postgresql_cursor = postgresql_client.cursor() postgresql_table_name = indexer_settings.get('table_name') postgresql_table_name_production = indexer_settings.get( 'table_name_production') postgresql_metadata_table_name = indexer_settings.get('metadata_table_name') postgresql_metadata_table_name_production = indexer_settings.get( 'metadatatable_name_production') try: document = mongodb.get_document_by_id(mongodb_database, document_id) indexed_rows = [] copied_rows = [] does_production_exists = postgresql.test_if_table_exists( postgresql_client, postgresql_cursor, postgresql_table_name_production) try: is_file = document.get('file') # Metadata if not is_file: soup = indexer.get_soup_from_document(document) metadata = microformat.find_microformat_on_page(soup) if metadata is not None: parsed_metadata = microformat.parse_json(metadata) metadata_hash = hashlib.sha1( json.dumps(parsed_metadata, sort_keys=True).encode('utf-8')).hexdigest() if microformat.validate_json_schema(parsed_metadata): parsed_metadata, metadata_type = microformat.prepare_metadata_for_insert(parsed_metadata) try: postgresql.insert_microformat(postgresql_client, postgresql_cursor, json.dumps(parsed_metadata), metadata_hash, metadata_type, postgresql_metadata_table_name) except IntegrityError as e: log.info('METADATA duplicity: {}'.format( parsed_metadata)) else: log.info('METADATA not valid: {}'.format( document.get('url'))) if does_production_exists: url_hash = document.get('_id') content_hash = document.get('content').get('hashes').get('text') production_document = postgresql.get_document_by_hash( postgresql_client, postgresql_cursor, url_hash, postgresql_table_name_production) else: production_document = None if (production_document is None) or (production_document[10] != content_hash): if is_file: log.info('INDEXER: Indexing document (file).') # Index only pdf this time if document.get('file_type') == 'pdf': try: row = indexer.prepare_one_file_for_index( document, crawler_settings.get('limit_domain')) except Exception as e: log.exception('Exception: {0}'.format(document.get('url'))) row = None else: row = None else: log.info('INDEXER: Indexing document.') row = indexer.prepare_one_document_for_index( document, soup, crawler_settings.get('limit_domain')) if row is not None: indexed_rows.append(row) else: if is_file: log.info('INDEXER: Coping document (file).') else: log.info('INDEXER: Coping document.') copied_rows.append(production_document) postgresql.copy_row_from_table_to_table( postgresql_client, postgresql_cursor, url_hash, postgresql_table_name_production, postgresql_table_name) except Exception as e: log.exception('Exception: {0}'.format(document.get('url'))) if len(indexed_rows) > 0: postgresql.insert_rows_into_index(postgresql_client, postgresql_cursor, indexed_rows, postgresql_table_name) mongodb.update_indexer_progress( mongodb_client, task_id, len(indexed_rows) + len(copied_rows)) except Exception as e: log.exception('Exception: INDEXER TASK POSSIBLE FAILURE') finally: postgresql_cursor.close() postgresql_client.close() mongodb_client.close()