def exists_url(db, url): """Return if url is exists in db""" url_hash = urls.hash(url) result = db['Urls'].find_one({'_id': url_hash}) return result is not None
def set_canonical_group_to_alias(db, original_url, canonical_group): """If there was redirect, set the canonical group to the orginal alias url""" modification = {'canonical_group': canonical_group} return db['Urls'].find_one_and_update({'_id': urls.hash(original_url)}, {'$set': modification})
def set_visited_url(db, url, response, soup, noindex, original_url=None): """Try to set url to visited and update other important informations""" url_hash = urls.hash(url) is_redirect, is_permanent_redirect = _determine_type_of_redirect(response) url_addition = {} # Pairing url with canonical group id # Remove script tags from soup for script in soup('script'): script.extract() text = soup.getText(separator='\n') try: url_addition['language'] = detect(text) except Exception as e: # Fallback language url_addition['language'] = 'cs' text_hash = document.hash_document( document.extract_document_text_for_hash(soup)) url_addition['canonical_group'] = get_or_create_canonical_group( db, text_hash) url_addition['visited'] = True url_addition['queued'] = False url_addition['indexed'] = False url_addition['noindex'] = noindex url_addition['progress.last_visited'] = str(datetime.utcnow()) url_addition['content.binary'] = response.content url_addition['content.hashes.text'] = text_hash url_addition['content.encoding'] = response.encoding # Later detect language url_addition['response.elapsed'] = str(response.elapsed) url_addition['response.is_redirect'] = is_redirect url_addition['response.is_permanent_redirect'] = is_permanent_redirect url_addition['response.status_code'] = response.status_code url_addition['response.reason'] = response.reason url_addition = _format_response_header(response, url_addition) result = db['Urls'].find_one_and_update({'_id': url_hash}, {'$set': url_addition}) # If there was redirect, set the canonical group to the orginal alias url if original_url is not None: set_canonical_group_to_alias(db, original_url, url_addition['canonical_group']) # If insertion was successful update representative of canonical group if result is not None: _update_representatives_of_canonical_groups( db, url_addition['canonical_group']) return result is not None
def set_url_for_recrawl(db, url): """Set url for recrawl later""" url_hash = urls.hash(url) result = db['Urls'].find_one_and_update( {'_id': url_hash}, {'$set': { 'queued': False, 'visited': False }}) return result is not None
def batch_insert_pagerank_outlinks(db, from_url, to_urls): """Inser batch of outlinks into database""" url_documents = [] for to_url in to_urls: to_url = to_url.get('url') url_object = { 'from_hash': urls.hash(from_url), 'to_hash': urls.hash(to_url) } url_documents.append(url_object) try: result = db['PageRank'].insert_many(url_documents, ordered=False) except pymongo.errors.BulkWriteError: result = None return result
def set_timeout_url(db, url): """Try to set url as timouted""" url_hash = urls.hash(url) result = db['Urls'].find_one_and_update({'_id': url_hash}, { '$set': { 'queued': False, 'timeout.timeout': True, 'timeout.last_timeout': str(datetime.utcnow()) } }) return result is not None
def set_alias_visited_url(db, url): url_hash = urls.hash(url) url_addition = {} url_addition['visited'] = True url_addition['queued'] = False url_addition['alias'] = True url_addition['progress.last_visited'] = str(datetime.utcnow()) result = db['Urls'].find_one_and_update({'_id': url_hash}, {'$set': url_addition}) return result is not None
def set_visited_invalid_url(db, url, response, reason, is_file=False): url_hash = urls.hash(url) url_addition = {} url_addition['visited'] = True url_addition['queued'] = False url_addition['invalid'] = True url_addition['file'] = is_file url_addition['invalid_reason'] = reason url_addition['progress.last_visited'] = str(datetime.utcnow()) result = db['Urls'].find_one_and_update({'_id': url_hash}, {'$set': url_addition}) return result is not None
def _prepare_url_object(url, visited, queued, depth): """Prepare url object before inserting into database""" url_object = { '_id': urls.hash(url), 'url': url, 'domain': urls.domain(url), 'depth': depth, 'visited': visited, 'queued': queued, 'alias': False, 'invalid': False, 'file': False, 'progress': { 'discovered': str(datetime.utcnow()) } } return url_object
def select_representative_for_canonical_group(db, canonical_group): """Return id of URL which is suitable as representative of canonical group""" urls_representatives = db['Urls'].find({ 'canonical_group': ObjectId(canonical_group), 'alias': False, 'invalid': False }) representatives = [] for url in urls_representatives: representatives.append(url.get('url')) # Return hash of the shortest url return urls.hash(min(representatives, key=len))
def _handle_response(database, url, original_url, redirected, response, depth, max_depth, limit_domain, blacklist, ignore_blacklist=False): try: url_document = mongodb.get_url(database, url) regex = urls.generate_regex(limit_domain) # Redirect handling if original_url != url: log.info('Redirect: {1} (original: {0})'.format(original_url, url)) # Check if redirected url is valid is_valid_redirect, reason = validator.validate( url, regex, blacklist) if (is_valid_redirect is False) and ( reason == 'UrlIsBlacklisted') and ignore_blacklist: is_valid_redirect = True if is_valid_redirect: mongodb.set_alias_visited_url(database, original_url) url_document = mongodb.get_url(database, url) if url_document is not None: if url_document.get( 'visited') and not url_document.get('alias'): canonical_group = url_document.get('canonical_group') mongodb.set_canonical_group_to_alias( database, original_url, canonical_group) log.info( 'Already visited redirect: {0} (original: {1})'. format(url, original_url)) return else: if not urls.is_same_domain(url, original_url): depth = max_depth mongodb.insert_url(database, url, False, False, depth) else: mongodb.set_visited_invalid_url(database, original_url, response, "invalid_redirect") mongodb.delete_pagerank_edge_to(database, urls.hash(original_url)) log.info('Not Valid Redirect: {0} (original: {1})'.format( url, original_url)) return else: # Check if url is already visited if url_document is not None: if url_document.get('visited'): log.info('Already visited: {0}'.format(url)) return # File handling content_type = response.headers.get('Content-Type') if content_type is None: content_type = '' is_content_type_file = test_content_type_file(content_type) is_file_valid_type = test_file_valid_type(content_type) if is_content_type_file: if not is_file_valid_type: mongodb.delete_pagerank_edge_to(database, urls.hash(url)) mongodb.set_visited_invalid_url(database, url, response, "invalid_file", True) log.info('Not valid file: {0}'.format(url)) return else: if original_url != url: mongodb.set_visited_file_url(database, url, response, original_url) else: mongodb.set_visited_file_url(database, url, response) log.info('Done (file) [{0}]: {1}'.format(response.reason, url)) else: # Handle normal page soup = BeautifulSoup(response.content, 'html5lib') no_index = link_extractor.has_noindex(soup) validated_urls_on_page, not_valid_urls = link_extractor.validated_page_urls( soup, url, regex, blacklist) urls_for_insert = [] for page_url in validated_urls_on_page: insert_url = {'url': page_url} if urls.is_same_domain(url, page_url): if depth - 1 != 0: insert_url['depth'] = depth - 1 else: continue else: insert_url['depth'] = max_depth urls_for_insert.append(insert_url) if len(urls_for_insert) > 0: mongodb.batch_insert_url(database, urls_for_insert, False, False) mongodb.batch_insert_pagerank_outlinks(database, url, urls_for_insert) if original_url != url: mongodb.set_visited_url(database, url, response, soup, no_index, original_url) else: mongodb.set_visited_url(database, url, response, soup, no_index) log.info('Done [{0}]: {1}'.format(response.reason, url)) return except Exception as e: mongodb.delete_url(database, url) log.exception('Exception: {0} {1}'.format(url, e)) raise
def set_visited_file_url(db, url, response, original_url=None): """Save file into database and set is as visited""" content_type = response.headers.get('Content-Type') if 'application/pdf' in content_type: file_type = 'pdf' elif 'text/plain' in content_type: file_type = 'txt' else: file_type = None url_hash = urls.hash(url) is_redirect, is_permanent_redirect = _determine_type_of_redirect(response) url_addition = {} # Pairing url with canonical group id content_hash = urls.hash_document(response.content) url_addition['canonical_group'] = get_or_create_canonical_group( db, content_hash) url_addition['visited'] = True url_addition['queued'] = False url_addition['indexed'] = False url_addition['noindex'] = False url_addition['file'] = True url_addition['file_type'] = file_type url_addition['progress.last_visited'] = str(datetime.utcnow()) # GridFS connection fs = gridfs.GridFS(db) file_id = fs.put(response.content) url_addition['content.binary'] = file_id url_addition['content.hashes.content'] = content_hash url_addition['response.elapsed'] = str(response.elapsed) url_addition['response.is_redirect'] = is_redirect url_addition['response.is_permanent_redirect'] = is_permanent_redirect url_addition['response.status_code'] = response.status_code url_addition['response.reason'] = response.reason url_addition = _format_response_header(response, url_addition) result = db['Urls'].find_one_and_update({'_id': url_hash}, {'$set': url_addition}) # If there was redirect, set the canonical group to the orginal alias url if original_url is not None: set_canonical_group_to_alias(db, original_url, url_addition['canonical_group']) # If insertion was successful update representative of canonical group if result is not None: _update_representatives_of_canonical_groups( db, url_addition['canonical_group']) return result is not None
def get_url(db, url): document = db['Urls'].find_one({'_id': urls.hash(url)}) return document
def delete_url(db, url): """Try to delete url from db, returns True if case of success""" result = db['Urls'].delete_one({'_id': urls.hash(url)}) return result.deleted_count > 0