def add_web_page_summary(self, web_page_summary, output_directory_name): """ Resolves web page links, Indexes and writes web page summary only if resolved requested url is not in index.""" # resolved requested url requested_url = web_page_summary['requested_url'] resolved_requested_url = self.url_resolver.resolve(requested_url) # check if resolved requested url is in index. if it is, return if resolved_requested_url in self.url_id_index: return # add new url to index self.url_id_index.add(resolved_requested_url) # if not in index, resolve all web page links and write to file resolved_normalized_a_hrefs = self.url_resolver.resolve_list(web_page_summary['normalized_a_hrefs']) resolved_normalized_img_srcs = self.url_resolver.resolve_list(web_page_summary['normalized_img_srcs']) # add web_page_summary resolved links # and add the additional url_id key value pair before writing to file written_web_page_summary = web_page_summary.copy() written_web_page_summary['id'] = self.url_id_index[resolved_requested_url] written_web_page_summary['resolved_requested_url'] = resolved_requested_url written_web_page_summary['resolved_normalized_a_hrefs'] = resolved_normalized_a_hrefs written_web_page_summary['resolved_normalized_img_srcs'] = resolved_normalized_img_srcs # write file file_io.save('web_page_summary_file_path', written_web_page_summary, [output_directory_name, written_web_page_summary['id']])
def save_term_frequency_dictionary(self, term_frequency_dictionary, content_hash, output_directory_name): # add new document hash to index self.hash_id_index.add(content_hash) document_id = self.hash_id_index[content_hash] # write file file_io.save('document_frequency_dict_file_path', term_frequency_dictionary, [output_directory_name, document_id])
def write_log_file(self): self.log_info_dict = {} self.log_info_dict['seed_url'] = self.seed_url self.log_info_dict['max_urls_to_index'] = self.max_urls_to_index self.log_info_dict['robots_dissaloud_path'] = self.forbidden_urls # write file file_io.save('log_file', self.log_info_dict, [self.output_directory_name])
def save_indexes(self): logger.info("Saving Index Files") # add log self.document_indexer.save_document_indexer() self.url_indexer.save_url_indexer() # save url frontier # write file file_io.save('url_frontier_file', self.url_frontier.to_dict(), None)
def save_document_title_dictionary(self, document_title, content_hash, output_directory_name): # add new document hash to index self.hash_id_index.add(content_hash) document_id = self.hash_id_index[content_hash] # create page html json with meta data document_title_dictionary = { 'title': document_title['title'], 'document_id': document_id, 'content_hash': content_hash } logger.info("Saving Document Title, ID: %d" % document_id) # write file file_io.save('document_title_file_path', document_title_dictionary, [output_directory_name, document_id])
def save_term_frequency_dictionary(self, term_frequency_dictionary, content_hash, output_directory_name): # add new document hash to index self.hash_id_index.add(content_hash) document_id = self.hash_id_index[content_hash] # add doc id and hash to term frequency dictionary term_frequency_dictionary['document_id'] = document_id term_frequency_dictionary['content_hash'] = content_hash logger.info("Saving Document Term Frequency Dictonary ID: %d" % document_id) # write file file_io.save('document_term_frequency_dictionary_file_path', term_frequency_dictionary, [output_directory_name, document_id])
def save_url_indexer(self): # write file file_io.save('url_id_map_file', self.url_id_index.to_dict(), None) file_io.save('resolved_url_map_file', self.url_resolver.to_dict(), None) # new file_io.save('hash_url_list_map_file', self.hash_url_list_map, None)
def save_leader_follower_dictionary(doc_freq_matrix_dataFrame): logger.info("Saving Leader Follower File...") leader_follower_docID_json = cluster_pruning_leader_follower_dict(doc_freq_matrix_dataFrame, to_json=True) # write file file_io.save('leader_follower_file_path', leader_follower_docID_json, None)
def add_web_page_summary(self, web_page_summary, output_directory_name): """ Resolves web page links, Indexes and writes web page summary only if resolved requested url is not in index.""" # resolved requested url requested_url = web_page_summary['requested_url'] resolved_requested_url = self.url_resolver.resolve(requested_url) # check if resolved requested url is in index. if it is, return if resolved_requested_url in self.url_id_index: return logger.info("Adding new URL to index: %s" % resolved_requested_url) # add new url to index self.url_id_index.add(resolved_requested_url) # if not in index, resolve all web page links and write to file # and if has attributes resolved_normalized_a_hrefs = [] if 'normalized_a_hrefs' in web_page_summary: resolved_normalized_a_hrefs = self.url_resolver.resolve_list( web_page_summary['normalized_a_hrefs']) resolved_normalized_img_srcs = [] if 'normalized_img_srcs' in web_page_summary: resolved_normalized_img_srcs = self.url_resolver.resolve_list( web_page_summary['normalized_img_srcs']) # add web_page_summary resolved links # and add the additional url_id key value pair before writing to file written_web_page_summary = web_page_summary.copy() written_web_page_summary['url_id'] = self.url_id_index[ resolved_requested_url] written_web_page_summary[ 'resolved_requested_url'] = resolved_requested_url written_web_page_summary[ 'resolved_normalized_a_hrefs'] = resolved_normalized_a_hrefs written_web_page_summary[ 'resolved_normalized_img_srcs'] = resolved_normalized_img_srcs # write file logger.info("Saving response summary") file_io.save( 'web_page_access_log_and_metadata_file_path', written_web_page_summary, [output_directory_name, written_web_page_summary['url_id']]) # keeping track of pointer urls to same content # update hash_url_list_map if 'content_hash' in web_page_summary: content_hash = web_page_summary['content_hash'] requested_url_list = [] resolved_requested_url_list = [] redirection_urls_list = [] try: requested_url_list = [web_page_summary['requested_url']] except: pass try: resolved_requested_url_list = [ web_page_summary['resolved_requested_url'] ] except: pass try: redirection_urls_list = web_page_summary[ 'redirect_history'] # already list except: pass urls = requested_url_list + resolved_requested_url_list + redirection_urls_list if content_hash in self.hash_url_list_map: existing_urls = self.hash_url_list_map[content_hash] urls += existing_urls self.hash_url_list_map[content_hash] = list(set(urls))
def save_document_indexer(self): # write file file_io.save('hash_id_map_file', self.hash_id_index.to_dict(), None)
def build_matrices_and_maps(indexed_directory_name_list): output_directory_name = '_'.join(indexed_directory_name_list) # create term_frequency_dictionaries and find unique_words combined_full_id_tf_dict = {} combined_title_id_tf_dict = {} for indexed_directory_name in indexed_directory_name_list: full_id_tf_dict = load_document_term_frequency_dictionaries( indexed_directory_name) title_id_tf_dict = load_title_document_id_term_frequency_dictionaries( indexed_directory_name) combined_full_id_tf_dict.update(full_id_tf_dict) combined_title_id_tf_dict.update(title_id_tf_dict) unique_words = find_all_unique_words( [combined_full_id_tf_dict, combined_title_id_tf_dict]) # create full, title and leader dvms and maps full_document_vector_matrix, docID2row, word2col = matrix_and_maps( combined_full_id_tf_dict, unique_words) title_document_vector_matrix, _, _ = matrix_and_maps( combined_title_id_tf_dict, unique_words) leader_document_vector_matrix, leader_row_2_cluster_indices, leader_row_2_cluster_ids = \ cluster_pruning_matrix_and_maps(full_document_vector_matrix, docID2row) tfidf_matrix = build_tfidf_matrix(full_document_vector_matrix) tfidf_leader_document_vector_matrix, tfidf_leader_row_2_cluster_indices, tfidf_leader_row_2_cluster_ids = \ cluster_pruning_matrix_and_maps(tfidf_matrix, docID2row) # save matrices and maps file_io.save('full_document_vector_matrix_file_path', full_document_vector_matrix, [output_directory_name], output_type='numpy_array') file_io.save('title_document_vector_matrix_file_path', title_document_vector_matrix, [output_directory_name], output_type='numpy_array') file_io.save('leader_document_vector_matrix_file_path', leader_document_vector_matrix, [output_directory_name], output_type='numpy_array') file_io.save('tfidf_matrix_file_path', tfidf_matrix, [output_directory_name], output_type='numpy_array') file_io.save('tfidf_leader_document_vector_matrix_file_path', tfidf_leader_document_vector_matrix, [output_directory_name], output_type='numpy_array') # save all maps in one file matrix_maps = { 'tfidf_leader_row_2_cluster_indices': tfidf_leader_row_2_cluster_indices, 'tfidf_leader_row_2_cluster_ids': tfidf_leader_row_2_cluster_ids, 'leader_row_2_cluster_indices': leader_row_2_cluster_indices, 'leader_row_2_cluster_ids': leader_row_2_cluster_ids, 'docID2url': get_docID2url_map(), 'row2docID': {v: k for k, v in docID2row.items()}, 'docID2row': docID2row, 'col2word': {v: k for k, v in word2col.items()}, 'word2col': word2col } file_io.save('matrix_maps_file_path', matrix_maps, [output_directory_name], output_type='pickle_dict')