def load_url_indexer(self): # load url_id_index url_indexer_file_path = file_io.get_path('url_id_map_file', None) if url_indexer_file_path is not None: self.url_id_index.load(url_indexer_file_path) # load url_resolver url_resolver_file_path = file_io.get_path('resolved_url_map_file', None) if url_resolver_file_path is not None: self.url_resolver.load(url_resolver_file_path)
def get_docID2url_map(): hash_id_map_file = file_io.get_path("hash_id_map_file", None, force=True) with open(hash_id_map_file) as json_data: hash_id_map = json.load(json_data) # hash_id_df = pd.DataFrame(hash_id_tuple_list, columns=['hash', 'id']) hash_url_list_map_file = file_io.get_path("hash_url_list_map_file", None, force=True) with open(hash_url_list_map_file) as json_data: hash_url_list_map = json.load(json_data) # take all urls # docID_url_map = {hash_id_map[hash]:hash_url_list_map[hash] for hash in hash_id_map.keys()} # take only first url docID_url_map = {hash_id_map[hash]: hash_url_list_map[hash][0] for hash in hash_id_map.keys()} return docID_url_map
def get_title( self, docID): # TODO: Merge title databases so outputdir is not needed title_path = file_io.get_path('document_title_file_path', [self.output_directory_name, docID]) with open(title_path) as json_data: dtd = json.load(json_data) doc_title = dtd['title'] return doc_title
def load_indexes(self): logger.info("Loading Index Files") self.url_indexer.load_url_indexer() self.document_indexer.load_document_indexer() # load url frontier url_frontier_file_path = file_io.get_path('url_frontier_file', None) if url_frontier_file_path is not None: self.url_frontier.load(url_frontier_file_path)
def load_leader_follower_dictionary(): logger.info("Loading Leader Follower File...") leader_follower_docID_dict_file_path = file_io.get_path('leader_follower_file_path', None) if leader_follower_docID_dict_file_path is not None: with open(leader_follower_docID_dict_file_path) as json_data: leader_follower_dict_json = json.load(json_data) # cast docID key to int leader_follower_dict = {int(k): v for k, v in leader_follower_dict_json.items()} return leader_follower_dict else: logger.error("Leader Follower File Not Found")
def load_url_indexer(self): # load url_id_index url_indexer_file_path = file_io.get_path('url_id_map_file', None) if url_indexer_file_path is not None: self.url_id_index.load(url_indexer_file_path) # load url_resolver url_resolver_file_path = file_io.get_path('resolved_url_map_file', None) if url_resolver_file_path is not None: self.url_resolver.load(url_resolver_file_path) # new # load hash_url_list_map hash_url_list_map_path = file_io.get_path('hash_url_list_map_file', None) if hash_url_list_map_path is not None: with open(hash_url_list_map_path) as json_data: self.hash_url_list_map = json.load(json_data)
def get_docID2url_map(): hash_id_map_file = file_io.get_path("hash_id_map_file", None, force=True) with open(hash_id_map_file) as json_data: hash_id_map = json.load(json_data) hash_url_list_map_file = file_io.get_path("hash_url_list_map_file", None, force=True) with open(hash_url_list_map_file) as json_data: hash_url_list_map = json.load(json_data) # take all urls # docID_url_map = {hash_id_map[hash]:hash_url_list_map[hash] for hash in hash_id_map.keys()} # take only first url docID_url_map = { hash_id_map[hash]: hash_url_list_map[hash][0] for hash in hash_id_map.keys() } return docID_url_map
def get_document_term_frequency_matrix(indexed_directory_name, write=True): # check if document term frequency matrix has already been created matrix_file = file_io.get_path("document_term_frequency_matrix_file_path", None, force=True) # if the document term frequency matrix has already been created already exists load and return if os.path.isfile(matrix_file): logger.info("Accessing document term frequency matrix already in existence at: %s" % matrix_file) return pd.read_csv(matrix_file, index_col="docID") # , dtype={'docID': np.int64, 'document text': str}) # if the document term frequency matrix does not exist, construct it from document frequency json files id_tf_dict = load_document_frequency_dicts(indexed_directory_name) unique_words = set() for doc_id, tf_dict in id_tf_dict.items(): unique_words = unique_words | set(tf_dict.keys()) doc_freq_matrix = pd.DataFrame(columns=unique_words) # , index=id_tf_dict.keys()) doc_freq_matrix.index.name = "docID" for doc_id, tf_dict in id_tf_dict.items(): terms, freqs = zip(*tf_dict.items()) df = pd.DataFrame(data=[freqs], columns=terms, index=[int(doc_id)]) # df.index.name = "docID" doc_freq_matrix = pd.concat([doc_freq_matrix, df], join='outer') doc_freq_matrix = doc_freq_matrix.fillna(value=0) # sort by docID doc_freq_matrix = doc_freq_matrix.sort_index() # by='docID') # set index column name to docID # doc_freq_matrix.index.name = 'docID' # write to csv if write: logger.info("Writing Document Term Frequency Matrix") matrix_file = file_io.get_path("document_term_frequency_matrix_file_path", None, force=True) doc_freq_matrix.to_csv(matrix_file) return doc_freq_matrix
def load_matrices(self, matrix_names): if "leader_document_vector_matrix" in matrix_names: # load leader document vector matrix ldvm_file_path = file_io.get_path( "leader_document_vector_matrix_file_path", [self.output_directory_name]) self.leader_document_vector_matrix = np.load(ldvm_file_path) if "title_document_vector_matrix" in matrix_names: # load title document vector matrix tdvm_file_path = file_io.get_path( "title_document_vector_matrix_file_path", [self.output_directory_name]) self.title_document_vector_matrix = np.load(tdvm_file_path) if "full_document_vector_matrix" in matrix_names: # load full document vector matrix fdvm_file_path = file_io.get_path( "full_document_vector_matrix_file_path", [self.output_directory_name]) self.full_document_vector_matrix = np.load(fdvm_file_path) if "tfidf_matrix" in matrix_names: # load tfidf matrix tfidf_matrix_file_path = file_io.get_path( "tfidf_matrix_file_path", [self.output_directory_name]) self.tfidf_matrix = np.load(tfidf_matrix_file_path) if "tfidf_leader_document_vector_matrix" in matrix_names: # load leader document vector matrix tfidf_ldvm_file_path = file_io.get_path( "tfidf_leader_document_vector_matrix_file_path", [self.output_directory_name]) self.tfidf_leader_document_vector_matrix = np.load( tfidf_ldvm_file_path)
def query_to_vector_slow(raw_query): # all that is needed is word2col dictonary word2col_file_path = file_io.get_path('word2col_file_path', None) with open(word2col_file_path) as json_data: word2col = json.load(json_data) # create empty query vector query_vector = np.zeros(len(word2col)) # tokenize query query_tokens = text_processing.plain_text_to_tokens(raw_query) # , stopwords file) # update term frequencies of query vector for token in query_tokens: column_index = word2col[token] query_vector[column_index] += 1 return query_vector
def load_maps(self): # load matrix maps matrix_maps_file_path = file_io.get_path("matrix_maps_file_path", [self.output_directory_name]) with open(matrix_maps_file_path, 'rb') as pickle_file: self.matrix_maps = pickle.load(pickle_file) # loading without optimization loads all self.word2col = self.matrix_maps['word2col'] # query_to_vector self.col2word = self.matrix_maps['col2word'] # vector_to_tokens self.leader_row_2_cluster_indices = self.matrix_maps[ 'leader_row_2_cluster_indices'] # cluster self.leader_row_2_cluster_ids = self.matrix_maps[ 'leader_row_2_cluster_ids'] # cluster self.tfidf_leader_row_2_cluster_indices = self.matrix_maps[ 'tfidf_leader_row_2_cluster_indices'] # tfidf cluster self.tfidf_leader_row_2_cluster_ids = self.matrix_maps[ 'tfidf_leader_row_2_cluster_ids'] # tfidf cluster self.docID2url = self.matrix_maps['docID2url'] # cluster self.row2docID = self.matrix_maps['row2docID']
def get_document_term_frequency_matrix(indexed_directory_name, write=True): id_tf_dict = load_document_frequency_dicts(indexed_directory_name) unique_words = set() for doc_id, tf_dict in id_tf_dict.items(): unique_words = unique_words | set(tf_dict.keys()) doc_freq_matrix = pd.DataFrame(columns=unique_words) for doc_id, tf_dict in id_tf_dict.items(): terms, freqs = zip(*tf_dict.items()) df = pd.DataFrame(data=[freqs], columns=terms, index=[doc_id]) doc_freq_matrix = pd.merge(doc_freq_matrix, df, how='outer') doc_freq_matrix = doc_freq_matrix.fillna(value=0) # write to csv if write: logger.info("Writing Document Term Frequency Matrix") matrix_file = file_io.get_path( "document_term_frequency_matrix_file_path", None, force=True) doc_freq_matrix.to_csv(matrix_file) return doc_freq_matrix
def load_log_file(indexed_directory_name): log_file_path = file_io.get_path('log_file', [indexed_directory_name]) if log_file_path is not None: with open(log_file_path) as json_data: log_info = json.load(json_data) return log_info
def load_document_indexer(self): document_indexer_file_path = file_io.get_path('hash_id_map_file', None) if document_indexer_file_path is not None: self.hash_id_index.load(document_indexer_file_path)