def compute_vectors(self): """ Collect pas vector representations into a list. :return: list of sentence vector representations. """ if not self.quiet: print("Computing vectorial representation...") vectors = np.zeros((1, 200, 134)) embeddings = sentence_embeddings(self.sentences) centr_scores = centrality_scores(embeddings) tf_idfs = tf_idf(self.sentences, os.getcwd() + "/data/idfs.dat") # Position score, reference sentence length score, tf_idf, numerical data, centrality, title. for j in range(min(len(self.sentences), 200)): sent = self.sentences[j] position_score = (len(self.sentences) - j) / len(self.sentences) length_score = len(sent) / max(len(snt) for snt in self.sentences) tf_idf_score = 0 numerical_score = 0 centrality_score = centr_scores[j] title_sim_score = np.inner(np.array(embeddings[j]), np.array(embeddings[-1])) # Computing centrality and tf_idf score. terms = list(set(stem_and_stopword(sent))) for term in terms: # Due to errors terms may be not present in the tf_idf dictionary. if term in tf_idfs.keys(): tf_idf_score += tf_idfs[term] else: tf_idf_score += 0 if term.isdigit(): numerical_score += 1 # Some errors in the preprocessing may lead to zero terms, so it is necessary to avoid division by zero. if len(terms): tf_idf_score /= len(terms) else: tf_idf_score = 0 vectors[0, j, :] = np.append( np.array([ position_score, length_score, tf_idf_score, numerical_score, centrality_score, title_sim_score ]), np.array(embeddings[j])) return vectors
def store_full_sentence_matrices(index, ref): """ Storing matrices for the extractive summarization task. """ if index < 0: docs, references, _ = get_duc() doc_path = "/dataset/duc/duc_doc_sent_matrix.dat" ref_path = "/dataset/duc/duc_ref_sent_matrix.dat" else: docs_pas_lists, refs_pas_lists = get_pas_lists(index) docs = get_sources_from_pas_lists(docs_pas_lists) references = get_sources_from_pas_lists(refs_pas_lists) dataset_path = "/dataset/nyt/" + str(index) + "/nyt" + str(index) doc_path = dataset_path + "_doc_sent_matrix.dat" ref_path = dataset_path + "_ref_sent_matrix.dat" docs_no = len(docs) # First dimension, documents number. # Second dimension, max document length (sparse), fixed in case of nyt. max_sent_no = 200 # Third dimension, vector representation dimension. sent_vec_len = 134 # The matrix are initialized as zeros, then they'll filled in with vectors for each docs' sentence. refs_3d_matrix = np.zeros((docs_no, max_sent_no, sent_vec_len)) docs_3d_matrix = np.zeros((docs_no, max_sent_no, sent_vec_len)) # For each document the pas_list is extracted after cleaning the text and tokenizing it. if ref: doc_list = references else: doc_list = docs for i in range(len(doc_list)): doc = doc_list[i] print("Processing doc " + str(i) + "/" + str(len(docs))) doc = text_cleanup(doc) # Splitting sentences (by dot). sentences = tokens(doc) embeddings = sentence_embeddings(sentences) centr_scores = centrality_scores(embeddings) tf_idfs = tf_idf(sentences, os.getcwd() + "/dataset/duc/duc_idfs.dat") # Position score, reference sentence length score, tf_idf, numerical data, centrality, title. for j in range(len(sentences)): sent = sentences[j] position_score = (len(sentences) - j) / len(sentences) length_score = len(sent) / max(len(snt) for snt in sentences) tf_idf_score = 0 numerical_score = 0 centrality_score = centr_scores[j] title_sim_score = np.inner(np.array(embeddings[j]), np.array(embeddings[-1])) # Computing centrality and tf_idf score. terms = list(set(stem_and_stopword(sent))) for term in terms: # Due to errors terms may be not present in the tf_idf dictionary. if term in tf_idfs.keys(): tf_idf_score += tf_idfs[term] else: tf_idf_score += 0 if term.isdigit(): numerical_score += 1 # Some errors in the preprocessing may lead to zero terms, so it is necessary to avoid division by zero. if len(terms): tf_idf_score /= len(terms) else: tf_idf_score = 0 if ref: refs_3d_matrix[i, j, :] = np.append([ position_score, length_score, tf_idf_score, numerical_score, centrality_score, title_sim_score ], embeddings[j]) else: docs_3d_matrix[i, j, :] = np.append([ position_score, length_score, tf_idf_score, numerical_score, centrality_score, title_sim_score ], embeddings[j]) # Storing the matrices in the appropriate file, depending on the scoring system. if ref: with open(os.getcwd() + ref_path, "wb") as dest_f: pickle.dump(refs_3d_matrix, dest_f) else: with open(os.getcwd() + doc_path, "wb") as dest_f: pickle.dump(docs_3d_matrix, dest_f)
def get_important_words(corpus, text): d = tf_idf(corpus, text)[0]['stats'] return sorted(d, key=d.get, reverse=True)[:max_words]
def main(): fileTotalManager = base.FileTotalManager('./file_lengths.json') # load data into memory print('loading csv data into memory...') genome_tags = base.load_data( '../data/genome-tags.csv', base.tags_adapter, fileTotalManager.getFileTotal('genome-tags.csv')) movies_info = base.load_data('../data/mlmovies.csv', base.movie_info_adapter, fileTotalManager.getFileTotal('mlmovies.csv')) ratings_info = base.load_data( '../data/mlratings.csv', base.RatingInfo, fileTotalManager.getFileTotal('mlratings.csv')) tags_info = base.load_data('../data/mltags.csv', base.TagInfo, fileTotalManager.getFileTotal('mltags.csv')) print('loading completed!') # print(movie_actor[0].keys(), mltags[0].keys(), tags[0].keys(), mlmovies[0].keys(), mlusers[0].keys()) print('preprocessing data...') # conversion min_ts, max_ts = base.convert_timestamp(tags_info, 'timestamp') # base.convert_timestamp(ratings_info, 'timestamp') genome_tags = {k['tagId']: k['tag'] for k in genome_tags} # movie_actor_list = base.get_moive_actor_list(movie_actor) # genres_movie_list, min_yr, max_yr = base.get_genre_movies_list(movies_info) movie_names = {k['movieid']: k['moviename'] for k in movies_info} # actor_names = {k['id']: k['name'] for k in actor_info} def tfidf_tag_weight(mr, ts): return (1.0 / mr) * (ts - min_ts + 1) / (max_ts - min_ts + 1) def no_weight(mr, ts): return 1 print('building vectors') # actor_tags_vector # actors_tags_vector = base.actor_tag_vector(movie_actor, tags_info, no_weight)[1] # actors_idf, actors_tfidf_tags_vector = base.actor_tag_vector(movie_actor, tags_info, tfidf_tag_weight) # actors_idf = base.idf(actors_tfidf_tags_vector, actors_idf) # for actor in actors_tfidf_tags_vector.keys(): # actors_tfidf_tags_vector[actor] = base.tf_idf(actors_tfidf_tags_vector[actor], actors_idf, 'tf-idf') # movie_tags_vector print('Building standard movie-tag vector') movies_tags_vector = base.movie_tag_vector(movies_info, tags_info, no_weight)[1] print('\nBuilding tf-idf movie-tag vector') movies_idf, movies_tfidf_tags_vector = base.movie_tag_vector( movies_info, tags_info, tfidf_tag_weight) movies_idf = base.idf(movies_tfidf_tags_vector, movies_idf) for i, movie in enumerate(movies_tfidf_tags_vector.keys()): movies_tfidf_tags_vector[movie] = base.tf_idf( movies_tfidf_tags_vector[movie], movies_idf, 'tf-idf') # movie_actors_vector # movies_actors_vector = base.movie_actor_vector(movies_info, movie_actor, no_weight)[1] # movies_actor_idf, movies_tfidf_actors_vector = base.movie_actor_vector(movies_info, movie_actor, tfidf_actor_weight) # movies_actor_idf = base.idf(movies_tfidf_actors_vector, movies_actor_idf) # for movie in movies_tfidf_actors_vector.keys(): # movies_tfidf_actors_vector[movie] = base.tf_idf(movies_tfidf_actors_vector[movie], movies_actor_idf, 'tf-idf') # create actor-actor matrix # actor_actor_similarity, actors_list, actors_index = build_actor_actor_matrix(actors_tfidf_tags_vector) # create coactor-coactor matrix # coactor_coactor_matrix, coactors_list, coactors_index = build_coactor_coactor_matrix(movie_actor) # print('building AMY tensor') # create Actor-Movie-Year tensor (AMY tensor) # actor_movie_year_tensor, amy_tensor_info = build_actor_movie_year_tensor(movie_actor, movies_info) print('\nbuilding TMR tensor') # create Tag-Movie-Rating tensor (TMR tensor) tag_movie_rating, tmr_tensor_info = build_tag_movie_rating_tensor( genome_tags.keys(), ratings_info) print('creating list') # create watched list users_watched_movies = base.get_users_watched_movies( tags_info, ratings_info) # create watched movies info # watched_movies_info = base.get_moives_related_info(movies_info, ratings_info, movie_actor) print('preprocessing completed!') while True: command_line = input('query>') commands = command_line.split(' ') relevance_feedback = None if len(commands) > 0 and 'p3_task1' in commands[0]: if len(commands) == 3: if commands[2] == 'pf': relevance_feedback = gen_prob_feedback_function( movies_tags_vector) else: if not (commands[2] == 'PCA' or commands[2] == 'SVD'): help() continue elif len(commands) == 4: if commands[3] == 'pf': relevance_feedback = gen_prob_feedback_function( movies_tags_vector) else: help() continue WeightConstants.initialize(movie_names, tags_info, ratings_info) if commands[0] == 'p3_task1a' and len(commands) > 2: user_id = int(commands[1]) similarities = recommender_system_using_svd_pca( user_id, users_watched_movies, movies_tfidf_tags_vector, genome_tags, commands[2]) print_output_using(user_id, similarities, relevance_feedback) elif commands[0] == 'p3_task1b' and len(commands) > 1: user_id = int(commands[1]) similarities = recommender_system_using_lda( user_id, users_watched_movies, movies_tags_vector, genome_tags) print_output_using(user_id, similarities, relevance_feedback) elif commands[0] == 'p3_task1c' and len(commands) > 1: user_id = int(commands[1]) similarities = recommender_system_using_cp( user_id, users_watched_movies, movies_tags_vector, tag_movie_rating, tmr_tensor_info, genome_tags) print_output_using(user_id, similarities, relevance_feedback) elif commands[0] == 'p3_task1d' and len(commands) > 1: user_id = int(commands[1]) similarities = recommender_system_using_ppr( user_id, users_watched_movies, movies_tfidf_tags_vector) print_output_using(user_id, similarities, relevance_feedback) elif commands[0] == 'p3_task1e' and len(commands) > 1: user_id = int(commands[1]) similarities = recommender_system_combining_all( user_id, users_watched_movies, movies_tfidf_tags_vector, movies_tags_vector, tag_movie_rating, tmr_tensor_info, genome_tags) print_output_using(user_id, similarities, relevance_feedback) elif commands[0] == 'p3_task3' and len(commands) == 3: lsh_indexing(genome_tags, movie_names, movies_tags_vector, int(commands[1]), int(commands[2])) elif commands[0] == 'p3_task5' and len(commands) > 1: labelled_movies = {} n = int(input("Enter number of labels: ")) while (n > 0): label = input("Enter label: ") movie_data = input("Enter space separated movies for label " "" + label + ": ") movies = movie_data.split(" ") for i, m in enumerate(movies): movies[i] = int(m) labelled_movies[label] = movies n -= 1 if commands[1] == 'NN' and len(commands) > 2: recommender_system_for_labeling_movies( movies_info, labelled_movies, genome_tags, movies_tfidf_tags_vector, commands[1], int(commands[2])) elif commands[1] == 'SVM' or commands[1] == 'DT': recommender_system_for_labeling_movies( movies_info, labelled_movies, genome_tags, movies_tfidf_tags_vector, commands[1], 0) elif len(commands) > 1 and (commands[0] == 'reset' and commands[1] == 'wc'): WeightConstants.reset() print("WeightConstants data has been purged") else: help()
def extract_pas(sentences): """ Extracts the PASs from a list of sentences ( :param sentences: sentences from which to extract PAS. """ # Compute the TFIDF vector of all terms in the document. tf_idfs = tf_idf(sentences, os.getcwd() + "/data/idfs.dat") # Longest sentence length needed afterwards for the length score. longest_sent_len = max(len(sent) for sent in sentences) pas_list = [] for sent in sentences: # Ignoring short sentences (errors). if 3 < len(remove_punct(sent)) and len(sent) < 1000: sent_index = sentences.index(sent) # Substituting single apices with double apices to avoid errors with SRL. sent = re.sub("\'([a-zA-Z0-9])([a-zA-Z0-9 ]+)([a-zA-Z0-9])\'", r'" \1\2\3 "', sent) annotations = _annotator.get_annoations(remove_punct(sent).split()) # Getting SRL annotations from SENNA. sent_srl = annotations['srl'] # Getting POS tags from SENNA. parts_of_speech = annotations['pos'] for raw_pas in sent_srl: accept_pas = 1 out_of_order = 0 chk_sent = remove_punct(sent) # Rejecting PASs with arguments that change the order (w.r.t. to the one of the original sentence); # These represents the 10% of the total PASs and the 80% of them are incorrect. for arg in raw_pas.values(): # Replacing double spaces with a single space to avoid some arguments to be ignored. arg = remove_punct(arg.replace(" ", " ")) if chk_sent.find(arg) < 0: accept_pas = False out_of_order = 1 break if accept_pas: pas = Pas(sent, parts_of_speech, sent_index, sent_srl.index(raw_pas), raw_pas, out_of_order) pas_list.append(pas) # Completing each PAS with its realization embeddings and vector representation. # This process is done after the initialization as all the other pas are needed. realized_pass = [] for pas in pas_list: realized_pass.append(realize_pas(pas)) # Here the title is put together with the pass to avoid starting another embedding process realized_pass.append(sentences[0]) pas_embeddings = sentence_embeddings(realized_pass) # Get the centrality scores for the pas embeddings pas_centralities = centrality_scores(pas_embeddings) for pas in pas_list: pas_index = pas_list.index(pas) pas.complete_pas( realized_pass[pas_index], pas_embeddings[pas_index], len(sentences), longest_sent_len, tf_idfs, pas_centralities[pas_index], np.inner(np.array(pas_embeddings[pas_index]), np.array(pas_embeddings[-1]))) return pas_list
#!python3 import text as text_parser import utils import xlsxwriter import re workbook = xlsxwriter.Workbook('tf-idf.xlsx') print('Reading texts...') all_texts = text_parser.get_text_corpus(9999, 'texts/news') print('Done! Computing TF-IDF ranks...') all_ranks = utils.tf_idf(all_texts) print('\nDone! Writing results...') text_no = 0 for dictionary in all_ranks: text_no += 1 print('Writing worksheet ' + str(text_no) + '/' + str(len(all_ranks)) + ' (' + dictionary['title'][:10] + '...)', end='\r') sheet_name = str(text_no) + ' ' + re.sub('[\[\]:*?/\\\]', '', dictionary['title'][:-1]) worksheet = workbook.add_worksheet(sheet_name[:28] + ('...' if len(sheet_name) > 28 else '')) worksheet.write(0, 0, dictionary['title']) worksheet.write(1, 0, '#') worksheet.write(1, 1, 'Rank') worksheet.write(1, 2, 'Word') row = 2 sorted_dict = sorted(dictionary['stats'].items(), key=lambda x: (x[1], x[0]), reverse=True)
if __name__ == "__main__": path = 'data/Data.csv' tokenized_docs, ids = read_corpus(file_path=path, has_tag=False, has_id=True) vocab_docs = list(Counter(chain(*tokenized_docs)).keys()) vocab, word2id, vectors = None, None, None if not os.path.exists('word_vectors.npy'): word2vec_model = KeyedVectors.load_word2vec_format( './GoogleNews-vectors-negative300.bin', binary=True) save_word2vec_vocab(vocab_docs=vocab_docs, word2vec_model=word2vec_model) else: vocab, word2id, vectors = load_word2vec_vocab() tf_idf_matrix = tf_idf(tokenized_docs, vocab, word2id) doc_vecs = get_document_vectors_word2vec(tokenized_docs, tf_idf_matrix, word2id, vectors) #model = kmeans(X_train=doc_vecs[:4000], n_clusters=4) #doc_vecs = get_document_vectors_tfidf() #model = kmeans(X_train=doc_vecs[:4000], n_clusters=4) #predictions = model.predict(doc_vecs[4000:]) #model = gaussian_mixture_model(doc_vecs[:4000], n_components=4) #print(model.predict(doc_vecs[4000:])) print(hierarchical_clustering(X=doc_vecs[:4000], n_clusters=10))
rank_threshold = 0.09 # filter less relevant words, ranked in relative range 0..1 commons = set() line = [] text = [] with open('tf_df_output.txt') as f: content = f.readlines() with open('files_count.txt') as f: documents = int(f.read()) with open('file_name.txt') as f: with open(f.read().strip()) as f2: text = utils.get_normalized_words(f2.read()) tf_df_word = [x.strip().split() for x in content] for [tf, df, word] in tf_df_word: ranks[word] = utils.tf_idf(int(tf), int(df), documents) if ranks[word] > max_rank: max_rank = ranks[word] for word in text: rank = ranks[word] / max_rank line.append(rank) if rank < rank_threshold: commons.add(word) all_words = set(text) - commons def add_to_hvg(w1, w2): if w1 > w2: temp = w1
def vectorize(self): docs = [self.tokenizer(t.encode('utf-8')) for t in self.texts] return utils.tf_idf(docs, normalize=True)
exit(1) print('Reading all texts from /texts/news...') corpus = get_text_corpus(root_dir=os.path.normpath('texts/news')) main_text = None for text in corpus: if text['filename'] == file_path: main_text = text if not main_text: print('A mystery duck found') exit(1) word_ranking = tf_idf(corpus, main_text)[0] workbook = xlsxwriter.Workbook('article-rank.xlsx') worksheet = workbook.add_worksheet( re.sub('[\[\]:*?/\\\]', '', main_text['title'][0:28])) worksheet.write(0, 0, main_text['title']) worksheet.write(1, 0, '#') worksheet.write(1, 2, 'Word') worksheet.write(1, 1, 'Rank') row = 2 for word in main_text['text']: worksheet.write(row, 0, row - 1) worksheet.write(row, 1, word) worksheet.write(row, 2, word_ranking['stats'][word] / word_ranking['max_rank']) row += 1 workbook.close()
textds = TextDataSet(data_path='data/') X_train, X_valid, X_test, y_train, y_valid, y_test = textds.generate_text_dataset( train_size=500, valid_size=200, test_size=500) print(X_train.shape, y_train.shape) print(X_valid.shape, y_valid.shape) print(X_test.shape, y_test.shape) # print(textds.embedding_from_text(['I love coffee']).shape) # %% knn_ds = { 'hamming': ((X_train > 0).astype('float'), (X_valid > 0).astype('float')), 'euclidean': (X_train, X_valid), 'cosine': (tf_idf(X_train, alpha=1e-6, beta=1e-9), tf_idf(X_valid, alpha=1e-6, beta=1e-9)), } best_acc = 0 best_metric = None best_k = 0 for metric, (X_train_, X_valid_) in knn_ds.items(): for k in [1, 3, 5]: clf = TextClassifier(Knn(n_neighbors=k, metric=metric)) clf.fit(X_train_, y_train) acc = clf.score(X_valid_, y_valid) print(metric, k, round(acc * 100, 2), sep=', ') if acc > best_acc: best_acc = acc best_metric = metric