def get_word_embeddings(layer): path = io.get_word_embeddings_path(layer) exist( path, "No cached embeddings. Run the model first with the --plot option enabled." ) return io.read_csv(path, sep=",").to_numpy()
def analyze_doc_embeddings(dataset, path, id1, id2, filename): embeddings = io.read_csv(path, sep=",") reduced_emb_doc = reduce_dimensions(embeddings) doc_labels = generate_doc_labels(embeddings, dataset) label1 = doc_labels[id1] label2 = doc_labels[id2] # assert label1 == label2 visualize_highlight(reduced_emb_doc, id1, id2, label1, filename=filename, labels=doc_labels, colors=dataset_colors[dataset])
def analyze_word_embeddings(dataset, path, threshold, edge_type, best, n=10): global top_words_dict embeddings = io.read_csv(path, sep=",") embeddings_array = embeddings.to_numpy().tolist() unique_labels = sorted(list(set([label.split("\t")[2] for label in file.get_labels(dataset)])), reverse=True) vocab = file.get_vocab(dataset) max_indices = [] max_values = [] all_words = [] all_labels = [] results_dict = {} for index, emb in enumerate(embeddings_array): array = np.array(emb) max_index = array.argmax() max_indices.append(max_index) max_values.append(array[max_index]) all_words.append(vocab[index]) all_labels.append(unique_labels[max_index]) results_dict[index] = { "max_index": max_index, "max_value": array[max_index], "word": vocab[index], "label": unique_labels[max_index] } assert len(max_values) == len(max_indices) == len(all_words) == len(all_labels) results_df = pd.DataFrame.from_dict(results_dict,orient="index") top_words = {} for u_label in unique_labels: largest = results_df[results_df["label"] == u_label].nlargest(n, columns="max_value")["word"].tolist() top_words[u_label] = largest # print(top_words) # key = f"{threshold}:{edge_type}" if best: top_words_dict[dataset][edge_type] = top_words
def get_ordered_document_triples_metrics(edge_type, dataset=FLAGS.dataset): path = io.get_ordered_document_triples_metrics_path(edge_type, dataset) exist(path, "Ordered document triples metrics do not exist yet.") return io.read_csv(path, sep=",")
def get_eval_logs(dataset=FLAGS.dataset, version=FLAGS.version): path = io.get_eval_log_path(dataset, version) if not exists(path): return None return io.read_csv(path, sep=';')
def get_filtered_triples(dataset=FLAGS.dataset): return io.read_csv(io.get_filtered_word_triples_path(dataset), sep=",")
def get_all_relations(): path = io.get_all_wiki_relations_path() exist(path, "Run `analyze_properties.py` to generate this file") return io.read_csv(path, sep="+")
def get_filtered_relations(): path = io.get_filtered_wiki_relations_path() exist(path, "Run `analyze_properties.py` to generate this file") return io.read_csv(path, sep="\n")["ID"].tolist()
def get_doc2idf(dataset=FLAGS.dataset): return io.read_csv(io.get_doc2idf_path(dataset), sep=",")
def get_entity2id(dataset=FLAGS.dataset): return io.read_csv(io.get_entity2id_path(dataset), sep=",")