Пример #1
0
def get_word_embeddings(layer):
    path = io.get_word_embeddings_path(layer)
    exist(
        path,
        "No cached embeddings. Run the model first with the --plot option enabled."
    )
    return io.read_csv(path, sep=",").to_numpy()
Пример #2
0
def analyze_doc_embeddings(dataset, path, id1, id2, filename):

    embeddings = io.read_csv(path, sep=",")
    reduced_emb_doc = reduce_dimensions(embeddings)

    doc_labels = generate_doc_labels(embeddings, dataset)
    label1 = doc_labels[id1]
    label2 = doc_labels[id2]
    # assert label1 == label2
    visualize_highlight(reduced_emb_doc, id1, id2, label1, filename=filename, labels=doc_labels, colors=dataset_colors[dataset])
Пример #3
0
def analyze_word_embeddings(dataset, path, threshold, edge_type, best, n=10):
    global top_words_dict
    embeddings = io.read_csv(path, sep=",")
    embeddings_array = embeddings.to_numpy().tolist()
    unique_labels = sorted(list(set([label.split("\t")[2] for label in file.get_labels(dataset)])), reverse=True)
    vocab = file.get_vocab(dataset)
    max_indices = []
    max_values = []
    all_words = []
    all_labels = []
    results_dict = {}
    for index, emb in enumerate(embeddings_array):
        array = np.array(emb)
        max_index = array.argmax()
        max_indices.append(max_index)
        max_values.append(array[max_index])
        all_words.append(vocab[index])
        all_labels.append(unique_labels[max_index])
        results_dict[index] = {
            "max_index": max_index,
            "max_value": array[max_index],
            "word": vocab[index],
            "label": unique_labels[max_index]
            }

    assert len(max_values) == len(max_indices) == len(all_words) == len(all_labels)
    results_df = pd.DataFrame.from_dict(results_dict,orient="index")

    top_words = {}
    for u_label in unique_labels:
        largest = results_df[results_df["label"] == u_label].nlargest(n, columns="max_value")["word"].tolist()
        top_words[u_label] = largest

    # print(top_words)
    # key = f"{threshold}:{edge_type}"
    if best:
        top_words_dict[dataset][edge_type] = top_words
Пример #4
0
def get_ordered_document_triples_metrics(edge_type, dataset=FLAGS.dataset):
    path = io.get_ordered_document_triples_metrics_path(edge_type, dataset)
    exist(path, "Ordered document triples metrics do not exist yet.")
    return io.read_csv(path, sep=",")
Пример #5
0
def get_eval_logs(dataset=FLAGS.dataset, version=FLAGS.version):
    path = io.get_eval_log_path(dataset, version)
    if not exists(path):
        return None
    return io.read_csv(path, sep=';')
Пример #6
0
def get_filtered_triples(dataset=FLAGS.dataset):
    return io.read_csv(io.get_filtered_word_triples_path(dataset), sep=",")
Пример #7
0
def get_all_relations():
    path = io.get_all_wiki_relations_path()
    exist(path, "Run `analyze_properties.py` to generate this file")
    return io.read_csv(path, sep="+")
Пример #8
0
def get_filtered_relations():
    path = io.get_filtered_wiki_relations_path()
    exist(path, "Run `analyze_properties.py` to generate this file")
    return io.read_csv(path, sep="\n")["ID"].tolist()
Пример #9
0
def get_doc2idf(dataset=FLAGS.dataset):
    return io.read_csv(io.get_doc2idf_path(dataset), sep=",")
Пример #10
0
def get_entity2id(dataset=FLAGS.dataset):
    return io.read_csv(io.get_entity2id_path(dataset), sep=",")