예제 #1
0
def make_similarity_matrix(mentions,
                           nlp,
                           acronym_dict,
                           n_jobs=-1):
    num_mentions = len(mentions)

    """
        Compute Similarity Matrix
    """

    expanded_mentions = list()
    for mid, mention in enumerate(mentions):
        expanded_list = [mention['orth_with_ws'].copy()]
        distances.expand_mention(expanded_list, acronym_dict)

        expanded_mentions.append(expanded_list)

    """
        Removing Ignore_Words
        Compute and Store Vectors
    """

    expanded_mentions_vectors = list()
    expanded_mentions_vector_averages = list()

    lt = LoopTimer(update_after=200, avg_length=20000, target=len(expanded_mentions))
    for expm in expanded_mentions:
        list_strings = ["".join([token for token in tokens if token.strip().lower() not in distances.ignore_words]) for
                        tokens in expm]
        list_spacy = [nlp(string) for string in list_strings]
        m_vectors = [doc.vector for doc in list_spacy]
        avg_vec = [sum(m_vectors) / len(m_vectors)]
        expanded_mentions_vector_averages.append(avg_vec)
        expanded_mentions_vectors.append(m_vectors)
        lt.update("Calc Vectors")

    """
        Compute Vector-Similarity-Matrix
    """
    print()
    lt = LoopTimer(update_after=5, avg_length=1000, target=num_mentions)
    sim_matrix = list()
    for vid, v in enumerate(expanded_mentions_vectors):
        sub_sims = Parallel(n_jobs=n_jobs)(delayed(distances.occ_vec)(v, u) for u in expanded_mentions_vectors)
        lt.update("Calc Affinity-Matrix")
        sim_matrix.append(sub_sims)

    return np.array(sim_matrix)
예제 #2
0
    mentions, sorted_mentions, replace_dic = load_cluster_dic(
        path=os.path.join(path_to_mlgenome, occ_file_name))

    nlp = spacy.load(os.path.join(paths.to_root, "models", nlp_model))
    vocab = nlp.vocab.from_disk(
        os.path.join(path_to_annotations, "spacy.vocab"))
    infoDF = pd.read_pickle(os.path.join(path_to_annotations,
                                         'info_db.pandas'))

    lemma_s_list = list()
    lemma_d_list = list()
    abstract_id_list = list()

    target = len(infoDF)
    lt = LoopTimer(update_after=10, avg_length=1000, target=target)
    for abstract_id, row in infoDF.iterrows():
        doc = Doc(vocab).from_disk(
            os.path.join(path_to_annotations, f"{abstract_id}.spacy"))

        doc = replace_cluster_in_doc(doc, replace_dic, sorted_mentions, nlp)

        lemma_s_list.append(doc_2_token(doc, split_sentences=True))
        lemma_d_list.append(doc_2_token(doc, split_sentences=False))
        abstract_id_list.append(abstract_id)

        breaker = lt.update(f"Create Pandas - {len(lemma_d_list)}")

    dictionary = Dictionary(lemma_d_list)
    id_d_list = [dictionary.doc2idx(document) for document in lemma_d_list]
    id_s_list = [[dictionary.doc2idx(sentence) for sentence in document]
예제 #3
0
        venues.append(mag)
for region in jourven_list.journals:
    vr = jourven_list.journals[region]
    for mag in vr:
        journals.append(mag)

filerange = [0, 1]
filerange[1] = min(filerange[1], 39)
blastfile = 1 if filerange[1] == 39 else 0
target = min(39, (filerange[1] - filerange[0])) * 1000000 + blastfile * 219709

key_error = 0
mass_error = 0
prune_error = 0

lt = LoopTimer(update_after=500, avg_length=1000000, target=target)
for filename in file_list[filerange[0]:filerange[1]]:
    cur_path = os.path.join(paths.raw_dir, filename)
    with open(cur_path) as file:
        for idx, file_line in enumerate(file):
            update_string = f"Prep  - Count:{count} |  key: {key_error} - different: {mass_error} - One Char: {prune_error}"
            break_p = lt.update(update_string)
            data = json.loads(file_line)
            if not all(key in data for key in req_keys):
                key_error += 1
                continue
            title = data['title']
            abstract = data['paperAbstract']
            abstract_id = data['id']

            year = data['year']
예제 #4
0
    infoDF = pd.read_pickle(os.path.join(path_to_annotations,
                                         'info_db.pandas'))

    db_size = len(infoDF)

    predictions = dict()
    targets = dict()

    target_vector = list()
    feature_vector = list()

    docs_lemma = list()
    docs_pos = list()

    lc = LoopTimer(update_after=100,
                   avg_length=5000,
                   target=min(data_size, db_size))
    for idx, (abstract_id, df_row) in enumerate(infoDF.iterrows()):
        doc = Doc(vocab).from_disk(
            os.path.join(path_to_annotations, f"{abstract_id}.spacy"))

        doc_target_vector, doc_lemma_list, doc_pos_list = doc_to_target_vector(
            doc, rules, trigger_words_=None if allow_tw else trigger_words)

        doc_feature_vector = doc_to_feature_vector(
            doc,
            target_vector_=doc_target_vector,
            trigger_words_=None if allow_tw else trigger_words)

        for tv, fv, lv, pv in zip(doc_target_vector, doc_feature_vector,
                                  doc_lemma_list, doc_pos_list):
예제 #5
0
def optimize(init_cluster, sim_matrix, p, tol=0.00005):
    n_clusters = init_cluster.shape[1]
    n_mentions = init_cluster.shape[0]
    m_range = range(n_mentions)

    cluster = np.copy(init_cluster)

    nnls_t = 0
    build_matrix_t = 0
    get_best_solution_t = 0
    calc_cost_t = 0

    best_cocc = cost_occ(sim_matrix, init_cluster)

    print()
    print(f"Start Optimization with: {best_cocc}")

    init_line = np.ones(n_clusters + 1).reshape(1, n_clusters + 1)
    init_line[0][0] = -1
    ones_vector = np.ones((n_mentions, 1))

    lt = LoopTimer(update_after=1, avg_length=20000)

    best_cluster = np.concatenate(cluster)

    while True:
        for v in m_range:
            """
                Construct Matrix
            """
            bm_t_start = time()

            zj_vec = sim_matrix[v, :].reshape(n_mentions, 1)
            mid_matrix = cluster*(ones_vector+zj_vec)
            a_matrix = np.concatenate((np.concatenate((zj_vec, -mid_matrix), axis=1), init_line), axis=0)

            a_matrix = np.delete(a_matrix, v, axis=0)

            n_sj_vec = cluster.sum(1).reshape(n_mentions, 1) * zj_vec
            b_vec = np.concatenate((-n_sj_vec, np.array([[0]])), axis=0).reshape(n_mentions+1)
            b_vec = np.delete(b_vec, v, axis=0)

            bm_t_end = time()
            build_matrix_t += (bm_t_end - bm_t_start)

            """
                Non Negative Least Squares
            """
            nnls_t_start = time()
            nnls_result = nnls(a_matrix, b_vec)[0][1:]
            nnls_ind = np.argpartition(nnls_result, -p)[-p:]
            nnls_ind_sorted = nnls_ind[np.argsort(nnls_result[nnls_ind])][::-1]
            nnls_t_end = time()
            nnls_t += (nnls_t_end - nnls_t_start)

            """
                Retrieve best feasible solution 
            """
            gbs_t_start = time()
            min_dist = float("inf")
            best_sq = float("inf")
            for q in range(1, p+1):
                Sq_ = nnls_ind_sorted[0:q]
                sq = set(Sq_)
                dist = 0
                for j in m_range:
                    j_clusters = np.where(cluster[j, :] == 1)[0]
                    dist += abs(distances.occ_h(sq, j_clusters) - sim_matrix[v, j])

                if dist < min_dist:
                    min_dist = dist
                    best_sq = np.copy(Sq_)

            for i in range(n_clusters):
                cluster[v, i] = 1 if i in best_sq else 0

            gbs_t_end = time()
            get_best_solution_t += (gbs_t_end - gbs_t_start)

        cc_t_start = time()
        new_cocc = cost_occ(sim_matrix, cluster)
        cc_t_end = time()
        calc_cost_t += (cc_t_end-cc_t_start)

        sum_t = calc_cost_t + build_matrix_t + nnls_t + get_best_solution_t
        bmt = round((build_matrix_t / sum_t) * 100, 2)
        nnlst = round((nnls_t/sum_t)*100, 2)
        gbst = round((get_best_solution_t/sum_t) * 100, 2)
        cct = round((calc_cost_t / sum_t) * 100, 2)
        print()
        lt.update(f"Optimize: {best_cocc} -> {new_cocc} | Build Matrix: {bmt} % |  NNLS: {nnlst} % | GBS: {gbst} % | CCT: {cct} %")
        print()
        if abs(best_cocc - new_cocc) < tol or new_cocc > best_cocc:
            break
        best_cocc = new_cocc
        best_cluster = np.copy(cluster)
    print()
    print(f"End Optimization with: {best_cocc}")
    return best_cluster, best_cocc
예제 #6
0
# RF Model
rf_model = keras.models.load_model(os.path.join(path_to_rfl, rf_model_fn))

aid_list = list()
rf_list = list()

print("Loading Vocab...")
vocab = Vocab().from_disk(os.path.join(path_to_annotations, "spacy.vocab"))
infoDF = pd.read_pickle(os.path.join(path_to_annotations, 'info_db.pandas'))
db_size = len(infoDF)
"""
    Abstract Parser
    Assign a rhetorical function to every sentence of every abstract
"""

lc = LoopTimer(update_after=1, avg_length=200, target=db_size)
for idx, (abstract_id, df_row) in enumerate(infoDF.iterrows()):

    doc = Doc(vocab).from_disk(
        os.path.join(path_to_annotations, f"{abstract_id}.spacy"))
    feature_vector = np.array(doc_to_feature_vector(doc))

    if any(entry is None for entry in feature_vector):
        breaker = lc.update(f"Abstract Parser - BAD FV - {len(aid_list)}")
        continue

    prediction_distr = rf_model.predict(feature_vector)
    prediction = [pred.argmax() for pred in prediction_distr]

    aid_list.append(abstract_id)
    rf_list.append(prediction)
예제 #7
0
                        "algorithms",
                        "based",
                        "function",
                        "functions",
                        "other",
                        "large",
                        "larger",
                        "twitter",
                        "such"]

collect_ml = set()
sentence_id = 0
train_list = list()
s_sid_list = list()

lt = LoopTimer(update_after=200, avg_length=2000, target=targ)

# Iterating over all abstracts
for abstract_id, row in infoDF.iterrows():
    ori_doc = Doc(vocab).from_disk(os.path.join(path_to_annotations, f"{abstract_id}.spacy"))


    # Training set is built on sentences, so iterate over sentences
    for sent in ori_doc.sents:
        sentence = sent.as_doc()

        # get ML matches
        matches = matcher(sentence)

        ent_list = list()
예제 #8
0
path_to_mlgenome = os.path.join(paths.to_root, "mlgenome", nlp_model)
path_to_annotations = os.path.join(paths.to_root, "annotations_version", nlp_model)
path_to_pandas = os.path.join(paths.to_root, "pandas", nlp_model)

if not os.path.isdir(path_to_mlgenome):
    print(f"Create Directory {path_to_mlgenome}")
    os.mkdir(path_to_mlgenome)

print("Loading Vocab...")
vocab = Vocab().from_disk(os.path.join(path_to_annotations, "spacy.vocab"))
infoDF = pd.read_pickle(os.path.join(path_to_annotations, 'info_db.pandas'))

acronym_dictionary = dict()
entities = set()

lt = LoopTimer(update_after=1, avg_length=1000, target=len(infoDF))
for abstract_id, row in infoDF.iterrows():

    file_path = os.path.join(path_to_annotations, f"{abstract_id}.spacy")
    doc = Doc(vocab).from_disk(file_path)

    for sentence in doc.sents:
        for ent in sentence.ents:
            entities.add(ent.text.lower())
            definition_span = find_definition_candidate(sentence, ent)
            if definition_span is not None:
                acronym_string = ent.text.lower()
                acronym_orth = [token.orth_ for token in ent]

                d_string = definition_span.text.lower()
                d_orth = [token.orth_.lower() for token in definition_span]
예제 #9
0
if not os.path.isdir(path_to_mlgenome):
    print(f"Create Directory {path_to_mlgenome}")
    os.mkdir(path_to_mlgenome)

path_to_annotations = os.path.join(paths.to_root, "annotations_version", nlp_model)

vocab = Vocab().from_disk(os.path.join(path_to_annotations, "spacy.vocab"))
infoDF = pd.read_pickle(os.path.join(path_to_annotations, 'info_db.pandas'))

window_size = 3

mentions = list()
unique_mentions = list()
um_set = dict()

lt = LoopTimer(update_after=100, avg_length=1000, target=len(infoDF))
for abstract_id, row in infoDF.iterrows():
    file_path = os.path.join(path_to_annotations, f"{abstract_id}.spacy")
    doc = Doc(vocab).from_disk(file_path)

    for sentence in doc.sents:
        for ent in sentence.ents:
            m_string = ent.text

            m_orth = [token.orth_.lower() for token in ent]
            m_orth_with_ws = [token.text_with_ws.lower() for token in ent]
            m_pos = [token.pos_ for token in ent]
            m_lemma = [token.lemma_.lower() for token in ent]
            m_lemma_with_ws = [f"{token.lemma_.lower()}{token.whitespace_}" for token in ent]
            m_length = sum([len(token.orth_) for token in ent])
            m_starts_with_cap = ent[0].orth_[0] == ent[0].orth_[0].upper()
예제 #10
0
rflabel_dict = {"id2label": id2label, "label2id": label2id}

learning_features, holdback_features, learning_targets, holdback_targets = train_test_split(
    all_features,
    all_int_targets,
    test_size=0.2,
    random_state=42,
    shuffle=True)

print(f"Learning Feature-Vector-Shape: {learning_features.shape}")
print(f"Holdback Feature-Vector-Shape: {holdback_features.shape}")
result_list = list()

print()
lc = LoopTimer(update_after=1, avg_length=1, target=len(clfs))
print(f"Feature Type: {feature_type} | Allow TW: {allow_tw}")
for name, clf in clfs:
    lc.update(f"{name} starting")
    clf.fit(learning_features, learning_targets)
    prediction = clf.predict(holdback_features)
    gold_prediction = clf.predict(all_gold_features)
    print(f"{name}:")
    print("----------")
    scoring = Scoring(holdback_targets,
                      prediction,
                      target_dic=rflabel_dict['id2label'])
    scoring.print()
    print()
    print("Gold Label Test")
    scoring = Scoring(all_gold_int_targets,
예제 #11
0
    else:
        continue

    segments_per_year = dict()
    segments_per_year['num_abstracts'] = len(aid_of_year)
    segments_per_year['rf'] = dict()

    # segments_per_sentence = dict()
    # segments_per_sentence['num_abstracts'] = len(aid_of_year)
    # segments_per_sentence['rf'] = dict()

    segments_per_abstract = dict()
    segments_per_abstract['num_abstracts'] = len(aid_of_year)
    segments_per_abstract['rf'] = dict()

    lc = LoopTimer(update_after=100, avg_length=200, target=len(aid_of_year))
    for num_abstracts, abstract_id in enumerate(aid_of_year):

        rf_index = aid_list.index(abstract_id)

        rf_pred = rf_list[rf_index]

        doc = Doc(vocab).from_disk(
            os.path.join(path_to_annotations, f"{abstract_id}.spacy"))

        token_list = list()
        for sentence in doc.sents:
            sent_as_doc = sentence.as_doc()
            sent_as_doc = replace_cluster_in_doc(sent_as_doc, replace_dic,
                                                 sorted_mentions, nlp)
            token_list.append(doc_2_token(sent_as_doc, split_sentences=False))
예제 #12
0
                                   nlp_model)
path_to_old_annotations = os.path.join(paths.to_root, "annotations_version",
                                       old_nlp_model)

if not os.path.isdir(path_to_annotations):
    print(f"Create Directory {path_to_annotations}")
    os.mkdir(path_to_annotations)

print("Load Vocab and NLP...")
nlp = spacy.load(nlp_path)
old_vocab = Vocab().from_disk(
    os.path.join(path_to_old_annotations, "spacy.vocab"))
old_infoDF = pd.read_pickle(
    os.path.join(path_to_old_annotations, 'info_db.pandas'))

print("Starting")
lt = LoopTimer(update_after=10, avg_length=1000, target=len(old_infoDF))
for abstract_id, row in old_infoDF.iterrows():
    file_path = os.path.join(path_to_old_annotations, f"{abstract_id}.spacy")
    old_doc = Doc(old_vocab).from_disk(file_path)
    abstract = old_doc.text
    doc = nlp(abstract)

    doc.to_disk(os.path.join(path_to_annotations, f"{abstract_id}.spacy"))
    lt.update("Re-Preprocess")

nlp.vocab.to_disk(os.path.join(path_to_annotations, "spacy.vocab"))
print(f"Vocab Size: {len(nlp.vocab)}")

nlp.vocab.to_disk(os.path.join(path_to_annotations, "spacy.vocab"))
old_infoDF.to_pickle(os.path.join(path_to_annotations, 'info_db.pandas'))
예제 #13
0
    topics = list()
    for topic, comp in enumerate(lda_model.components_):
        norm_factor = np.sum(comp)
        tc_prob = comp[topic_cluster_id] / norm_factor
        topics.append((topic, tc_prob))
    topics_probs[topic_cluster_id] = topics

x = list()
for year in years:
    x.append(year)

y_topics = dict()
y_topics_sentences = dict()
y_topics_abstracts = dict()
lc = LoopTimer(update_after=1,
               avg_length=100,
               target=len(topic_cluster_ids) * len(years))
for topic_cluster_id in topic_cluster_ids:

    topic_probs = topics_probs[topic_cluster_id]
    y_topics[topic_cluster_id] = list()
    y_topics_sentences[topic_cluster_id] = list()
    y_topics_abstracts[topic_cluster_id] = list()

    for year in years:
        # num_sentences = sum([lda_per_sentence[year][rf].shape[0] for rf in rf_labels])
        num_abstracts = sum(
            [lda_per_abstract[year][rf].shape[0] for rf in rf_labels])

        y_topics[topic_cluster_id].append(
            sum(  # over RF
예제 #14
0
    for category in [j for j in rules if j in cat_set]:
        string = f"| Learning {category} |"
        lines = "".join(["-" for i in range(len(string))])
        print(lines)
        print(string)
        print(lines)

        learn_rules = rules[category]
        for it in range(0, iterations):
            """
            =============================
                FIND PHRASES BY RULES
            =============================
            """
            patterns = list()
            lt = LoopTimer(update_after=500, avg_length=10000, target=db_size)
            for abstract_id, row in infoDF.iterrows():
                doc = Doc(vocab).from_disk(
                    os.path.join(path_to_annotations, f"{abstract_id}.spacy"))
                patterns.extend(
                    find_phrases_by_rule(doc, learn_rules, phrase_boundaries))
                n = lt.update(f"Find Phrases - {len(patterns)}")

            print()
            """
            =============================
                    BUILD MATCHER
            =============================
            """
            matcher = Matcher(vocab)
            lt = LoopTimer(update_after=10000,