def make_similarity_matrix(mentions, nlp, acronym_dict, n_jobs=-1): num_mentions = len(mentions) """ Compute Similarity Matrix """ expanded_mentions = list() for mid, mention in enumerate(mentions): expanded_list = [mention['orth_with_ws'].copy()] distances.expand_mention(expanded_list, acronym_dict) expanded_mentions.append(expanded_list) """ Removing Ignore_Words Compute and Store Vectors """ expanded_mentions_vectors = list() expanded_mentions_vector_averages = list() lt = LoopTimer(update_after=200, avg_length=20000, target=len(expanded_mentions)) for expm in expanded_mentions: list_strings = ["".join([token for token in tokens if token.strip().lower() not in distances.ignore_words]) for tokens in expm] list_spacy = [nlp(string) for string in list_strings] m_vectors = [doc.vector for doc in list_spacy] avg_vec = [sum(m_vectors) / len(m_vectors)] expanded_mentions_vector_averages.append(avg_vec) expanded_mentions_vectors.append(m_vectors) lt.update("Calc Vectors") """ Compute Vector-Similarity-Matrix """ print() lt = LoopTimer(update_after=5, avg_length=1000, target=num_mentions) sim_matrix = list() for vid, v in enumerate(expanded_mentions_vectors): sub_sims = Parallel(n_jobs=n_jobs)(delayed(distances.occ_vec)(v, u) for u in expanded_mentions_vectors) lt.update("Calc Affinity-Matrix") sim_matrix.append(sub_sims) return np.array(sim_matrix)
mentions, sorted_mentions, replace_dic = load_cluster_dic( path=os.path.join(path_to_mlgenome, occ_file_name)) nlp = spacy.load(os.path.join(paths.to_root, "models", nlp_model)) vocab = nlp.vocab.from_disk( os.path.join(path_to_annotations, "spacy.vocab")) infoDF = pd.read_pickle(os.path.join(path_to_annotations, 'info_db.pandas')) lemma_s_list = list() lemma_d_list = list() abstract_id_list = list() target = len(infoDF) lt = LoopTimer(update_after=10, avg_length=1000, target=target) for abstract_id, row in infoDF.iterrows(): doc = Doc(vocab).from_disk( os.path.join(path_to_annotations, f"{abstract_id}.spacy")) doc = replace_cluster_in_doc(doc, replace_dic, sorted_mentions, nlp) lemma_s_list.append(doc_2_token(doc, split_sentences=True)) lemma_d_list.append(doc_2_token(doc, split_sentences=False)) abstract_id_list.append(abstract_id) breaker = lt.update(f"Create Pandas - {len(lemma_d_list)}") dictionary = Dictionary(lemma_d_list) id_d_list = [dictionary.doc2idx(document) for document in lemma_d_list] id_s_list = [[dictionary.doc2idx(sentence) for sentence in document]
venues.append(mag) for region in jourven_list.journals: vr = jourven_list.journals[region] for mag in vr: journals.append(mag) filerange = [0, 1] filerange[1] = min(filerange[1], 39) blastfile = 1 if filerange[1] == 39 else 0 target = min(39, (filerange[1] - filerange[0])) * 1000000 + blastfile * 219709 key_error = 0 mass_error = 0 prune_error = 0 lt = LoopTimer(update_after=500, avg_length=1000000, target=target) for filename in file_list[filerange[0]:filerange[1]]: cur_path = os.path.join(paths.raw_dir, filename) with open(cur_path) as file: for idx, file_line in enumerate(file): update_string = f"Prep - Count:{count} | key: {key_error} - different: {mass_error} - One Char: {prune_error}" break_p = lt.update(update_string) data = json.loads(file_line) if not all(key in data for key in req_keys): key_error += 1 continue title = data['title'] abstract = data['paperAbstract'] abstract_id = data['id'] year = data['year']
infoDF = pd.read_pickle(os.path.join(path_to_annotations, 'info_db.pandas')) db_size = len(infoDF) predictions = dict() targets = dict() target_vector = list() feature_vector = list() docs_lemma = list() docs_pos = list() lc = LoopTimer(update_after=100, avg_length=5000, target=min(data_size, db_size)) for idx, (abstract_id, df_row) in enumerate(infoDF.iterrows()): doc = Doc(vocab).from_disk( os.path.join(path_to_annotations, f"{abstract_id}.spacy")) doc_target_vector, doc_lemma_list, doc_pos_list = doc_to_target_vector( doc, rules, trigger_words_=None if allow_tw else trigger_words) doc_feature_vector = doc_to_feature_vector( doc, target_vector_=doc_target_vector, trigger_words_=None if allow_tw else trigger_words) for tv, fv, lv, pv in zip(doc_target_vector, doc_feature_vector, doc_lemma_list, doc_pos_list):
def optimize(init_cluster, sim_matrix, p, tol=0.00005): n_clusters = init_cluster.shape[1] n_mentions = init_cluster.shape[0] m_range = range(n_mentions) cluster = np.copy(init_cluster) nnls_t = 0 build_matrix_t = 0 get_best_solution_t = 0 calc_cost_t = 0 best_cocc = cost_occ(sim_matrix, init_cluster) print() print(f"Start Optimization with: {best_cocc}") init_line = np.ones(n_clusters + 1).reshape(1, n_clusters + 1) init_line[0][0] = -1 ones_vector = np.ones((n_mentions, 1)) lt = LoopTimer(update_after=1, avg_length=20000) best_cluster = np.concatenate(cluster) while True: for v in m_range: """ Construct Matrix """ bm_t_start = time() zj_vec = sim_matrix[v, :].reshape(n_mentions, 1) mid_matrix = cluster*(ones_vector+zj_vec) a_matrix = np.concatenate((np.concatenate((zj_vec, -mid_matrix), axis=1), init_line), axis=0) a_matrix = np.delete(a_matrix, v, axis=0) n_sj_vec = cluster.sum(1).reshape(n_mentions, 1) * zj_vec b_vec = np.concatenate((-n_sj_vec, np.array([[0]])), axis=0).reshape(n_mentions+1) b_vec = np.delete(b_vec, v, axis=0) bm_t_end = time() build_matrix_t += (bm_t_end - bm_t_start) """ Non Negative Least Squares """ nnls_t_start = time() nnls_result = nnls(a_matrix, b_vec)[0][1:] nnls_ind = np.argpartition(nnls_result, -p)[-p:] nnls_ind_sorted = nnls_ind[np.argsort(nnls_result[nnls_ind])][::-1] nnls_t_end = time() nnls_t += (nnls_t_end - nnls_t_start) """ Retrieve best feasible solution """ gbs_t_start = time() min_dist = float("inf") best_sq = float("inf") for q in range(1, p+1): Sq_ = nnls_ind_sorted[0:q] sq = set(Sq_) dist = 0 for j in m_range: j_clusters = np.where(cluster[j, :] == 1)[0] dist += abs(distances.occ_h(sq, j_clusters) - sim_matrix[v, j]) if dist < min_dist: min_dist = dist best_sq = np.copy(Sq_) for i in range(n_clusters): cluster[v, i] = 1 if i in best_sq else 0 gbs_t_end = time() get_best_solution_t += (gbs_t_end - gbs_t_start) cc_t_start = time() new_cocc = cost_occ(sim_matrix, cluster) cc_t_end = time() calc_cost_t += (cc_t_end-cc_t_start) sum_t = calc_cost_t + build_matrix_t + nnls_t + get_best_solution_t bmt = round((build_matrix_t / sum_t) * 100, 2) nnlst = round((nnls_t/sum_t)*100, 2) gbst = round((get_best_solution_t/sum_t) * 100, 2) cct = round((calc_cost_t / sum_t) * 100, 2) print() lt.update(f"Optimize: {best_cocc} -> {new_cocc} | Build Matrix: {bmt} % | NNLS: {nnlst} % | GBS: {gbst} % | CCT: {cct} %") print() if abs(best_cocc - new_cocc) < tol or new_cocc > best_cocc: break best_cocc = new_cocc best_cluster = np.copy(cluster) print() print(f"End Optimization with: {best_cocc}") return best_cluster, best_cocc
# RF Model rf_model = keras.models.load_model(os.path.join(path_to_rfl, rf_model_fn)) aid_list = list() rf_list = list() print("Loading Vocab...") vocab = Vocab().from_disk(os.path.join(path_to_annotations, "spacy.vocab")) infoDF = pd.read_pickle(os.path.join(path_to_annotations, 'info_db.pandas')) db_size = len(infoDF) """ Abstract Parser Assign a rhetorical function to every sentence of every abstract """ lc = LoopTimer(update_after=1, avg_length=200, target=db_size) for idx, (abstract_id, df_row) in enumerate(infoDF.iterrows()): doc = Doc(vocab).from_disk( os.path.join(path_to_annotations, f"{abstract_id}.spacy")) feature_vector = np.array(doc_to_feature_vector(doc)) if any(entry is None for entry in feature_vector): breaker = lc.update(f"Abstract Parser - BAD FV - {len(aid_list)}") continue prediction_distr = rf_model.predict(feature_vector) prediction = [pred.argmax() for pred in prediction_distr] aid_list.append(abstract_id) rf_list.append(prediction)
"algorithms", "based", "function", "functions", "other", "large", "larger", "twitter", "such"] collect_ml = set() sentence_id = 0 train_list = list() s_sid_list = list() lt = LoopTimer(update_after=200, avg_length=2000, target=targ) # Iterating over all abstracts for abstract_id, row in infoDF.iterrows(): ori_doc = Doc(vocab).from_disk(os.path.join(path_to_annotations, f"{abstract_id}.spacy")) # Training set is built on sentences, so iterate over sentences for sent in ori_doc.sents: sentence = sent.as_doc() # get ML matches matches = matcher(sentence) ent_list = list()
path_to_mlgenome = os.path.join(paths.to_root, "mlgenome", nlp_model) path_to_annotations = os.path.join(paths.to_root, "annotations_version", nlp_model) path_to_pandas = os.path.join(paths.to_root, "pandas", nlp_model) if not os.path.isdir(path_to_mlgenome): print(f"Create Directory {path_to_mlgenome}") os.mkdir(path_to_mlgenome) print("Loading Vocab...") vocab = Vocab().from_disk(os.path.join(path_to_annotations, "spacy.vocab")) infoDF = pd.read_pickle(os.path.join(path_to_annotations, 'info_db.pandas')) acronym_dictionary = dict() entities = set() lt = LoopTimer(update_after=1, avg_length=1000, target=len(infoDF)) for abstract_id, row in infoDF.iterrows(): file_path = os.path.join(path_to_annotations, f"{abstract_id}.spacy") doc = Doc(vocab).from_disk(file_path) for sentence in doc.sents: for ent in sentence.ents: entities.add(ent.text.lower()) definition_span = find_definition_candidate(sentence, ent) if definition_span is not None: acronym_string = ent.text.lower() acronym_orth = [token.orth_ for token in ent] d_string = definition_span.text.lower() d_orth = [token.orth_.lower() for token in definition_span]
if not os.path.isdir(path_to_mlgenome): print(f"Create Directory {path_to_mlgenome}") os.mkdir(path_to_mlgenome) path_to_annotations = os.path.join(paths.to_root, "annotations_version", nlp_model) vocab = Vocab().from_disk(os.path.join(path_to_annotations, "spacy.vocab")) infoDF = pd.read_pickle(os.path.join(path_to_annotations, 'info_db.pandas')) window_size = 3 mentions = list() unique_mentions = list() um_set = dict() lt = LoopTimer(update_after=100, avg_length=1000, target=len(infoDF)) for abstract_id, row in infoDF.iterrows(): file_path = os.path.join(path_to_annotations, f"{abstract_id}.spacy") doc = Doc(vocab).from_disk(file_path) for sentence in doc.sents: for ent in sentence.ents: m_string = ent.text m_orth = [token.orth_.lower() for token in ent] m_orth_with_ws = [token.text_with_ws.lower() for token in ent] m_pos = [token.pos_ for token in ent] m_lemma = [token.lemma_.lower() for token in ent] m_lemma_with_ws = [f"{token.lemma_.lower()}{token.whitespace_}" for token in ent] m_length = sum([len(token.orth_) for token in ent]) m_starts_with_cap = ent[0].orth_[0] == ent[0].orth_[0].upper()
rflabel_dict = {"id2label": id2label, "label2id": label2id} learning_features, holdback_features, learning_targets, holdback_targets = train_test_split( all_features, all_int_targets, test_size=0.2, random_state=42, shuffle=True) print(f"Learning Feature-Vector-Shape: {learning_features.shape}") print(f"Holdback Feature-Vector-Shape: {holdback_features.shape}") result_list = list() print() lc = LoopTimer(update_after=1, avg_length=1, target=len(clfs)) print(f"Feature Type: {feature_type} | Allow TW: {allow_tw}") for name, clf in clfs: lc.update(f"{name} starting") clf.fit(learning_features, learning_targets) prediction = clf.predict(holdback_features) gold_prediction = clf.predict(all_gold_features) print(f"{name}:") print("----------") scoring = Scoring(holdback_targets, prediction, target_dic=rflabel_dict['id2label']) scoring.print() print() print("Gold Label Test") scoring = Scoring(all_gold_int_targets,
else: continue segments_per_year = dict() segments_per_year['num_abstracts'] = len(aid_of_year) segments_per_year['rf'] = dict() # segments_per_sentence = dict() # segments_per_sentence['num_abstracts'] = len(aid_of_year) # segments_per_sentence['rf'] = dict() segments_per_abstract = dict() segments_per_abstract['num_abstracts'] = len(aid_of_year) segments_per_abstract['rf'] = dict() lc = LoopTimer(update_after=100, avg_length=200, target=len(aid_of_year)) for num_abstracts, abstract_id in enumerate(aid_of_year): rf_index = aid_list.index(abstract_id) rf_pred = rf_list[rf_index] doc = Doc(vocab).from_disk( os.path.join(path_to_annotations, f"{abstract_id}.spacy")) token_list = list() for sentence in doc.sents: sent_as_doc = sentence.as_doc() sent_as_doc = replace_cluster_in_doc(sent_as_doc, replace_dic, sorted_mentions, nlp) token_list.append(doc_2_token(sent_as_doc, split_sentences=False))
nlp_model) path_to_old_annotations = os.path.join(paths.to_root, "annotations_version", old_nlp_model) if not os.path.isdir(path_to_annotations): print(f"Create Directory {path_to_annotations}") os.mkdir(path_to_annotations) print("Load Vocab and NLP...") nlp = spacy.load(nlp_path) old_vocab = Vocab().from_disk( os.path.join(path_to_old_annotations, "spacy.vocab")) old_infoDF = pd.read_pickle( os.path.join(path_to_old_annotations, 'info_db.pandas')) print("Starting") lt = LoopTimer(update_after=10, avg_length=1000, target=len(old_infoDF)) for abstract_id, row in old_infoDF.iterrows(): file_path = os.path.join(path_to_old_annotations, f"{abstract_id}.spacy") old_doc = Doc(old_vocab).from_disk(file_path) abstract = old_doc.text doc = nlp(abstract) doc.to_disk(os.path.join(path_to_annotations, f"{abstract_id}.spacy")) lt.update("Re-Preprocess") nlp.vocab.to_disk(os.path.join(path_to_annotations, "spacy.vocab")) print(f"Vocab Size: {len(nlp.vocab)}") nlp.vocab.to_disk(os.path.join(path_to_annotations, "spacy.vocab")) old_infoDF.to_pickle(os.path.join(path_to_annotations, 'info_db.pandas'))
topics = list() for topic, comp in enumerate(lda_model.components_): norm_factor = np.sum(comp) tc_prob = comp[topic_cluster_id] / norm_factor topics.append((topic, tc_prob)) topics_probs[topic_cluster_id] = topics x = list() for year in years: x.append(year) y_topics = dict() y_topics_sentences = dict() y_topics_abstracts = dict() lc = LoopTimer(update_after=1, avg_length=100, target=len(topic_cluster_ids) * len(years)) for topic_cluster_id in topic_cluster_ids: topic_probs = topics_probs[topic_cluster_id] y_topics[topic_cluster_id] = list() y_topics_sentences[topic_cluster_id] = list() y_topics_abstracts[topic_cluster_id] = list() for year in years: # num_sentences = sum([lda_per_sentence[year][rf].shape[0] for rf in rf_labels]) num_abstracts = sum( [lda_per_abstract[year][rf].shape[0] for rf in rf_labels]) y_topics[topic_cluster_id].append( sum( # over RF
for category in [j for j in rules if j in cat_set]: string = f"| Learning {category} |" lines = "".join(["-" for i in range(len(string))]) print(lines) print(string) print(lines) learn_rules = rules[category] for it in range(0, iterations): """ ============================= FIND PHRASES BY RULES ============================= """ patterns = list() lt = LoopTimer(update_after=500, avg_length=10000, target=db_size) for abstract_id, row in infoDF.iterrows(): doc = Doc(vocab).from_disk( os.path.join(path_to_annotations, f"{abstract_id}.spacy")) patterns.extend( find_phrases_by_rule(doc, learn_rules, phrase_boundaries)) n = lt.update(f"Find Phrases - {len(patterns)}") print() """ ============================= BUILD MATCHER ============================= """ matcher = Matcher(vocab) lt = LoopTimer(update_after=10000,