def main(): from utility_functions import get_config, get_cmd_args config = get_config() args = get_cmd_args() path = config['paths'][args.location][args.corpus]['path_out'] pe = PatternExtractor(path) pe.extract()
def __init__(self, clusters: Dict[int, cluster_type], cluster_centers: Dict[int, List[float]], subcorpora: Dict[int, Set[int]], level: int) -> None: """Initialize a Scorer object. Args: clusters: A list of clusters. Each cluster is a set of term-ids. cluster_centers: A dict mapping the cluster-id to it's cluster-center. subcorpora: Maps each cluster label to the relevant doc-ids. level: Level in the taxonomy. Root is level 0. """ self.config = get_config() self.pop_df_version = self.config['pop_df_version'] self.pop_sum_version = self.config['pop_sum_version'] self.pop_no_denominator = self.config['pop_no_denominator'] self.pop_scores = self.config['pop_scores'] self.con_scores = self.config['con_scores'] self.l1_normalize = self.config['l1_normalize'] self.kl_divergence = self.config['kl_divergence'] self.clusters = clusters self.clusters_inv = self.inverse_cluster(clusters) self.cluster_centers = cluster_centers self.subcorpora = subcorpora self.level = level if self.clusters.keys() != self.subcorpora.keys(): print('WARNING! Cluster and subcorpora do not correspond!') print('Cluster keys: ', self.clusters.keys()) print('Subcorpora keys: ', self.subcorpora.keys()) pdb.set_trace()
def get_paths() -> Dict[str, str]: """Generate paths for postprocessing.""" config = get_config() args = get_cmd_args() pout = config['paths'][args.location][args.corpus]['path_out'] paths = { 'out': pout, 'tax_csv': os.path.join(pout, 'concept_terms/tax_labels_sim.csv'), 'tax_png': os.path.join(pout, 'concept_terms/taxonomy.png') } return paths
def main(): from utility_functions import get_config, get_cmd_args config = get_config() args = get_cmd_args() path = config['paths'][args.location][args.corpus]['path_out'] idxer = Indexer(path, False) # true for tg-processing # idxer.index_tokens() # idxer.index_lemmas() # idxer.build_token_contains() # idxer.build_lemma_contains() idxer.hierarch_rels_to_token_idx()
def main(): config = get_config() args = get_cmd_args() path_out = config['paths'][args.location][args.corpus]['path_out'] path_tax = os.path.join(path_out, 'hierarchy/taxonomy.csv') taxonomy = load_taxonomy(path_tax) load_term_ids_to_embs_global(config['lemmatized'], config['emb_type'], path_out) for node in taxonomy: clus_center = get_clus_center(node, path_out) most_center_term = get_most_center_term(node, clus_center, path_out) write_to_csv(node, clus_center, most_center_term, path_out)
def load_doc_embeddings(path_out: str) -> Dict[int, np.ndarray]: """Compute document embeddings using term-embeddings and tfidf. Compute document embeddings though average of tfidf weighted term embeddings. The embedding for each document d_e is computed as: d_e = avg(tfidf(t1..tn)*emb(t1..tn)) where t is a term in d. Args: path_out: Path to the output directory. Return: doc_embeddings: {doc-id: embedding} """ config = get_config() lemmatized = config['lemmatized'] emb_type = config['embeddings'] if not lemmatized: if emb_type == 'Word2Vec': path_doc_embs = os.path.join( path_out, 'embeddings/doc_embs_token_Word2Vec.pickle') elif emb_type == 'GloVe': path_doc_embs = os.path.join( path_out, 'embeddings/doc_embs_token_GloVe.pickle') elif emb_type == 'ELMo': path_doc_embs = os.path.join( path_out, 'embeddings/doc_embs_token_ELMo.txt') else: raise Exception('Error! Embedding type not recognized.') else: if emb_type == 'Word2Vec': path_doc_embs = os.path.join( path_out, 'embeddings/doc_embs_lemma_Word2Vec.pickle') elif emb_type == 'GloVe': path_doc_embs = os.path.join( path_out, 'embeddings/doc_embs_lemma_GloVe.pickle') else: raise Exception('Error! Embedding type not recognized.') if path_doc_embs.endswith('.pickle'): doc_embeddings = pickle.load(open(path_doc_embs, 'rb')) elif path_doc_embs.endswith('.txt'): num_docs = 1889656 doc_embeddings = np.empty(num_docs, 1024) with open(path_doc_embs, 'r', encoding='utf8') as f: for i, line in enumerate(f): emb_str = line.split(',') emb = np.array([float(f) for f in emb_str]) doc_embeddings[i] = emb else: raise Exception('Error! File type for embedding not known.') return doc_embeddings
def main(): from utility_functions import get_config, get_cmd_args # , prep_output_dir config = get_config() args = get_cmd_args() path_in = config['paths'][args.location][args.corpus]['path_in'] path_out = config['paths'][args.location][args.corpus]['path_out'] path_lang_model = config['paths'][args.location]['path_lang_model'] # prep_output_dir(path_out) max_docs = None # prep_output_dir(path_out) if args.corpus == 'dblp': dp = DBLPLingPreprocessor(path_in, path_out, path_lang_model, max_docs) dp.preprocess_corpus() elif args.corpus == 'sp': sp = SPLingPreprocessor(path_in, path_out, path_lang_model, max_docs) sp.preprocess_corpus()
def main(): from utility_functions import get_config, get_cmd_args config = get_config() args = get_cmd_args() path = config['paths'][args.location][args.corpus]['path_out'] fa = FreqAnalyzer(path) print('Calculate token term frequencies...') fa.calc_tf('t') print('Calculate lemma term frequencies...') fa.calc_tf('l') print('Calculate token document frequencies...') fa.calc_df('t') print('Calculate lemma document frequencies...') fa.calc_df('l') print('Calculate token tfidf-values...') fa.calc_tfidf('t') print('Calculate lemma tfidf-values...') fa.calc_tfidf('l') print('Calculate document lengths...') fa.calc_dl() print('Done')
if t not in embedded_terms: not_in_et.append(t) if len(not_in_et) != 0: msg1 = 'Error! Not all terms have embeddings. ' msg2 = 'Num terms without embeddings: {}. '.format(len(not_in_et)) if len(not_in_et) < 20: msg3 = 'Terms without embeddings: {}'.format(not_in_et) else: msg3 = '' raise Exception(msg1 + msg2 + msg3) def load_terms(path_terms: str) -> Set[int]: terms = set() with open(path_terms, 'r', encoding='utf8') as f: for line in f: terms.add(int(line.strip('\n'))) return terms if __name__ == '__main__': from utility_functions import get_config, get_cmd_args config = get_config() args = get_cmd_args() path = config['paths'][args.location][args.corpus]['path_out'] print('Test if all token terms have embeddings...') test_all_token_terms_have_embeddings(path) print('Test if all lemma terms have embeddings...') test_all_lemma_terms_have_embeddings(path)
def generate_taxonomy() -> None: """Generate a taxonomy for a preprocessed corpus. 1. Set paths. 2. Load data. 3. Start recursive taxonomy generation. """ # Define globals. global idx_to_term global path_embeddings_global global path_term_distr global max_depth # Load cmd args and configs. print('Load and parse cmd args...') config = get_config() args = get_cmd_args() lemmatized = config['lemmatized'] emb_type = config['embeddings'] threshold = config['threshold'] max_depth = config['max_depth'] # Set paths. print('Set paths...') path_out = config['paths'][args.location][args.corpus]['path_out'] if lemmatized: path_term_ids = os.path.join(path_out, 'processed_corpus/lemma_terms_idxs.txt') path_idx_to_term = os.path.join(path_out, 'indexing/idx_to_lemma.json') path_df = os.path.join(path_out, 'frequencies/df_lemmas.json') # path_tf = os.path.join(path_out, 'frequencies/tf_lemmas.json') # path_tfidf = os.path.join( # path_out, 'frequencies/tfidf_lemmas.json') path_term_distr = os.path.join(path_out, 'frequencies/term_distr_lemmas.json') path_base_corpus = os.path.join( path_out, 'processed_corpus/pp_lemma_corpus.txt') path_base_corpus_ids = os.path.join( path_out, 'processed_corpus/lemma_idx_corpus.txt') if emb_type == 'GloVe' or emb_type == 'Word2Vec': path_embeddings_global = os.path.join( path_out, 'embeddings/embs_lemma_global_{}.vec'.format(emb_type)) else: path_embeddings_global = os.path.join( path_out, 'embeddings/embs_lemma_global_{}.pickle'.format(emb_type)) else: path_term_ids = os.path.join(path_out, 'processed_corpus/token_terms_idxs.txt') path_idx_to_term = os.path.join(path_out, 'indexing/idx_to_token.json') path_df = os.path.join(path_out, 'frequencies/df_tokens.json') # path_tf = os.path.join(path_out, 'frequencies/tf_tokens.json') # path_tfidf = os.path.join(path_out, 'frequencies/tfidf_tokens.json') path_term_distr = os.path.join(path_out, 'frequencies/term_distr_tokens.json') path_base_corpus = os.path.join( path_out, 'processed_corpus/pp_token_corpus.txt') path_base_corpus_ids = os.path.join( path_out, 'processed_corpus/token_idx_corpus.txt') if emb_type == 'GloVe' or emb_type == 'Word2Vec': path_embeddings_global = os.path.join( path_out, 'embeddings/embs_token_global_{}.vec'.format(emb_type)) else: path_embeddings_global = os.path.join( path_out, 'embeddings/embs_token_{}_avg.pickle'.format(emb_type)) # path_dl = os.path.join(path_out, 'frequencies/dl.json') path_taxonomy = os.path.join(path_out, 'hierarchy/taxonomy.csv') tax_file = open(path_taxonomy, 'w', encoding='utf8', newline='') csv_writer = csv.writer(tax_file, delimiter=',') # Define starting variables. print('Load term-ids...') term_ids = load_term_ids(path_term_ids) print('Load idx-term mappings...') with open(path_idx_to_term, 'r', encoding='utf8') as f: idx_to_term_str = json.load(f) idx_to_term = {int(k): v for k, v in idx_to_term_str.items()} print('Load global embeddings...') term_ids_to_embs_global = Embeddings.load_term_embeddings( term_ids, path_embeddings_global, idx_to_term) print('Load base corpus...') base_corpus = get_base_corpus(path_base_corpus) print('Load df-base...') with open(path_df, 'r', encoding='utf8') as f: # {word_id: [doc_id1, ...]} df_base_str = json.load(f) df_base = {int(k): [int(i) for i in v] for k, v in df_base_str.items()} print('load term distr file...') global term_distr_base term_distr_base = pickle.load(open(path_term_distr, 'rb')) del df_base_str # Start recursive taxonomy generation. rec_find_children( term_ids_local=term_ids, term_ids_global=term_ids, base_corpus=base_corpus, path_base_corpus_ids=path_base_corpus_ids, cur_node_id=0, level=0, df_base=df_base, df=df_base, # cur_repr_terms=[], path_out=path_out, cur_corpus=base_corpus, csv_writer=csv_writer, threshold=threshold, term_ids_to_embs_global=term_ids_to_embs_global, emb_type=emb_type, max_iter=config['max_iter']) tax_file.close() print('Done.')
def main(): args = get_cmd_args() location = args.location corpus = args.corpus config = get_config() path_out = config['paths'][location][corpus]['path_out'] emb_type = config['embeddings'] if not args.skip_prep: prep_output_dir(path_out) # Copy TG files into dir-system. papers_to_pp_token_corpus(config, location, corpus) copy_keywords_to_terms(config, location, corpus) # Index corpus. if not args.skip_idxer: # print('Start indexing...') idxer = Indexer(path_out) # idxer.index_tokens() # print('Finished indexing.') print('Start building subtoken index...') idxer.build_token_contains() print('Finished building subtoken index.') # Frequency analysis. if not args.skip_freq_an: print('Start frequency analysis for tf, df and dl...') fa = FreqAnalyzer(path_out) print('Calculate token term frequencies...') fa.calc_tf('t') print('Calculate token document frequencies...') fa.calc_df('t') print('Calculate tfidf for tokens...') fa.calc_tfidf('t') print('Calculate document lengths...') fa.calc_dl() print('Finished frequency analysis.') if not args.skip_embeddings: emb_types = ['Word2Vec', 'GloVe', 'ELMo'] for etype in emb_types: Embedding = get_emb(etype) print('Train {} token embeddings...'.format(etype)) path_input = os.path.join(path_out, 'processed_corpus/token_idx_corpus.txt') embs_fname = Embedding.train( path_input, 'embs_token_global_'+etype, path_out) print('{} embeddings written to: {}'.format(etype, embs_fname)) if not args.skip_doc_embs: print('Calculating document embeddings...') doc_embedder = DocEmbedder(path_out, emb_type) doc_embedder.embed_token_docs() print('Finished document embeddings.') if not args.skip_word_distr: print('Create term distributions pickle file...') path_tf = os.path.join(path_out, 'frequencies/tf_tokens.json') path_tfidf = os.path.join(path_out, 'frequencies/tfidf_tokens.json') path_dl = os.path.join(path_out, 'frequencies/dl.json') path_term_distr = os.path.join( path_out, 'frequencies/term_distr_tokens.json') # Load frequencies. with open(path_tf, 'r', encoding='utf8') as f_tf: tf_base = json.load(f_tf) with open(path_tfidf, 'r', encoding='utf8') as f_tfidf: tfidf_base = json.load(f_tfidf) with open(path_dl, 'r', encoding='utf8') as f_dl: dl_base = json.load(f_dl) # Create term_distr. for doc_id in tfidf_base: for word_id in tf_base[doc_id]: tf = tf_base[doc_id][word_id] tfidf = tfidf_base[doc_id][word_id] term_distr_base[int(doc_id)][int(word_id)] = (tf, tfidf) term_distr_base[int(doc_id)][-1] = dl_base[doc_id] # Dump term_distr. with open(path_term_distr, 'wb') as f: pickle.dump(term_distr_base, f)
def main(): global idx_to_term from utility_functions import get_config, get_cmd_args config = get_config() args = get_cmd_args() path_out = config['paths'][args.location][args.corpus]['path_out'] path_idx_to_term = os.path.join(path_out, 'indexing/idx_to_token.json') print('Load idx-term mappings...') with open(path_idx_to_term, 'r', encoding='utf8') as f: idx_to_term_str = json.load(f) idx_to_term = {int(k): v for k, v in idx_to_term_str.items()} taxonomy = load_taxonomy(path_out) global ls ls = LabelScorer(config, args) # Run labeling with repr score as metric. print('Run labeling with repr score as metric...') path_tax_frep = os.path.join(path_out, 'concept_terms/tax_labels_repr.csv') tax_label_file = open(path_tax_frep, 'w', encoding='utf8') csv_writer = csv.writer(tax_label_file, delimiter=',') rec_find_labels(path_out, taxonomy, 10, [], 0, csv_writer, cos=False, label_score=False, hypo_score=False, incl_score=False) # Run labeling with cosine similarity as metric. print('Run labeling with sim score as metric...') path_tax_fsim = os.path.join(path_out, 'concept_terms/tax_labels_sim.csv') tax_label_file = open(path_tax_fsim, 'w', encoding='utf8') csv_writer = csv.writer(tax_label_file, delimiter=',') rec_find_labels(path_out, taxonomy, 10, [], 0, csv_writer, cos=True, label_score=False, hypo_score=False, incl_score=False) # All ls. print('Run labeling with label score as metric...') path_tax_fsim = os.path.join(path_out, 'concept_terms/tax_labels_ls.csv') tax_label_file = open(path_tax_fsim, 'w', encoding='utf8') csv_writer = csv.writer(tax_label_file, delimiter=',') rec_find_labels(path_out, taxonomy, 10, [], 0, csv_writer, cos=True, label_score=True, hypo_score=True, incl_score=True) # No cos. print('Run labeling with label score but without cos score as metric...') path_tax_fsim = os.path.join( path_out, 'concept_terms/tax_labels_ls_no_cos.csv') tax_label_file = open(path_tax_fsim, 'w', encoding='utf8') csv_writer = csv.writer(tax_label_file, delimiter=',') rec_find_labels(path_out, taxonomy, 10, [], 0, csv_writer, cos=False, label_score=True, hypo_score=True, incl_score=True) # No hypo. print('Run labeling with label score but without hypo score as metric...') path_tax_fsim = os.path.join( path_out, 'concept_terms/tax_labels_ls_no_hypo.csv') tax_label_file = open(path_tax_fsim, 'w', encoding='utf8') csv_writer = csv.writer(tax_label_file, delimiter=',') rec_find_labels(path_out, taxonomy, 10, [], 0, csv_writer, cos=True, label_score=True, hypo_score=False, incl_score=True) # No incl. print('Run labeling with label score but without incl score as metric...') path_tax_fsim = os.path.join( path_out, 'concept_terms/tax_labels_ls_no_incl.csv') tax_label_file = open(path_tax_fsim, 'w', encoding='utf8') csv_writer = csv.writer(tax_label_file, delimiter=',') rec_find_labels(path_out, taxonomy, 10, [], 0, csv_writer, cos=True, label_score=True, hypo_score=True, incl_score=False) # Only hypo print('Run labeling with label score but only hypo...') path_tax_fsim = os.path.join( path_out, 'concept_terms/tax_labels_ls_only_hypo.csv') tax_label_file = open(path_tax_fsim, 'w', encoding='utf8') csv_writer = csv.writer(tax_label_file, delimiter=',') rec_find_labels(path_out, taxonomy, 10, [], 0, csv_writer, cos=False, label_score=True, hypo_score=True, incl_score=False) # Only incl print('Run labeling with label score but only incl score...') path_tax_fsim = os.path.join( path_out, 'concept_terms/tax_labels_ls_only_incl.csv') tax_label_file = open(path_tax_fsim, 'w', encoding='utf8') csv_writer = csv.writer(tax_label_file, delimiter=',') rec_find_labels(path_out, taxonomy, 10, [], 0, csv_writer, cos=False, label_score=True, hypo_score=False, incl_score=True) print('Done')