def text_rank_pke(cls, corpus: Corpus): # define the set of valid Part-of-Speeches pos = {'NOUN', 'PROPN', 'ADJ'} # 1. create a TextRank extractor. extractor = pke.unsupervised.TextRank() if corpus.language == Language.DE: lan = "de" else: lan = "en" # 2. load the content of the document. keywords = {} for document in tqdm(corpus.get_documents(as_list=True), desc="Calculating TextRank"): extractor.load_document(input=document.text, language=lan, normalization="lemmatization") # 3. build the graph representation of the document and rank the words. # Keyphrase candidates are composed from the 33-percent # highest-ranked words. extractor.candidate_weighting(window=2, pos=pos, top_percent=0.33) # 4. get the 10-highest scored candidates as keyphrases # keyphrases = extractor.get_n_best(n=top_k) keywords[document.doc_id] = extractor.get_n_best(n=cls.top_k) corpus.assign_keywords(keywords=keywords, keyword_type=KeywordType.TEXT_RANK_PKE)
def main(): # load trained model print 'Loading neural machine translator with attention:' train_set = Corpus(TRAIN_SRC, TRAIN_TGT) dev_set = Corpus(DEV_SRC, DEV_TGT) m = load_model(train_set, dev_set, MODEL) print 'Model loaded!' # dev_set.target_sentences = dev_set.target_sentences[100:200] # dev_set.source_sentences = dev_set.source_sentences[100:200] # translate sentence print '\nTranslating . . .\n' sample_output = np.random.choice(len(dev_set.target_sentences), 5, False) greedy, beam = gen_all(m, dev_set.source_sentences, BSIZE) for sample in sample_output: print 'Target: {}'.format(' '.join(dev_set.target_sentences[sample])) print 'Greedy: {}'.format(' '.join(greedy[sample])) print 'Beam search: {}'.format(' '.join(beam[sample])) print '----------' greedy_score = get_bleu_score(greedy, dev_set.target_sentences) beam_score = get_bleu_score(beam, dev_set.target_sentences) print 'Greedy bleu score: ', greedy_score print 'Beam search bleu score: ', beam_score
def tfidf_skl(cls, corpus: Corpus): if corpus.language == Language.EN: stop_words = stopwords.words("english") elif corpus.language == Language.DE: stop_words = stopwords.words("german") else: raise UserWarning("No stopwords for language!") tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words, ngram_range=(cls.min_nrgam, cls.max_ngram), min_df=2) tfidf_matrix = tfidf_vectorizer.fit_transform( [document.text for document in corpus.get_documents(as_list=True)]) doc_id_lookup = { i: document.doc_id for i, document in enumerate(corpus.get_documents(as_list=True)) } features = tfidf_vectorizer.get_feature_names() keywords = {} for i, doc in tqdm(enumerate(tfidf_matrix), desc="Calculating tf-idf", total=tfidf_matrix.shape[0]): df = pd.DataFrame(doc.T.todense(), index=features, columns=["tfidf"]) top_key_words = df.sort_values(by=["tfidf"], ascending=False)[:cls.top_k] keywords[doc_id_lookup[i]] = list(top_key_words.index) corpus.assign_keywords(keywords=keywords, keyword_type=KeywordType.TFIDF_SKL)
def yearwise_documents(corpus: Corpus, aggregation_func: Callable = len, printing: bool = False, as_dict: bool = False): year_bins = defaultdict(list) for doc in corpus.get_documents(): year_bins[doc.date].append(doc) result = { year: aggregation_func( Corpus(source=docs, language=corpus.language, name=f'{corpus.name}_yearwise')) for year, docs in year_bins.items() if year is not None } result = OrderedDict(sorted(result.items())) if as_dict: return result years = [] counts = [] for year, count in result.items(): years.append(year) counts.append(count) if printing: print(f'{year}: {count}') # print(years) # print(counts) return years, counts
def parse_and_preprocess_src(data_source, corpus_destination, preprocess=True): if re.search("bundestag", data_source.lower()): name = "bundestag" raw_corpus = DataHandler.get_bundestag_speeches(directory=data_source) elif re.search("sustainability", data_source.lower()): name = "sustainability" raw_corpus = DataHandler.get_sustainability_data(path=data_source) elif re.search("unv1.0-tei", data_source.lower()): name = "united_nations" raw_corpus = DataHandler.get_un_texts(directory=data_source) elif re.search("state_of_the_union", data_source.lower()): name = "state_of_the_union" raw_corpus = DataHandler.get_state_of_the_union(directory=data_source) else: name = "abstracts" raw_corpus = DataHandler.get_abstracts(path=data_source) language = raw_corpus[0].language print('loaded', len(raw_corpus), 'documents') if preprocess: Preprocessor.preprocess(raw_corpus, language=language) print('preprocessed', len(raw_corpus), 'documents') corpus = Corpus(source=raw_corpus, language=language, name=name) print('parsed', len(corpus.get_documents(as_list=True)), 'documents to a Corpus') corpus.save_corpus(corpus_destination)
def single_rank_pke(cls, corpus: Corpus): # define the set of valid Part-of-Speeches pos = {'NOUN', 'PROPN', 'ADJ'} # 1. create a SingleRank extractor. extractor = pke.unsupervised.SingleRank() if corpus.language == Language.DE: lan = "de" else: lan = "en" # 2. load the content of the document. keywords = {} for document in tqdm(corpus.get_documents(as_list=True), desc="Calculating SingleRank"): extractor.load_document(input=document.text, language=lan, normalization="lemmatization") # 3. select the longest sequences of nouns and adjectives as candidates. extractor.candidate_selection(pos=pos) # 4. weight the candidates using the sum of their word's scores that are # computed using random walk. In the graph, nodes are words of # certain part-of-speech (nouns and adjectives) that are connected if # they occur in a window of 10 words. extractor.candidate_weighting(window=10, pos=pos) # 5. get the 10-highest scored candidates as keyphrases # keyphrases = extractor.get_n_best(n=top_k) # corpus.assign_keywords(keywords={document.doc_id: keyphrases}, keyword_type=KeywordType.SINGLE_RANK_PKE) keywords[document.doc_id] = extractor.get_n_best(n=cls.top_k) corpus.assign_keywords(keywords=keywords, keyword_type=KeywordType.SINGLE_RANK_PKE)
def main(train_src_file, train_tgt_file, dev_src_file, dev_tgt_file, model_file, num_epochs, embeddings_init = None, seed = 0): print('reading train corpus ...') train_set = Corpus(train_src_file, train_tgt_file) # assert() print('reading dev corpus ...') dev_set = Corpus(dev_src_file, dev_tgt_file) # test_set = Corpus(test_src_file) print 'Initializing neural machine translator with attention:' # src_vocab_size, tgt_vocab_size, tgt_idx2word, word_d, gru_d, gru_layers encoder_decoder = nmt_dynet_attention(len(train_set.source_word2idx), len(train_set.target_word2idx), train_set.source_word2idx, train_set.source_idx2word, train_set.target_word2idx, train_set.target_idx2word, 50, 50, 2) trainer = SimpleSGDTrainer(encoder_decoder.model) sample_output = np.random.choice(len(dev_set.target_sentences), 5, False) losses = [] best_bleu_score = 0 for epoch in range(num_epochs): print 'Starting epoch', epoch # shuffle the training data combined = list(zip(train_set.source_sentences, train_set.target_sentences)) random.shuffle(combined) train_set.source_sentences[:], train_set.target_sentences[:] = zip(*combined) print 'Training . . .' sentences_processed = 0 for src_sentence, tgt_sentence in zip(train_set.source_sentences, train_set.target_sentences): loss = encoder_decoder.get_loss(src_sentence, tgt_sentence) loss_value = loss.value() loss.backward() trainer.update() sentences_processed += 1 if sentences_processed % 4000 == 0: print 'sentences processed: ', sentences_processed # Accumulate average losses over training to plot val_loss = get_val_set_loss(encoder_decoder, dev_set) print 'Validation loss this epoch', val_loss losses.append(val_loss) print 'Translating . . .' translated_sentences = encoder_decoder.translate_all(dev_set.source_sentences) print('translating {} source sentences...'.format(len(sample_output))) for sample in sample_output: print('Target: {}\nTranslation: {}\n'.format(' '.join(dev_set.target_sentences[sample]), ' '.join(translated_sentences[sample]))) bleu_score = get_bleu_score(translated_sentences, dev_set.target_sentences) print 'bleu score: ', bleu_score if bleu_score > best_bleu_score: best_bleu_score = bleu_score # save the model encoder_decoder.save(model_file) print 'best bleu score: ', best_bleu_score
def cleaning_authors(config, overwrite=False): corpus_names = [ "bundestag_corpus", # "sustainability_corpus", # "abstract_corpus" ] languages = [Language.DE, Language.EN, Language.EN] wlc = 0 m_a = 0 s_a = 0 for i, corpus_name in enumerate(corpus_names): corpus = Corpus(source=config["corpora"][corpus_name], language=languages[i], name=corpus_name) # corpus = DataHandler.load_corpus(config["corpora"][corpus_name]) for d in corpus.get_documents(): if d.author: if isinstance(d.author, float) and np.isnan(d.author): d.author = None else: if corpus_name == "bundestag_corpus": authors = [d.author] elif corpus_name == "sustainability_corpus": if isinstance(d.author, str): authors = [a.strip() for a in d.author.split(',')] authors = [ f'{j}. {i}' for i, j in zip(authors[::2], authors[1::2]) ] else: authors = d.author else: if d.language != "English": wlc += 1 continue if isinstance(d.author, str): authors = [a.strip() for a in d.author.split(',')] authors = [ f'{j}. {i}' for i, j in zip(authors[::2], authors[1::2]) ] else: authors = d.author if len(authors) > 1: m_a += 1 print(d.author, authors) else: s_a += 1 d.author = authors if not overwrite: os.rename(src=config["corpora"][corpus_name], dst=create_new_filepath_uncleaned( config["corpora"][corpus_name])) corpus.save_corpus(config["corpora"][corpus_name]) print(wlc, m_a, s_a)
def count_non_years(corpus: Corpus): without_year = [d for d in corpus.get_documents() if d.date is None] print( len([ d.date for d in corpus.get_documents() if d.date and len(str(d.date)) != 4 ])) with_year = [d for d in corpus.get_documents() if d.date] print(f'{len(without_year)} / {len(with_year)}')
def build_word2vec(): print("Tokens: ", Corpus(CORPUS_FILE).count_tokens()) w2v = Word2Vec(Corpus(CORPUS_FILE), size=100, window=5, min_count=1, workers=4) w2v.save(MODEL_FILE)
def main(args): # Set the random seed manually for reproducibility. random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed) else: print("Note that our pre-trained models require CUDA to evaluate.") ########################################################################### # Load the models ########################################################################### ae_args, gan_args, idx2word, autoencoder, gan_gen, gan_disc \ = load_models(args.ae_args, args.gan_args, args.vocab_file, args.ae_model, args.g_model, args.d_model) ########################################################################### # Generation code ########################################################################### # Generate sentences corpus = Corpus(args.data_path, args.dict_file, vocab_size=len(idx2word)) source, _ = next(BatchGen(corpus.get_chunks(size=2), args.ngenerations)) prev_sent = [ decode_idx(corpus.dictionary, sent) for sent in source.tolist() ] source = Variable(source, volatile=True) sentences = generate(autoencoder, gan_gen, inp=source, vocab=idx2word, sample=args.sample, maxlen=args.maxlen) if not args.noprint: print("\nSentence generations:\n") for prev, sent in zip(prev_sent, sentences): print(prev) print(" ", sent) print("") with open(args.outf, "w") as f: f.write("Sentence generations:\n\n") for prev, sent in zip(prev_sent, sentences): f.write(prev + '\n') f.write("-> " + sent + '\n\n')
def main(): # prepare corpus corpus = Corpus(args.data_file, args.dict_file, vocab_size=args.vocab_size) # dumping vocabulary with open(os.path.join(out_dir, 'vocab.json'), 'w') as f: json.dump(corpus.dictionary.word2idx, f) # save arguments ntokens = len(corpus.dictionary.word2idx) args.ntokens = ntokens with open(os.path.join(out_dir, 'args.json'), 'w') as f: json.dump(vars(args), f) log.info('[Data Loaded.]') autoencoder = AutoEncoder() if args.split: train, valid = corpus.get_data(split=args.split) valid = batchify(valid, args.batch_size, shuffle=False) else: train = corpus.get_data() for epoch in range(1, args.epochs + 1): # shuffle train data in each epoch batches = batchify(train, args.batch_size, shuffle=True) global_iters = 0 start_time = datetime.now() for i, batch in enumerate(batches): loss = autoencoder.update(batch) if i % args.log_interval == 0 and i > 0: log.info(('[Epoch {} {}/{} Loss {:.5f} ETA {}]').format( epoch, i, len(batches), loss, str((datetime.now() - start_time) / (i + 1) * (len(batches) - i - 1)).split('.')[0])) global_iters += 1 if global_iters % 100 == 0: autoencoder.anneal() if args.split: word_acc, sent_acc = autoencoder.evaluate(valid) msg = 'Epoch {} word acc: {} | sent acc: {}'.format( epoch, word_acc, sent_acc) log.warn(msg) autoencoder.save(out_dir, 'autoencoder_model_{}.pt'.format(epoch))
def eval_acc(net, device, b_size=10): corpus = Corpus('mr', test=True) sum_all = 0 sum_count = 0 A_E = 1 for e in range(A_E): i = 0 b = 0 A_B = int(len(corpus.xs) // b_size) while i >= 0: x, y = batchify(corpus, i, b_size) if x is None: i = 1 break x, y = x.to(device), y.to(device) out = net(x) sum_all += (1 - (y ^ torch.argmax(out, dim=-1))).sum().to( torch.device('cpu')) sum_count += y.shape[0] i += b_size b += 1 # if (b % 50 == 0): # print('E: {}/{} | B: {}/{}'.format(e, A_E, b, A_B)) return float(sum_all) / float(sum_count)
def cleaning_punctuation(config, overwrite=False): corpus_names = [ "bundestag_corpus", "sustainability_corpus", "abstract_corpus" ] languages = [Language.DE, Language.EN, Language.EN] for i, corpus_name in enumerate(corpus_names): corpus = Corpus(source=config["corpora"][corpus_name], language=languages[i], name=corpus_name) remove_punctuation(corpus) if not overwrite: os.rename(src=config["corpora"][corpus_name], dst=create_new_filepath_uncleaned( config["corpora"][corpus_name])) corpus.save_corpus(config["corpora"][corpus_name])
def evaluate( model_path, corpus_path, pairs_path, batch_size=100, ): model = torch.load(model_path) model = model.cuda() model.eval() corpus = Corpus([tuple([corpus_path, os.path.dirname(corpus_path)])]) pairs_batch_loader = FileLoader( [tuple([pairs_path, os.path.dirname(pairs_path)])], batch_size) code = [] nl = [] for data in tqdm.tqdm(pairs_batch_loader): data = map(corpus.get, data) batch = (make_batch(model.embedding_layer, data[0][0]), make_batch(model.embedding_layer, data[1][0])) batch = [x.cuda() for x in batch] batch = (Variable(batch[0], volatile=True), Variable(batch[1], volatile=True)) # embed code and NL repr_left = model(batch[0]) repr_right = model(batch[1]) # accumulate for evaluation code.extend(repr_left.cpu().data.numpy()) nl.extend(repr_right.cpu().data.numpy()) code = np.array(code) nl = np.array(nl) sim_mat = cosine_similarity(nl, code) ans_locs = location_of_correct(sim_mat) summary = {} mr = np.mean(ans_locs) mrr = get_mrr(ans_locs) summary["mrr"] = mrr cutoffs = [1, 5, 10] fracs = [] for c in cutoffs: frac = get_fraction_correct_at(ans_locs, c) fracs.append(frac) print("Num obs: {}".format(code.shape[0])) print("Mean Rank: {}".format(mr)) print("MRR: {}".format(mrr)) for c, f in zip(cutoffs, fracs): print("Fraction Correct@{}: {}".format(c, f)) summary["success@{}".format(c)] = f return summary
def main(): parser = argparse.ArgumentParser( description='Extracts keywords for given algorithm on given corpora') parser.add_argument('-a', '--algorithm', help='Algorithm to use like rake or tfidf', default="rake") parser.add_argument('-c', '--corpora', help='Corpora to annotate as list', nargs='+', default=['state_of_the_union']) parser.add_argument('-t', '--translate', help='Translate keywords', action='store_true') args = vars(parser.parse_args()) config = ConfigLoader.get_config() # remove and use actual args chosen_corpora = [ # 'state_of_the_union', 'bundestag', 'abstract', 'sustainability' ] # args['corpora'] PathMetaData = namedtuple('PathMetaData', 'path corpus_name language') paths_and_meta_data = [ PathMetaData(config["corpora"]["bundestag_corpus"], "bundestag", Language.DE), PathMetaData(config["corpora"]["abstract_corpus"], "abstract", Language.EN), PathMetaData(config["corpora"]["sustainability_corpus"], "sustainability", Language.EN), PathMetaData(config["corpora"]["state_of_the_union_corpus"], "state_of_the_union", Language.EN), PathMetaData(config["corpora"]["united_nations_corpus"], "united_nations", Language.EN) ] paths_and_meta_data = [ path_meta for path_meta in paths_and_meta_data if path_meta.corpus_name in chosen_corpora ] corpora = [ Corpus(source=path_meta.path, name=path_meta.corpus_name, language=path_meta.language) for path_meta in paths_and_meta_data ] for corpus, path_meta in zip(corpora, paths_and_meta_data): corpus.save_corpus_without_text(modify_path(path_meta.path))
def topical_page_rank_pke(cls, corpus: Corpus): # define the set of valid Part-of-Speeches pos = {'NOUN', 'PROPN', 'ADJ'} # define the grammar for selecting the keyphrase candidates grammar = "NP: {<ADJ>*<NOUN|PROPN>+}" # 1. create a TopicalPageRank extractor. extractor = pke.unsupervised.TopicalPageRank() if corpus.language == Language.DE: lan = "de" else: lan = "en" # 2. load the content of the document. keywords = {} for document in tqdm(corpus.get_documents(as_list=True), desc="Calculating Topical PageRank"): extractor.load_document(input=document.text, language=lan, normalization="lemmatization") # 3. select the noun phrases as keyphrase candidates. extractor.candidate_selection(grammar=grammar) # 4. weight the keyphrase candidates using Single Topical PageRank. # Builds a word-graph in which edges connecting two words occurring # in a window are weighted by co-occurrence counts. extractor.candidate_weighting( window=10, pos=pos, lda_model='path/to/lda_model') # todo: find model # 5. get the 10-highest scored candidates as keyphrases # 5. get the 10-highest scored candidates as keyphrases # keyphrases = extractor.get_n_best(n=top_k) # corpus.assign_keywords(keywords={document.doc_id: keyphrases}, # keyword_type=KeywordType.TOPICAL_PAGE_RANK_PKE) keywords[document.doc_id] = extractor.get_n_best(n=cls.top_k) corpus.assign_keywords(keywords=keywords, keyword_type=KeywordType.TOPICAL_PAGE_RANK_PKE)
def encode(content, word_delimiter="|", tag_delimiter="/", num_step=60): # Create corpus instance corpus = Corpus(word_delimiter=word_delimiter, tag_delimiter=tag_delimiter) # Add text to corpus corpus.add_text(content) # Create index for character and tag char_index = index_builder(constant.CHARACTER_LIST, constant.CHAR_START_INDEX) tag_index = index_builder(constant.TAG_LIST, constant.TAG_START_INDEX) # Generate input inb = InputBuilder(corpus, char_index, tag_index, num_step, y_one_hot=False) # Display encoded content np.set_printoptions(threshold=np.inf) print("[Input]") print(inb.x) print("[Label]") print(inb.y)
def yake_pke(cls, corpus: Corpus): # 1. create a YAKE extractor. extractor = pke.unsupervised.YAKE() if corpus.language == Language.DE: lan = "de" stop_list = stopwords.words('german') else: lan = "en" stop_list = stopwords.words('english') # 2. load the content of the document. keywords = {} for document in tqdm(corpus.get_documents(as_list=True), desc="Calculating YAKE"): extractor.load_document(input=document.text, language=lan, normalization="lemmatization") # 3. select {1-3}-grams not containing punctuation marks and not # beginning/ending with a stopword as candidates. extractor.candidate_selection(n=3, stoplist=stop_list) # 4. weight the candidates using YAKE weighting scheme, a window (in # words) for computing left/right contexts can be specified. window = 2 extractor.candidate_weighting(window=window, stoplist=stop_list, use_stems=True) # 5. get the 10-highest scored candidates as keyphrases. # redundant keyphrases are removed from the output using levenshtein # distance and a threshold. threshold = 0.8 # keyphrases = extractor.get_n_best(n=top_k, threshold=threshold) # corpus.assign_keywords(keywords={document.doc_id: keyphrases}, keyword_type=KeywordType.YAKE_PKE) keywords[document.doc_id] = extractor.get_n_best( n=cls.top_k, threshold=threshold) corpus.assign_keywords(keywords=keywords, keyword_type=KeywordType.YAKE_PKE)
def position_rank_pke(cls, corpus: Corpus): # define the set of valid Part-of-Speeches pos = {'NOUN', 'PROPN', 'ADJ'} # define the grammar for selecting the keyphrase candidates grammar = "NP: {<ADJ>*<NOUN|PROPN>+}" # 1. create a PositionRank extractor. extractor = pke.unsupervised.PositionRank() if corpus.language == Language.DE: lan = "de" else: lan = "en" # 2. load the content of the document. keywords = {} for document in tqdm(corpus.get_documents(as_list=True), desc="Calculating PositionRank"): extractor.load_document(input=document.text, language=lan, normalization="lemmatization") # 3. select the noun phrases up to 3 words as keyphrase candidates. extractor.candidate_selection(grammar=grammar, maximum_word_number=3) # 4. weight the candidates using the sum of their word's scores that are # computed using random walk biaised with the position of the words # in the document. In the graph, nodes are words (nouns and # adjectives only) that are connected if they occur in a window of # 10 words. extractor.candidate_weighting(window=10, pos=pos) # 5. get the 10-highest scored candidates as keyphrases # 5. get the 10-highest scored candidates as keyphrases # corpus.assign_keywords(keywords={document.doc_id: keyphrases}, keyword_type=KeywordType.POSITION_RANK_PKE) keywords[document.doc_id] = extractor.get_n_best(n=cls.top_k) corpus.assign_keywords(keywords=keywords, keyword_type=KeywordType.POSITION_RANK_PKE)
def main(): parser = argparse.ArgumentParser( description='Extracts keywords for given algorithm on given corpora') parser.add_argument('-c', '--corpora', help='Corpora to annotate as list', nargs='+', default=['bundestag', 'abstract']) parser.add_argument('-t', '--translate', help='Translate keywords', action='store_true') args = vars(parser.parse_args()) config = ConfigLoader.get_config() chosen_corpora = args['corpora'] PathMetaData = namedtuple('PathMetaData', 'path corpus_name language') paths_and_meta_data = [ PathMetaData(config["corpora"]["state_of_the_union_corpus"], "state_of_the_union", Language.EN), PathMetaData(config["corpora"]["bundestag_corpus"], "bundestag", Language.DE), PathMetaData(config["corpora"]["abstract_corpus"], "abstract", Language.EN), PathMetaData(config["corpora"]["sustainability_corpus"], "sustainability", Language.EN), PathMetaData(config["corpora"]["united_nations_corpus"], "united_nations", Language.EN) ] paths_and_meta_data = [ path_meta for path_meta in paths_and_meta_data if path_meta.corpus_name in chosen_corpora ] print(f'Yearwise of {chosen_corpora}') corpora = [ Corpus(source=path_meta.path, name=path_meta.corpus_name, language=path_meta.language) for path_meta in paths_and_meta_data ] corpora = [corpus.year_wise_pseudo_documents() for corpus in corpora] for corpus, path_meta in zip(corpora, paths_and_meta_data): corpus.save_corpus(modify_path(path_meta.path)) corpus.save_corpus_without_text( modify_path(path_meta.path, without_text=True))
def topic_rank_pke(cls, corpus: Corpus): # define the set of valid Part-of-Speeches pos = {'NOUN', 'PROPN', 'ADJ'} # 1. create a TopicRank extractor. extractor = pke.unsupervised.TopicRank() if corpus.language == Language.DE: lan = "de" stop_list = stopwords.words('german') else: lan = "en" stop_list = stopwords.words('english') # 2. load the content of the document. keywords = {} for document in tqdm(corpus.get_documents(as_list=True), desc="Calculating TopicRank"): stop_list += list(string.punctuation) stop_list += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-'] extractor.load_document(input=document.text, language=lan, normalization="lemmatization") extractor.candidate_selection(pos=pos, stoplist=stop_list) # 4. build topics by grouping candidates with HAC (average linkage, # threshold of 1/4 of shared stems). Weight the topics using random # walk, and select the first occuring candidate from each topic. extractor.candidate_weighting(threshold=0.74, method='average') # 5. get the 10-highest scored candidates as keyphrases # keyphrases = extractor.get_n_best(n=top_k) # corpus.assign_keywords(keywords={document.doc_id: keyphrases}, keyword_type=KeywordType.TOPIC_RANK_PKE) keywords[document.doc_id] = extractor.get_n_best(n=cls.top_k) corpus.assign_keywords(keywords=keywords, keyword_type=KeywordType.TOPIC_RANK_PKE)
def build_phrase_model(): phrase_list = load_phrases() phrases = Phrases(Corpus(CORPUS_FILE)) bigrams = Phraser(phrases) bigrams.save(MODEL_FILE) years = Corpus(CORPUS_FILE).get_years() authors = Corpus(CORPUS_FILE).get_authors() with open(OUT_FILE, "w") as f: for i, line in tqdm(enumerate(bigrams[Corpus(CORPUS_FILE)])): line = remove_under(line) line = check_phrase_list(phrase_list, line) line = [authors[i]] + line line = [years[i]] + line f.write("{}\n".format(" ".join(remove_under(line))))
def multipartite_rank_pke(cls, corpus: Corpus): # define the set of valid Part-of-Speeches pos = {'NOUN', 'PROPN', 'ADJ'} # 1. create a MultipartiteRank extractor. extractor = pke.unsupervised.MultipartiteRank() if corpus.language == "German": lan = "de" stop_list = stopwords.words('german') else: lan = "en" stop_list = stopwords.words('english') # 2. load the content of the document. keywords = {} for document in tqdm(corpus.get_documents(as_list=True), desc="Calculating MultipartiteRank"): stop_list += list(string.punctuation) stop_list += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-'] extractor.load_document(input=document.text, language=lan, normalization="lemmatization") extractor.candidate_selection(pos=pos, stoplist=stop_list) # 4. build the Multipartite graph and rank candidates using random walk, # alpha controls the weight adjustment mechanism, see TopicRank for # threshold/method parameters. extractor.candidate_weighting(alpha=1.1, threshold=0.74, method='average') # 5. get the 10-highest scored candidates as keyphrases keywords[document.doc_id] = extractor.get_n_best(n=cls.top_k) corpus.assign_keywords(keywords=keywords, keyword_type=KeywordType.MULTIPARTITE_RANK_PKE)
def main(args): ########################################################################### # Load the models ########################################################################### model_args, idx2word, autoencoder, inverter, gan_gen, gan_disc = \ load_models(args.load_path) # Set the random seed manually for reproducibility. random.seed(model_args['seed']) np.random.seed(model_args['seed']) torch.manual_seed(model_args['seed']) if torch.cuda.is_available(): torch.cuda.manual_seed(model_args['seed']) else: print("Note that our pre-trained models require CUDA to evaluate.") ########################################################################### # Load data ########################################################################### corpus = Corpus(model_args['data_path'], maxlen=model_args['maxlen'], vocab_size=model_args['vocab_size'], lowercase=model_args['lowercase']) if args.test: eval_batch_size = 1 test_data = batchify(corpus.test, eval_batch_size, shuffle=False) else: train_data = batchify(corpus.train, model_args['batch_size'], shuffle=True) print("Loaded data!") ########################################################################### # Perturbations ########################################################################### ring_rng = np.linspace(0., 1., 100) n_rng = len(test_data) if args.test else len(train_data) for idx in range(n_rng): data_batch = test_data[idx] if args.test else train_data[idx] for l, r in zip(ring_rng, ring_rng[1:]): flg = perturb(data_batch, autoencoder, idx2word, model_args['sample'], model_args['maxlen'], left=l, right=r, n_samples=5, epoch=idx, gpu=model_args['cuda']) if flg: break
def tfidf_pke(cls, corpus: Corpus): stop_list = list(string.punctuation) # 1. create a TfIdf extractor. extractor = pke.unsupervised.TfIdf() # 2. load the content of the document. if corpus.language == Language.DE: lan = "de" else: lan = "en" keywords = {} for document in tqdm(corpus.get_documents(as_list=True), desc="Calculating TF-IDF PKE"): extractor.load_document(input=document.text, language=lan, normalization="lemmatization") # 3. select {1-3}-grams not containing punctuation marks as candidates. # must link spacy languages to language code extractor.candidate_selection(n=3, stoplist=stop_list) # pke.compute_document_frequency(input_dir='/path/to/collection/of/documents/', # output_file='output.tsv.gz', # extension='xml', # language='en', # normalization="lemmatization", # stoplist=stop_list) # # # 4. weight the candidates using a `tf` x `idf` # df = pke.load_document_frequency_file(input_file='output.tsv.gz') # # extractor.candidate_weighting(df=df) extractor.candidate_weighting() # 5. get the 10-highest scored candidates as keyphrases # keyphrases = extractor.get_n_best(n=top_k) # corpus.assign_keywords(keywords={document.doc_id: keyphrases}, keyword_type=KeywordType.TFIDF_PKE) keywords[document.doc_id] = extractor.get_n_best(n=cls.top_k) corpus.assign_keywords(keywords=keywords, keyword_type=KeywordType.TFIDF_PKE)
def cleaning_un(config, overwrite=True): corpus = Corpus(source=config["corpora"]["united_nations_corpus"], language=Language.DE, name="united_nations_corpus") corpus = Corpus(source=[d for d in corpus.get_documents() if d.date], language=corpus.language, name=corpus.name) print("1", len(corpus)) for d in corpus.get_documents(): d.date = int(d.date) print("2", len(corpus)) if not overwrite: os.rename(src=config["corpora"]["united_nations_corpus"], dst=create_new_filepath_uncleaned( config["corpora"]["united_nations_corpus"])) corpus.save_corpus(config["corpora"]["united_nations_corpus"])
def cleaning_bundestag(config, overwrite=True): corpus = Corpus(source=config["corpora"]["bundestag_corpus"], language=Language.DE, name="bundestag_corpus") # corpus = DataHandler.load_corpus(config["corpora"]["bundestag_corpus"]) corpus = Corpus(source=[d for d in corpus.get_documents() if d.date], language=corpus.language, name=corpus.name) print("1", len(corpus)) for d in corpus.get_documents(): d.date = int(d.date) print("2", len(corpus)) if not overwrite: os.rename(src=config["corpora"]["bundestag_corpus"], dst=create_new_filepath_uncleaned( config["corpora"]["bundestag_corpus"])) corpus.save_corpus(config["corpora"]["bundestag_corpus"])
def cleaning_abstracts(config, overwrite=True): corpus = Corpus(source=config["corpora"]["abstract_corpus"], language=Language.EN, name="abstract_corpus") # corpus = DataHandler.load_corpus(config["corpora"]["abstract_corpus"]) print("1", len(corpus)) corpus = Corpus([ d for d in corpus.get_documents() if d.date and len(str(d.date)) == 4 and d.date.isnumeric() ], name=corpus.name, language=Language.EN) for d in corpus.get_documents(): d.date = int(d.date) print("2", len(corpus)) if not overwrite: os.rename(src=config["corpora"]["abstract_corpus"], dst=create_new_filepath_uncleaned( config["corpora"]["abstract_corpus"])) corpus.save_corpus(config["corpora"]["abstract_corpus"])
def data_generator(args): file, testfile, valfile = getattr(observations, args.dataset)('data/') file, testfile, valfile = file.replace( '<eos>', chr(255) ), testfile.replace('<eos>', chr(255)), valfile.replace( '<eos>', chr(255) ) # Just replace <eos> with another unusual alphabet here (that is not in PTB) file_len = len(file) valfile_len = len(valfile) testfile_len = len(testfile) ############################################################ # Use the following if you want to pickle the loaded data pickle_name = "{0}.corpus".format(args.dataset) if os.path.exists(pickle_name): print("Loading cached data...") corpus = pickle.load(open(pickle_name, 'rb')) else: corpus = Corpus(file + " " + valfile + " " + testfile) pickle.dump(corpus, open(pickle_name, 'wb')) ############################################################ return file, file_len, valfile, valfile_len, testfile, testfile_len, corpus