def main(): parser = argparse.ArgumentParser( description='Extracts keywords for given algorithm on given corpora') parser.add_argument('-a', '--algorithm', help='Algorithm to use like rake or tfidf', default="rake") parser.add_argument('-c', '--corpora', help='Two Corpora to operate on ', nargs='+', default=['state_of_the_union', 'abstract']) parser.add_argument('-k', '--top_k', help='number of elements for output', type=int, default=100) args = vars(parser.parse_args()) config = ConfigLoader.get_config() # remove and use actual args chosen_corpora = ['sustainability', 'bundestag'] # args['corpora'] algorithm = "rake" # args['algorithm'] top_k = 100 # args['top_k'] yearwise = True evaluate_single(config, algorithm, chosen_corpora, top_k, use_unassigned=True, yearwise=yearwise)
def main(): parser = argparse.ArgumentParser( description='Extracts keywords for given algorithm on given corpora') parser.add_argument('-a', '--algorithm', help='Algorithm to use like rake or tfidf', default="rake") parser.add_argument('-c', '--corpora', help='Corpora to annotate as list', nargs='+', default=['state_of_the_union']) parser.add_argument('-t', '--translate', help='Translate keywords', action='store_true') args = vars(parser.parse_args()) config = ConfigLoader.get_config() # remove and use actual args chosen_corpora = [ # 'state_of_the_union', 'bundestag', 'abstract', 'sustainability' ] # args['corpora'] PathMetaData = namedtuple('PathMetaData', 'path corpus_name language') paths_and_meta_data = [ PathMetaData(config["corpora"]["bundestag_corpus"], "bundestag", Language.DE), PathMetaData(config["corpora"]["abstract_corpus"], "abstract", Language.EN), PathMetaData(config["corpora"]["sustainability_corpus"], "sustainability", Language.EN), PathMetaData(config["corpora"]["state_of_the_union_corpus"], "state_of_the_union", Language.EN), PathMetaData(config["corpora"]["united_nations_corpus"], "united_nations", Language.EN) ] paths_and_meta_data = [ path_meta for path_meta in paths_and_meta_data if path_meta.corpus_name in chosen_corpora ] corpora = [ Corpus(source=path_meta.path, name=path_meta.corpus_name, language=path_meta.language) for path_meta in paths_and_meta_data ] for corpus, path_meta in zip(corpora, paths_and_meta_data): corpus.save_corpus_without_text(modify_path(path_meta.path))
def main(): # load configuration parameters from config file config = ConfigLoader.get_config() # deletes unusable documents and replaces date with year int # cleaning_abstracts(config, overwrite=False) # cleaning_sustainability(config, overwrite=False) # cleaning_bundestag(config, overwrite=True) # # cleaning_authors(config, overwrite=True) cleaning_un(config, overwrite=False)
def main(): parser = argparse.ArgumentParser( description='Extracts keywords for given algorithm on given corpora') parser.add_argument('-c', '--corpora', help='Corpora to annotate as list', nargs='+', default=['bundestag', 'abstract']) parser.add_argument('-t', '--translate', help='Translate keywords', action='store_true') args = vars(parser.parse_args()) config = ConfigLoader.get_config() chosen_corpora = args['corpora'] PathMetaData = namedtuple('PathMetaData', 'path corpus_name language') paths_and_meta_data = [ PathMetaData(config["corpora"]["state_of_the_union_corpus"], "state_of_the_union", Language.EN), PathMetaData(config["corpora"]["bundestag_corpus"], "bundestag", Language.DE), PathMetaData(config["corpora"]["abstract_corpus"], "abstract", Language.EN), PathMetaData(config["corpora"]["sustainability_corpus"], "sustainability", Language.EN), PathMetaData(config["corpora"]["united_nations_corpus"], "united_nations", Language.EN) ] paths_and_meta_data = [ path_meta for path_meta in paths_and_meta_data if path_meta.corpus_name in chosen_corpora ] print(f'Yearwise of {chosen_corpora}') corpora = [ Corpus(source=path_meta.path, name=path_meta.corpus_name, language=path_meta.language) for path_meta in paths_and_meta_data ] corpora = [corpus.year_wise_pseudo_documents() for corpus in corpora] for corpus, path_meta in zip(corpora, paths_and_meta_data): corpus.save_corpus(modify_path(path_meta.path)) corpus.save_corpus_without_text( modify_path(path_meta.path, without_text=True))
def main(): parser = argparse.ArgumentParser( description='Translates keywords in keyword files of given paths') parser.add_argument( '-p', '--paths', help='Paths of keyword files to translate', nargs='+', default=['data/bundestag_corpus_rake_keywords_yearwise.json']) args = vars(parser.parse_args()) # -p data/bundestag_corpus_rake_keywords.json config = ConfigLoader.get_config() paths = args['paths'] cache_file = config["translator"]["cache_file"] google_client_secrets_file = config["translator"][ "google_client_secret_file"] translator = None timeout = None if google_client_secrets_file and not google_client_secrets_file == "": appflow = flow.InstalledAppFlow.from_client_secrets_file( google_client_secrets_file, scopes=['https://www.googleapis.com/auth/cloud-platform']) appflow.run_local_server() # launch browser # appflow.run_console() translator = g_translate.Client(credentials=appflow.credentials) else: # fallback translator = googletrans.Translator() cache = None timeout = timeout try: cache = load_cache_from_file(cache_file) except Exception as e: logging.warning("Loading of file failed") logging.warning(e) cache = {"de2en": {}, "en2de": {}} def iterate_keywords(data): tqdm_bar = tqdm(data.items(), total=len(data.keys())) for doc_id, keywords in tqdm_bar: if keywords: for keyword in keywords: en_translation = keyword["english_translation"] ger_translation = keyword["german_translation"] if en_translation is None: translated = translate(ger_translation, cache, translator, timeout, dest="en") keyword["english_translation"] = translated if ger_translation is None: translated = translate(en_translation, cache, translator, timeout, dest="de") keyword["german_translation"] = translated try: for path in paths: logging.debug(f'loading keywords at \"{path}\"') with open(path, encoding='utf-8') as f: data = json.load(f) logging.debug('translating keywords ...') iterate_keywords(data) logging.debug(f'saving keywords with translations at \"{path}\"') with open(path, "w", encoding='utf-8') as f: json.dump(data, f, indent=1, ensure_ascii=True) except KeyboardInterrupt: logging.debug('process was interrupted') finally: logging.debug('saving ...') save_cache_to_file(cache, cache_file)
import logging from typing import List, Union from flask import Flask, render_template, request, Response from utils import ConfigLoader, Corpus, Keyword, KeywordType, Language, KeywordTranslator, CorpusFilter from simple_statistics import yearwise_documents logging.basicConfig(level=logging.INFO) def modify_path(path: str, algorithm: str): return path.replace('.json', f'_{algorithm}.json') logging.info('importing corpora ...') config = ConfigLoader.get_config() corpus_data = {} keyword_data = {} min_year = 5000 max_year = 0 logging.info('importing corpora and keywords data ...') start_time = time.time() for corpus_name in config["corpora_for_viz"]: logging.info(corpus_name) with open(config["corpora_for_viz"][corpus_name]["corpus"]) as corpus_file: corpus_data[corpus_name] = json.load(corpus_file) with open(config["corpora_for_viz"][corpus_name] ["keywords"]) as keyword_file:
def main(): # load configuration parameters from config file config = ConfigLoader.get_config() # corpus = Corpus(source=config["corpora"]["abstract_corpus"], language=Language.EN, name="abstract_corpus") # corpus = Corpus(source=config["corpora"]["bundestag_corpus"], language=Language.DE, name="bundestag_corpus") corpus = Corpus(source=config["corpora"]["sustainability_corpus"], language=Language.EN, name="sustainability_corpus") # print(len(corpus)) # test = DocumentsFilter.filter(corpus, has_tags=['test']) # print(set([x.tags for x in test])) # print(len(test)) # # exit(0) corpus = corpus.get_n_documents_as_corpus(n=100) # build yearwise pseudo documents pseudo_corpus = corpus.year_wise_pseudo_documents() # extract keywords KeyPhraseExtractor.tfidf_skl(corpus=pseudo_corpus) print([d.keywords for d in pseudo_corpus.get_documents()]) KeyPhraseExtractor.rake(corpus=corpus) print([d.keywords for d in corpus.get_documents()]) # key_words_post = Document.group_keywords_year_wise(corpus) # key_words_pre = Document.transform_pseudo_docs_keywords_to_dict(KeyPhraseExtractor.rake(documents=pseudo_corpus)) # print(KeyPhraseExtractor.get_top_k_keywords(key_words_post_group, 10)) # print(KeyPhraseExtractor.get_top_k_keywords(key_words_pre_group, 10)) # format: {year->list fo keywords} kwt = KeywordTranslator(cache_file=config["translator"]["cache_file"]) counter = 0 for doc in corpus.get_documents(): for keyword in doc.keywords: if counter > 100: break kwt.translate(keyword) print(keyword) counter += 1 break print('extracting keywords with rake ...') rake_keywords = KeyPhraseExtractor.rake(corpus=corpus.get_documents()[0]) rake_keywords_keys = list(rake_keywords.keys()) print('rake keywords dict keys:', rake_keywords_keys) kwt = KeywordTranslator(cache_file=config["translator"]["cache_file"]) list_of_keywords = [] for k in rake_keywords[rake_keywords_keys[0]]: kw = Keyword(german_translation=k, keyword_type=KeywordType.RAKE) kwt.translate(kw) list_of_keywords.append(kw) print('{} \t {} \t\t\t {}'.format(kw.source_language, kw.english_translation, kw.german_translation))
def main(): parser = argparse.ArgumentParser( description='Extracts keywords for given algorithm on given corpora') parser.add_argument('-a', '--algorithm', help='Algorithm to use like rake or tfidf_skl', default="rake") parser.add_argument('-c', '--corpora', help='Corpora to annotate as list', nargs='+', default=['bundestag']) parser.add_argument('-t', '--translate', help='Translate keywords', action='store_true') args = vars(parser.parse_args()) config = ConfigLoader.get_config() # remove and use actual args algorithm = args['algorithm'] translate_keywords = False #args['translate'] chosen_corpora = args['corpora'] assign_keywords = False yearwise = True PathMetaData = namedtuple('PathMetaData', 'path corpus_name language') paths_and_meta_data = [ PathMetaData(config["corpora"]["state_of_the_union_corpus"], "state_of_the_union", Language.EN), PathMetaData(config["corpora"]["bundestag_corpus"], "bundestag", Language.DE), PathMetaData(config["corpora"]["abstract_corpus"], "abstract", Language.EN), PathMetaData(config["corpora"]["sustainability_corpus"], "sustainability", Language.EN), PathMetaData(config["corpora"]["united_nations_corpus"], "united_nations", Language.EN) ] paths_and_meta_data = [ path_meta for path_meta in paths_and_meta_data if path_meta.corpus_name in chosen_corpora ] if yearwise: KeyPhraseExtractor.top_k = 1000 KeyPhraseExtractor.max_ngram = 3 use = { "rake": KeyPhraseExtractor.rake, "tfidf_skl": KeyPhraseExtractor.tfidf_skl, "tfidf_pke": KeyPhraseExtractor.tfidf_pke, "text_rank": KeyPhraseExtractor.text_rank, "text_rank_pke": KeyPhraseExtractor.text_rank_pke, "yake": KeyPhraseExtractor.yake_pke, "single_rank": KeyPhraseExtractor.single_rank_pke, "topic_rank": KeyPhraseExtractor.topic_rank_pke, "topical_page_rank": KeyPhraseExtractor.topical_page_rank_pke, "position_rank": KeyPhraseExtractor.position_rank_pke, "multipartite_rank": KeyPhraseExtractor.multipartite_rank_pke } keyword_extractor = use[algorithm] print( f'Applied {algorithm} on {chosen_corpora} with translation={translate_keywords}' ) corpora = [ Corpus(source=path_meta.path, name=path_meta.corpus_name, language=path_meta.language) for path_meta in paths_and_meta_data ] if yearwise: corpora = [corpus.year_wise_pseudo_documents() for corpus in corpora] for corpus, path_meta in zip(corpora, paths_and_meta_data): if translate_keywords: kwt = KeywordTranslator( cache_file=config["translator"]["cache_file"]) corpus.translate_keywords(kwt) keyword_extractor(corpus=corpus) if assign_keywords: new_path = str(path_meta.path).replace('.json', f"_{algorithm}.json") corpus.save_corpus(new_path) else: new_path = str(path_meta.path).replace( '.json', f"_{algorithm}_keywords.json") if yearwise: new_path = str(new_path).replace('.json', f"_yearwise.json") keyword_storage = { doc_id: document.keywords for doc_id, document in corpus.documents.items() } with open(new_path, 'w', encoding='utf-8') as f: json.dump(keyword_storage, f, ensure_ascii=False, indent=1, default=lambda o: o.__dict__) print(f'wrote file {new_path}')
def test_filter(): config = ConfigLoader.get_config(relative_path="..") corpora = [ Corpus(source=config["corpora"]["abstract_corpus"], language=Language.EN, name="sustainability_corpus"), Corpus(source=config["corpora"]["bundestag_corpus"], language=Language.EN, name="sustainability_corpus"), Corpus(source=config["corpora"]["sustainability_corpus"], language=Language.EN, name="sustainability_corpus") ] for corpus in corpora: corpus = corpus.get_n_documents_as_corpus(200) # test text_contains test_text_words = ['climate', 'klima'] test = CorpusFilter.filter(corpus, text_contains_one_of=test_text_words) for t in test: is_incorporated = False for ttw in test_text_words: if ttw in t.text: is_incorporated = True if not is_incorporated: assert False # test is_one_of_languages test_date_range = range(2015,2016) test = CorpusFilter.filter(corpus, date_in_range=test_date_range) for t in test: if t.date not in test_date_range: assert False # test is_one_of_languages test_languages = ['english', 'en'] test = CorpusFilter.filter(corpus, is_one_of_languages=test_languages) for t in test: if t.language.lower() is not test_languages: assert False # test is_one_of_doc_ids test_doc_id = '0' test = CorpusFilter.filter(corpus, is_one_of_doc_ids=[test_doc_id]) for t in test: if test_doc_id is not t.doc_id: assert False # test has_authors test_author = 'test' test = CorpusFilter.filter(corpus, has_authors=[test_author]) for t in test: if test_author not in t.author: assert False # test has_tags test_tags = 'test' test = CorpusFilter.filter(corpus, has_tags=[test_tags]) for t in test: if test_tags not in t.tags: assert False # test is_one_of_parties test_parties = ["cdu", "FdP"] test = CorpusFilter.filter(corpus, is_one_of_parties=[test_parties]) for t in test: if t.party.lower() not in [x.lower() for x in test_parties]: assert False # test ratings_in_range test_rating_range = range(0, 7) test = CorpusFilter.filter(corpus, ratings_in_range=test_rating_range) for t in test: if t.rating not in test_rating_range: assert False # TODO: Test for keywords assert True
def main(): config = ConfigLoader.get_config() corpora = [ Corpus(source=config["corpora"]["abstract_corpus"], language=Language.EN, name="abstract_corpus"), Corpus(source=config["corpora"]["bundestag_corpus"], language=Language.DE, name="bundestag_corpus"), Corpus(source=config["corpora"]["sustainability_corpus"], language=Language.EN, name="sustainability_corpus"), Corpus(source=config["corpora"]["state_of_the_union_corpus"], language=Language.EN, name="state_of_the_union_corpus") ] # count_non_years(corpora[0]) # count_non_years(corpora[1]) # count_non_years(corpora[2]) # Results: non date vs useable date # abstract: 54135 / 261215, 1387 don't have a year as date but a string # bundestag: 0 / 877973 # sustainability 3 / 221034 print(document_number(corpora[0])) print(document_number(corpora[1])) print(document_number(corpora[2])) print(document_number(corpora[3])) print(token_number(corpora[0])) print(token_number(corpora[1])) print(token_number(corpora[2])) print(token_number(corpora[3])) # Results: token number # abstract: 59314582 # bundestag: 226300348 # sustainability: 52878146 yearwise_documents(corpora[0], aggregation_func=len) # [1900, 1904, 1951, 1961, 1965, 1972, 1974, 1975, 1976, 1978, 1979, 1980, 1981, 1983, 1984, 1985, 1986, 1987, 1988, # 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, # 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018] # [1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 3, 2, 1, 4, 14, 28, 47, 44, 124, 714, 962, 1080, 1143, 1513, 2104, 2341, # 2554, 2862, 2947, 3470, 3617, 4230, 4495, 4827, 5655, 6948, 8331, 10287, 11750, 14345, 16149, 19308, 20899, # 23429, 26201, 28937, 29835] yearwise_documents(corpora[0], aggregation_func=token_number) # [1900, 1904, 1951, 1961, 1965, 1972, 1974, 1975, 1976, 1978, 1979, 1980, 1981, 1983, 1984, 1985, 1986, 1987, 1988, # 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, # 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018] # [237, 289, 26, 196, 299, 4, 2, 302, 13, 35, 163, 2, 513, 13, 3, 354, 2763, 5930, 10297, 9573, 20802, 124895, # 172925, 202836, 227647, 303919, 435539, 496060, 558721, 628000, 653111, 770258, 822043, 937258, 1009178, 1078762, # 1283970, 1593002, 1880724, 2268271, 2621783, 3192629, 3664511, 4406424, 4775594, 5367972, 6024271, # 6682090, 7080373] yearwise_documents(corpora[1], aggregation_func=len) # [1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, # 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, # 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, # 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019] # [1540, 7359, 7846, 7492, 6252, 5534, 5794, 7532, 6738, 4469, 4446, 7027, 5950, 7756, 8704, 12078, 13355, 14542, # 15855, 15673, 14876, 15917, 16901, 8760, 15082, 16343, 17110, 11914, 14095, 15597, 14811, 8937, 14207, 14647, # 9904, 16009, 19397, 16843, 10560, 16032, 16220, 11704, 14972, 14102, 17113, 11485, 16825, 17482, 13614, 9905, # 15310, 14208, 14124, 10926, 12884, 14305, 7757, 14210, 13508, 14408, 10609, 16643, 17751, 16497, 11335, 15374, # 14794, 13705, 5829, 17021, 9469] yearwise_documents(corpora[1], aggregation_func=token_number) # [1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, # 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, # 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, # 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019] # [612509, 2854944, 3061777, 3034065, 2113852, 2406060, 2380625, 2660021, 2460495, 2114953, 1715064, 2049805, # 1614656, 1634229, 1867580, 2135204, 2055406, 2452521, 2553521, 2575640, 2464189, 2675640, 2836025, 1644761, # 2665313, 3244912, 3004963, 2657335, 2751084, 2919374, 3366152, 2159773, 2722208, 3171091, 2280604, 3443955, # 3855233, 3566063, 2569335, 3565324, 4173720, 3067311, 3987509, 3832524, 4291976, 3145478, 4291797, 4338335, # 3925125, 3094547, 4464993, 4373147, 4392056, 3738766, 3946187, 4129635, 2350304, 4330315, 3983980, 4532271, # 3752798, 5167090, 5442241, 5468729, 3942007, 4846052, 4613129, 4046021, 1607377, 4583019, 2525648] yearwise_documents(corpora[2], aggregation_func=len) # [1986, 1987, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, # 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018] # [1, 1, 39, 297, 476, 572, 749, 1017, 1117, 1327, 1479, 1673, 1953, 2072, 2246, 2762, 2971, 3593, 4149, 5313, 6234, # 7880, 9095, 10858, 12484, 15035, 17163, 20084, 23485, 29233, 35676] yearwise_documents(corpora[2], aggregation_func=token_number)