def main():
    parser = argparse.ArgumentParser(
        description='Extracts keywords for given algorithm on given corpora')
    parser.add_argument('-a',
                        '--algorithm',
                        help='Algorithm to use like rake or tfidf',
                        default="rake")
    parser.add_argument('-c',
                        '--corpora',
                        help='Two Corpora to operate on ',
                        nargs='+',
                        default=['state_of_the_union', 'abstract'])
    parser.add_argument('-k',
                        '--top_k',
                        help='number of elements for output',
                        type=int,
                        default=100)
    args = vars(parser.parse_args())

    config = ConfigLoader.get_config()

    #  remove and use actual args
    chosen_corpora = ['sustainability', 'bundestag']  # args['corpora']
    algorithm = "rake"  # args['algorithm']
    top_k = 100  # args['top_k']
    yearwise = True

    evaluate_single(config,
                    algorithm,
                    chosen_corpora,
                    top_k,
                    use_unassigned=True,
                    yearwise=yearwise)
예제 #2
0
def main():
    parser = argparse.ArgumentParser(
        description='Extracts keywords for given algorithm on given corpora')
    parser.add_argument('-a',
                        '--algorithm',
                        help='Algorithm to use like rake or tfidf',
                        default="rake")
    parser.add_argument('-c',
                        '--corpora',
                        help='Corpora to annotate as list',
                        nargs='+',
                        default=['state_of_the_union'])
    parser.add_argument('-t',
                        '--translate',
                        help='Translate keywords',
                        action='store_true')
    args = vars(parser.parse_args())

    config = ConfigLoader.get_config()

    #  remove and use actual args
    chosen_corpora = [
        # 'state_of_the_union',
        'bundestag',
        'abstract',
        'sustainability'
    ]  # args['corpora']

    PathMetaData = namedtuple('PathMetaData', 'path corpus_name language')
    paths_and_meta_data = [
        PathMetaData(config["corpora"]["bundestag_corpus"], "bundestag",
                     Language.DE),
        PathMetaData(config["corpora"]["abstract_corpus"], "abstract",
                     Language.EN),
        PathMetaData(config["corpora"]["sustainability_corpus"],
                     "sustainability", Language.EN),
        PathMetaData(config["corpora"]["state_of_the_union_corpus"],
                     "state_of_the_union", Language.EN),
        PathMetaData(config["corpora"]["united_nations_corpus"],
                     "united_nations", Language.EN)
    ]
    paths_and_meta_data = [
        path_meta for path_meta in paths_and_meta_data
        if path_meta.corpus_name in chosen_corpora
    ]

    corpora = [
        Corpus(source=path_meta.path,
               name=path_meta.corpus_name,
               language=path_meta.language)
        for path_meta in paths_and_meta_data
    ]

    for corpus, path_meta in zip(corpora, paths_and_meta_data):
        corpus.save_corpus_without_text(modify_path(path_meta.path))
def main():
    # load configuration parameters from config file
    config = ConfigLoader.get_config()

    # deletes unusable documents and replaces date with year int
    # cleaning_abstracts(config, overwrite=False)
    # cleaning_sustainability(config, overwrite=False)
    # cleaning_bundestag(config, overwrite=True)
    #
    # cleaning_authors(config, overwrite=True)
    cleaning_un(config, overwrite=False)
def main():
    parser = argparse.ArgumentParser(
        description='Extracts keywords for given algorithm on given corpora')
    parser.add_argument('-c',
                        '--corpora',
                        help='Corpora to annotate as list',
                        nargs='+',
                        default=['bundestag', 'abstract'])
    parser.add_argument('-t',
                        '--translate',
                        help='Translate keywords',
                        action='store_true')
    args = vars(parser.parse_args())

    config = ConfigLoader.get_config()

    chosen_corpora = args['corpora']

    PathMetaData = namedtuple('PathMetaData', 'path corpus_name language')
    paths_and_meta_data = [
        PathMetaData(config["corpora"]["state_of_the_union_corpus"],
                     "state_of_the_union", Language.EN),
        PathMetaData(config["corpora"]["bundestag_corpus"], "bundestag",
                     Language.DE),
        PathMetaData(config["corpora"]["abstract_corpus"], "abstract",
                     Language.EN),
        PathMetaData(config["corpora"]["sustainability_corpus"],
                     "sustainability", Language.EN),
        PathMetaData(config["corpora"]["united_nations_corpus"],
                     "united_nations", Language.EN)
    ]

    paths_and_meta_data = [
        path_meta for path_meta in paths_and_meta_data
        if path_meta.corpus_name in chosen_corpora
    ]

    print(f'Yearwise of {chosen_corpora}')

    corpora = [
        Corpus(source=path_meta.path,
               name=path_meta.corpus_name,
               language=path_meta.language)
        for path_meta in paths_and_meta_data
    ]

    corpora = [corpus.year_wise_pseudo_documents() for corpus in corpora]

    for corpus, path_meta in zip(corpora, paths_and_meta_data):
        corpus.save_corpus(modify_path(path_meta.path))
        corpus.save_corpus_without_text(
            modify_path(path_meta.path, without_text=True))
def main():
    parser = argparse.ArgumentParser(
        description='Translates keywords in keyword files of given paths')
    parser.add_argument(
        '-p',
        '--paths',
        help='Paths of keyword files to translate',
        nargs='+',
        default=['data/bundestag_corpus_rake_keywords_yearwise.json'])
    args = vars(parser.parse_args())
    # -p data/bundestag_corpus_rake_keywords.json
    config = ConfigLoader.get_config()
    paths = args['paths']

    cache_file = config["translator"]["cache_file"]
    google_client_secrets_file = config["translator"][
        "google_client_secret_file"]
    translator = None
    timeout = None

    if google_client_secrets_file and not google_client_secrets_file == "":

        appflow = flow.InstalledAppFlow.from_client_secrets_file(
            google_client_secrets_file,
            scopes=['https://www.googleapis.com/auth/cloud-platform'])
        appflow.run_local_server()  # launch browser
        # appflow.run_console()
        translator = g_translate.Client(credentials=appflow.credentials)

    else:  # fallback
        translator = googletrans.Translator()

    cache = None
    timeout = timeout

    try:
        cache = load_cache_from_file(cache_file)
    except Exception as e:
        logging.warning("Loading of file failed")
        logging.warning(e)
        cache = {"de2en": {}, "en2de": {}}

    def iterate_keywords(data):
        tqdm_bar = tqdm(data.items(), total=len(data.keys()))
        for doc_id, keywords in tqdm_bar:
            if keywords:
                for keyword in keywords:
                    en_translation = keyword["english_translation"]
                    ger_translation = keyword["german_translation"]
                    if en_translation is None:
                        translated = translate(ger_translation,
                                               cache,
                                               translator,
                                               timeout,
                                               dest="en")
                        keyword["english_translation"] = translated
                    if ger_translation is None:
                        translated = translate(en_translation,
                                               cache,
                                               translator,
                                               timeout,
                                               dest="de")
                        keyword["german_translation"] = translated

    try:
        for path in paths:
            logging.debug(f'loading keywords at \"{path}\"')
            with open(path, encoding='utf-8') as f:
                data = json.load(f)

            logging.debug('translating keywords ...')
            iterate_keywords(data)

            logging.debug(f'saving keywords with translations at \"{path}\"')
            with open(path, "w", encoding='utf-8') as f:
                json.dump(data, f, indent=1, ensure_ascii=True)

    except KeyboardInterrupt:
        logging.debug('process was interrupted')
    finally:
        logging.debug('saving ...')
        save_cache_to_file(cache, cache_file)
import logging
from typing import List, Union
from flask import Flask, render_template, request, Response

from utils import ConfigLoader, Corpus, Keyword, KeywordType, Language, KeywordTranslator, CorpusFilter
from simple_statistics import yearwise_documents

logging.basicConfig(level=logging.INFO)


def modify_path(path: str, algorithm: str):
    return path.replace('.json', f'_{algorithm}.json')


logging.info('importing corpora ...')
config = ConfigLoader.get_config()

corpus_data = {}
keyword_data = {}
min_year = 5000
max_year = 0

logging.info('importing corpora and keywords data ...')
start_time = time.time()

for corpus_name in config["corpora_for_viz"]:
    logging.info(corpus_name)
    with open(config["corpora_for_viz"][corpus_name]["corpus"]) as corpus_file:
        corpus_data[corpus_name] = json.load(corpus_file)
    with open(config["corpora_for_viz"][corpus_name]
              ["keywords"]) as keyword_file:
def main():
    # load configuration parameters from config file
    config = ConfigLoader.get_config()

    # corpus = Corpus(source=config["corpora"]["abstract_corpus"], language=Language.EN, name="abstract_corpus")
    # corpus = Corpus(source=config["corpora"]["bundestag_corpus"], language=Language.DE, name="bundestag_corpus")
    corpus = Corpus(source=config["corpora"]["sustainability_corpus"],
                    language=Language.EN,
                    name="sustainability_corpus")

    # print(len(corpus))
    # test = DocumentsFilter.filter(corpus, has_tags=['test'])
    # print(set([x.tags for x in test]))
    # print(len(test))
    #
    # exit(0)

    corpus = corpus.get_n_documents_as_corpus(n=100)

    # build yearwise pseudo documents

    pseudo_corpus = corpus.year_wise_pseudo_documents()
    # extract keywords
    KeyPhraseExtractor.tfidf_skl(corpus=pseudo_corpus)
    print([d.keywords for d in pseudo_corpus.get_documents()])

    KeyPhraseExtractor.rake(corpus=corpus)
    print([d.keywords for d in corpus.get_documents()])
    # key_words_post = Document.group_keywords_year_wise(corpus)
    # key_words_pre = Document.transform_pseudo_docs_keywords_to_dict(KeyPhraseExtractor.rake(documents=pseudo_corpus))

    # print(KeyPhraseExtractor.get_top_k_keywords(key_words_post_group, 10))
    # print(KeyPhraseExtractor.get_top_k_keywords(key_words_pre_group, 10))
    # format: {year->list fo keywords}

    kwt = KeywordTranslator(cache_file=config["translator"]["cache_file"])

    counter = 0
    for doc in corpus.get_documents():
        for keyword in doc.keywords:
            if counter > 100:
                break
            kwt.translate(keyword)
            print(keyword)
            counter += 1
        break

    print('extracting keywords with rake ...')
    rake_keywords = KeyPhraseExtractor.rake(corpus=corpus.get_documents()[0])
    rake_keywords_keys = list(rake_keywords.keys())
    print('rake keywords dict keys:', rake_keywords_keys)

    kwt = KeywordTranslator(cache_file=config["translator"]["cache_file"])
    list_of_keywords = []

    for k in rake_keywords[rake_keywords_keys[0]]:
        kw = Keyword(german_translation=k, keyword_type=KeywordType.RAKE)
        kwt.translate(kw)
        list_of_keywords.append(kw)
        print('{} \t {} \t\t\t {}'.format(kw.source_language,
                                          kw.english_translation,
                                          kw.german_translation))
def main():
    parser = argparse.ArgumentParser(
        description='Extracts keywords for given algorithm on given corpora')
    parser.add_argument('-a',
                        '--algorithm',
                        help='Algorithm to use like rake or tfidf_skl',
                        default="rake")
    parser.add_argument('-c',
                        '--corpora',
                        help='Corpora to annotate as list',
                        nargs='+',
                        default=['bundestag'])
    parser.add_argument('-t',
                        '--translate',
                        help='Translate keywords',
                        action='store_true')
    args = vars(parser.parse_args())

    config = ConfigLoader.get_config()

    #  remove and use actual args
    algorithm = args['algorithm']
    translate_keywords = False  #args['translate']
    chosen_corpora = args['corpora']
    assign_keywords = False
    yearwise = True

    PathMetaData = namedtuple('PathMetaData', 'path corpus_name language')
    paths_and_meta_data = [
        PathMetaData(config["corpora"]["state_of_the_union_corpus"],
                     "state_of_the_union", Language.EN),
        PathMetaData(config["corpora"]["bundestag_corpus"], "bundestag",
                     Language.DE),
        PathMetaData(config["corpora"]["abstract_corpus"], "abstract",
                     Language.EN),
        PathMetaData(config["corpora"]["sustainability_corpus"],
                     "sustainability", Language.EN),
        PathMetaData(config["corpora"]["united_nations_corpus"],
                     "united_nations", Language.EN)
    ]

    paths_and_meta_data = [
        path_meta for path_meta in paths_and_meta_data
        if path_meta.corpus_name in chosen_corpora
    ]
    if yearwise:
        KeyPhraseExtractor.top_k = 1000
        KeyPhraseExtractor.max_ngram = 3
    use = {
        "rake": KeyPhraseExtractor.rake,
        "tfidf_skl": KeyPhraseExtractor.tfidf_skl,
        "tfidf_pke": KeyPhraseExtractor.tfidf_pke,
        "text_rank": KeyPhraseExtractor.text_rank,
        "text_rank_pke": KeyPhraseExtractor.text_rank_pke,
        "yake": KeyPhraseExtractor.yake_pke,
        "single_rank": KeyPhraseExtractor.single_rank_pke,
        "topic_rank": KeyPhraseExtractor.topic_rank_pke,
        "topical_page_rank": KeyPhraseExtractor.topical_page_rank_pke,
        "position_rank": KeyPhraseExtractor.position_rank_pke,
        "multipartite_rank": KeyPhraseExtractor.multipartite_rank_pke
    }

    keyword_extractor = use[algorithm]

    print(
        f'Applied {algorithm} on {chosen_corpora} with translation={translate_keywords}'
    )

    corpora = [
        Corpus(source=path_meta.path,
               name=path_meta.corpus_name,
               language=path_meta.language)
        for path_meta in paths_and_meta_data
    ]

    if yearwise:
        corpora = [corpus.year_wise_pseudo_documents() for corpus in corpora]

    for corpus, path_meta in zip(corpora, paths_and_meta_data):
        if translate_keywords:
            kwt = KeywordTranslator(
                cache_file=config["translator"]["cache_file"])
            corpus.translate_keywords(kwt)
        keyword_extractor(corpus=corpus)
        if assign_keywords:
            new_path = str(path_meta.path).replace('.json',
                                                   f"_{algorithm}.json")
            corpus.save_corpus(new_path)
        else:
            new_path = str(path_meta.path).replace(
                '.json', f"_{algorithm}_keywords.json")
            if yearwise:
                new_path = str(new_path).replace('.json', f"_yearwise.json")
            keyword_storage = {
                doc_id: document.keywords
                for doc_id, document in corpus.documents.items()
            }
            with open(new_path, 'w', encoding='utf-8') as f:
                json.dump(keyword_storage,
                          f,
                          ensure_ascii=False,
                          indent=1,
                          default=lambda o: o.__dict__)
            print(f'wrote file {new_path}')
예제 #9
0
def test_filter():

    config = ConfigLoader.get_config(relative_path="..")

    corpora = [
        Corpus(source=config["corpora"]["abstract_corpus"], language=Language.EN, name="sustainability_corpus"),
        Corpus(source=config["corpora"]["bundestag_corpus"], language=Language.EN, name="sustainability_corpus"),
        Corpus(source=config["corpora"]["sustainability_corpus"], language=Language.EN, name="sustainability_corpus")
    ]

    for corpus in corpora:
        corpus = corpus.get_n_documents_as_corpus(200)

        # test text_contains
        test_text_words = ['climate', 'klima']
        test = CorpusFilter.filter(corpus, text_contains_one_of=test_text_words)
        for t in test:
            is_incorporated = False
            for ttw in test_text_words:
                if ttw in t.text:
                    is_incorporated = True
            if not is_incorporated:
                assert False

        # test is_one_of_languages
        test_date_range = range(2015,2016)
        test = CorpusFilter.filter(corpus, date_in_range=test_date_range)
        for t in test:
            if t.date not in test_date_range:
                assert False

        # test is_one_of_languages
        test_languages = ['english', 'en']
        test = CorpusFilter.filter(corpus, is_one_of_languages=test_languages)
        for t in test:
            if t.language.lower() is not test_languages:
                assert False

        # test is_one_of_doc_ids
        test_doc_id = '0'
        test = CorpusFilter.filter(corpus, is_one_of_doc_ids=[test_doc_id])
        for t in test:
            if test_doc_id is not t.doc_id:
                assert False

        # test has_authors
        test_author = 'test'
        test = CorpusFilter.filter(corpus, has_authors=[test_author])
        for t in test:
            if test_author not in t.author:
                assert False

        # test has_tags
        test_tags = 'test'
        test = CorpusFilter.filter(corpus, has_tags=[test_tags])
        for t in test:
            if test_tags not in t.tags:
                assert False

        # test is_one_of_parties
        test_parties = ["cdu", "FdP"]
        test = CorpusFilter.filter(corpus, is_one_of_parties=[test_parties])
        for t in test:
            if t.party.lower() not in [x.lower() for x in test_parties]:
                assert False

        # test ratings_in_range
        test_rating_range = range(0, 7)
        test = CorpusFilter.filter(corpus, ratings_in_range=test_rating_range)
        for t in test:
            if t.rating not in test_rating_range:
                assert False

        # TODO: Test for keywords

        assert True
def main():
    config = ConfigLoader.get_config()

    corpora = [
        Corpus(source=config["corpora"]["abstract_corpus"],
               language=Language.EN,
               name="abstract_corpus"),
        Corpus(source=config["corpora"]["bundestag_corpus"],
               language=Language.DE,
               name="bundestag_corpus"),
        Corpus(source=config["corpora"]["sustainability_corpus"],
               language=Language.EN,
               name="sustainability_corpus"),
        Corpus(source=config["corpora"]["state_of_the_union_corpus"],
               language=Language.EN,
               name="state_of_the_union_corpus")
    ]

    # count_non_years(corpora[0])
    # count_non_years(corpora[1])
    # count_non_years(corpora[2])

    # Results: non date vs useable date
    # abstract: 54135 / 261215, 1387 don't have a year as date but a string
    # bundestag: 0 / 877973
    # sustainability 3 / 221034

    print(document_number(corpora[0]))
    print(document_number(corpora[1]))
    print(document_number(corpora[2]))
    print(document_number(corpora[3]))

    print(token_number(corpora[0]))
    print(token_number(corpora[1]))
    print(token_number(corpora[2]))
    print(token_number(corpora[3]))

    # Results: token number
    # abstract: 59314582
    # bundestag: 226300348
    # sustainability: 52878146

    yearwise_documents(corpora[0], aggregation_func=len)
    # [1900, 1904, 1951, 1961, 1965, 1972, 1974, 1975, 1976, 1978, 1979, 1980, 1981, 1983, 1984, 1985, 1986, 1987, 1988,
    #  1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
    #  2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]
    # [1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 3, 2, 1, 4, 14, 28, 47, 44, 124, 714, 962, 1080, 1143, 1513, 2104, 2341,
    # 2554, 2862, 2947, 3470, 3617, 4230, 4495, 4827, 5655, 6948, 8331, 10287, 11750, 14345, 16149, 19308, 20899,
    # 23429, 26201, 28937, 29835]
    yearwise_documents(corpora[0], aggregation_func=token_number)
    # [1900, 1904, 1951, 1961, 1965, 1972, 1974, 1975, 1976, 1978, 1979, 1980, 1981, 1983, 1984, 1985, 1986, 1987, 1988,
    #  1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
    #  2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]
    # [237, 289, 26, 196, 299, 4, 2, 302, 13, 35, 163, 2, 513, 13, 3, 354, 2763, 5930, 10297, 9573, 20802, 124895,
    # 172925, 202836, 227647, 303919, 435539, 496060, 558721, 628000, 653111, 770258, 822043, 937258, 1009178, 1078762,
    # 1283970, 1593002, 1880724, 2268271, 2621783, 3192629, 3664511, 4406424, 4775594, 5367972, 6024271,
    # 6682090, 7080373]

    yearwise_documents(corpora[1], aggregation_func=len)
    # [1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967,
    #  1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986,
    #  1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005,
    #  2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]
    # [1540, 7359, 7846, 7492, 6252, 5534, 5794, 7532, 6738, 4469, 4446, 7027, 5950, 7756, 8704, 12078, 13355, 14542,
    #  15855, 15673, 14876, 15917, 16901, 8760, 15082, 16343, 17110, 11914, 14095, 15597, 14811, 8937, 14207, 14647,
    #  9904, 16009, 19397, 16843, 10560, 16032, 16220, 11704, 14972, 14102, 17113, 11485, 16825, 17482, 13614, 9905,
    #  15310, 14208, 14124, 10926, 12884, 14305, 7757, 14210, 13508, 14408, 10609, 16643, 17751, 16497, 11335, 15374,
    #  14794, 13705, 5829, 17021, 9469]
    yearwise_documents(corpora[1], aggregation_func=token_number)
    # [1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967,
    #  1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986,
    #  1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005,
    #  2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]
    # [612509, 2854944, 3061777, 3034065, 2113852, 2406060, 2380625, 2660021, 2460495, 2114953, 1715064, 2049805,
    # 1614656, 1634229, 1867580, 2135204, 2055406, 2452521, 2553521, 2575640, 2464189, 2675640, 2836025, 1644761,
    # 2665313, 3244912, 3004963, 2657335, 2751084, 2919374, 3366152, 2159773, 2722208, 3171091, 2280604, 3443955,
    # 3855233, 3566063, 2569335, 3565324, 4173720, 3067311, 3987509, 3832524, 4291976, 3145478, 4291797, 4338335,
    # 3925125, 3094547, 4464993, 4373147, 4392056, 3738766, 3946187, 4129635, 2350304, 4330315, 3983980, 4532271,
    # 3752798, 5167090, 5442241, 5468729, 3942007, 4846052, 4613129, 4046021, 1607377, 4583019, 2525648]

    yearwise_documents(corpora[2], aggregation_func=len)
    # [1986, 1987, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
    #  2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]
    # [1, 1, 39, 297, 476, 572, 749, 1017, 1117, 1327, 1479, 1673, 1953, 2072, 2246, 2762, 2971, 3593, 4149, 5313, 6234,
    #  7880, 9095, 10858, 12484, 15035, 17163, 20084, 23485, 29233, 35676]
    yearwise_documents(corpora[2], aggregation_func=token_number)