def main():
    parser = argparse.ArgumentParser(
        description='Extracts keywords for given algorithm on given corpora')
    parser.add_argument('-a',
                        '--algorithm',
                        help='Algorithm to use like rake or tfidf',
                        default="rake")
    parser.add_argument('-c',
                        '--corpora',
                        help='Two Corpora to operate on ',
                        nargs='+',
                        default=['state_of_the_union', 'abstract'])
    parser.add_argument('-k',
                        '--top_k',
                        help='number of elements for output',
                        type=int,
                        default=100)
    args = vars(parser.parse_args())

    config = ConfigLoader.get_config()

    #  remove and use actual args
    chosen_corpora = ['sustainability', 'bundestag']  # args['corpora']
    algorithm = "rake"  # args['algorithm']
    top_k = 100  # args['top_k']
    yearwise = True

    evaluate_single(config,
                    algorithm,
                    chosen_corpora,
                    top_k,
                    use_unassigned=True,
                    yearwise=yearwise)
예제 #2
0
def finish_simulation(simulation_setting, repetitions, logDir):
    analyzers = [rep['analyzer'] for rep in repetitions]

    cnf.save_config(simulation_setting, logDir + 'settings.yaml')

    # write graph metrics to csv
    for ind, analyser in enumerate(analyzers):
        analyser.write(logDir + 'graph_' + str(ind) + '.csv')

    # build mean and std over all analyzers
    metrics_mean = []
    metrics_std = []
    metrics_mean.append(analyzers[0].results['Version'])
    metrics_std.append(analyzers[0].results['Version'])
    for metric in analyzers[0].metrics:
        if metric.getMetricName() is not 'Version':
            metric_combined = np.array([
                analyser.results[metric.getMetricName()]
                for analyser in analyzers
            ])  # a row is an analyzer
            metrics_mean.append(np.mean(metric_combined, axis=0))
            metrics_std.append(np.std(metric_combined, axis=0))

    for suffix, contents in zip(['mean', 'std'], [metrics_mean, metrics_std]):
        combinedCsv = csv.writer(
            open(logDir + 'metrics_' + suffix + '.csv', 'w'))
        combinedCsv.writerow(
            [metric.getMetricName() for metric in analyzers[0].metrics])
        for i in range(len(analyzers[0].results['Version'])):
            row = []
            for row_contents in contents:
                row.append(row_contents[i])
            combinedCsv.writerow(row)

    mean = {
        metric.getMetricName(): metrics_mean[i]
        for i, metric in enumerate(analyzers[0].metrics)
    }
    std = {
        metric.getMetricName(): metrics_std[i]
        for i, metric in enumerate(analyzers[0].metrics)
    }

    return {
        'mean': mean,
        'std': std,
    }
예제 #3
0
def main():
    parser = argparse.ArgumentParser(
        description='Extracts keywords for given algorithm on given corpora')
    parser.add_argument('-a',
                        '--algorithm',
                        help='Algorithm to use like rake or tfidf',
                        default="rake")
    parser.add_argument('-c',
                        '--corpora',
                        help='Corpora to annotate as list',
                        nargs='+',
                        default=['state_of_the_union'])
    parser.add_argument('-t',
                        '--translate',
                        help='Translate keywords',
                        action='store_true')
    args = vars(parser.parse_args())

    config = ConfigLoader.get_config()

    #  remove and use actual args
    chosen_corpora = [
        # 'state_of_the_union',
        'bundestag',
        'abstract',
        'sustainability'
    ]  # args['corpora']

    PathMetaData = namedtuple('PathMetaData', 'path corpus_name language')
    paths_and_meta_data = [
        PathMetaData(config["corpora"]["bundestag_corpus"], "bundestag",
                     Language.DE),
        PathMetaData(config["corpora"]["abstract_corpus"], "abstract",
                     Language.EN),
        PathMetaData(config["corpora"]["sustainability_corpus"],
                     "sustainability", Language.EN),
        PathMetaData(config["corpora"]["state_of_the_union_corpus"],
                     "state_of_the_union", Language.EN),
        PathMetaData(config["corpora"]["united_nations_corpus"],
                     "united_nations", Language.EN)
    ]
    paths_and_meta_data = [
        path_meta for path_meta in paths_and_meta_data
        if path_meta.corpus_name in chosen_corpora
    ]

    corpora = [
        Corpus(source=path_meta.path,
               name=path_meta.corpus_name,
               language=path_meta.language)
        for path_meta in paths_and_meta_data
    ]

    for corpus, path_meta in zip(corpora, paths_and_meta_data):
        corpus.save_corpus_without_text(modify_path(path_meta.path))
def main():
    # load configuration parameters from config file
    config = ConfigLoader.get_config()

    # deletes unusable documents and replaces date with year int
    # cleaning_abstracts(config, overwrite=False)
    # cleaning_sustainability(config, overwrite=False)
    # cleaning_bundestag(config, overwrite=True)
    #
    # cleaning_authors(config, overwrite=True)
    cleaning_un(config, overwrite=False)
def main():
    parser = argparse.ArgumentParser(
        description='Extracts keywords for given algorithm on given corpora')
    parser.add_argument('-c',
                        '--corpora',
                        help='Corpora to annotate as list',
                        nargs='+',
                        default=['bundestag', 'abstract'])
    parser.add_argument('-t',
                        '--translate',
                        help='Translate keywords',
                        action='store_true')
    args = vars(parser.parse_args())

    config = ConfigLoader.get_config()

    chosen_corpora = args['corpora']

    PathMetaData = namedtuple('PathMetaData', 'path corpus_name language')
    paths_and_meta_data = [
        PathMetaData(config["corpora"]["state_of_the_union_corpus"],
                     "state_of_the_union", Language.EN),
        PathMetaData(config["corpora"]["bundestag_corpus"], "bundestag",
                     Language.DE),
        PathMetaData(config["corpora"]["abstract_corpus"], "abstract",
                     Language.EN),
        PathMetaData(config["corpora"]["sustainability_corpus"],
                     "sustainability", Language.EN),
        PathMetaData(config["corpora"]["united_nations_corpus"],
                     "united_nations", Language.EN)
    ]

    paths_and_meta_data = [
        path_meta for path_meta in paths_and_meta_data
        if path_meta.corpus_name in chosen_corpora
    ]

    print(f'Yearwise of {chosen_corpora}')

    corpora = [
        Corpus(source=path_meta.path,
               name=path_meta.corpus_name,
               language=path_meta.language)
        for path_meta in paths_and_meta_data
    ]

    corpora = [corpus.year_wise_pseudo_documents() for corpus in corpora]

    for corpus, path_meta in zip(corpora, paths_and_meta_data):
        corpus.save_corpus(modify_path(path_meta.path))
        corpus.save_corpus_without_text(
            modify_path(path_meta.path, without_text=True))
def main():
    parser = argparse.ArgumentParser(
        description='Translates keywords in keyword files of given paths')
    parser.add_argument(
        '-p',
        '--paths',
        help='Paths of keyword files to translate',
        nargs='+',
        default=['data/bundestag_corpus_rake_keywords_yearwise.json'])
    args = vars(parser.parse_args())
    # -p data/bundestag_corpus_rake_keywords.json
    config = ConfigLoader.get_config()
    paths = args['paths']

    cache_file = config["translator"]["cache_file"]
    google_client_secrets_file = config["translator"][
        "google_client_secret_file"]
    translator = None
    timeout = None

    if google_client_secrets_file and not google_client_secrets_file == "":

        appflow = flow.InstalledAppFlow.from_client_secrets_file(
            google_client_secrets_file,
            scopes=['https://www.googleapis.com/auth/cloud-platform'])
        appflow.run_local_server()  # launch browser
        # appflow.run_console()
        translator = g_translate.Client(credentials=appflow.credentials)

    else:  # fallback
        translator = googletrans.Translator()

    cache = None
    timeout = timeout

    try:
        cache = load_cache_from_file(cache_file)
    except Exception as e:
        logging.warning("Loading of file failed")
        logging.warning(e)
        cache = {"de2en": {}, "en2de": {}}

    def iterate_keywords(data):
        tqdm_bar = tqdm(data.items(), total=len(data.keys()))
        for doc_id, keywords in tqdm_bar:
            if keywords:
                for keyword in keywords:
                    en_translation = keyword["english_translation"]
                    ger_translation = keyword["german_translation"]
                    if en_translation is None:
                        translated = translate(ger_translation,
                                               cache,
                                               translator,
                                               timeout,
                                               dest="en")
                        keyword["english_translation"] = translated
                    if ger_translation is None:
                        translated = translate(en_translation,
                                               cache,
                                               translator,
                                               timeout,
                                               dest="de")
                        keyword["german_translation"] = translated

    try:
        for path in paths:
            logging.debug(f'loading keywords at \"{path}\"')
            with open(path, encoding='utf-8') as f:
                data = json.load(f)

            logging.debug('translating keywords ...')
            iterate_keywords(data)

            logging.debug(f'saving keywords with translations at \"{path}\"')
            with open(path, "w", encoding='utf-8') as f:
                json.dump(data, f, indent=1, ensure_ascii=True)

    except KeyboardInterrupt:
        logging.debug('process was interrupted')
    finally:
        logging.debug('saving ...')
        save_cache_to_file(cache, cache_file)
import logging
from typing import List, Union
from flask import Flask, render_template, request, Response

from utils import ConfigLoader, Corpus, Keyword, KeywordType, Language, KeywordTranslator, CorpusFilter
from simple_statistics import yearwise_documents

logging.basicConfig(level=logging.INFO)


def modify_path(path: str, algorithm: str):
    return path.replace('.json', f'_{algorithm}.json')


logging.info('importing corpora ...')
config = ConfigLoader.get_config()

corpus_data = {}
keyword_data = {}
min_year = 5000
max_year = 0

logging.info('importing corpora and keywords data ...')
start_time = time.time()

for corpus_name in config["corpora_for_viz"]:
    logging.info(corpus_name)
    with open(config["corpora_for_viz"][corpus_name]["corpus"]) as corpus_file:
        corpus_data[corpus_name] = json.load(corpus_file)
    with open(config["corpora_for_viz"][corpus_name]
              ["keywords"]) as keyword_file:
def main():
    # load configuration parameters from config file
    config = ConfigLoader.get_config()

    # corpus = Corpus(source=config["corpora"]["abstract_corpus"], language=Language.EN, name="abstract_corpus")
    # corpus = Corpus(source=config["corpora"]["bundestag_corpus"], language=Language.DE, name="bundestag_corpus")
    corpus = Corpus(source=config["corpora"]["sustainability_corpus"],
                    language=Language.EN,
                    name="sustainability_corpus")

    # print(len(corpus))
    # test = DocumentsFilter.filter(corpus, has_tags=['test'])
    # print(set([x.tags for x in test]))
    # print(len(test))
    #
    # exit(0)

    corpus = corpus.get_n_documents_as_corpus(n=100)

    # build yearwise pseudo documents

    pseudo_corpus = corpus.year_wise_pseudo_documents()
    # extract keywords
    KeyPhraseExtractor.tfidf_skl(corpus=pseudo_corpus)
    print([d.keywords for d in pseudo_corpus.get_documents()])

    KeyPhraseExtractor.rake(corpus=corpus)
    print([d.keywords for d in corpus.get_documents()])
    # key_words_post = Document.group_keywords_year_wise(corpus)
    # key_words_pre = Document.transform_pseudo_docs_keywords_to_dict(KeyPhraseExtractor.rake(documents=pseudo_corpus))

    # print(KeyPhraseExtractor.get_top_k_keywords(key_words_post_group, 10))
    # print(KeyPhraseExtractor.get_top_k_keywords(key_words_pre_group, 10))
    # format: {year->list fo keywords}

    kwt = KeywordTranslator(cache_file=config["translator"]["cache_file"])

    counter = 0
    for doc in corpus.get_documents():
        for keyword in doc.keywords:
            if counter > 100:
                break
            kwt.translate(keyword)
            print(keyword)
            counter += 1
        break

    print('extracting keywords with rake ...')
    rake_keywords = KeyPhraseExtractor.rake(corpus=corpus.get_documents()[0])
    rake_keywords_keys = list(rake_keywords.keys())
    print('rake keywords dict keys:', rake_keywords_keys)

    kwt = KeywordTranslator(cache_file=config["translator"]["cache_file"])
    list_of_keywords = []

    for k in rake_keywords[rake_keywords_keys[0]]:
        kw = Keyword(german_translation=k, keyword_type=KeywordType.RAKE)
        kwt.translate(kw)
        list_of_keywords.append(kw)
        print('{} \t {} \t\t\t {}'.format(kw.source_language,
                                          kw.english_translation,
                                          kw.german_translation))
def main():
    parser = argparse.ArgumentParser(
        description='Extracts keywords for given algorithm on given corpora')
    parser.add_argument('-a',
                        '--algorithm',
                        help='Algorithm to use like rake or tfidf_skl',
                        default="rake")
    parser.add_argument('-c',
                        '--corpora',
                        help='Corpora to annotate as list',
                        nargs='+',
                        default=['bundestag'])
    parser.add_argument('-t',
                        '--translate',
                        help='Translate keywords',
                        action='store_true')
    args = vars(parser.parse_args())

    config = ConfigLoader.get_config()

    #  remove and use actual args
    algorithm = args['algorithm']
    translate_keywords = False  #args['translate']
    chosen_corpora = args['corpora']
    assign_keywords = False
    yearwise = True

    PathMetaData = namedtuple('PathMetaData', 'path corpus_name language')
    paths_and_meta_data = [
        PathMetaData(config["corpora"]["state_of_the_union_corpus"],
                     "state_of_the_union", Language.EN),
        PathMetaData(config["corpora"]["bundestag_corpus"], "bundestag",
                     Language.DE),
        PathMetaData(config["corpora"]["abstract_corpus"], "abstract",
                     Language.EN),
        PathMetaData(config["corpora"]["sustainability_corpus"],
                     "sustainability", Language.EN),
        PathMetaData(config["corpora"]["united_nations_corpus"],
                     "united_nations", Language.EN)
    ]

    paths_and_meta_data = [
        path_meta for path_meta in paths_and_meta_data
        if path_meta.corpus_name in chosen_corpora
    ]
    if yearwise:
        KeyPhraseExtractor.top_k = 1000
        KeyPhraseExtractor.max_ngram = 3
    use = {
        "rake": KeyPhraseExtractor.rake,
        "tfidf_skl": KeyPhraseExtractor.tfidf_skl,
        "tfidf_pke": KeyPhraseExtractor.tfidf_pke,
        "text_rank": KeyPhraseExtractor.text_rank,
        "text_rank_pke": KeyPhraseExtractor.text_rank_pke,
        "yake": KeyPhraseExtractor.yake_pke,
        "single_rank": KeyPhraseExtractor.single_rank_pke,
        "topic_rank": KeyPhraseExtractor.topic_rank_pke,
        "topical_page_rank": KeyPhraseExtractor.topical_page_rank_pke,
        "position_rank": KeyPhraseExtractor.position_rank_pke,
        "multipartite_rank": KeyPhraseExtractor.multipartite_rank_pke
    }

    keyword_extractor = use[algorithm]

    print(
        f'Applied {algorithm} on {chosen_corpora} with translation={translate_keywords}'
    )

    corpora = [
        Corpus(source=path_meta.path,
               name=path_meta.corpus_name,
               language=path_meta.language)
        for path_meta in paths_and_meta_data
    ]

    if yearwise:
        corpora = [corpus.year_wise_pseudo_documents() for corpus in corpora]

    for corpus, path_meta in zip(corpora, paths_and_meta_data):
        if translate_keywords:
            kwt = KeywordTranslator(
                cache_file=config["translator"]["cache_file"])
            corpus.translate_keywords(kwt)
        keyword_extractor(corpus=corpus)
        if assign_keywords:
            new_path = str(path_meta.path).replace('.json',
                                                   f"_{algorithm}.json")
            corpus.save_corpus(new_path)
        else:
            new_path = str(path_meta.path).replace(
                '.json', f"_{algorithm}_keywords.json")
            if yearwise:
                new_path = str(new_path).replace('.json', f"_yearwise.json")
            keyword_storage = {
                doc_id: document.keywords
                for doc_id, document in corpus.documents.items()
            }
            with open(new_path, 'w', encoding='utf-8') as f:
                json.dump(keyword_storage,
                          f,
                          ensure_ascii=False,
                          indent=1,
                          default=lambda o: o.__dict__)
            print(f'wrote file {new_path}')
예제 #10
0
def main():
    pool = mp.Pool()

    log.info("Loading Config.")
    settings = cnf.load_config()
    results = {}
    for simulation_setting in settings:
        simulation_dir = './experiment/' + simulation_setting['sim_name'] + '/'
        try:
            os.makedirs(simulation_dir)
        except (FileExistsError):
            pass
        cnf.save_config(simulation_setting, simulation_dir + 'settings.yaml')

        stepConfigs = cnf.get_iteration_steps(simulation_setting)
        results[simulation_setting['sim_name']] = {
            'dir': simulation_dir,
            'steps': [],
        }
        for ind, stepConfig in enumerate(stepConfigs):
            stepDir = simulation_dir + str(ind) + '/'
            try:
                os.mkdir(stepDir)
            except (FileExistsError):
                pass

            stepResult = pool.apply_async(run_simulation,
                                          args=(stepConfig.copy(), stepDir))

            results[simulation_setting['sim_name']]['steps'].append({
                'settings':
                stepConfig,
                'result':
                stepResult,
                'stepDir':
                stepDir,
            })

    pool.close()
    # monitor progress
    ready = False
    while not ready:
        all = sum([
            step['settings']['sim_repetitions'] for sim in results.values()
            for step in sim['steps']
        ])
        finished = sum([
            step['settings']['sim_repetitions'] for sim in results.values()
            for step in sim['steps'] if step['result'].ready()
        ])
        print(str(finished) + ' of ' + str(all) + ' jobs finished')
        ready = (all <= finished)
        try:
            time.sleep(1)
        except:
            pass

    if sum([
            not step['result'].successful() for sim in results.values()
            for step in sim['steps']
    ]) > 0:
        log.error('an exception occurrent in a simulation')

    pool.join()
예제 #11
0
def test_filter():

    config = ConfigLoader.get_config(relative_path="..")

    corpora = [
        Corpus(source=config["corpora"]["abstract_corpus"], language=Language.EN, name="sustainability_corpus"),
        Corpus(source=config["corpora"]["bundestag_corpus"], language=Language.EN, name="sustainability_corpus"),
        Corpus(source=config["corpora"]["sustainability_corpus"], language=Language.EN, name="sustainability_corpus")
    ]

    for corpus in corpora:
        corpus = corpus.get_n_documents_as_corpus(200)

        # test text_contains
        test_text_words = ['climate', 'klima']
        test = CorpusFilter.filter(corpus, text_contains_one_of=test_text_words)
        for t in test:
            is_incorporated = False
            for ttw in test_text_words:
                if ttw in t.text:
                    is_incorporated = True
            if not is_incorporated:
                assert False

        # test is_one_of_languages
        test_date_range = range(2015,2016)
        test = CorpusFilter.filter(corpus, date_in_range=test_date_range)
        for t in test:
            if t.date not in test_date_range:
                assert False

        # test is_one_of_languages
        test_languages = ['english', 'en']
        test = CorpusFilter.filter(corpus, is_one_of_languages=test_languages)
        for t in test:
            if t.language.lower() is not test_languages:
                assert False

        # test is_one_of_doc_ids
        test_doc_id = '0'
        test = CorpusFilter.filter(corpus, is_one_of_doc_ids=[test_doc_id])
        for t in test:
            if test_doc_id is not t.doc_id:
                assert False

        # test has_authors
        test_author = 'test'
        test = CorpusFilter.filter(corpus, has_authors=[test_author])
        for t in test:
            if test_author not in t.author:
                assert False

        # test has_tags
        test_tags = 'test'
        test = CorpusFilter.filter(corpus, has_tags=[test_tags])
        for t in test:
            if test_tags not in t.tags:
                assert False

        # test is_one_of_parties
        test_parties = ["cdu", "FdP"]
        test = CorpusFilter.filter(corpus, is_one_of_parties=[test_parties])
        for t in test:
            if t.party.lower() not in [x.lower() for x in test_parties]:
                assert False

        # test ratings_in_range
        test_rating_range = range(0, 7)
        test = CorpusFilter.filter(corpus, ratings_in_range=test_rating_range)
        for t in test:
            if t.rating not in test_rating_range:
                assert False

        # TODO: Test for keywords

        assert True
def main():
    config = ConfigLoader.get_config()

    corpora = [
        Corpus(source=config["corpora"]["abstract_corpus"],
               language=Language.EN,
               name="abstract_corpus"),
        Corpus(source=config["corpora"]["bundestag_corpus"],
               language=Language.DE,
               name="bundestag_corpus"),
        Corpus(source=config["corpora"]["sustainability_corpus"],
               language=Language.EN,
               name="sustainability_corpus"),
        Corpus(source=config["corpora"]["state_of_the_union_corpus"],
               language=Language.EN,
               name="state_of_the_union_corpus")
    ]

    # count_non_years(corpora[0])
    # count_non_years(corpora[1])
    # count_non_years(corpora[2])

    # Results: non date vs useable date
    # abstract: 54135 / 261215, 1387 don't have a year as date but a string
    # bundestag: 0 / 877973
    # sustainability 3 / 221034

    print(document_number(corpora[0]))
    print(document_number(corpora[1]))
    print(document_number(corpora[2]))
    print(document_number(corpora[3]))

    print(token_number(corpora[0]))
    print(token_number(corpora[1]))
    print(token_number(corpora[2]))
    print(token_number(corpora[3]))

    # Results: token number
    # abstract: 59314582
    # bundestag: 226300348
    # sustainability: 52878146

    yearwise_documents(corpora[0], aggregation_func=len)
    # [1900, 1904, 1951, 1961, 1965, 1972, 1974, 1975, 1976, 1978, 1979, 1980, 1981, 1983, 1984, 1985, 1986, 1987, 1988,
    #  1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
    #  2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]
    # [1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 3, 2, 1, 4, 14, 28, 47, 44, 124, 714, 962, 1080, 1143, 1513, 2104, 2341,
    # 2554, 2862, 2947, 3470, 3617, 4230, 4495, 4827, 5655, 6948, 8331, 10287, 11750, 14345, 16149, 19308, 20899,
    # 23429, 26201, 28937, 29835]
    yearwise_documents(corpora[0], aggregation_func=token_number)
    # [1900, 1904, 1951, 1961, 1965, 1972, 1974, 1975, 1976, 1978, 1979, 1980, 1981, 1983, 1984, 1985, 1986, 1987, 1988,
    #  1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
    #  2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]
    # [237, 289, 26, 196, 299, 4, 2, 302, 13, 35, 163, 2, 513, 13, 3, 354, 2763, 5930, 10297, 9573, 20802, 124895,
    # 172925, 202836, 227647, 303919, 435539, 496060, 558721, 628000, 653111, 770258, 822043, 937258, 1009178, 1078762,
    # 1283970, 1593002, 1880724, 2268271, 2621783, 3192629, 3664511, 4406424, 4775594, 5367972, 6024271,
    # 6682090, 7080373]

    yearwise_documents(corpora[1], aggregation_func=len)
    # [1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967,
    #  1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986,
    #  1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005,
    #  2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]
    # [1540, 7359, 7846, 7492, 6252, 5534, 5794, 7532, 6738, 4469, 4446, 7027, 5950, 7756, 8704, 12078, 13355, 14542,
    #  15855, 15673, 14876, 15917, 16901, 8760, 15082, 16343, 17110, 11914, 14095, 15597, 14811, 8937, 14207, 14647,
    #  9904, 16009, 19397, 16843, 10560, 16032, 16220, 11704, 14972, 14102, 17113, 11485, 16825, 17482, 13614, 9905,
    #  15310, 14208, 14124, 10926, 12884, 14305, 7757, 14210, 13508, 14408, 10609, 16643, 17751, 16497, 11335, 15374,
    #  14794, 13705, 5829, 17021, 9469]
    yearwise_documents(corpora[1], aggregation_func=token_number)
    # [1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967,
    #  1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986,
    #  1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005,
    #  2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]
    # [612509, 2854944, 3061777, 3034065, 2113852, 2406060, 2380625, 2660021, 2460495, 2114953, 1715064, 2049805,
    # 1614656, 1634229, 1867580, 2135204, 2055406, 2452521, 2553521, 2575640, 2464189, 2675640, 2836025, 1644761,
    # 2665313, 3244912, 3004963, 2657335, 2751084, 2919374, 3366152, 2159773, 2722208, 3171091, 2280604, 3443955,
    # 3855233, 3566063, 2569335, 3565324, 4173720, 3067311, 3987509, 3832524, 4291976, 3145478, 4291797, 4338335,
    # 3925125, 3094547, 4464993, 4373147, 4392056, 3738766, 3946187, 4129635, 2350304, 4330315, 3983980, 4532271,
    # 3752798, 5167090, 5442241, 5468729, 3942007, 4846052, 4613129, 4046021, 1607377, 4583019, 2525648]

    yearwise_documents(corpora[2], aggregation_func=len)
    # [1986, 1987, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
    #  2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]
    # [1, 1, 39, 297, 476, 572, 749, 1017, 1117, 1327, 1479, 1673, 1953, 2072, 2246, 2762, 2971, 3593, 4149, 5313, 6234,
    #  7880, 9095, 10858, 12484, 15035, 17163, 20084, 23485, 29233, 35676]
    yearwise_documents(corpora[2], aggregation_func=token_number)
예제 #13
0
def main():
    # parsing the command line arguments
    parser = argparse.ArgumentParser(prog=sys.argv[0], add_help=True)
    parser.add_argument('-g', '--game', default='ody')
    parser.add_argument('-b', '--batch_id', default=0)
    parser.add_argument('-e', '--env', default='local')
    parser.add_argument('-a', '--async_sort_process', default=1)  # MMING
    parser.add_argument('-p', '--async_push', default=0)
    parser.add_argument('-s', '--sort_data', default=0)
    parser.add_argument('-f', '--process_file', default=1)
    parser.add_argument('-t', '--process_time', default=0)
    parser.add_argument('-c', '--cleanup', default=1)
    parser.add_argument('-j', '--job_id', default=-1)
    parser.add_argument('-d', '--start_ts', default=0)

    # retrieve the arguments
    args = vars(parser.parse_args(sys.argv[1:]))
    game = args['game']
    batch_id = args['batch_id']
    env = args['env']
    async_sort_process = int(args['async_sort_process'])
    async_push = int(args['async_push'])
    sort_data = int(args['sort_data'])
    process_file = int(args['process_file'])
    process_time = int(args['process_time'])
    cleanup = int(args['cleanup'])
    job_id = int(args['job_id'])
    start_ts = int(args['start_ts'])

    # start the timer for the process
    timer = MZTimer(process_time)

    # get the config
    config = ConfigLoader(game, env, 'daily_snapshot').config

    message = "Dumped eco data from game: {}\n\n".format(
        time.strftime('%H:%M:%S', time.gmtime(process_time)))
    current_ts = TimeUtil.get_current_timestamp()
    if start_ts == 0:
        start_ts = current_ts
    current_time = TimeUtil.ts2str(current_ts)
    user = config['target']['user']
    processing_dir = config['source']['processing_dir']
    processed_dir = config['source']['processed_dir']
    working_dir = config['source']['working_dir']
    not_sent_dir = config['source']['not_sent_dir']
    archive_temp_dir = config['target']['archive_tmp_dir']
    target_archive_dir = config['target']['archive_dir']
    clusters = config['target']['clusters'].split(',')
    job_ids = config['target']['job_ids'].split(',')
    default_cluster = clusters[0]
    target_temp_dir = '{}/temp_{}'.format(archive_temp_dir, job_id)
    daily_snapshot_temp_dir = construct_daily_snapshot_temp_dir_path(
        archive_temp_dir, start_ts)
    daily_snapshot_control_dir = construct_daily_snapshot_control_dir_path(
        archive_temp_dir, start_ts)

    pool = multiprocessing.Pool()

    # sanity check
    if job_id < 0:
        clean_up_source_files(working_dir)
        subject = "Invalid job_id [{} UTC]".format(current_time)
        EmailUtil.send_email(config['email']['alert'], subject, message)
        sys.exit(0)

    # sort and compress the files
    if process_file == 1:
        print 'Sorting and compressing the files...'
        prefixes = config['source']['prefixes'].split(',')

        res = True
        if async_sort_process == 1:
            res = pool.map(
                partial(sort_and_compress,
                        game=game,
                        batch_id=batch_id,
                        job_id=job_id,
                        start_ts=start_ts,
                        sort_data=sort_data,
                        processing_dir=processing_dir,
                        working_dir=working_dir), prefixes)
            res = check_results(res)
        else:
            for prefix in prefixes:
                res = sort_and_compress(prefix, game, batch_id, job_id,
                                        start_ts, sort_data, processing_dir,
                                        working_dir)

        if not res:
            clean_up_source_files(working_dir)
            subject = "Error in sorting and compressing [{} UTC]".format(
                current_time)
            EmailUtil.send_email(config['email']['alert'], subject, message)
            sys.exit(0)

        timer.stop()
        message += "Sorted and Compressed files: {}\n\n".format(
            timer.sub_process_time_str)

    # send compressed files to archive server's temp
    print 'Sending processed files to archive server...'
    timer.sub_start()

    files = glob(os.path.join(working_dir, '*.gz'))
    hosts = config['target']['hosts'].split(',')
    results = {}
    for host in hosts:
        # create target temp dir if it does not exist on the archive server
        subprocess.call([
            'ssh', '{}@{}'.format(user, host), 'mkdir', '-p', target_temp_dir
        ])

        if async_push == 1:
            results[host] = pool.map(
                partial(send_files,
                        temp_dir=target_temp_dir,
                        host=host,
                        user=user), files)
        else:
            results[host] = []
            for log_file in files:
                results[host].append(
                    send_files(log_file, target_temp_dir, host, user))
    timer.stop()
    message += "Pushed files to archive servers: {}\n\n".format(
        timer.sub_process_time_str)

    # move the files to aggregated (if all exit status are 0) or not_sent (otherwise)
    timer.sub_start()
    failed = False
    for (n, log_file) in enumerate(files):
        exit_status = max([results[host][n] for host in results])
        if exit_status == 0:
            # successfully sent
            date = TimeUtil.get_date(current_ts)
            dest_dir = os.path.join(processed_dir, date)
            OSUtil.mkdir(dest_dir)
            shutil.move(log_file, dest_dir)
        else:
            # send failed; move working to not_sent directory
            failed = True
            failed_hosts = [host for host in results if results[host][n] != 0]
            for n, host in enumerate(failed_hosts):
                host_not_sent_dir = os.path.join(not_sent_dir, host)
                OSUtil.mkdir(host_not_sent_dir)
                if n == len(failed_hosts) - 1:
                    # move it
                    shutil.move(log_file, host_not_sent_dir)
                else:
                    # copy it
                    shutil.copy(log_file, host_not_sent_dir)

    if cleanup == 1:
        clean_up_source_files(processing_dir)

    if failed:
        subject = "[{}-ds] Error sending files to archive server. [{} UTC]".format(
            game, TimeUtil.get_current_time())
        EmailUtil.send_email(config['email']['alert'], subject, message)
        sys.exit(0)

    # move all the files to the remote archive dir
    print "Moving files to final temp direcoty on archive servers..."
    timer.sub_start()
    for host in hosts:
        user_host = '{}@{}'.format(user, host)
        # create temp and control dirs if they do not exist
        subprocess.call(
            ['ssh', user_host, 'mkdir', '-p', daily_snapshot_temp_dir])
        subprocess.call(
            ['ssh', user_host, 'mkdir', '-p', daily_snapshot_control_dir])

        src = os.path.join(target_temp_dir, '*')
        dest = daily_snapshot_temp_dir + '/'
        print 'ssh', user_host, 'mv', src, dest
        subprocess.call(['ssh', user_host, 'mv', src, dest])

        # mark single job success
        success_log_file_path = '{}/{}'.format(
            daily_snapshot_control_dir,
            construct_success_log_file_name(job_id))
        print(success_log_file_path)
        subprocess.call([
            'ssh', user_host, 'echo ' + str(TimeUtil.get_current_timestamp()) +
            ' > ' + success_log_file_path
        ])

    timer.stop()
    message += "Moved files to final temp dir: {}\n\n".format(
        timer.sub_process_time_str)

    # move the log files from the final temp to final destinations
    last_job = False
    for host in hosts:
        if are_all_jobs_completed(host, user, daily_snapshot_control_dir,
                                  job_ids):
            last_job = True
            timer.sub_start()
            # move files from the final temp to default cluster
            src = os.path.join(daily_snapshot_temp_dir, '*')
            default_cluster_temp_dir = construct_cluster_temp_dir(
                archive_temp_dir, default_cluster)
            subprocess.call([
                'ssh', '{}@{}'.format(user, host), 'mkdir', '-p',
                default_cluster_temp_dir
            ])
            print 'ssh', user_host, 'mv', src, default_cluster_temp_dir
            subprocess.call(
                ['ssh', user_host, 'mv', src, default_cluster_temp_dir])

            # copy files from the default cluster temp to other cluster temps
            for cluster in clusters:
                if cluster != default_cluster:
                    cluster_temp_dir = construct_cluster_temp_dir(
                        archive_temp_dir, cluster)
                    subprocess.call([
                        'ssh', '{}@{}'.format(user, host), 'mkdir', '-p',
                        cluster_temp_dir
                    ])

                    # copy files from first temp directory to others
                    src = os.path.join(default_cluster_temp_dir, '*')
                    print 'ssh', user_host, 'cp', src, cluster_temp_dir
                    subprocess.call(
                        ['ssh', user_host, 'cp', src, cluster_temp_dir])

            # move files from each cluster temp to the cluster final destination
            for cluster in clusters:
                cluster_target_temp_dir = construct_cluster_temp_dir(
                    archive_temp_dir, cluster)
                src = os.path.join(cluster_target_temp_dir, '*')
                cluster_target_archive_dir = target_archive_dir.format(
                    cluster=cluster)
                dest = cluster_target_archive_dir + '/'
                print 'ssh', user_host, 'mv', src, dest
                subprocess.call(['ssh', user_host, 'mv', src, dest])

            # clean up the success log
            subprocess.call([
                'ssh', user_host,
                'rm -rf {}/*'.format(daily_snapshot_control_dir)
            ])
            timer.stop()
            message += "Moved files to final destinations on {}: {}\n\n".format(
                host, timer.sub_process_time_str)

    message += "The whole process ran in {}.\n\n".format(
        timer.process_time_str)

    # send email out
    subject = "[{}] Successfully Sending Daily Snapshot Data. Job ID: {} [{} UTC]".format(
        game, job_id, TimeUtil.get_current_time())
    if last_job:
        recipients = config['email']['success']
    else:
        recipients = config['email']['sub_success']
    EmailUtil.send_email(recipients, subject, message)
    sys.exit(0)