示例#1
0
def main():
    global total_token_count, sent
    a = ArgumentParser()
    a.add_argument(
        '-f', '--fsa', metavar='FSAFILE', required=True,
        help="HFST's optimised lookup binary data for the transducer to be applied")
    a.add_argument(
        '-i', '--input', metavar="INFILE", type=str, required=True,
        dest="infile", help="source of analysis data")
    a.add_argument(
        '-m', '--master', metavar="TSVFILE", type=str, required=True,
        dest="tsvfile", help="source of existing lexical data")
    opts = a.parse_args()
    if opts.infile:
        test_corpora_files = glob(opts.infile)
    else:
        test_corpora_files = glob("*.text")
    # hard-coded logs for now
    # lemma_log = open('missing_word_ids.log', 'w')
    # case_log = open('missing_nominal_cases.log', 'w')
    # comp_log = open('missing_comparatives.log', 'w')
    # adposition_log = open('adposition_complements.log', 'w')
    # adposition_stats = open('adposition_complements_full.log', 'w')
    # adjective_log = open('adjective_agreements.log', 'w')
    proper_stats = open('proper_contexts_full.log', 'w')
    # open('../src/probabilistics/lemmas.freqs', 'w')
    lemma_stats = open('lemmas.freqs', 'w')
    # case_stats = open('../src/probabilistics/cases.freqs', 'w')
    omorfi = Omorfi()
    omorfi.load_filename(opts.fsa)
    gather_lemmas(open(opts.tsvfile))
    test_corpora = list()
    for test_corpus_file in test_corpora_files:
        try:
            test_corpora.append(open(test_corpus_file))
        except IOError as ioe:
            print("Failed to open corpus ", test_corpus_file, ":", ioe)
    for test_corpus in test_corpora:
        print('lines from', test_corpus)
        linen = 0
        for line in test_corpus:
            linen += 1
            if (linen % 500000) == 0:
                print(
                    linen, "...! Time to reload everything because memory is leaking very badly indeed!")
                sent = list()
                omorfi = None
                omorfi = Omorfi()
                omorfi.load_filename(opts.fsa)
                gc.collect()

            if (linen % 1000) == 0:
                print(linen, "...", end='\r')
            for punct in ".,:;?!()":
                line = line.replace(punct, " " + punct + " ")
            for token in line.split():
                total_token_count += 1
                analyses = omorfi.analyse(token)
                add_to_sent(analyses, token)
                stat_word_ids(token, analyses)
                # stat_nominal_cases(token, analyses, case_log)
                # stat_adjective_comps(token, analyses, comp_log)
    print("Testing statistics")
    # test_zero_lemmas(lemma_log)
    # test_zero_cases(case_log)
    # test_zero_comps(comp_log)
    # test_case_deviations()
    # test_adposition_complements(adposition_log)
    # test_adjective_agreements(adjective_log)
    print("Writing accurate statistics")
    # print_adposition_stats(adposition_stats)
    print_proper_stats(proper_stats)
    print_lemma_stats(lemma_stats)
    # print_case_stats(case_stats)
    exit(0)
示例#2
0
def main():
    global total_token_count
    a = ArgumentParser()
    a.add_argument(
        '-f',
        '--fsa',
        metavar='FSAFILE',
        required=True,
        help=
        "HFST's optimised lookup binary data for the transducer to be applied")
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=str,
                   required=True,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-m',
                   '--master',
                   metavar="TSVFILE",
                   type=str,
                   required=True,
                   dest="tsvfile",
                   help="source of existing lexical data")
    opts = a.parse_args()
    if opts.infile:
        test_corpora_files = glob(opts.infile)
    else:
        test_corpora_files = glob("*.text")
    # hard-coded logs for now
    #lemma_log = open('missing_word_ids.log', 'w')
    #case_log = open('missing_nominal_cases.log', 'w')
    #comp_log = open('missing_comparatives.log', 'w')
    #adposition_log = open('adposition_complements.log', 'w')
    #adposition_stats = open('adposition_complements_full.log', 'w')
    #adjective_log = open('adjective_agreements.log', 'w')
    proper_stats = open('proper_contexts_full.log', 'w')
    lemma_stats = open('lemmas.freqs',
                       'w')  #open('../src/probabilistics/lemmas.freqs', 'w')
    #case_stats = open('../src/probabilistics/cases.freqs', 'w')
    omorfi = Omorfi()
    omorfi.load_filename(opts.fsa)
    gather_lemmas(open(opts.tsvfile))
    test_corpora = list()
    for test_corpus_file in test_corpora_files:
        try:
            test_corpora.append(open(test_corpus_file))
        except IOError as ioe:
            print("Failed to open corpus ", test_corpus_file, ":", ioe)
    for test_corpus in test_corpora:
        print('lines from', test_corpus)
        linen = 0
        for line in test_corpus:
            linen += 1
            if (linen % 500000) == 0:
                print(
                    linen,
                    "...! Time to reload everything because memory is leaking very badly indeed!"
                )
                previous = list()
                sent = list()
                omorfi = None
                omorfi = Omorfi()
                omorfi.load_filename(opts.fsa)
                gc.collect()

            if (linen % 1000) == 0:
                print(linen, "...", end='\r')
            for punct in ".,:;?!()":
                line = line.replace(punct, " " + punct + " ")
            for token in line.split():
                total_token_count += 1
                analyses = omorfi.analyse(token)
                add_to_sent(analyses, token)
                stat_word_ids(token, analyses)
                #stat_nominal_cases(token, analyses, case_log)
                #stat_adjective_comps(token, analyses, comp_log)
    print("Testing statistics")
    #test_zero_lemmas(lemma_log)
    #test_zero_cases(case_log)
    #test_zero_comps(comp_log)
    #test_case_deviations()
    #test_adposition_complements(adposition_log)
    #test_adjective_agreements(adjective_log)
    print("Writing accurate statistics")
    #print_adposition_stats(adposition_stats)
    print_proper_stats(proper_stats)
    print_lemma_stats(lemma_stats)
    #print_case_stats(case_stats)
    exit(0)