def main(): global total_token_count, sent a = ArgumentParser() a.add_argument( '-f', '--fsa', metavar='FSAFILE', required=True, help="HFST's optimised lookup binary data for the transducer to be applied") a.add_argument( '-i', '--input', metavar="INFILE", type=str, required=True, dest="infile", help="source of analysis data") a.add_argument( '-m', '--master', metavar="TSVFILE", type=str, required=True, dest="tsvfile", help="source of existing lexical data") opts = a.parse_args() if opts.infile: test_corpora_files = glob(opts.infile) else: test_corpora_files = glob("*.text") # hard-coded logs for now # lemma_log = open('missing_word_ids.log', 'w') # case_log = open('missing_nominal_cases.log', 'w') # comp_log = open('missing_comparatives.log', 'w') # adposition_log = open('adposition_complements.log', 'w') # adposition_stats = open('adposition_complements_full.log', 'w') # adjective_log = open('adjective_agreements.log', 'w') proper_stats = open('proper_contexts_full.log', 'w') # open('../src/probabilistics/lemmas.freqs', 'w') lemma_stats = open('lemmas.freqs', 'w') # case_stats = open('../src/probabilistics/cases.freqs', 'w') omorfi = Omorfi() omorfi.load_filename(opts.fsa) gather_lemmas(open(opts.tsvfile)) test_corpora = list() for test_corpus_file in test_corpora_files: try: test_corpora.append(open(test_corpus_file)) except IOError as ioe: print("Failed to open corpus ", test_corpus_file, ":", ioe) for test_corpus in test_corpora: print('lines from', test_corpus) linen = 0 for line in test_corpus: linen += 1 if (linen % 500000) == 0: print( linen, "...! Time to reload everything because memory is leaking very badly indeed!") sent = list() omorfi = None omorfi = Omorfi() omorfi.load_filename(opts.fsa) gc.collect() if (linen % 1000) == 0: print(linen, "...", end='\r') for punct in ".,:;?!()": line = line.replace(punct, " " + punct + " ") for token in line.split(): total_token_count += 1 analyses = omorfi.analyse(token) add_to_sent(analyses, token) stat_word_ids(token, analyses) # stat_nominal_cases(token, analyses, case_log) # stat_adjective_comps(token, analyses, comp_log) print("Testing statistics") # test_zero_lemmas(lemma_log) # test_zero_cases(case_log) # test_zero_comps(comp_log) # test_case_deviations() # test_adposition_complements(adposition_log) # test_adjective_agreements(adjective_log) print("Writing accurate statistics") # print_adposition_stats(adposition_stats) print_proper_stats(proper_stats) print_lemma_stats(lemma_stats) # print_case_stats(case_stats) exit(0)
def main(): global total_token_count a = ArgumentParser() a.add_argument( '-f', '--fsa', metavar='FSAFILE', required=True, help= "HFST's optimised lookup binary data for the transducer to be applied") a.add_argument('-i', '--input', metavar="INFILE", type=str, required=True, dest="infile", help="source of analysis data") a.add_argument('-m', '--master', metavar="TSVFILE", type=str, required=True, dest="tsvfile", help="source of existing lexical data") opts = a.parse_args() if opts.infile: test_corpora_files = glob(opts.infile) else: test_corpora_files = glob("*.text") # hard-coded logs for now #lemma_log = open('missing_word_ids.log', 'w') #case_log = open('missing_nominal_cases.log', 'w') #comp_log = open('missing_comparatives.log', 'w') #adposition_log = open('adposition_complements.log', 'w') #adposition_stats = open('adposition_complements_full.log', 'w') #adjective_log = open('adjective_agreements.log', 'w') proper_stats = open('proper_contexts_full.log', 'w') lemma_stats = open('lemmas.freqs', 'w') #open('../src/probabilistics/lemmas.freqs', 'w') #case_stats = open('../src/probabilistics/cases.freqs', 'w') omorfi = Omorfi() omorfi.load_filename(opts.fsa) gather_lemmas(open(opts.tsvfile)) test_corpora = list() for test_corpus_file in test_corpora_files: try: test_corpora.append(open(test_corpus_file)) except IOError as ioe: print("Failed to open corpus ", test_corpus_file, ":", ioe) for test_corpus in test_corpora: print('lines from', test_corpus) linen = 0 for line in test_corpus: linen += 1 if (linen % 500000) == 0: print( linen, "...! Time to reload everything because memory is leaking very badly indeed!" ) previous = list() sent = list() omorfi = None omorfi = Omorfi() omorfi.load_filename(opts.fsa) gc.collect() if (linen % 1000) == 0: print(linen, "...", end='\r') for punct in ".,:;?!()": line = line.replace(punct, " " + punct + " ") for token in line.split(): total_token_count += 1 analyses = omorfi.analyse(token) add_to_sent(analyses, token) stat_word_ids(token, analyses) #stat_nominal_cases(token, analyses, case_log) #stat_adjective_comps(token, analyses, comp_log) print("Testing statistics") #test_zero_lemmas(lemma_log) #test_zero_cases(case_log) #test_zero_comps(comp_log) #test_case_deviations() #test_adposition_complements(adposition_log) #test_adjective_agreements(adjective_log) print("Writing accurate statistics") #print_adposition_stats(adposition_stats) print_proper_stats(proper_stats) print_lemma_stats(lemma_stats) #print_case_stats(case_stats) exit(0)