def build_and_eval(): utility.make_dir(OUTPUT_DIR) print('Building lexicon') poli_lexicon = list(lexicon.loadLexiconFromFile(corpora.DIZ_POLI_WORD_SORTED_FILE)) sost_lexicon = list(lexicon.loadLexiconFromFile(corpora.DIZIONARIO_SOSTANTIVI_AUGMENTED_PAISA_FILE)) agg_lexicon = list(lexicon.loadLexiconFromFile(corpora.DIZIONARIO_AGGETTIVI_AUGMENTED_PAISA_FILE)) lex_set = set(poli_lexicon+sost_lexicon+agg_lexicon) lex_solution_set = set(sost_lexicon+agg_lexicon) lexicon.printLexiconToFile(lex_set, LEX_FREQ_FILE) lexicon.printLexiconToFile(lex_solution_set, SOLUTION_LEX_FREQ_FILE) print('Computing coverage') scorer.computeCoverageOfGameWordLex(lex_set, lex_solution_set, corpora.GAME_SET_100_FILE, COVERAGE_WORD_GAME100_FILE) print('Building association matrix') matrix = matrix_dict.Matrix_Dict(lex_set, lex_solution_set) matrix.add_patterns_from_corpus(corpora.PAISA_RAW_INFO) matrix.compute_association_scores() matrix.write_matrix_to_file(MATRIX_FILE) print('Eval') scorer.evaluate_kbest_MeanReciprocalRank(matrix, corpora.GAME_SET_100_FILE, EVAL_WORD_GAME100_FILE) scorer.evaluate_kbest_MeanReciprocalRank(matrix, corpora.NLP4FUN_DEV_TSV_v2_tv_FILE, EVAL_NLP4FUN_DEV_TV_FILE) scorer.evaluate_kbest_MeanReciprocalRank(matrix, corpora.NLP4FUN_DEV_TSV_v2_bg_FILE, EVAL_NLP4FUN_DEV_BG_FILE)
def build_and_eval(): utility.make_dir(OUTPUT_DIR) print('Building lexicon') poli_lexicon = list( lexicon.loadLexiconFromFile(corpora.DIZ_POLI_WORD_SORTED_FILE)) sost_lexicon = list( lexicon.loadLexiconFromFile( corpora.DIZIONARIO_SOSTANTIVI_AUGMENTED_PAISA_FILE)) agg_lexicon = list( lexicon.loadLexiconFromFile( corpora.DIZIONARIO_AGGETTIVI_AUGMENTED_PAISA_FILE)) lex_set = set(poli_lexicon + sost_lexicon + agg_lexicon) lex_solution_set = set(sost_lexicon + agg_lexicon) ''' poli_lexicon = list(lexicon.loadLexiconFromFile(corpora.DIZ_POLI_WORD_SORTED_FILE)) sost_lexicon = list(corpora.getSostantiviSetFromPaisa(min_freq=1000, inflected=True)) print('\nSize of sostantivi lex: {}'.format(len(sost_lexicon))) agg_lexicon = list(corpora.getAggettiviSetFromPaisa(min_freq=1000, inflected=True)) print('\nSize of agg lex: {}'.format(len(agg_lexicon))) lex_set = set(poli_lexicon + sost_lexicon + agg_lexicon) ''' lexicon.printLexiconToFile(lex_set, LEX_FREQ_FILE) lexicon.printLexiconToFile(lex_solution_set, SOLUTION_LEX_FREQ_FILE) print('Computing lex coverage') scorer.computeCoverageOfGameWordLex(lex_set, lex_solution_set, corpora.GAME_SET_100_FILE, COVERAGE_WORD_GAME100_FILE) print('Building association matrix') matrix = Matrix_Dict(lex_set, lex_solution_set) matrix.add_patterns_from_corpus(corpora.PAISA_RAW_INFO) matrix.add_patterns_from_corpus(corpora.DE_MAURO_POLIREMATICHE_INFO, weight=DE_MAURO_WEIGHT) matrix.add_patterns_from_corpus(corpora.PROVERBI_INFO, weight=PROVERBI_WEIGHT) matrix.add_patterns_from_corpus(corpora.ITWAC_RAW_INFO, weight=1) matrix.add_patterns_from_corpus(corpora.WIKI_IT_TITLES_INFO, weight=WIKI_IT_WEIGHT) #matrix.add_patterns_from_corpus(corpora.WIKI_IT_TEXT_INFO, weight=1) corpora.addBigramFromPolirematicheInMatrix(matrix, DE_MAURO_WEIGHT) corpora.addBigramFromCompunds(matrix, lex_set, min_len=4, weight=COMPOUNDS_WEIGHT) matrix.compute_association_scores() matrix.write_matrix_to_file(MATRIX_FILE) print('Eval') scorer.evaluate_kbest_MeanReciprocalRank(matrix, corpora.GAME_SET_100_FILE, EVAL_WORD_GAME100_FILE)
def build_and_eval(): utility.make_dir(OUTPUT_DIR) print('\nBuilding lexicon') lex_set = lexicon.loadLexiconFromFile(corpora.DIZ_POLI_WORD_SORTED_FILE) lex_solution_set = lex_set ''' poli_lexicon = list(lexicon.loadLexiconFromFile(corpora.DIZ_POLI_WORD_SORTED_FILE)) sost_lexicon = list(corpora.getSostantiviSetFromPaisa(min_freq=100, inflected=True)) print('\nSize of sostantivi lex: {}'.format(len(sost_lexicon))) agg_lexicon = list(corpora.getAggettiviSetFromPaisa(min_freq=100, inflected=True)) print('\nSize of agg lex: {}'.format(len(agg_lexicon))) lex_set = set(poli_lexicon + sost_lexicon + agg_lexicon) lex_solution_set = set(sost_lexicon+agg_lexicon) #lex_solution_set = lex_set ''' print('\nComputing lex coverage') scorer.computeCoverageOfGameWordLex(lex_set, lex_solution_set, corpora.GAME_SET_100_FILE, COVERAGE_WORD_GAME100_FILE) print('\nBuilding association matrix') matrix = Matrix(lex_set, lex_solution_set) matrix.add_patterns_from_corpus(corpora.DE_MAURO_POLIREMATICHE_INFO) corpora.addBigramFromPolirematicheInMatrix(matrix, weight=1) #corpora.addBigramFromCompunds(matrix, lex_set, min_len=4, weight=10) matrix.compute_association_scores() matrix.write_matrix_to_file(MATRIX_FILE) print('\nEval') scorer.evaluate_kbest_MeanReciprocalRank(matrix, corpora.GAME_SET_100_FILE, EVAL_WORD_GAME100_FILE)
def build(): utility.make_dir(OUTPUT_DIR) print('Building lexicon') poli_lexicon = list( lexicon.loadLexiconFromFile(corpora.DIZ_POLI_WORD_SORTED_FILE)) sost_lexicon = list( lexicon.loadLexiconFromFile( corpora.DIZIONARIO_SOSTANTIVI_AUGMENTED_PAISA_FILE)) agg_lexicon = list( lexicon.loadLexiconFromFile( corpora.DIZIONARIO_AGGETTIVI_AUGMENTED_PAISA_FILE)) lex_set = set(poli_lexicon + sost_lexicon + agg_lexicon) lex_solution_set = set(sost_lexicon + agg_lexicon) lexicon.printLexiconToFile(lex_set, LEX_FREQ_FILE) lexicon.printLexiconToFile(lex_solution_set, SOLUTION_LEX_FREQ_FILE) def add_patterns_from_corpus(corpus_info): lines_extractor = corpora.extract_lines(corpus_info) source = corpus_info['name'] patterns_count = 0 print("Adding patterns from source: {}".format(source)) tot_lines = corpus_info['lines'] for n, line in enumerate(lines_extractor, 1): patterns_count += patterns_extraction.addPatternsFromLineInMongo( line, lex_set, source) if n % 1000 == 0: sys.stdout.write("Progress: {0:.1f}%\r".format( float(n) * 100 / tot_lines)) sys.stdout.flush() print('Extracted patterns: {}'.format(patterns_count)) # print('Computing lex coverage') # scorer.computeCoverageOfGameWordLex(lex_set, lex_solution_set, corpora.GAME_SET_100_FILE, COVERAGE_WORD_GAME100_FILE) print('Adding patterns in db') add_patterns_from_corpus(corpora.DE_MAURO_POLIREMATICHE_INFO)
def build_and_eval(): utility.make_dir(OUTPUT_DIR) print('Building lexicon') poli_lexicon = list(lexicon.loadLexiconFromFile(corpora.DIZ_POLI_WORD_SORTED_FILE)) sost_lexicon = list(lexicon.loadLexiconFromFile(corpora.DIZIONARIO_SOSTANTIVI_AUGMENTED_PAISA_FILE)) agg_lexicon = list(lexicon.loadLexiconFromFile(corpora.DIZIONARIO_AGGETTIVI_AUGMENTED_PAISA_FILE)) lex_set = set(poli_lexicon+sost_lexicon+agg_lexicon) lexicon_freq = {w:1 for w in lex_set} lex_solution_set = set(sost_lexicon+agg_lexicon) print('Lex size: {}'.format(len(lex_set))) print('Solution Lex size: {}'.format(len(lex_solution_set))) lexicon.printLexFreqToFile(lexicon_freq, LEX_FREQ_FILE) # solution_lexicon_freq = {w:1 for w in lex_solution_set} print('Computing lex_set coverage') scorer.computeCoverageOfGameWordLex(lex_set, lex_solution_set, corpora.GAME_SET_100_FILE, COVERAGE_WORD_GAME100_FILE) print('Building association matrix') matrix = matrix_dict.Matrix_Dict(lex_set, lex_solution_set) matrix.add_patterns_from_corpus(corpora.PAISA_RAW_INFO) matrix.add_patterns_from_corpus(corpora.DE_MAURO_POLIREMATICHE_INFO, weight=DE_MAURO_WEIGHT) corpora.addBigramFromPolirematicheInMatrix(matrix, DE_MAURO_WEIGHT) matrix.compute_association_scores() matrix.write_matrix_to_file(MATRIX_FILE) print('Eval') scorer.evaluate_kbest_MeanReciprocalRank(matrix, corpora.GAME_SET_100_FILE, EVAL_WORD_GAME100_FILE)
def buildDizSostantiviAugmentedPaisaInflected(): import lexicon lemma_inflections_dict = getLemmasInflectionsDict() diz_base_inflected = [ [ DIZIONARIO_SOSTANTIVI_AUGMENTED_PAISA_FILE, DIZIONARIO_SOSTANTIVI_AUGMENTED_PAISA_INFLECTED_FILE ], [ DIZIONARIO_AGGETTIVI_AUGMENTED_PAISA_FILE, DIZIONARIO_AGGETTIVI_AUGMENTED_PAISA_INFLECTED_FILE ] ] for diz_base_file, diz_inflected_file in diz_base_inflected: word_set = lexicon.loadLexiconFromFile(diz_base_file) # set inflected_words = set() for lemma in word_set: if lemma in lemma_inflections_dict: inflected_words.update(lemma_inflections_dict[lemma]) word_set.update(inflected_words) lexicon.printLexiconToFile(inflected_words, diz_inflected_file)
def builDizAugmentedPaisa(lexPosFreqFile, lexPosBaseFile, min_freq, output_file): import lexicon vowels = [v for v in 'aeiou'] output_file_log = output_file + '_log' paisa_pos_lex_freq = lexicon.loadLexFreqFromFile(lexPosFreqFile) diz_base = lexicon.loadLexiconFromFile(lexPosBaseFile) diz_sostantivi_prefix = set() for w in diz_base: if len(w) > 1 and w[-1] in vowels: diz_sostantivi_prefix.add(w[:-1]) with open(output_file_log, 'w') as f_out: for w, f in sorted(paisa_pos_lex_freq.items(), key=lambda x: -x[1]): if f >= min_freq and len( w ) > 1 and w not in diz_base and w[: -1] in diz_sostantivi_prefix and w[ -1] in vowels: diz_base.add(w) origin = next(o for o in diz_base if o[:-1] == w[:-1] and o != w and len(o) == len(w) and o[-1] in vowels) f_out.write('{}->{}\n'.format(origin, w)) lexicon.printLexiconToFile(diz_base, output_file)