def compute_correlation_score_match(matrix, game_set_file, output_file_clues_matched, output_file_solutions_matched): import corpora import lexicon unfound_pair_score = matrix.get_min_association_score() lex_freq_dict = lexicon.loadLexFreqFromFile(corpora.PAISA_LEX_FREQ_FILE) nouns_lex_freq_dict = lexicon.loadLexFreqFromFile( corpora.PAISA_SOSTANTIVI_FREQ_FILE) most_freq_nouns = [ item[0] for item in sorted(nouns_lex_freq_dict.items(), key=lambda x: -x[1]) ] game_set = read_game_set_tab(game_set_file) output_lines_clues_matched = [] output_lines_clues_matched.append('\t'.join(['Scores', 'Matched'])) scores_guessed = [] scores_missed = [] for game_words in game_set: clues = game_words[:5] gold_solution = game_words[5] result = getBestWordAssociationGroups(matrix, clues, unfound_pair_score, nBest=100) if len(result) == 0: best_solution = most_freq_nouns[0] clues_matched_count = 0 scores_sum = unfound_pair_score * 5 else: best_result = result[0] best_solution = best_result['ranked_solution'] clues_matched_count = best_result['clues_matched_count'] scores = best_result['scores'] scores_sum = best_result['scores_sum'] if best_solution == gold_solution: scores_guessed.append(scores_sum) else: scores_missed.append(scores_sum) output_lines_clues_matched.append('{}\t{}'.format( scores_sum, clues_matched_count)) with open(output_file_clues_matched, 'w') as f_out: print_write(f_out, '\n'.join(output_lines_clues_matched)) with open(output_file_solutions_matched, 'w') as f_out: print_write(f_out, '\t'.join(['Scores Guessed', 'Scores Missed']) + '\n') max_lines = max(len(scores_guessed), len(scores_missed)) for i in range(max_lines): if i < len(scores_guessed): print_write(f_out, str(scores_guessed[i])) if i < len(scores_missed): print_write(f_out, '\t' + str(scores_missed[i])) print_write(f_out, '\n')
def getAggettiviSetFromPaisa(min_freq, inflected): import lexicon agg_lex_freq = lexicon.loadLexFreqFromFile(PAISA_AGGETTIVI_FREQ_FILE) agg_lex_min_freq = [w for w, f in agg_lex_freq.items() if f >= min_freq] agg_lex_set = set(agg_lex_min_freq) if inflected: lemma_inflections_dict = getLemmasInflectionsDict() for w in agg_lex_min_freq: if w in lemma_inflections_dict: agg_lex_set.update(lemma_inflections_dict[w]) return agg_lex_set
def getSostantiviSetFromPaisa(min_freq, inflected): import lexicon sostantivi_lex_freq = lexicon.loadLexFreqFromFile( PAISA_SOSTANTIVI_FREQ_FILE) sostantivi_lex_min_freq = [ w for w, f in sostantivi_lex_freq.items() if f >= min_freq ] sostantivi_lex_set = set(sostantivi_lex_min_freq) if inflected: lemma_inflections_dict = getLemmasInflectionsDict() for w in sostantivi_lex_min_freq: if w in lemma_inflections_dict: sostantivi_lex_set.update(lemma_inflections_dict[w]) return sostantivi_lex_set
def builDizAugmentedPaisa(lexPosFreqFile, lexPosBaseFile, min_freq, output_file): import lexicon vowels = [v for v in 'aeiou'] output_file_log = output_file + '_log' paisa_pos_lex_freq = lexicon.loadLexFreqFromFile(lexPosFreqFile) diz_base = lexicon.loadLexiconFromFile(lexPosBaseFile) diz_sostantivi_prefix = set() for w in diz_base: if len(w) > 1 and w[-1] in vowels: diz_sostantivi_prefix.add(w[:-1]) with open(output_file_log, 'w') as f_out: for w, f in sorted(paisa_pos_lex_freq.items(), key=lambda x: -x[1]): if f >= min_freq and len( w ) > 1 and w not in diz_base and w[: -1] in diz_sostantivi_prefix and w[ -1] in vowels: diz_base.add(w) origin = next(o for o in diz_base if o[:-1] == w[:-1] and o != w and len(o) == len(w) and o[-1] in vowels) f_out.write('{}->{}\n'.format(origin, w)) lexicon.printLexiconToFile(diz_base, output_file)
def batch_solver(matrix, game_set_file, output_file, nBest=100, extra_search=False): import time import corpora import lexicon from lexicon import morph_normalize_word unfound_pair_score = matrix.get_min_association_score() lex_freq_dict = lexicon.loadLexFreqFromFile(corpora.PAISA_LEX_FREQ_FILE) #most_freq_words = [item[0] for item in sorted(lex_freq_dict.items(), key=lambda x: -x[1])] nouns_lex_freq_dict = lexicon.loadLexFreqFromFile( corpora.PAISA_SOSTANTIVI_FREQ_FILE) most_freq_nouns = [ item[0] for item in sorted(nouns_lex_freq_dict.items(), key=lambda x: -x[1]) ] game_set = read_game_set_tab(game_set_file) output_lines_clues_matched = [] for game_words in game_set: start_time = time.time() clues = game_words[:5] result = getBestWordAssociationGroups(matrix, clues, unfound_pair_score, nBest) if extra_search and len(result) < 100: morphed_clues = [ morph_normalize_word(c, lex_freq_dict) for c in clues ] if morphed_clues != clues: result += getBestWordAssociationGroups(matrix, morphed_clues, unfound_pair_score, nBest) # resorting results (omitting if we want to give more relevance to unmorphed clues) # result = sorted(result, key=lambda r: r['scores_sum']) if len(result) == 0: best_solution = most_freq_nouns[0] clues_matched_count = 0 scores = -9999 scores_sum = -9999 remaining_solutions = most_freq_nouns[1:nBest] else: best_result = result[0] other_results = result[1:] best_solution = best_result['ranked_solution'] clues_matched_count = best_result['clues_matched_count'] scores = best_result['scores'] scores_sum = best_result['scores_sum'] remaining_solutions = [r['ranked_solution'] for r in other_results] if len(remaining_solutions) < (nBest - 1): missing_count = nBest - 1 - len(remaining_solutions) missing_nouns = [ n for n in most_freq_nouns if n != best_solution and n not in other_results ][:missing_count] remaining_solutions += missing_nouns remaining_solutions_str = ', '.join(remaining_solutions) elapsed_time = int(round(time.time() - start_time) * 1000) report_fields = clues + [ best_solution, clues_matched_count, scores, scores_sum, remaining_solutions_str, elapsed_time ] output_lines_clues_matched.append('\t'.join( [str(x) for x in report_fields])) print('Input lines: {}'.format(len(game_set))) print('Output lines: {}'.format(len(output_lines_clues_matched))) with open(output_file, 'w') as f_out: print_write(f_out, '\n'.join(output_lines_clues_matched))
def coverage(): lexicon_freq = lexicon.loadLexFreqFromFile(LEX_FREQ_FILE) scorer.computeCoverageOfGameWordLex(lexicon_freq, corpora.GAME_SET_100_FILE, COVERAGE_WORD_GAME100_FILE)