def drawSimPlot(entity, category): wiki_parser_instance = wiki_parser.WikiParser() wiki_trivia_metric_calculator_instance = wiki_trivia_metric_calculator.WikiTriviaMetricCalculator( ) tokens = wiki_parser_instance.getEntityTokens(entity) topk1 = wiki_trivia_metric_calculator_instance.getTopKTFIDFforEntity( tokens) full_path = category_entity_cache_dir + category + '/' outer_list = [] for (root, dirs, files) in os.walk(full_path): for file in files: if file.endswith('.txt'): inner_list = [] current_file = open(os.path.join(root, file), "r") for line in current_file: line = line.replace('\n', '') inner_list.append(line) outer_list.append(inner_list) size_new = len(outer_list) sim_list = [] for i in range(0, size_new): sim_list.append( wiki_trivia_metric_calculator_instance.getEntitySimilarity( topk1, outer_list[i])) tups = zip(*enumerate(sim_list)) plt.plot(tups[0], tups[1], label=str("10"), color="r") plt.ylim((0.0, 1.0)) plt.legend().draggable() plt.show()
import wiki_trivia_metric_calculator import wiki_parser if __name__ == "__main__": test = wiki_trivia_metric_calculator.WikiTriviaMetricCalculator() test_parser = wiki_parser.WikiParser() token_freq_map = test_parser.getEntityTokens("Lionel Messi") #print token_freq_map test.GetModel() topk_terms = test.getTopKTFIDFforEntity(token_freq_map) print(topk_terms) token_freq_map1 = test_parser.getEntityTokens("Cristiano Ronaldo") topk_terms1 = test.getTopKTFIDFforEntity(token_freq_map1) print(topk_terms1) print(test.getEntitySimilarity(topk_terms, topk_terms1))
def main(): cwd = os.getcwd() parser = OptionParser() parser.add_option("-c", "--create_dump", dest="dump_file", default="", help="Index a Wikipedia dump (.bz2)") parser.add_option("-o", "--output_file", dest="output_file", default="", help="Location where the indexed dump will be printed") parser.add_option("-i", "--index_file", dest="index_file", default="", help="Location of a previously saved index") parser.add_option("-d", "--wiki_file", dest="wiki_file", default="", help="Location of a previously saved wikitext file") parser.add_option("--inter_wiki", dest="interwiki_file", default="", help="Location of an interwiki links SQL file") parser.add_option("--iw_out", dest="iw_out", default="", help="Print interwiki links to this file") parser.add_option( "-l", "--language_code", dest="language_code", default="", help="Language code of the target Wikipedia of the interwiki links") parser.add_option( "--dict_trans", dest="dict_trans", default="", help="Print Wiktionary entries for the given language.\n" + "The language should be specified as a full name.") parser.add_option( "--dict_trans_out", dest="dict_trans_out", default="", help="Dictionary entries are printed here (tab separated)") # For creating document pairs parser.add_option("--text_out", dest="text_out", default="", help="Output cleaned wikitext to this file") parser.add_option("--iw_in", dest="iw_in", default="", help="Saved interwiki links") (opts, args) = parser.parse_args() wd = wiki_dump.WikiDump() if opts.dump_file and opts.output_file: wd.CreateDump(opts.dump_file, opts.output_file, opts.output_file + '.index') # Used to identify pages outside of the main namespace special_page = re.compile('^\S+:') # TODO: Temporary, many things not handled in the options if opts.text_out and opts.iw_in: print "Writing article pairs from", opts.iw_in, "to", opts.text_out source_wp = wiki_parser.WikiParser('old_models/es_model.pickle') target_wp = wiki_parser.WikiParser('old_models/en_sbreak.pickle') source_dump = wiki_dump.WikiDump() source_dump.LoadIndex(cwd + '/data/es_dump.index', cwd + '/data/es_dump') print 'Done loading es_dump.index' target_dump = wiki_dump.WikiDump() target_dump.LoadIndex(cwd + '/data/en_dump.index', cwd + '/data/en_dump') print 'Done loading en_dump.index' source_out = open(opts.text_out + '.source', 'w') target_out = open(opts.text_out + '.target', 'w') count = 0 title_list = open(opts.iw_in, mode='r') for line in title_list: (target_title, source_title) = line.strip().split('\t') if special_page.match(source_title) or special_page.match( target_title): continue source_wt = source_dump.GetArticle(source_title) target_wt = target_dump.GetArticle(target_title) if not source_wt or not target_wt: continue if re.match('^#REDIREC', source_wt, re.IGNORECASE) or re.match( '^#REDIREC', target_wt, re.IGNORECASE): continue source_sents = source_wp.ToPlainText(source_wt) if len(source_sents) == 0: continue target_sents = target_wp.ToPlainText(target_wt) if len(target_sents) == 0: continue print source_title, "\t\t", target_title source_out.write('\n'.join(source_sents).encode('utf-8') + '\n\n') target_out.write('\n'.join(target_sents).encode('utf-8') + '\n\n') count += 1 print "Wrote", count, "document pairs" source_out.close() target_out.close() if opts.index_file and opts.wiki_file: wd.LoadIndex(opts.index_file, opts.wiki_file) if opts.interwiki_file and opts.language_code and opts.iw_out: iw_file = opts.interwiki_file lc = opts.language_code iw_out = open(opts.iw_out, 'w') for source_title, target_title in wd.IterateInterwiki(iw_file, lc): if not special_page.match( source_title) and not special_page.match(target_title): iw_out.write(source_title + "\t" + target_title + "\n") iw_out.close() if opts.dict_trans: # The third group will contain the entries dict_out = None if opts.dict_trans_out: dict_out = open(opts.dict_trans_out, 'w') dict_line = re.compile( r'^\*\s*(\[\[|)' + opts.dict_trans + r'(\]\]|):(.*)$', re.IGNORECASE) print dict_line.pattern # Matches an individual translation entry. # Groups: # 1: Template type ('+' '-' or '') # 2: Language code # 3: Translation # 4: Rest of the options (TODO) dict_entry = re.compile( '\{\{t(\+|\-|)\|([^\|\}]+)\|([^\|\}]+)(\|[^\|\}]*)*\}\}') print dict_entry.pattern for title, wiki_text in wd.IterateArticles(): if special_page.match(title): continue for line in wiki_text.splitlines(): line_match = dict_line.search(line) if line_match: entries = line_match.group(3) print entries for entry in dict_entry.finditer(entries): print "\t", entry.groups() if dict_out: dict_out.write(title + "\t" + entry.group(3) + "\n") if dict_out: dict_out.close()
import algorithm_wrapper import wikipedia as wiki import pdb import wiki_parser import wiki_trivia_metric_calculator if __name__ == "__main__": wiki_parser_instance = wiki_parser.WikiParser() wiki_trivia_metric_calculator_instance = wiki_trivia_metric_calculator.WikiTriviaMetricCalculator() print("Init done") target = open("input.txt", "r") for line in target: line = line.replace('\n', '') print(algorithm_wrapper.triviaAlgorithm(line, wiki_parser_instance, wiki_trivia_metric_calculator_instance)) target.close()