Python WikiParser示例，wiki_parser.WikiParser Python示例

示例#1

0

显示文件

文件： graphplot.py 项目： bondeanikets/Automatic-Trivia-Fact-Extraction-from-Wikipedia

def drawSimPlot(entity, category):
    wiki_parser_instance = wiki_parser.WikiParser()
    wiki_trivia_metric_calculator_instance = wiki_trivia_metric_calculator.WikiTriviaMetricCalculator(
    )
    tokens = wiki_parser_instance.getEntityTokens(entity)
    topk1 = wiki_trivia_metric_calculator_instance.getTopKTFIDFforEntity(
        tokens)
    full_path = category_entity_cache_dir + category + '/'
    outer_list = []
    for (root, dirs, files) in os.walk(full_path):
        for file in files:
            if file.endswith('.txt'):
                inner_list = []
                current_file = open(os.path.join(root, file), "r")
                for line in current_file:
                    line = line.replace('\n', '')
                    inner_list.append(line)
                outer_list.append(inner_list)
    size_new = len(outer_list)
    sim_list = []
    for i in range(0, size_new):
        sim_list.append(
            wiki_trivia_metric_calculator_instance.getEntitySimilarity(
                topk1, outer_list[i]))
    tups = zip(*enumerate(sim_list))
    plt.plot(tups[0], tups[1], label=str("10"), color="r")
    plt.ylim((0.0, 1.0))
    plt.legend().draggable()
    plt.show()

示例#2

0

显示文件

import wiki_trivia_metric_calculator
import wiki_parser

if __name__ == "__main__":
    test = wiki_trivia_metric_calculator.WikiTriviaMetricCalculator()
    test_parser = wiki_parser.WikiParser()

    token_freq_map = test_parser.getEntityTokens("Lionel Messi")
    #print token_freq_map
    test.GetModel()
    topk_terms = test.getTopKTFIDFforEntity(token_freq_map)
    print(topk_terms)
    token_freq_map1 = test_parser.getEntityTokens("Cristiano Ronaldo")

    topk_terms1 = test.getTopKTFIDFforEntity(token_freq_map1)
    print(topk_terms1)
    print(test.getEntitySimilarity(topk_terms, topk_terms1))

示例#3

0

显示文件

文件： index_wiki.py 项目： hfxunlp/SentenceAlignment

def main():
    cwd = os.getcwd()
    parser = OptionParser()

    parser.add_option("-c",
                      "--create_dump",
                      dest="dump_file",
                      default="",
                      help="Index a Wikipedia dump (.bz2)")
    parser.add_option("-o",
                      "--output_file",
                      dest="output_file",
                      default="",
                      help="Location where the indexed dump will be printed")
    parser.add_option("-i",
                      "--index_file",
                      dest="index_file",
                      default="",
                      help="Location of a previously saved index")
    parser.add_option("-d",
                      "--wiki_file",
                      dest="wiki_file",
                      default="",
                      help="Location of a previously saved wikitext file")
    parser.add_option("--inter_wiki",
                      dest="interwiki_file",
                      default="",
                      help="Location of an interwiki links SQL file")
    parser.add_option("--iw_out",
                      dest="iw_out",
                      default="",
                      help="Print interwiki links to this file")
    parser.add_option(
        "-l",
        "--language_code",
        dest="language_code",
        default="",
        help="Language code of the target Wikipedia of the interwiki links")
    parser.add_option(
        "--dict_trans",
        dest="dict_trans",
        default="",
        help="Print Wiktionary entries for the given language.\n" +
        "The language should be specified as a full name.")
    parser.add_option(
        "--dict_trans_out",
        dest="dict_trans_out",
        default="",
        help="Dictionary entries are printed here (tab separated)")

    # For creating document pairs
    parser.add_option("--text_out",
                      dest="text_out",
                      default="",
                      help="Output cleaned wikitext to this file")
    parser.add_option("--iw_in",
                      dest="iw_in",
                      default="",
                      help="Saved interwiki links")

    (opts, args) = parser.parse_args()

    wd = wiki_dump.WikiDump()
    if opts.dump_file and opts.output_file:
        wd.CreateDump(opts.dump_file, opts.output_file,
                      opts.output_file + '.index')

    # Used to identify pages outside of the main namespace
    special_page = re.compile('^\S+:')

    # TODO: Temporary, many things not handled in the options
    if opts.text_out and opts.iw_in:
        print "Writing article pairs from", opts.iw_in, "to", opts.text_out
        source_wp = wiki_parser.WikiParser('old_models/es_model.pickle')
        target_wp = wiki_parser.WikiParser('old_models/en_sbreak.pickle')
        source_dump = wiki_dump.WikiDump()
        source_dump.LoadIndex(cwd + '/data/es_dump.index',
                              cwd + '/data/es_dump')
        print 'Done loading es_dump.index'
        target_dump = wiki_dump.WikiDump()
        target_dump.LoadIndex(cwd + '/data/en_dump.index',
                              cwd + '/data/en_dump')
        print 'Done loading en_dump.index'

        source_out = open(opts.text_out + '.source', 'w')
        target_out = open(opts.text_out + '.target', 'w')
        count = 0
        title_list = open(opts.iw_in, mode='r')
        for line in title_list:
            (target_title, source_title) = line.strip().split('\t')
            if special_page.match(source_title) or special_page.match(
                    target_title):
                continue
            source_wt = source_dump.GetArticle(source_title)
            target_wt = target_dump.GetArticle(target_title)
            if not source_wt or not target_wt:
                continue
            if re.match('^#REDIREC', source_wt, re.IGNORECASE) or re.match(
                    '^#REDIREC', target_wt, re.IGNORECASE):
                continue
            source_sents = source_wp.ToPlainText(source_wt)
            if len(source_sents) == 0:
                continue
            target_sents = target_wp.ToPlainText(target_wt)
            if len(target_sents) == 0:
                continue
            print source_title, "\t\t", target_title
            source_out.write('\n'.join(source_sents).encode('utf-8') + '\n\n')
            target_out.write('\n'.join(target_sents).encode('utf-8') + '\n\n')
            count += 1

        print "Wrote", count, "document pairs"
        source_out.close()
        target_out.close()

    if opts.index_file and opts.wiki_file:
        wd.LoadIndex(opts.index_file, opts.wiki_file)

        if opts.interwiki_file and opts.language_code and opts.iw_out:
            iw_file = opts.interwiki_file
            lc = opts.language_code
            iw_out = open(opts.iw_out, 'w')
            for source_title, target_title in wd.IterateInterwiki(iw_file, lc):
                if not special_page.match(
                        source_title) and not special_page.match(target_title):
                    iw_out.write(source_title + "\t" + target_title + "\n")
            iw_out.close()

        if opts.dict_trans:
            # The third group will contain the entries
            dict_out = None
            if opts.dict_trans_out:
                dict_out = open(opts.dict_trans_out, 'w')
            dict_line = re.compile(
                r'^\*\s*(\[\[|)' + opts.dict_trans + r'(\]\]|):(.*)$',
                re.IGNORECASE)
            print dict_line.pattern

            # Matches an individual translation entry.
            # Groups:
            #   1: Template type ('+' '-' or '')
            #   2: Language code
            #   3: Translation
            #   4: Rest of the options (TODO)
            dict_entry = re.compile(
                '\{\{t(\+|\-|)\|([^\|\}]+)\|([^\|\}]+)(\|[^\|\}]*)*\}\}')
            print dict_entry.pattern

            for title, wiki_text in wd.IterateArticles():
                if special_page.match(title):
                    continue
                for line in wiki_text.splitlines():
                    line_match = dict_line.search(line)
                    if line_match:
                        entries = line_match.group(3)
                        print entries
                        for entry in dict_entry.finditer(entries):
                            print "\t", entry.groups()
                            if dict_out:
                                dict_out.write(title + "\t" + entry.group(3) +
                                               "\n")

            if dict_out:
                dict_out.close()

示例#4

0

显示文件

文件： algorithm_test.py 项目： bondeanikets/Automatic-Trivia-Fact-Extraction-from-Wikipedia

import algorithm_wrapper
import wikipedia as wiki
import pdb
import wiki_parser
import wiki_trivia_metric_calculator


if __name__ == "__main__":
    wiki_parser_instance = wiki_parser.WikiParser()
    wiki_trivia_metric_calculator_instance = wiki_trivia_metric_calculator.WikiTriviaMetricCalculator()
    print("Init done")
    target = open("input.txt", "r")
    for line in target:
        line = line.replace('\n', '')
        print(algorithm_wrapper.triviaAlgorithm(line, wiki_parser_instance, wiki_trivia_metric_calculator_instance))
    target.close()