Exemplo n.º 1
0
def build_entity_vocab(dump_db_file: str, white_list: List[TextIO], **kwargs):
    dump_db = DumpDB(dump_db_file)
    white_list = [line.rstrip() for f in white_list for line in f]
    EntityVocab.build(dump_db,
                      white_list=white_list,
                      language=dump_db.language,
                      **kwargs)
Exemplo n.º 2
0
def build_from_p_e_m_file(p_e_m_file, dump_db_file, wiki_mention_db_file,
                          **kwargs):
    dump_db = DumpDB(dump_db_file)
    tokenizer = BasicTokenizer(do_lower_case=False)
    normalizer = BertLowercaseNormalizer()
    wiki_mention_db = MentionDB(wiki_mention_db_file)
    MentionDB.build_from_p_e_m_file(p_e_m_file, dump_db, wiki_mention_db,
                                    tokenizer, normalizer, **kwargs)
Exemplo n.º 3
0
def setUp():
    global dump_db, dump_db_file

    dump_file = pkg_resources.resource_filename(
        __name__, 'test_data/enwiki-pages-articles-sample.xml.bz2')
    dump_reader = WikiDumpReader(dump_file)
    dump_db_file = NamedTemporaryFile()

    DumpDB.build(dump_reader, dump_db_file.name, 1, 1)
    dump_db = DumpDB(dump_db_file.name)
Exemplo n.º 4
0
def build_wikipedia_pretraining_dataset(dump_db_file: str, tokenizer_name: str,
                                        entity_vocab_file: str,
                                        output_dir: str,
                                        sentence_tokenizer: str, **kwargs):
    dump_db = DumpDB(dump_db_file)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    sentence_tokenizer = SentenceTokenizer.from_name(sentence_tokenizer)

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    entity_vocab = EntityVocab(entity_vocab_file)
    WikipediaPretrainingDataset.build(dump_db, tokenizer, sentence_tokenizer,
                                      entity_vocab, output_dir, **kwargs)
Exemplo n.º 5
0
Arquivo: main.py Projeto: yifding/luke
def create_candidate_list(dump_db_file, out_file, data_dir):
    dump_db = DumpDB(dump_db_file)

    titles = set()
    valid_titles = frozenset(dump_db.titles())

    reader = EntityDisambiguationDataset(data_dir)
    for documents in reader.get_all_datasets():
        for document in documents:
            for mention in document.mentions:
                candidates = mention.candidates
                for candidate in candidates:
                    title = dump_db.resolve_redirect(candidate.title)
                    if title in valid_titles:
                        titles.add(title)

    for title in titles:
        out_file.write(title + "\n")
Exemplo n.º 6
0
def build_entity_linker(dump_db_file, **kwargs):
    dump_db = DumpDB(dump_db_file)
    tokenizer = RegexpTokenizer()
    EntityLinker.build(dump_db, tokenizer, **kwargs)
Exemplo n.º 7
0
def build_from_wikipedia(dump_db_file, **kwargs):
    dump_db = DumpDB(dump_db_file)
    tokenizer = BasicTokenizer(do_lower_case=False)
    normalizer = BertLowercaseNormalizer()
    MentionDB.build_from_wikipedia(dump_db, tokenizer, normalizer, **kwargs)
Exemplo n.º 8
0
Arquivo: main.py Projeto: yifding/luke
def create_redirect_tsv(dump_db_file, out_file):
    dump_db = DumpDB(dump_db_file)

    for src, dest in dump_db.redirects():
        out_file.write(f"{src}\t{dest}\n")
Exemplo n.º 9
0
Arquivo: main.py Projeto: yifding/luke
def create_title_list(dump_db_file, out_file):
    dump_db = DumpDB(dump_db_file)

    for title in dump_db.titles():
        out_file.write(f"{title}\n")
Exemplo n.º 10
0
    def __init__(
        self,
        dump_db_file: str,  # Location of file build by build-dump-db
        tokenizer_name:
        str,  # Tokenizer to use, e.g. Maltehb/danish-bert-botxo for Danish BERT
        entity_vocab_file: str,  # Build by build-entity-vocab
        out_dir:
        str,  # Where to put finished dataset. All contents will be removed before saving dataset
        validation_prob:
        float,  # Chance of each finished document to be marked as part of validation set
        max_entities:
        int,  # Only up to this many entities are included in each sequence
        max_entity_span:
        int,  # Maximum number tokens an entity can span before sequence is discarded
        min_sentence_length:
        int,  # Minimum number of tokens a sentence must span to be included
        max_articles: int | None,
        max_vocab_size: int,
    ):
        if not wikipedia2vec_available:
            raise ModuleNotFoundError(
                "Pretrain data generation requires installation of the optional requirement `wikipedia2vec`"
            )
        log("Reading dump database at %s" % dump_db_file)
        self.dump_db = DumpDB(dump_db_file)
        log("Building tokeninizer: %s" % tokenizer_name)
        self.tokenizer_name = tokenizer_name
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        log("Building sentence tokenizer: %s" % self.tokenizer_language)
        self.sentence_tokenizer = ICUSentenceTokenizer(self.tokenizer_language)
        log("Loading entity vocab at %s" % entity_vocab_file)
        self.entity_vocab = load_entity_vocab(entity_vocab_file)
        # Make sure IDs on non-ignored entities are contiguous
        num = 0
        for entity_info in self.entity_vocab.values():
            entity_info["id"] = num
            num += 1
        log("Entity vocab has size %i" % num)

        self.out_dir = out_dir
        self.data_file = os.path.join(self.out_dir, self.data_file)
        self.token_map_file = os.path.join(self.out_dir, self.token_map_file)
        self.max_seq_length = self.tokenizer.model_max_length
        self.validation_prob = validation_prob
        self.max_entities = max_entities
        self.max_entity_span = max_entity_span
        self.min_sentence_length = min_sentence_length
        # Get maximum number of tokens in a sequence excluding start and end tokens
        self.max_num_tokens = self.max_seq_length - 2
        self.max_articles = max_articles
        self.vocab_size = self.tokenizer.vocab_size if max_vocab_size == -1 else min(
            max_vocab_size, max_vocab_size)

        # Filter titles so only real articles are included
        self.target_titles = list(self.dump_db.titles())

        # Remove old datafile if it exists
        if os.path.isfile(self.data_file):
            log.debug("Removing old datafile '%s'" % self.data_file)
            os.remove(self.data_file)

        self.examples = list()
Exemplo n.º 11
0
import copy
import joblib
import argparse
import numpy as np

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--entity_file", type=str, required=True)
    parser.add_argument("--src", type=str, required=True)
    parser.add_argument("--tgt", type=str, required=True)
    parser.add_argument("--dumpdb", type=str, required=True)

    args = parser.parse_args()

    dictionary = Dictionary.load(args.src)
    dumpdb = DumpDB(args.dumpdb)

    with open(args.entity_file) as handle:
        all_needed_entities_raw = set(handle.readlines())

    title2dest_title = dict(dumpdb.redirects())
    all_needed_entities = set([
        title2dest_title.get(title, title) for title in all_needed_entities_raw
    ])

    src_file = joblib.load(args.src)

    old_word_dict = Trie()
    old_word_dict.frombytes(src_file['word_dict'])

    old_word_stats = src_file['word_stats']
Exemplo n.º 12
0
# -*- coding: utf-8 -*-

import sys
import Levenshtein
from collections import Counter
from wikipedia2vec.dump_db import DumpDB

dump_db = DumpDB(sys.argv[1])
pair_counter = Counter()

for (title1, title2) in dump_db.redirects():
    ops = Levenshtein.editops(title1.lower(), title2.lower())
    if len(ops) == 1:
        (op, p1, p2) = ops[0]
        if op == 'replace':
            pair_counter[frozenset((title1[p1], title2[p2]))] += 1

for (pair, count) in pair_counter.most_common():
    print('%s\t%s\t%d' % (*list(pair), count))
Exemplo n.º 13
0
def generate_redirect_file(dump_db_file, out_file, compress):
    data = {k: v for k, v in DumpDB(dump_db_file).redirects()}
    joblib.dump(data, out_file, compress=compress)
Exemplo n.º 14
0
def build_wiki_link_db(common_args, dump_db_file, mention_db_file, **kwargs):
    dump_db = DumpDB(dump_db_file)
    mention_db = MentionDB(mention_db_file)
    WikiLinkDB.build(dump_db, mention_db, **kwargs)