Exemplo n.º 1
0
def main(args):
    spacy_en = spacy.load('en_core_web_sm', disable=['vectors', 'textcat', 'tagger', 'parser', 'ner'])

    postprocess = str.lower if args.lower else lambda x: x

    def tokenizer(x):
        return [postprocess(token.text) for token in spacy_en(x) if not token.is_space]

    if args.only_question:
        indices = [1]
        desc = 'question'
    elif args.only_context:
        indices = [0]
        desc = 'context'
    else:
        indices = [0, 1]
        desc = 'question_context'

    basename, ext = os.path.splitext(args.vocab_path)
    min_freq = args.min_freq if args.min_freq else ''
    max_size = args.max_size if args.max_size else ''
    filename = f'{basename}_{desc}_min-freq{min_freq}_max_size{max_size}{ext}'

    squad_tokens = load_squad_tokens(args.train_path, tokenizer, indices=indices)
    Vocabulary.build(squad_tokens, args.min_freq, args.max_size, (PAD_TOKEN, UNK_TOKEN), filename)
Exemplo n.º 2
0
def main(args):
    tokenizer = get_tokenizer(lower=args.lower, as_str=True)

    if args.only_question:
        indices = [1]
        desc = 'question'
    elif args.only_context:
        indices = [0]
        desc = 'context'
    else:
        indices = [0, 1]
        desc = 'question_context'

    basename, ext = os.path.splitext(args.vocab_path)
    min_freq = args.min_freq if args.min_freq else ''
    max_size = args.max_size if args.max_size else ''
    filename = f'{basename}_{desc}_min-freq{min_freq}_max_size{max_size}{ext}'

    squad_tokens = load_squad_tokens(args.train_path,
                                     tokenizer,
                                     indices=indices)
    Vocabulary.build(squad_tokens, args.min_freq, args.max_size,
                     (PAD_TOKEN, UNK_TOKEN), filename)
Exemplo n.º 3
0
 def test_build(self):
     tokens = ['rock', 'n', 'roll']
     token_to_index, index_to_token = Vocabulary.build(
         tokens, 1, 4, ('<pad>', ), None)
     tokens += ['<pad>']
     self.assertCountEqual(token_to_index.keys(), tokens)