def main(args): spacy_en = spacy.load('en_core_web_sm', disable=['vectors', 'textcat', 'tagger', 'parser', 'ner']) postprocess = str.lower if args.lower else lambda x: x def tokenizer(x): return [postprocess(token.text) for token in spacy_en(x) if not token.is_space] if args.only_question: indices = [1] desc = 'question' elif args.only_context: indices = [0] desc = 'context' else: indices = [0, 1] desc = 'question_context' basename, ext = os.path.splitext(args.vocab_path) min_freq = args.min_freq if args.min_freq else '' max_size = args.max_size if args.max_size else '' filename = f'{basename}_{desc}_min-freq{min_freq}_max_size{max_size}{ext}' squad_tokens = load_squad_tokens(args.train_path, tokenizer, indices=indices) Vocabulary.build(squad_tokens, args.min_freq, args.max_size, (PAD_TOKEN, UNK_TOKEN), filename)
def main(args): tokenizer = get_tokenizer(lower=args.lower, as_str=True) if args.only_question: indices = [1] desc = 'question' elif args.only_context: indices = [0] desc = 'context' else: indices = [0, 1] desc = 'question_context' basename, ext = os.path.splitext(args.vocab_path) min_freq = args.min_freq if args.min_freq else '' max_size = args.max_size if args.max_size else '' filename = f'{basename}_{desc}_min-freq{min_freq}_max_size{max_size}{ext}' squad_tokens = load_squad_tokens(args.train_path, tokenizer, indices=indices) Vocabulary.build(squad_tokens, args.min_freq, args.max_size, (PAD_TOKEN, UNK_TOKEN), filename)
def test_build(self): tokens = ['rock', 'n', 'roll'] token_to_index, index_to_token = Vocabulary.build( tokens, 1, 4, ('<pad>', ), None) tokens += ['<pad>'] self.assertCountEqual(token_to_index.keys(), tokens)