def test_init_with_nested_field_as_nesting_field(self): nesting_field = data.NestedField(data.Field()) with pytest.raises(ValueError) as excinfo: data.NestedField(nesting_field) assert "nesting field must not be another NestedField" in str( excinfo.value)
def test_pad_when_fix_length_is_not_none(self): nesting_field = data.Field(tokenize=list, unk_token="<cunk>", pad_token="<cpad>", init_token="<w>", eos_token="</w>") CHARS = data.NestedField( nesting_field, init_token="<s>", eos_token="</s>", fix_length=3) minibatch = [ ["john", "loves", "mary"], ["mary", "cries"] ] expected = [ [ ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4, ["<w>"] + list("john") + ["</w>", "<cpad>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4, ], [ ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4, ["<w>"] + list("mary") + ["</w>", "<cpad>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4, ] ] assert CHARS.pad(minibatch) == expected # test include length nesting_field = data.Field(tokenize=list, unk_token="<cunk>", pad_token="<cpad>", init_token="<w>", eos_token="</w>") CHARS = data.NestedField(nesting_field, init_token="<s>", eos_token="</s>", include_lengths=True, fix_length=3) arr, seq_len, words_len = CHARS.pad(minibatch) assert arr == expected assert seq_len == [3, 3] assert words_len == [[3, 6, 3], [3, 6, 3]]
def __init__(self): super(DMNIterator, self).__init__() # Define text nested field self.text_sent = data.Field(sequential=True, lower=True, tokenize=lambda x: x.split(" ")) self.text_doc = data.NestedField(self.text_sent, tokenize=lambda x: x.split("<EOS>"), include_lengths=True) # Define entity nested field self.entity_sent = data.Field(sequential=True, tokenize=lambda x: x.split(" "), unk_token=None) self.entity_doc = data.NestedField(self.entity_sent, tokenize=lambda x: x.split("<EOS>")) # Define label nested field self.label_sent = data.Field(sequential=True, tokenize=lambda x: x.split(" "), unk_token=None) self.label_doc = data.NestedField(self.label_sent, tokenize=lambda x: x.split("<EOS>")) # Define offset nested field self.offset_sent = self.InfoField(sequential=True, tokenize=lambda x: x.split(" "), use_vocab=False) self.offset_doc = self.NestedInfoField(self.offset_sent, tokenize=lambda x: x.split("<EOS>"), use_vocab=False) # Define length nested field self.length_sent = self.InfoField(sequential=True, tokenize=lambda x: x.split(" "), use_vocab=False, pad_token=None) self.length_doc = self.NestedInfoField(self.length_sent, tokenize=lambda x: x.split("<EOS>"), use_vocab=False) # Define word attention field self.word_attn_sent = self.InfoField(sequential=True, tokenize=lambda x: x.split(" "), use_vocab=False) self.word_attn_doc = self.NestedInfoField(self.word_attn_sent, tokenize=lambda x: x.split("<EOS>"), use_vocab=False) # Define sentence attention field self.sent_attn_doc = self.InfoField(sequential=True, tokenize=lambda x: x.split("<EOS>"), use_vocab=False) # Define doc id field self.doc_id = self.InfoField(sequential=False, use_vocab=False) self.vectors = None
def get_data_iter(): WORD = data.Field(init_token='<bos>', eos_token='<eos>', include_lengths=True) UD_TAG = data.Field(init_token='<bos>', eos_token='<eos>') PTB_TAG = data.Field(init_token='<bos>', eos_token='<eos>') CHAR_NESTING = data.Field(tokenize=list, init_token='<bos>', eos_token='<eos>') CHAR = data.NestedField(CHAR_NESTING, init_token='<bos>', eos_token='<eos>', include_lengths=True) train, val, test = datasets.UDPOS.splits( fields=((('word', 'char'), (WORD, CHAR)), ('tag', UD_TAG), ('ptbtag', PTB_TAG)), root='.data', train='en-ud-tag.v2.train.txt', validation='en-ud-tag.v2.dev.txt', test='en-ud-tag.v2.test.txt' ) WORD.build_vocab(train, min_freq=args.word_min_freq) UD_TAG.build_vocab(train) PTB_TAG.build_vocab(train) CHAR.build_vocab(train) args.word2idx = WORD.vocab.stoi args.tag2idx = PTB_TAG.vocab.stoi args.char2idx = CHAR.vocab.stoi args.tag_bos = PTB_TAG.init_token args.tag_eos = PTB_TAG.eos_token args.tag_pad = PTB_TAG.pad_token train_iter, val_iter, test_iter = data.BucketIterator.splits((train, val, test), batch_sizes=(args.train_batch_size, args.val_batch_size, args.val_batch_size), device=args.device, repeat=False) return train_iter, val_iter, test_iter
def __init__(self): self.CHAR_NESTING = data.Field(batch_first=True, tokenize=list) self.char_field = data.NestedField(self.CHAR_NESTING, tokenize=lambda x: x.split(), fix_length=60) # 构建Field self.TEXT = data.Field(batch_first=True, lower=True, tokenize=lambda x: x.split(), fix_length=60) self.bigram = data.Field(batch_first=True, lower=True, tokenize=lambda x: n_gram_tokenizer(x, 2), fix_length=60) self.trigram = data.Field(batch_first=True, lower=True, tokenize=lambda x: n_gram_tokenizer(x, 3), fix_length=60) # 标签域一定要加LabelField!!!!气哭 self.LABEL = data.LabelField(use_vocab=True, dtype=torch.long) self.WORD_FIELD = [("sentence_word", self.TEXT), ("label", self.LABEL)] self.CHAR_FIELD = [("sentence_char", self.char_field), ("sentence_word", self.TEXT), ("label", self.LABEL)] self.BIGRAM_FIELD = [("sentence_word", self.TEXT), ("sentence_bigram", self.bigram), ("label", self.LABEL)]
def get_dataset(base_path, batch_size, pretrained_embedding=None, is_inference=False): sentence = data.Field(lower=False, include_lengths=True, batch_first=True) char_nesting = data.Field(lower=False, tokenize=list) char_sentence = data.NestedField(char_nesting, include_lengths=True) tags = data.Field(batch_first=True) train, val, test = SequenceTaggingDataset.splits( path=base_path, train="train.txt", validation="dev.txt", test="test.txt", fields=[(("sentence", "char_sentence"), (sentence, char_sentence)), ("tags", tags)]) tags.build_vocab(train.tags) if not pretrained_embedding: sentence.build_vocab(train.sentence, min_freq=5) else: sentence.build_vocab(train.sentence, vectors=pretrained_embedding) char_sentence.build_vocab(train.char_sentence) train_iter, val_iter, test_iter = data.BucketIterator.splits( (train, val, test), [batch_size] * 3, repeat=False, shuffle=True, sort_key=lambda x: len(x.sentence), sort_within_batch=True) return sentence, char_sentence, tags, val_iter, train_iter, test_iter
def test_serialization(self): nesting_field = data.Field(batch_first=True) field = data.NestedField(nesting_field) ex1 = data.Example.fromlist(["john loves mary"], [("words", field)]) ex2 = data.Example.fromlist(["mary cries"], [("words", field)]) dataset = data.Dataset([ex1, ex2], [("words", field)]) field.build_vocab(dataset) examples_data = [[ ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4, ["<w>"] + list("john") + ["</w>", "<cpad>"], ["<w>"] + list("loves") + ["</w>"], ["<w>"] + list("mary") + ["</w>", "<cpad>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4, ], [ ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4, ["<w>"] + list("mary") + ["</w>", "<cpad>"], ["<w>"] + list("cries") + ["</w>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4, ["<cpad>"] * 7, ]] field_pickle_filename = "char_field.pl" field_pickle_path = os.path.join(self.test_dir, field_pickle_filename) torch.save(field, field_pickle_path) loaded_field = torch.load(field_pickle_path) assert loaded_field == field original_numericalization = field.numericalize(examples_data) pickled_numericalization = loaded_field.numericalize(examples_data) assert torch.all( torch.eq(original_numericalization, pickled_numericalization))
def load_data(data_path): input_word = data.Field(init_token="<bos>", eos_token="<eos>", batch_first=True, lower=True, include_lengths=True) input_char_nesting = data.Field(tokenize=list, init_token="<bos>", eos_token="<eos>", batch_first=True) input_char = data.NestedField(input_char_nesting, init_token="<bos>", eos_token="<eos>") label = data.Field(init_token="<bos>", eos_token="<eos>", batch_first=True) fields = [(('input_word', 'input_char'), (input_word, input_char)), ('label', label)] dataset = read_data(data_path, fields) for item in dataset: print(item.input_word) print(item.input_char) print(item.label) break
def __init__(self, args_dict): """ Params: arg_dict: ... The dataset json is read and splitted into three jsons: "train.json", "val.json", "test.json". """ self.args_dict = args_dict # Create data field self.ID = data.Field() self.LABEL = data.LabelField() if self.args_dict['net_type'] == 'han': max_sent_len = self.args_dict['max_sent_len'] if self.args_dict[ 'max_sent_len'] != 0 else None max_doc_len = self.args_dict[ 'max_doc_len'] if self.args_dict['max_doc_len'] != 0 else None # nested sentence tokens nest_field = data.Field( pad_token='<pad>', fix_length=max_sent_len ) # fix num of words in each sent (fix max_sent_len) self.TEXT = data.NestedField( nest_field, fix_length=max_doc_len) # fix num of sents (fix max_doc_len) else: self.TEXT = data.Field() # word tokens # Modify rob name self.rob_item = self.args_dict['rob_item'] self.under_sample_ratio = self.args_dict['under_sample_ratio']
def test_preprocess(self): nesting_field = data.Field( tokenize=list, preprocessing=lambda xs: [x.upper() for x in xs]) field = data.NestedField(nesting_field, preprocessing=lambda xs: reversed(xs)) preprocessed = field.preprocess("john loves mary") assert preprocessed == [list("MARY"), list("LOVES"), list("JOHN")]
def test_init_when_nesting_field_has_include_lengths_equal_true(self): nesting_field = data.Field(include_lengths=True) with pytest.raises(ValueError) as excinfo: data.NestedField(nesting_field) assert "nesting field cannot have include_lengths=True" in str( excinfo.value)
def test_numericalize(self): nesting_field = data.Field(batch_first=True) field = data.NestedField(nesting_field) ex1 = data.Example.fromlist(["john loves mary"], [("words", field)]) ex2 = data.Example.fromlist(["mary cries"], [("words", field)]) dataset = data.Dataset([ex1, ex2], [("words", field)]) field.build_vocab(dataset) examples_data = [[ ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4, ["<w>"] + list("john") + ["</w>", "<cpad>"], ["<w>"] + list("loves") + ["</w>"], ["<w>"] + list("mary") + ["</w>", "<cpad>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4, ], [ ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4, ["<w>"] + list("mary") + ["</w>", "<cpad>"], ["<w>"] + list("cries") + ["</w>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4, ["<cpad>"] * 7, ]] numericalized = field.numericalize(examples_data, device=-1) assert numericalized.dim() == 3 assert numericalized.size(0) == len(examples_data) for example, numericalized_example in zip(examples_data, numericalized): verify_numericalized_example(field, example, numericalized_example, batch_first=True)
def test_pad_when_pad_first_is_true(self): nesting_field = data.Field(tokenize=list, unk_token="<cunk>", pad_token="<cpad>", init_token="<w>", eos_token="</w>") CHARS = data.NestedField(nesting_field, init_token="<s>", eos_token="</s>", pad_first=True) minibatch = [ [list("john"), list("loves"), list("mary")], [list("mary"), list("cries")], ] expected = [[ ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4, ["<w>"] + list("john") + ["</w>", "<cpad>"], ["<w>"] + list("loves") + ["</w>"], ["<w>"] + list("mary") + ["</w>", "<cpad>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4, ], [ ["<cpad>"] * 7, ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4, ["<w>"] + list("mary") + ["</w>", "<cpad>"], ["<w>"] + list("cries") + ["</w>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4, ]] assert CHARS.pad(minibatch) == expected
def test_pad_when_nesting_field_has_fix_length(self): nesting_field = data.Field(tokenize=list, unk_token="<cunk>", pad_token="<cpad>", init_token="<w>", eos_token="</w>", fix_length=5) CHARS = data.NestedField(nesting_field, init_token="<s>", eos_token="</s>") minibatch = [["john", "loves", "mary"], ["mary", "cries"]] expected = [[ ["<w>", "<s>", "</w>"] + ["<cpad>"] * 2, ["<w>"] + list("joh") + ["</w>"], ["<w>"] + list("lov") + ["</w>"], ["<w>"] + list("mar") + ["</w>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 2, ], [ ["<w>", "<s>", "</w>"] + ["<cpad>"] * 2, ["<w>"] + list("mar") + ["</w>"], ["<w>"] + list("cri") + ["</w>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 2, ["<cpad>"] * 5, ]] assert CHARS.pad(minibatch) == expected
def test_pad_when_pad_first_is_true(self): nesting_field = data.Field(tokenize=list, unk_token="<cunk>", pad_token="<cpad>", init_token="<w>", eos_token="</w>") CHARS = data.NestedField(nesting_field, init_token="<s>", eos_token="</s>", pad_first=True) minibatch = [ [list("john"), list("loves"), list("mary")], [list("mary"), list("cries")], ] expected = [[ ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4, ["<w>"] + list("john") + ["</w>", "<cpad>"], ["<w>"] + list("loves") + ["</w>"], ["<w>"] + list("mary") + ["</w>", "<cpad>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4, ], [ ["<cpad>"] * 7, ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4, ["<w>"] + list("mary") + ["</w>", "<cpad>"], ["<w>"] + list("cries") + ["</w>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4, ]] assert CHARS.pad(minibatch) == expected # test include_length nesting_field = data.Field(tokenize=list, unk_token="<cunk>", pad_token="<cpad>", init_token="<w>", eos_token="</w>") CHARS = data.NestedField(nesting_field, init_token="<s>", eos_token="</s>", include_lengths=True, pad_first=True) arr, seq_len, words_len = CHARS.pad(minibatch) assert arr == expected assert seq_len == [5, 4] assert words_len == [[3, 6, 7, 6, 3], [0, 3, 6, 7, 3]]
def load_data_word_lstm_char(path_file_data, name_file_train, name_file_test=None, min_freq_word=1, min_freq_char=1, batch_size=2): inputs_word = data.Field(init_token="<bos>", eos_token="<eos>", batch_first=True) inputs_char_nesting = data.Field(tokenize=list, init_token="<bos>", eos_token="<eos>", batch_first=True) inputs_char = data.NestedField(inputs_char_nesting, init_token="<bos>", eos_token="<eos>") labels = data.LabelField(sequential=False) fields = ([(('inputs_word', 'inputs_char'), (inputs_word, inputs_char)), ('labels', labels)]) if name_file_test is not None: train, test = data.TabularDataset.splits(path=path_file_data, train=name_file_train, test=name_file_test, fields=tuple(fields), format='csv', skip_header=True, csv_reader_params={'delimiter': '|'}) inputs_word.build_vocab(train.inputs_word, test.inputs_word, min_freq=min_freq_word) inputs_char.build_vocab(train.inputs_char, test.inputs_char, min_freq=min_freq_char) labels.build_vocab(train.labels) train_iter, test_iter = data.BucketIterator.splits(datasets=(train, test), batch_size=batch_size, sort_key=lambda x: len(x.inputs_word), device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")) dict_return = {'iters': (train_iter, test_iter), 'vocabs': (inputs_word.vocab, inputs_char.vocab, labels.vocab)} else: path_file_data_train = path_file_data + name_file_train train = data.TabularDataset(path_file_data_train, fields=tuple(fields), format='csv', skip_header=True, csv_reader_params={'delimiter': '|'}) inputs_word.build_vocab(train.inputs_word, min_freq=min_freq_word) inputs_char.build_vocab(train.inputs_char, min_freq=min_freq_char) labels.build_vocab(train.labels) train_iter = data.BucketIterator(train, batch_size=batch_size, sort_key=lambda x: len(x.inputs_word), device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")) dict_return = {'iters': (train_iter), 'vocabs': (inputs_word.vocab, inputs_char.vocab, labels.vocab)} return dict_return
def __init__(self, args): path = '.data/squad' dataset_path = path + '/torchtext/' train_examples_path = dataset_path + 'train_examples.pt' dev_examples_path = dataset_path + 'dev_examples.pt' self.RAW = data.RawField() self.CHAR_NESTING = data.Field(batch_first=True, tokenize=list, lower=True) self.CHAR = data.NestedField(self.CHAR_NESTING, tokenize=word_tokenize) self.WORD = data.Field(batch_first=True, tokenize=word_tokenize, lower=True, include_lengths=True) self.LABEL = data.Field(sequential=False, unk_token=None, use_vocab=False) dict_fields = { 'id': ('id', self.RAW), 's_idx': ('s_idx', self.LABEL), 'e_idx': ('e_idx', self.LABEL), 'context': [('c_word', self.WORD), ('c_char', self.CHAR)], 'question': [('q_word', self.WORD), ('q_char', self.CHAR)] } list_fields = [('id', self.RAW), ('s_idx', self.LABEL), ('e_idx', self.LABEL), ('c_word', self.WORD), ('c_char', self.CHAR), ('q_word', self.WORD), ('q_char', self.CHAR)] if os.path.exists(dataset_path): print("loading splits...") dev_examples = torch.load(dev_examples_path) self.dev = data.Dataset(examples=dev_examples, fields=list_fields) else: print("building splits...") self.dev = data.TabularDataset(path=path + f'/dev-v1.1.jsonl', format='json', fields=dict_fields) os.makedirs(dataset_path) torch.save(self.dev.examples, dev_examples_path) print("building vocab...") self.CHAR.build_vocab(self.dev, min_freq=10000) self.WORD.build_vocab(self.dev, vectors=GloVe(name='6B', dim=args.word_dim), max_size=80000) device = torch.device( f"cuda:0" if torch.cuda.is_available() else "cpu") self.dev_iter = \ data.BucketIterator(self.dev, batch_size=60, device=device, sort=True, sort_key=lambda x: len(x.c_word))
def load_data(): input_word = data.Field(init_token="<bos>", eos_token="<eos>", batch_first=True, lower=True, include_lengths=True) input_char_nesting = data.Field(tokenize=list, init_token="<bos>", eos_token="<eos>", batch_first=True) input_char = data.NestedField(input_char_nesting, init_token="<bos>", eos_token="<eos>") label = data.Field(init_token="<bos>", eos_token="<eos>", batch_first=True) fields = [(('input_word', 'input_char'), (input_word, input_char)), (None, None), ('label', label)] train, valid, test = datasets.CoNLL2000Chunking.splits(fields) for item in train: print(item.__dict__.keys()) print(item.input_word) print(item.input_char) print(item.label) break input_word.build_vocab(train.input_word, test.input_word, valid.input_word, vectors=GloVe(name='6B', dim=300)) input_char.build_vocab(train.input_char, test.input_word, valid.input_word) label.build_vocab(train.label) vocab_word = input_word.vocab print(vocab_word.vectors) # word_embeddings = vocab_word.vectors # vocab_word_size = len(vocab_word) # # vocab_char = input_char.vocab # vocab_char_size = len(vocab_char) train_iter, test_iter, valid_iter = data.BucketIterator.splits( (train, test, valid), batch_size=32, sort_key=lambda x: len(x.input_word), repeat=False, sort_within_batch=True, shuffle=True) return { 'iter': (train_iter, valid_iter, test_iter), 'vocabs': (input_word.vocab, input_char.vocab, label.vocab) }
def __init__(self,args): path = './data/squad' dataset_path = path +'/torchtext/' train_examples_path = dataset_path + 'train_examples.pt' dev_examples_path = dataset_path +'dev_examples.pt' print ("[+] Preprocessing data files..") if not os.path.exists(f'{path}/{args.train_file}l'): # what's the l chracter means? self.preprocess_file(f'{path}/{args.train_file}') if not os.path.exists(f'{path}/{args.dev_file}l'): self.preprocess_file(f'{path}/{args.dev_file}') self.RAW = data.RawField() self.CHAR_NESTING = data.Field(batch_first=True,tokenize=list,lower=True ) # tokenize list? # nesting filed? char -> [c,h,a,r]? self.CHAR = data.NestedField(self.CHAR_NESTING,tokenize=word_tokenize) # In this line, what's the mean of Nested Field ( I thinck that Nested filed contains other filed ). In this case, self.charnetsting is chracter based tokenizer self.WORD = data.Field(batch_first =True,tokenize=word_tokenize,lower=True,include_lengths=True) self.LABEL = data.Field(sequential=False,unk_token=None,use_vocab=False) dict_field = { 'id' : ('id',self.RAW) , 's_idx':('s_idx',self.LABEL), 'e_idx':('e_idx',self.LABEL), 'context': [('c_word',self.WORD),('c_char',self.CHAR)], 'questions':[('q_word',self.WORD),('q_char',self.CHAR)]} list_field = [ ('id',self.RAW) ,('s_idx',self.LABEL),('e_idx',self.LABEL), ('c_word',self.WORD),('c_char',self.CHAR),('q_word',self.WORD),('q_char',self.CHAR)] if os.path.exists(dataset_path): print ("[+] Loading splits....") train_examples = torch.load(train_examples_path) dev_examples = torch.load(dev_examples_path) self.train = data.Dataset(examples=train_examples,fields=list_field) self.dev = data.Dataset(examples=dev_examples,fields=list_field) else: print ('[+] building splits...') self.train,self.dev = data.TabularDataset.splits( path=path,train=f'{args.train_file}l', validation=f'{args.dev_file}l', format='json', fields=dict_field) os.makedir(dataset_path) torch.save(self.train.examples,train_examples_path) torch.save(self.dev.examples,dev_examples_path) # cut too long context in the training set for efficiency if args.context_threshold > 0: self.train.examples = [e for e in self.train.examples if len(e.c_word) <= args.context_threshold] print ("building iterators...") self.train_tier,self.dev_iter = \ data.BucketIterator.splits((self.train,self.dev),batch_size = [args.train_batch_size,args.dev_batch_size], device=args.gpu_num ,sort_key = lambda x: len(x.c_word))
def test_pad_when_nesting_field_has_fix_length(self): nesting_field = data.Field(tokenize=list, unk_token="<cunk>", pad_token="<cpad>", init_token="<w>", eos_token="</w>", fix_length=5) CHARS = data.NestedField(nesting_field, init_token="<s>", eos_token="</s>") minibatch = [["john", "loves", "mary"], ["mary", "cries"]] expected = [[ ["<w>", "<s>", "</w>"] + ["<cpad>"] * 2, ["<w>"] + list("joh") + ["</w>"], ["<w>"] + list("lov") + ["</w>"], ["<w>"] + list("mar") + ["</w>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 2, ], [ ["<w>", "<s>", "</w>"] + ["<cpad>"] * 2, ["<w>"] + list("mar") + ["</w>"], ["<w>"] + list("cri") + ["</w>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 2, ["<cpad>"] * 5, ]] assert CHARS.pad(minibatch) == expected # test include length nesting_field = data.Field(tokenize=list, unk_token="<cunk>", pad_token="<cpad>", init_token="<w>", eos_token="</w>", fix_length=5) CHARS = data.NestedField(nesting_field, init_token="<s>", eos_token="</s>", include_lengths=True) arr, seq_len, words_len = CHARS.pad(minibatch) assert arr == expected assert seq_len == [5, 4] assert words_len == [[3, 5, 5, 5, 3], [3, 5, 5, 3, 0]]
def get_input_processor_words(vocab_word, vocab_char=None, convert_digits=True): """ Returns a function that converts text into a processed batch. Required duing inference. Parameters: vocab_word: Instance of torchtext.Vocab for input word vocabulary vocab_char[optional]: Instance of torchtext.Vocab for input per-word character vocabulary convert_digits: If True will convert numbers to single 0's """ inputs_word = data.Field( init_token="<bos>", eos_token="<eos>", batch_first=True, lower=True, preprocessing=data.Pipeline(lambda w: '0' if convert_digits and w.isdigit() else w)) # Set the vocab object manually without building from training dataset inputs_word.vocab = vocab_word if vocab_char is not None: inputs_char_nesting = data.Field(tokenize=list, init_token="<bos>", eos_token="<eos>", batch_first=True) inputs_char = data.NestedField(inputs_char_nesting, init_token="<bos>", eos_token="<eos>") # Set the vocab object manually without building from training dataset inputs_char.vocab = inputs_char_nesting.vocab = vocab_char fields = [(('inputs_word', 'inputs_char'), (inputs_word, inputs_char))] else: fields = [('inputs_word', inputs_word)] def input_processor_fn(inputs): if not isinstance(inputs, list): inputs = [inputs] examples = [] for line in inputs: examples.append(data.Example.fromlist([line], fields)) dataset = data.Dataset(examples, fields) # Entire input in one batch return data.Batch( data=dataset, dataset=dataset, device=torch.device( "cuda:0" if torch.cuda.is_available() else "cpu")) return input_processor_fn
def __init__(self, data_path, glove_size, batch_size, train_file='train.csv', dev_file='dev.csv'): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Defining the Fields self.RAW = data.RawField(is_target=False) self.WORDS = data.Field(batch_first=True, tokenize=post_ptbtokenizer, lower=True, include_lengths=True) self.CHAR = data.NestedField(data.Field(batch_first=True, tokenize=list, lower=True), tokenize=post_ptbtokenizer) self.INDEX = data.Field(sequential=False, unk_token=None, use_vocab=False) fields = { 'id': ('id', self.RAW), 'context_ptb_tok': [('context_words', self.WORDS), ('context_char', self.CHAR)], 'question_ptb_tok': [('question_words', self.WORDS), ('question_char', self.CHAR)], 'answer_ptb_tok': [('answer_words', self.WORDS), ('answer_char', self.CHAR)], 'start_idx': ('start_idx', self.INDEX), 'end_idx': ('end_idx', self.INDEX) } print('Loading CSV Data Into Torch Tabular Dataset') self.train, self.dev = data.TabularDataset.splits(path=data_path, train=train_file, validation=dev_file, format='csv', fields=fields) print('Building Vocabulary') self.CHAR.build_vocab(self.train, self.dev) self.WORDS.build_vocab(self.train, self.dev, vectors=GloVe(name='6B', dim=glove_size)) print('Creating Iterators') self.train_iter = PreprocessData.create_train_iterator( self.train, device, batch_size) self.dev_iter = PreprocessData.create_dev_iterator( self.dev, device, batch_size)
def read_files(args): target_path = args.target_path if args.is_from_scratch: plausible_path = args.plausible_path implausible_path = args.implausible_path prepare_tsv(plausible_path, implausible_path, target_path, option='combined') nesting_field = data.Field(batch_first=True, tokenize=word_tokenizer, unk_token='<unk>', include_lengths=False, sequential=True, fix_length=args.word_max_len) text_field = data.NestedField(nesting_field, tokenize=sent_tokenize, fix_length=args.sent_max_len) label_field = data.Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float) fields = [('text', text_field), ('label', label_field)] train_path = os.path.join(target_path, 'train.tsv') logger.debug('Reading training samples from {}'.format(train_path)) train = PlausibleDataset(path=train_path, format='tsv', skip_header=True, fields=fields) dev_path = os.path.join(target_path, 'dev.tsv') logger.debug('Reading dev samples from {}'.format(train_path)) dev = PlausibleDataset(path=dev_path, format='tsv', skip_header=True, fields=fields) test_path = os.path.join(target_path, 'test.tsv') logger.debug('Reading test samples from {}'.format(test_path)) test = PlausibleDataset(path=test_path, format='tsv', skip_header=True, fields=fields) logging.info('Initializing the vocabulary...') text_field.build_vocab(train, max_size=args.max_vocab_size, vectors=get_embeddings(args.embedding_name), unk_init=torch.Tensor.normal_) return train, dev, test, text_field, label_field
def test_build_vocab_from_iterable(self): nesting_field = data.Field(unk_token="<cunk>", pad_token="<cpad>") CHARS = data.NestedField(nesting_field) CHARS.build_vocab( [[list("aaa"), list("bbb"), ["c"]], [list("bbb"), list("aaa")]], [[list("ccc"), list("bbb")], [list("bbb")]], ) expected = "a b c <cunk> <cpad>".split() assert len(CHARS.vocab) == len(expected) for c in expected: assert c in CHARS.vocab.stoi
def test_pad_when_nesting_field_is_not_sequential(self): nesting_field = data.Field(sequential=False, unk_token="<cunk>", pad_token="<cpad>", init_token="<w>", eos_token="</w>") CHARS = data.NestedField(nesting_field, init_token="<s>", eos_token="</s>") minibatch = [ ["john", "loves", "mary"], ["mary", "cries"] ] expected = [ ["<s>", "john", "loves", "mary", "</s>"], ["<s>", "mary", "cries", "</s>", "<pad>"], ] assert CHARS.pad(minibatch) == expected
def test_build_vocab_from_dataset(self): nesting_field = data.Field(tokenize=list, unk_token="<cunk>", pad_token="<cpad>", init_token="<w>", eos_token="</w>") CHARS = data.NestedField(nesting_field, init_token="<s>", eos_token="</s>") ex1 = data.Example.fromlist(["aaa bbb c"], [("chars", CHARS)]) ex2 = data.Example.fromlist(["bbb aaa"], [("chars", CHARS)]) dataset = data.Dataset([ex1, ex2], [("chars", CHARS)]) CHARS.build_vocab(dataset, min_freq=2) expected = "a b <w> </w> <s> </s> <cunk> <cpad>".split() assert len(CHARS.vocab) == len(expected) for c in expected: assert c in CHARS.vocab.stoi
def test_build_vocab(self): nesting_field = data.Field(tokenize=list, init_token="<w>", eos_token="</w>") field = data.NestedField(nesting_field, init_token='<s>', eos_token='</s>', include_lengths=True, pad_first=True) sources = [[['a'], ['s', 'e', 'n', 't', 'e', 'n', 'c', 'e'], ['o', 'f'], ['d', 'a', 't', 'a'], ['.']], [['y', 'e', 't'], ['a', 'n', 'o', 't', 'h', 'e', 'r']], [['o', 'n', 'e'], ['l', 'a', 's', 't'], ['s', 'e', 'n', 't']]] field.build_vocab(sources, vectors='glove.6B.50d', unk_init=init.normal_, vectors_cache=".vector_cache")
def load_iters(batch_size=32, device="cpu", data_path='data', vectors=None, word2lower=True): zero_char_in_word = lambda ex: [re.sub('\d', '0', w) for w in ex] zero_char = lambda w: [re.sub('\d', '0', c) for c in w] WORD_TEXT = data.Field(lower=word2lower, batch_first=True, include_lengths=True, preprocessing=zero_char_in_word) CHAR_NESTING = data.Field( tokenize=list, preprocessing=zero_char) # process a word in char list CHAR_TEXT = data.NestedField(CHAR_NESTING) # LABEL = data.Field(unk_token=None, pad_token="O", batch_first=True) train_data = ConllDataset(WORD_TEXT, CHAR_TEXT, LABEL, os.path.join(data_path, "train.txt")) dev_data = ConllDataset(WORD_TEXT, CHAR_TEXT, LABEL, os.path.join(data_path, "dev.txt")) test_data = ConllDataset(WORD_TEXT, CHAR_TEXT, LABEL, os.path.join(data_path, "test.txt")) if vectors is not None: WORD_TEXT.build_vocab(train_data.word, vectors=vectors, unk_init=unk_init) else: WORD_TEXT.build_vocab(train_data.word) CHAR_TEXT.build_vocab(train_data.char) LABEL.build_vocab(train_data.label) train_iter, dev_iter = BucketIterator.splits( (train_data, dev_data), batch_sizes=(batch_size, batch_size), device=device, sort_key=lambda x: len(x.word), sort_within_batch=True, repeat=False, shuffle=True) test_iter = Iterator(test_data, batch_size=batch_size, device=device, sort=False, sort_within_batch=False, repeat=False, shuffle=False) return train_iter, dev_iter, test_iter, WORD_TEXT, CHAR_TEXT, LABEL
def cadec(self, opt, tag_type='ner'): """ cadec: CADEC (Parser only. You must place the files) Extract CADEC dataset using torchtext. """ logger.info('---------- CADEC = %s ---------' % (tag_type)) train_file = mapping_files[opt.lang] # Setup fields with batch dimension first inputs_word = data.Field( batch_first=True, fix_length=opt.maxlen, lower=opt.lower, preprocessing=data.Pipeline( lambda w: '0' if opt.convert_digits and w.isdigit() else w)) inputs_char_nesting = data.Field(tokenize=list, batch_first=True, fix_length=opt.maxlen) inputs_char = data.NestedField(inputs_char_nesting) inputs_case = data.Field( batch_first=True, fix_length=opt.maxlen, preprocessing=data.Pipeline(lambda w: self.getCasing(w))) labels = data.Field(batch_first=True, unk_token=None, fix_length=opt.maxlen) # pad_token=None, # preprocessing=data.Pipeline(lambda w: labels_map[w])) id = data.Field(batch_first=True, use_vocab=False) self.fields = ([(('inputs_word', 'inputs_char', 'inputs_case'), (inputs_word, inputs_char, inputs_case))] + [('labels', labels) if label == tag_type else (None, None) for label in ['ner']] + [('id', id)]) # Load the data datafile = NERDataset.splits(path='.', train=train_file, separator='\t', encoding='utf-8', fields=tuple(self.fields))[0] self.train, self.val, self.test = datafile.split( split_ratio=[5610, 1000, 1000]) return inputs_word, inputs_char, inputs_case, labels
def __init__(self, args): print("args=",args) path = './data/squad' logging.info("Preprocessing Data - First Phase :: Reading And Transforming") self.preprocess('{}/{}'.format(path, args.train_file),draft=args.draft) self.preprocess('{}/{}'.format(path, args.dev_file),draft=args.draft) self.RAW = data.RawField(); self.RAW.is_target = False self.CHAR_NESTING = data.Field(batch_first = True, tokenize = list, lower=True) self.CHAR = data.NestedField(self.CHAR_NESTING, tokenize = word_tokenize) self.WORD = data.Field(batch_first = True, tokenize = word_tokenize, lower = True, include_lengths = True) self.LABEL = data.Field(sequential = False, unk_token = None, use_vocab = False) dict_fields = {'qid' : ('qid', self.RAW), 'start_idx': ('start_idx', self.LABEL), 'end_idx' : ('end_idx', self.LABEL), 'context' : [('c_word', self.WORD), ('c_char', self.CHAR)], 'question' : [('q_word', self.WORD), ('q_char', self.CHAR)]} logging.info("Preprocessing Data - Second Phase :: To Torchtext") self.train, self.dev = data.TabularDataset.splits(path=path, train=args.train_file + 'l', \ validation=args.dev_file + 'l', format='json', fields=dict_fields) if args.max_token_len > 0: self.train.examples = [e for e in self.train.examples if len(e.c_word) <= args.max_token_len] logging.info("Preprocessing Data - Third Phase :: Building Vocabulary") self.CHAR.build_vocab(self.train, self.dev) self.WORD.build_vocab(self.train, self.dev, vectors=GloVe(name='6B', dim=args.word_dim)) logging.info("Preprocessing Data - Fourth Phase :: Building Itertors") device = torch.device("cuda:{}".format(args.GPU) if torch.cuda.is_available() else "cpu") self.train_iter = data.BucketIterator(dataset = self.train, batch_size = args.train_batch_size, \ sort_key = lambda x : len(x.c_word), device=device) self.dev_iter = data.BucketIterator(dataset = self.dev, batch_size = args.dev_batch_size, sort_key = lambda x : len(x.c_word), device=device)