def prepare_fields(pad_t): WORD_field = data.Field(use_vocab=False, batch_first=True, sequential=True, pad_token=pad_t) WORD_nested_field = NestedField( data.Field(use_vocab=False, batch_first=True, sequential=True, pad_token=pad_t)) PAD_field = data.Field(use_vocab=False, batch_first=True, sequential=True, pad_token=0) PAD_nested_field = NestedField( data.Field(use_vocab=False, batch_first=True, sequential=True, pad_token=0)) MASK_nested_field = NestedField( data.Field(use_vocab=False, batch_first=True, sequential=True, pad_token=1.)) fields = { 'id': data.RawField(), 'question': data.RawField(), 'answers': data.RawField(), 'src': WORD_nested_field, 'src_mask': PAD_nested_field, 'doc_mask': MASK_nested_field, 'target': WORD_field, 'target_mask': PAD_field, } return fields
def eval_input_fn(): path, exts = find_path_and_exts(filenames[0], filenames[1]) SRC_TEXT = data.RawField( preprocessing=lambda s: preprocessing(s, params), postprocessing=lambda s: postprocessing(s, params)) TRG_TEXT = data.RawField( preprocessing=lambda s: preprocessing(s, params, add_bos=True), postprocessing=lambda s: postprocessing(s, params)) LABEL_TEXT = data.RawField( preprocessing=lambda s: preprocessing(s, params), postprocessing=lambda s: postprocessing(s, params)) MASK = data.Field(sequential=False, use_vocab=False) fields = [("source", SRC_TEXT), ("source_mask", MASK), ("target", TRG_TEXT), ("target_mask", MASK), ("label", LABEL_TEXT)] dataset = TranslationDataset(path, exts, fields, mode="eval") iterator = MTIterator(dataset, params.decode_batch_size, params, mode="eval", continuous=True, sort=False, shuffle=False) return iterator
def prepare_fields(pad_t, encode_like_dpr=False): WORD_field = data.Field(use_vocab=False, batch_first=True, sequential=True, pad_token=pad_t, fix_length=256 if encode_like_dpr else None) return [('id', data.RawField()), ('raw_question', data.RawField()), ('input', WORD_field), ('segment_mask', data.Field(use_vocab=False, batch_first=True, sequential=True, pad_token=0, fix_length=256 if encode_like_dpr else None)), ('input_mask', data.Field(use_vocab=False, batch_first=True, sequential=True, pad_token=float(encode_like_dpr), fix_length=256 if encode_like_dpr else None)), ('pos', data.Field(sequential=False, use_vocab=False, batch_first=True, is_target=True)), ('hard_neg', data.Field(sequential=False, use_vocab=False, batch_first=True, is_target=True)), ('answers', data.RawField()), ('human_answer', data.RawField())]
def load_dataset(path, include_lengths=True, lower=False, stop_words=None, load_raw=False, load_id=False, float_label=True): TEXT = data.Field(include_lengths=include_lengths, lower=lower, stop_words=stop_words) label_type = torch.float if float_label else torch.long LABEL = data.LabelField(dtype=label_type) RAW = data.RawField() ID = data.RawField() fields = {'text': ('text', TEXT), 'label': ('label', LABEL)} if load_raw: fields['raw'] = ('raw', RAW) RAW.is_target = True if load_id: fields['id'] = ('id', ID) ID.is_target = True splits = data.TabularDataset.splits( path=path, train='train.json', validation='valid.json', test='test.json', format='json', fields=fields) return splits, (TEXT, LABEL, RAW, ID)
def load_nli_dataset(path, lower=False, stop_words=None, load_raw=True, load_id=True, float_label=False): TEXT = data.Field(lower=lower, stop_words=stop_words) label_type = torch.float if float_label else torch.long LABEL = data.LabelField(dtype=label_type) RAW = data.RawField() ID = data.RawField() fields = {'premise': ('premise', TEXT), 'hypothesis': ('hypothesis', TEXT), 'label': ('label', LABEL)} if load_raw: fields['raw_premise'] = ('raw_premise', RAW) fields['raw_hypothesis'] = ('raw_hypothesis', RAW) if load_id: fields['id'] = ('id', ID) splits = data.TabularDataset.splits( path=path, train='train.json', validation='valid.json', test='test.json', format='json', fields=fields) return splits, (TEXT, LABEL, RAW, ID)
def make_generate_data(config, input_src, input_mt): bert_path = config['bert']['path'] tokenizer = BertTokenizer.from_pretrained(bert_path) unk_id = tokenizer.vocab['[UNK]'] pad_id = tokenizer.vocab['[PAD]'] unk_fun = lambda: unk_id vocab = defaultdict(unk_fun) for k, v in tokenizer.vocab.items(): vocab[k] = v src_trg_field = data.Field(eos_token=None, pad_token=pad_id, batch_first=True, include_lengths=True, sequential=True, use_vocab=False) mask_field = data.RawField( postprocessing=tensorify(batch_fun, torch.float32)) id_mask = data.RawField(postprocessing=tensorify(batch_fun, torch.long)) test_data = TestMergeDataset(src_path=input_src, trg_path=input_mt, fields=(src_trg_field, mask_field, id_mask), bos_token='[CLS]', sep_token='[SEP]', vocab=vocab) return test_data
def mktestset(self, args): path = args.path.replace("train", 'test') fields = self.fields ds = data.TabularDataset(path=path, format='tsv', fields=fields) ds.fields["rawent"] = data.RawField() for x in ds: x.rawent = x.ent.split(" ; ") x.ent = self.vec_ents(x.ent, self.ENT) x.rel = self.mkGraphs(x.rel, len(x.ent[1])) if args.sparse: x.rel = (self.adjToSparse(x.rel[0]), x.rel[1]) x.tgt = x.out x.out = [y.split("_")[0] + ">" if "_" in y else y for y in x.out] x.sordertgt = torch.LongTensor( [int(y) + 3 for y in x.sorder.split(" ")]) x.sorder = [[int(z) for z in y.strip().split(" ") if len(z) > 0] for y in x.sorder.split("-1")[:-1]] ds.fields["tgt"] = self.TGT ds.fields["rawent"] = data.RawField() ds.fields["sordertgt"] = data.RawField() dat_iter = data.Iterator(ds, 1, device=args.device, sort_key=lambda x: len(x.src), train=False, sort=False) return dat_iter
def __init__(self, args): self.args = args self.ID = data.RawField() self.PID = data.RawField() self.TEXT = data.Field(batch_first=True) self.POSITION = data.RawField()
def mkVocabs(self, args): args.path = args.datadir + args.data self.INP = data.Field(sequential=True, batch_first=True, init_token="<start>", eos_token="<eos>", include_lengths=True) self.OUTP = data.Field(sequential=True, batch_first=True, init_token="<start>", eos_token="<eos>", include_lengths=True) self.TGT = data.Field(sequential=True, batch_first=True, init_token="<start>", eos_token="<eos>") self.ENT = data.RawField() self.REL = data.RawField() self.REL.is_target = False self.ENT.is_target = False self.fields = [("src", self.INP), ("ent", self.ENT), ("rel", self.REL), ("tgt", self.TGT), ("out", self.OUTP)] train = data.TabularDataset(path=args.path, format='tsv', fields=self.fields) print('building vocab') self.OUTP.build_vocab(train, min_freq=args.outunk) self.TGT.vocab = copy(self.OUTP.vocab) specials = zip( "method material otherscientificterm metric task".split(" "), range(40)) for x in specials: s = "<" + x[0] + "_" + str(x[1]) + ">" self.TGT.vocab.stoi[s] = len(self.TGT.vocab.itos) + x[1] self.INP.build_vocab(train, min_freq=args.outunk) ''' self.INP.vocab.stoi['<pad>']=0 self.INP.vocab.stoi['<unk>']=1 self.INP.vocab.itos = ['<pad>','<unk>']+self.INP.vocab.itos[2:] ''' self.REL.special = ['<pad>', '<unk>', 'ROOT'] with open(args.datadir + "/" + args.relvocab) as f: rvocab = [x.strip() for x in f.readlines()] self.REL.size = len(rvocab) rvocab += [x + "_inv" for x in rvocab] relvocab = self.REL.special + rvocab self.REL.itos = relvocab self.ENT.itos, self.ENT.stoi = self.build_ent_vocab(args.path) print('done')
def prepare_fields(self): return [ ('id', data.RawField()), ('title', data.RawField()), ('psg', data.RawField()), ('label', data.Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float, is_target=True)), ]
def _create_examples(self, tokenizer, data_dir): return PsychDataset( data.TabularDataset( data_dir, 'json', { 'options-for-correct-answers': ('utterance', data.RawField( utterance_processor(tokenizer, speaker=self.agent[0].upper() if self.agent else None))), 'messages-so-far': ('context', data.RawField(context_processor(tokenizer))) }))
def load_dataset(batch_size, path_data): tokenize = lambda x: x.split() captions = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=1500) category = data.Field(sequential=False) channel_id = data.RawField() channel_id.is_target = False chunk = data.RawField() chunk.is_target = False video_id = data.RawField() video_id.is_target = False seeds_captions = data.TabularDataset(path=path_data, format='csv', skip_header=True, fields=[('captions', captions), ('category', category), ('channel_id', channel_id), ('chunk', chunk), ('video_id', video_id)]) train_data, test_data = seeds_captions.split(split_ratio=0.8, stratified=True, strata_field='category') train_data, valid_data = train_data.split(split_ratio=0.8, stratified=True, strata_field='category') captions.build_vocab(train_data, test_data, valid_data, vectors=GloVe(name='6B', dim=300)) category.build_vocab(train_data, specials_first=False) vocab_size = len(captions.vocab) class_size = len(category.vocab) - 1 print(vocab_size) train_iter, valid_iter, test_iter = data.BucketIterator.splits( (train_data, valid_data, test_data), batch_size=batch_size, sort_key=lambda x: len(x.captions), repeat=False, shuffle=True) return captions, category, train_iter, valid_iter, test_iter, vocab_size, class_size
def iters(cls, config, **kwargs): """ Create the iterator objects for splits of the SemEval dataset. :param batch_size: Batch_size :param device: Device to create batches, -1 for CPU and None for GPU. :param root: The root directory containing datasets files. :param vectors: Load pretrained vectors :param kwargs: :return: """ vectors = vocab.Vectors(name=config.vectors, cache=config.cache) ID = data.RawField() TEXT = data.Field(batch_first=True, tokenize=lambda x: x, fix_length=20) TAG = data.Field(batch_first=True, tokenize=lambda x: x, fix_length=20) RAW = data.RawField() REL = data.Field(sequential=False, use_vocab=False, batch_first=True, tensor_type=torch.FloatTensor, postprocessing=data.Pipeline(get_class_probs)) CONF = data.RawField() #TAG.preprocessing = shrink_chunk train, val, test = cls.splits(ID, TEXT, REL, CONF, RAW, TAG, root=config.datasets_dir, **kwargs) TEXT.build_vocab(train) config.n_embed = len(TEXT.vocab) config.d_embed = vectors.dim TEXT.vocab.load_vectors(vectors) config.weights = TEXT.vocab.vectors config.n_classes = 2 return data.BucketIterator.splits((train, val, test), batch_size=config.batch_size, shuffle=config.shuffle, device=config.device, repeat=False)
def splits(cls, text_field, root_path, img_dir, **kwargs): train, valid, test = 'train.pkl', 'val.pkl', 'test.pkl' fields = { 'image_id': ('image_id', data.RawField()), 'img_1c_feat': ('img_1c_feat', data.RawField()), 'img_to_load': ('img_to_load', data.RawField()), 'caption': ('caption', text_field), 'caption_id': ('caption_id', data.RawField()), } train_data = None if train is None else cls(root_path, img_dir, train, fields, True, **kwargs) val_data = None if valid is None else cls(root_path, img_dir, valid, fields, False, **kwargs) test_data = None if test is None else cls(root_path, img_dir, test, fields, False, **kwargs) return tuple(d for d in (train_data, val_data, test_data) if d is not None)
def mkiters(self, train): args = self.args c = Counter([len(x.out) for x in train]) t1, t2, t3 = [], [], [] print("Sorting training data by len") for x in train: l = len(x.out) if l < 100: t1.append(x) elif l > 100 and l < 220: t2.append(x) else: t3.append(x) t1d = data.Dataset(t1, self.fields) t2d = data.Dataset(t2, self.fields) t3d = data.Dataset(t3, self.fields) valid = data.TabularDataset(path=args.path.replace("train", "val"), format='tsv', fields=self.fields) print("ds sizes:", end='\t') for ds in [t1d, t2d, t3d, valid]: print(len(ds.examples), end='\t') for x in ds: x.rawent = x.ent.split(" ; ") x.ent = self.vec_ents(x.ent, self.ENT) x.rel = self.mkGraphs(x.rel, len(x.ent[1])) if args.sparse: x.rel = (self.adjToSparse(x.rel[0]), x.rel[1]) x.tgt = x.out x.out = [y.split("_")[0] + ">" if "_" in y else y for y in x.out] x.sordertgt = torch.LongTensor( [int(y) + 3 for y in x.sorder.split(" ")]) x.sorder = [[int(z) for z in y.strip().split(" ")] for y in x.sorder.split("-1")[:-1]] ds.fields["tgt"] = self.TGT ds.fields["rawent"] = data.RawField() ds.fields["sordertgt"] = data.RawField() self.t1_iter = data.Iterator(t1d, args.t1size, device=args.device, sort_key=lambda x: len(x.out), repeat=False, train=True) self.t2_iter = data.Iterator(t2d, args.t2size, device=args.device, sort_key=lambda x: len(x.out), repeat=False, train=True) self.t3_iter = data.Iterator(t3d, args.t3size, device=args.device, sort_key=lambda x: len(x.out), repeat=False, train=True) self.val_iter = data.Iterator(valid, args.t3size, device=args.device, sort_key=lambda x: len(x.out), sort=False, repeat=False, train=False)
def preprocess(self): print("\nLoading data... ", end="", flush=True) process = MakeLabelVector() set_label_vector = process.set_label_vector get_label_vector = process.get_label_vector # Define fields for torchtext length = self.params["sequence_length"] self.ID = data.RawField(is_target=False) self.LABEL = data.RawField(set_label_vector, get_label_vector, True) self.TEXT = data.Field(sequential=True, lower=True, fix_length=length) fields = [ ("id", self.ID), ("label", self.LABEL), ("text", self.TEXT), ] datasets = data.TabularDataset.splits( path="./", train=self.params["train_data_path"], validation=self.params["valid_data_path"], test=self.params["test_data_path"], format="tsv", fields=fields, ) if self.params["params_search"]: self.train, self.valid = datasets else: self.train, self.valid, self.test = datasets print("Done.", flush=True) # Convert words to ID print("Converting text to ID... ", end="", flush=True) if self.params["params_search"]: self.TEXT.build_vocab(self.train, self.valid) else: self.TEXT.build_vocab(self.train, self.valid, self.test) self.TEXT.vocab.load_vectors("glove.6B.300d") print("Done.\n", flush=True) # Add parameters that havn't yet been defined self.params["uniq_of_cat"] = process.uniq_of_cat self.params["num_of_class"] = len(process.uniq_of_cat)
def __init__(self, batch_size, word_dim): self.RAW = data.RawField() self.TEXT = data.Field(batch_first=True) self.LABEL = data.Field(sequential=False, unk_token=None) self.train, self.dev, self.test = data.TabularDataset.splits( path='.data/quora', train='train.tsv', validation='dev.tsv', test='test.tsv', format='tsv', fields=[('label', self.LABEL), ('q1', self.TEXT), ('q2', self.TEXT), ('id', self.RAW)]) self.TEXT.build_vocab(self.train, self.dev, self.test, vectors=GloVe(name='6B', dim=word_dim)) self.LABEL.build_vocab(self.train) sort_key = lambda x: data.interleave_keys(len(x.q1), len(x.q2)) self.train_iter, self.dev_iter, self.test_iter = \ data.BucketIterator.splits((self.train, self.dev, self.test), device=-1, batch_sizes=[batch_size] * 3, sort_key=sort_key) self.max_word_len = max([len(w) for w in self.TEXT.vocab.itos])
def make_test(self, args): path = args.path.replace("train", 'test') fields = self.fields dataset = data.TabularDataset(path=path, format='tsv', fields=fields) dataset.fields["rawent"] = data.RawField() dataset.fields["rawent"].is_target = False for row in dataset: row.rawent = row.ent.split(" ; ") row.ent = self.vectorize_entity(row.ent, self.ENT) # row.ent: tuple of ((# of entities in x, max entity len), (# of entities)) row.rel = self.make_graph(row.rel, len(row.ent[1])) # x.rel: tuple of (adj, rel) row.tgt = row.out row.out = [ token.split("_")[0] + ">" if "_" in token else token for token in row.out ] dataset.fields["tgt"] = self.TARGET test_iter = data.Iterator(dataset, 1, device=args.device, sort_key=lambda x: len(x.title), train=False, sort=False) return test_iter
def __init__(self, args): self.RAW = data.RawField() self.TEXT = data.Field(batch_first=True) self.LABEL = data.Field(sequential=False, unk_token=None) self.train, self.dev, self.test = data.TabularDataset.splits( path='.data/trecqa', train='train.tsv', validation='dev.tsv', test='test.tsv', format='tsv', fields=[('label', self.LABEL), ('s1', self.TEXT), ('s2', self.TEXT)]) self.TEXT.build_vocab(self.train, self.dev, self.test, vectors=GloVe(name='840B', dim=300)) self.LABEL.build_vocab(self.train) sort_key = lambda x: data.interleave_keys(len(x.s1), len(x.s2)) self.train_iter, self.dev_iter, self.test_iter = \ data.BucketIterator.splits((self.train, self.dev, self.test), batch_sizes=[args.batch_size] * 3, device=args.gpu, sort_key=sort_key) self.max_word_len = max([len(w) for w in self.TEXT.vocab.itos]) # for <pad> self.char_vocab = {'': 0} # for <unk> and <pad> self.characterized_words = [[0] * self.max_word_len, [0] * self.max_word_len] if args.use_char_emb: self.build_char_vocab()
def __init__(self, args): self.RAW = data.RawField(is_target=False) self.TEXT = data.Field(batch_first=True, tokenize='spacy', lower=True) # self.LABEL = data.Field(sequential=False, unk_token=None) self.LABEL = data.LabelField() self.train, self.dev, self.test = data.TabularDataset.splits( path='/media/fch/Data/leo/text-similarity/data/quora', train='train.tsv', validation='dev.tsv', test='test.tsv', format='tsv', fields=[('label', self.LABEL), ('q1', self.TEXT), ('q2', self.TEXT), ('id', self.RAW)]) vectors = Vectors(name='/media/fch/Data/leo/text-similarity/glove/glove.840B.300d.txt', cache='/media/fch/Data/leo/text-similarity/.vector_cache') self.TEXT.build_vocab(self.train, self.dev, self.test, vectors=vectors) self.LABEL.build_vocab(self.train) sort_key = lambda x: data.interleave_keys(len(x.q1), len(x.q2)) self.train_iter, self.dev_iter, self.test_iter = data.BucketIterator.splits((self.train, self.dev, self.test), batch_sizes=[args.batch_size] * 3, device=args.device, sort_key=sort_key) self.max_word_len = max([len(w) for w in self.TEXT.vocab.itos])
def __init__(self, src_path, trg_path, fields, bos_token='[CLS]', sep_token='[SEP]', vocab=None, **kwargs): src_len = data.RawField( postprocessing=tensorify(identity_fun, torch.long)) trg_len = data.RawField( postprocessing=tensorify(identity_fun, torch.long)) attention_mask = data.RawField( postprocessing=tensorify(batch_fun, torch.long)) if not isinstance(fields[0], (tuple, list)): fields = [('src_trg', fields[0]), ('label_mask', fields[1]), ('id_mask', fields[2]), ('src_len', src_len), ('trg_len', trg_len), ('attention_mask', attention_mask)] examples = [] with open(src_path) as src_file, open(trg_path) as trg_file: for src_line, trg_line in \ zip(src_file, trg_file): src_line, trg_line = src_line.strip().split(" "), \ trg_line.strip().split(" ") src_line = [vocab[bos_token]] + [vocab[x] for x in src_line] + \ [vocab[sep_token]] trg_line = [vocab[x] for x in trg_line] label_mask = [0.0] * len(src_line) + [1.0] * len(trg_line) if src_line != '' and trg_line != '': merged_line = src_line + trg_line att_mask = [1] * len(merged_line) id_mask = [0] * len(src_line) + [1] * len(trg_line) assert len(merged_line) == len(id_mask) assert len(label_mask) == len(merged_line) examples.append( data.Example.fromlist([ merged_line, label_mask, id_mask, len(src_line), len(trg_line), att_mask ], fields)) super(TestMergeDataset, self).__init__(examples, fields, **kwargs)
def __init__(self, args): path = '.data/squad' dataset_path = path + '/torchtext/' train_examples_path = dataset_path + 'train_examples.pt' dev_examples_path = dataset_path + 'dev_examples.pt' self.RAW = data.RawField() self.CHAR_NESTING = data.Field(batch_first=True, tokenize=list, lower=True) self.CHAR = data.NestedField(self.CHAR_NESTING, tokenize=word_tokenize) self.WORD = data.Field(batch_first=True, tokenize=word_tokenize, lower=True, include_lengths=True) self.LABEL = data.Field(sequential=False, unk_token=None, use_vocab=False) dict_fields = { 'id': ('id', self.RAW), 's_idx': ('s_idx', self.LABEL), 'e_idx': ('e_idx', self.LABEL), 'context': [('c_word', self.WORD), ('c_char', self.CHAR)], 'question': [('q_word', self.WORD), ('q_char', self.CHAR)] } list_fields = [('id', self.RAW), ('s_idx', self.LABEL), ('e_idx', self.LABEL), ('c_word', self.WORD), ('c_char', self.CHAR), ('q_word', self.WORD), ('q_char', self.CHAR)] if os.path.exists(dataset_path): print("loading splits...") dev_examples = torch.load(dev_examples_path) self.dev = data.Dataset(examples=dev_examples, fields=list_fields) else: print("building splits...") self.dev = data.TabularDataset(path=path + f'/dev-v1.1.jsonl', format='json', fields=dict_fields) os.makedirs(dataset_path) torch.save(self.dev.examples, dev_examples_path) print("building vocab...") self.CHAR.build_vocab(self.dev, min_freq=10000) self.WORD.build_vocab(self.dev, vectors=GloVe(name='6B', dim=args.word_dim), max_size=80000) device = torch.device( f"cuda:0" if torch.cuda.is_available() else "cpu") self.dev_iter = \ data.BucketIterator(self.dev, batch_size=60, device=device, sort=True, sort_key=lambda x: len(x.c_word))
def get_fields(cls, text_field, label_field, with_genre=False): fields = { 'label': ('label', label_field), 'sentence1': ('premise', text_field), 'sentence2': ('hypothesis', text_field) } if with_genre: fields['genre'] = ('genre', data.RawField()) return fields
def __init__(self,args): path = './data/squad' dataset_path = path +'/torchtext/' train_examples_path = dataset_path + 'train_examples.pt' dev_examples_path = dataset_path +'dev_examples.pt' print ("[+] Preprocessing data files..") if not os.path.exists(f'{path}/{args.train_file}l'): # what's the l chracter means? self.preprocess_file(f'{path}/{args.train_file}') if not os.path.exists(f'{path}/{args.dev_file}l'): self.preprocess_file(f'{path}/{args.dev_file}') self.RAW = data.RawField() self.CHAR_NESTING = data.Field(batch_first=True,tokenize=list,lower=True ) # tokenize list? # nesting filed? char -> [c,h,a,r]? self.CHAR = data.NestedField(self.CHAR_NESTING,tokenize=word_tokenize) # In this line, what's the mean of Nested Field ( I thinck that Nested filed contains other filed ). In this case, self.charnetsting is chracter based tokenizer self.WORD = data.Field(batch_first =True,tokenize=word_tokenize,lower=True,include_lengths=True) self.LABEL = data.Field(sequential=False,unk_token=None,use_vocab=False) dict_field = { 'id' : ('id',self.RAW) , 's_idx':('s_idx',self.LABEL), 'e_idx':('e_idx',self.LABEL), 'context': [('c_word',self.WORD),('c_char',self.CHAR)], 'questions':[('q_word',self.WORD),('q_char',self.CHAR)]} list_field = [ ('id',self.RAW) ,('s_idx',self.LABEL),('e_idx',self.LABEL), ('c_word',self.WORD),('c_char',self.CHAR),('q_word',self.WORD),('q_char',self.CHAR)] if os.path.exists(dataset_path): print ("[+] Loading splits....") train_examples = torch.load(train_examples_path) dev_examples = torch.load(dev_examples_path) self.train = data.Dataset(examples=train_examples,fields=list_field) self.dev = data.Dataset(examples=dev_examples,fields=list_field) else: print ('[+] building splits...') self.train,self.dev = data.TabularDataset.splits( path=path,train=f'{args.train_file}l', validation=f'{args.dev_file}l', format='json', fields=dict_field) os.makedir(dataset_path) torch.save(self.train.examples,train_examples_path) torch.save(self.dev.examples,dev_examples_path) # cut too long context in the training set for efficiency if args.context_threshold > 0: self.train.examples = [e for e in self.train.examples if len(e.c_word) <= args.context_threshold] print ("building iterators...") self.train_tier,self.dev_iter = \ data.BucketIterator.splits((self.train,self.dev),batch_size = [args.train_batch_size,args.dev_batch_size], device=args.gpu_num ,sort_key = lambda x: len(x.c_word))
def main(): global WORD WORD = data.Field(include_lengths=True, batch_first=True, eos_token=None, init_token=None) LABEL = data.Field(sequential=False, batch_first=True) TREE = data.RawField(postprocessing=ListOpsDataset.tree_field(WORD)) TREE.is_target = False train = ListOpsDataset( "data/train_d20s.tsv", (("word", WORD), ("label", LABEL), ("tree", TREE)), filter_pred=lambda x: 5 < len(x.word) < config["train_len"], ) WORD.build_vocab(train) LABEL.build_vocab(train) valid = ListOpsDataset( "data/test_d20s.tsv", (("word", WORD), ("label", LABEL), ("tree", TREE)), filter_pred=lambda x: 5 < len(x.word) < 150, ) train_iter = TokenBucket(train, batch_size=1500, device="cuda:0", key=lambda x: len(x.word)) train_iter.repeat = False valid_iter = data.BucketIterator(train, batch_size=50, train=False, sort=False, device="cuda:0") NT = 1 T = len(WORD.vocab) V = T if True: tree_lstm = TreeLSTM(config["H"], len(WORD.vocab) + 100, len(LABEL.vocab)).cuda() for p in tree_lstm.parameters(): if p.dim() > 1: torch.nn.init.xavier_uniform_(p) model = SpanLSTM(NT, len(WORD.vocab), config["H"]).cuda() for p in model.parameters(): if p.dim() > 1: torch.nn.init.xavier_uniform_(p) wandb.watch((model, tree_lstm)) print(wandb.config) tree = run_train(train_iter, valid_iter, model, tree_lstm, V) else: print("loading") model, tree_lstm = torch.load("cp.yoyo.model") print(valid_sup(valid_iter, model, tree_lstm, V))
def __init__(self, path, text_field, label_field, keepneutral=False, neutral=None, size=None, shuffle=True, **kwargs): # Get the Standford dataset fields SENT_ID = data.RawField() DATE = data.RawField() QUERY = data.RawField() USER = data.RawField() fields = [("label", label_field), ("id", SENT_ID), ("date", DATE), ("query", QUERY), ("user", USER), ("text", text_field)] # Create the torchtext dataset for all examples examples = [] df = pd.read_csv( path, encoding='latin-1', header=0, names=["label", "id", "date", "query", "user", "text"]) if shuffle: df = df.sample(frac=1) if neutral is not None: df_neutral = pd.read_csv(neutral, index_col=0, header=None).T df_neutral.columns = ["text"] df_neutral["label"] = [2] * len(df_neutral) df = pd.concat([df_neutral, df]) for (_, entry) in df.iloc[0:size].iterrows(): if not keepneutral and entry["label"] == 2: continue example = data.Example.fromlist(entry, fields) examples.append(example) super(Sentiment140, self).__init__(examples, fields, **kwargs)
def __init__(self, data_path, glove_size, batch_size, train_file='train.csv', dev_file='dev.csv'): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Defining the Fields self.RAW = data.RawField(is_target=False) self.WORDS = data.Field(batch_first=True, tokenize=post_ptbtokenizer, lower=True, include_lengths=True) self.CHAR = data.NestedField(data.Field(batch_first=True, tokenize=list, lower=True), tokenize=post_ptbtokenizer) self.INDEX = data.Field(sequential=False, unk_token=None, use_vocab=False) fields = { 'id': ('id', self.RAW), 'context_ptb_tok': [('context_words', self.WORDS), ('context_char', self.CHAR)], 'question_ptb_tok': [('question_words', self.WORDS), ('question_char', self.CHAR)], 'answer_ptb_tok': [('answer_words', self.WORDS), ('answer_char', self.CHAR)], 'start_idx': ('start_idx', self.INDEX), 'end_idx': ('end_idx', self.INDEX) } print('Loading CSV Data Into Torch Tabular Dataset') self.train, self.dev = data.TabularDataset.splits(path=data_path, train=train_file, validation=dev_file, format='csv', fields=fields) print('Building Vocabulary') self.CHAR.build_vocab(self.train, self.dev) self.WORDS.build_vocab(self.train, self.dev, vectors=GloVe(name='6B', dim=glove_size)) print('Creating Iterators') self.train_iter = PreprocessData.create_train_iterator( self.train, device, batch_size) self.dev_iter = PreprocessData.create_dev_iterator( self.dev, device, batch_size)
def __init__(self, args): if args.datastories: tokenizer = SocialTokenizer(lowercase=True) else: tokenizer = TweetTokenizer() self.RAW = data.RawField() self.TEXT = data.Field(batch_first=True, include_lengths=True, lower=True, tokenize=tokenizer.tokenize) self.LABEL = data.Field(sequential=False, unk_token=None) self.train, self.dev, self.test = datasets.EMO.splits( args, self.RAW, self.TEXT, self.LABEL, args.train_data_path, args.valid_data_path, args.test_data_path) self.TEXT.build_vocab(self.train, self.dev, self.test, vectors=GloVe(name='840B', dim=300)) if args.fasttext: self.FASTTEXT = data.Field(batch_first=True, include_lengths=True, lower=True, tokenize=tokenizer.tokenize) self.FASTTEXT.vocab = copy.deepcopy(self.TEXT.vocab) self.FASTTEXT.vocab.set_vectors(self.FASTTEXT.vocab.stoi, vectors=FastText(language='en'), dim=300) self.LABEL.build_vocab(self.train) self.train_iter, self.dev_iter, self.test_iter = \ data.BucketIterator.splits((self.train, self.dev, self.test), batch_size=args.batch_size, device=args.device, repeat=False) self.max_word_len = max([len(w) for w in self.TEXT.vocab.itos]) # for <pad> self.char_vocab = {'': 0} # for <unk> and <pad> self.characterized_words = [[0] * self.max_word_len, [0] * self.max_word_len] if args.char_emb: self.build_char_vocab() filehandler = open('./data/vocab.obj', 'wb') pickle.dump(self.TEXT.vocab, filehandler) filehandler = open('./data/label.obj', 'wb') pickle.dump(self.LABEL.vocab, filehandler)
def load_dataset_for_transformer(path, tokenizer, lower=False, stop_words=None, load_raw=True, load_id=True, float_label=True, max_len=512): postpro = lambda xs, _: [tokenizer.convert_tokens_to_ids(x[:max_len]) for x in xs] TEXT = data.Field(use_vocab=False, postprocessing=postpro, pad_token=tokenizer.pad_token_id, lower=lower, stop_words=stop_words) label_type = torch.float if float_label else torch.long LABEL = data.LabelField(dtype=label_type) RAW = data.RawField() ID = data.RawField() fields = {'text': ('text', TEXT), 'label': ('label', LABEL)} if load_raw: fields['raw'] = ('raw', RAW) RAW.is_target = True if load_id: fields['id'] = ('id', ID) ID.is_target = True splits = data.TabularDataset.splits( path=path, train='train.json', validation='valid.json', test='test.json', format='json', fields=fields) return splits, (TEXT, LABEL, RAW, ID)
def __init__(self, args): # self.RAW = data.RawField(is_target=False) self.RAW = data.RawField() self.RAW.is_target = False # tokenizer = lambda x:list(jieba.cut(x)) self.tokenize = lambda x: [char for char in x] self.TEXT = data.Field(batch_first=True, fix_length=32, tokenize=self.tokenize) self.LABEL = data.LabelField() self.train, self.dev, self.test = data.TabularDataset.splits( # path = './Bimpm/data/Docomo', path='/home/lsy2018/TextClassification/DATA/DATA_DOUBAN/data_1024/', train='train.csv', validation='dev.csv', test='test.csv', format='csv', fields=[('id', self.RAW), ('q1', self.TEXT), ('q2', self.TEXT), ('label', self.LABEL)]) # vectors = Vectors(name='/home/fch/leo/text-similarity/BIMPM_new/.vector_cache/glove.840B.300d.txt') # vectors = Vectors(name='./data/Glove/glove.6B.300d.txt') vectors = Vectors( name= '/home/lsy2018/wlw/Bimpm/data/Embedding/sgns.financial.bigram-char_cleaned.txt' ) self.TEXT.build_vocab(self.train, self.dev, self.test, vectors=vectors) self.LABEL.build_vocab(self.train) sort_key = lambda x: data.interleave_keys(len(x.q1), len(x.q2)) self.train_iter, self.dev_iter, self.test_iter = \ data.BucketIterator.splits((self.train, self.dev, self.test), batch_sizes=[args.batch_size] * 3, # [args.batch_size] * 3, device=args.device, sort_key=sort_key) # print('train_iter:',type(self.train_iter),self.train.shape) # print('test_iter:',type(self.test_iter),self.test.shape) # print('dev_iter:',type(self.dev_iter),self.dev.shape) self.max_word_len = max([len(w) for w in self.TEXT.vocab.itos]) # for <pad> self.char_vocab = {'': 0} # for <unk> and <pad> self.characterized_words = [[0] * self.max_word_len, [0] * self.max_word_len] if args.use_char_emb: self.build_char_vocab()