def create_iter(self, batch_size): """ 构建迭代器 :param batch_size: 每批的大小 :return: iter """ # 定义torchtext中的Field fields = [('english', self.english), ('chinese', self.chinese)] examples = [] # 构建中英文example for en, ch in zip(self.english_list, self.chinese_list): item = [en, ch] examples.append(data.Example().fromlist(item, fields)) # 划分训练集,测试集 train, test = Dataset(examples=examples, fields=fields).split(split_ratio=0.8) self.english.build_vocab(train) self.chinese.build_vocab(train) self.english_voca_size = len(self.english.vocab) self.chinese_voca_size = len(self.chinese.vocab) train_iter, test_iter = Iterator.splits( (train, test), batch_sizes=(batch_size, len(test)), sort_key=lambda x: len(x.english), sort_within_batch=True, device=-1) return train_iter, test_iter
def load_dataset(data_path, train_batch_size=4096, dev_batch_size=1, max_len=100): """ This assumes that the data is already pre-processed using Moses Tokenizer Returns iterators for the training/dev dataset Arguments: data_path: path of the dataset train_batch_size: batch size of the training data (defined in terms of number of tokens or sentences, depending on the model_type) dev_batch_size: batch size of the dev data (usually one) max_len: max length of sequeences in a batch """ SRC = Field(tokenize=lambda s: s.split(), init_token="<s>", eos_token="</s>", batch_first=True, include_lengths=True) TRG = Field(tokenize=lambda s: s.split(), init_token="<s>", eos_token="</s>", batch_first=True, include_lengths=True) # create a TranslationDataset for both the train and dev set train_data = datasets.TranslationDataset(exts=("train.de", "train.en"), fields=( SRC, TRG), path=data_path, filter_pred=lambda x: len(vars(x)['src']) <= max_len and len(vars(x)['trg']) <= max_len) dev_data = datasets.TranslationDataset( exts=("dev.de", "dev.en"), fields=(SRC, TRG), path=data_path) # load in the Test Set test_examples = [] with open(data_path + "test.de", "r") as f: for test_example in f.readlines(): example = data.Example() setattr(example, "src", test_example.split()) test_examples.append(example) test_data = data.Dataset(test_examples, fields=[("src", SRC)]) # build he vocab using the training data SRC.build_vocab(train_data.src, train_data.trg) TRG.build_vocab(train_data.src, train_data.trg) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # use custom DataIterator in order to minimize padding in a sequence # and inoder to `pack` a batch fully inorder to maximmize the computation # in a GPU train_iterator = DataIterator(train_data, batch_size=train_batch_size, device=device, repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)), batch_size_fn=batch_size_fn, train=True, sort_within_batch=True, shuffle=True) # use a regular Iterator since we want to be able to compare # our translations to a gold standard file. If we use a # `DataIterator` then we will get our translations in shuffled/random # order dev_iterator = Iterator(dev_data, batch_size=dev_batch_size, train=False, sort=False, repeat=False, device=device) # create Test Iterator for the test data test_iterator = Iterator( test_data, batch_size=1, train=False, sort=False, repeat=False, device=device) print(len(test_iterator)) return train_iterator, dev_iterator, test_iterator, SRC, TRG
def product_dataset(path, text_field, label_field): fields = [('text', text_field), ('label', label_field)] texts, labels = read_product(path, 1, 1) examples = [] for text, label in zip(texts, labels): _data = [text, label] _fields = [('text', text_field), ('label', label_field)] a = data.Example() examples.append(a.fromlist(_data, _fields)) ret_data = data.Dataset(examples, fields) ret_data.sort_key = lambda x: -1 * len(x.text) return ret_data
def create_iter(self, split_ratio, batch_size=1000, device=-1): fields = [("text", self.TEXT), ("label", self.LABEL)] examples = [] for review, label in zip(self.reviews, self.labels): item = [review, label] examples.append(data.Example().fromlist(item, fields)) train, valid, test = Dataset(examples=examples, fields=fields).split(split_ratio=split_ratio) self.TEXT.build_vocab(train) self.LABEL.build_vocab(train) voca_size = len(self.TEXT.vocab) train_iter, val_iter, test_iter = data.Iterator.splits( (train, valid, test), sort_key=lambda x: len(x.text), batch_sizes=(batch_size, len(valid), len(test)), device=device) return train_iter, val_iter, test_iter, voca_size
def read(self, path, text_field, label_field): fields = [('text', text_field), ('label', label_field)] texts = self.read_dataset(path) examples = [] for text in texts: if len(text) < 3: continue #_data = [text[:-1], text[1:]] _data = [text, text] _fields = [('text', text_field), ('label', label_field)] a = data.Example() #print _data examples.append(a.fromlist(_data, _fields)) ret_data = data.Dataset(examples, fields) ret_data.sort_key = lambda x: -1 * len(x.text) return ret_data
def tokenize(self, text, output='conllu'): text = [t for t in text.split("\n") if len(t) > 0] examples = [data.Example().fromlist([t], fields=[('text', self.TEXT)]) for t in text] dataset = data.Dataset(examples, fields=[('text', self.TEXT)]) data_iter = data.BucketIterator(dataset, batch_size=self.BATCH_SIZE, sort_key=lambda x: len(x.text), sort_within_batch=True, shuffle=False, device=device) with torch.no_grad(): preds = [] for batch in data_iter: t, l = batch.text predictions = self.model(t, l) predictions = predictions.float() _, rounded_preds = torch.max(torch.sigmoid(predictions), 2) preds.append(rounded_preds) sents = [] tokens = [] for item in list(zip(text, preds[::-1])): text = item[0] tags = item[1] token = '' for i in tqdm(range(len(tags[0]))): if int(tags[0][i]) == 0: token += text[i] elif int(tags[0][i]) == 1: token += text[i] if output == 'conllu': space_after = 1 if text[i + 1] == ' ' else 0 tokens.append((token.strip(), space_after)) else: tokens.append(token.strip()) token = '' else: token += text[i] if output == 'conllu': tokens.append((token.strip(), 0)) else: tokens.append(token.strip()) token = '' sents.append(tokens) tokens = [] return sents
def imdb_dataset(path, text_field, label_field): fields = [('text', text_field), ('label', label_field)] texts = read_imdb(path, 'pos', 1, 1) labels = [1] * len(texts) _texts = read_imdb(path, 'neg', 1, 1) labels += [0] * len(_texts) texts += _texts examples = [] for text, label in zip(texts, labels): _data = [text, label] _fields = [('text', text_field), ('label', label_field)] a = data.Example() examples.append(a.fromlist(_data, _fields)) ret_data = data.Dataset(examples, fields) ret_data.sort_key = lambda x: -1 * len(x.text) return ret_data
def create_iter(self, split_ratio, batch_size=100): fields = [("text", self.TEXT), ("label", self.LABEL)] examples = [] for index, context in enumerate(self.file): d = context.split('\t') # item = [text, label] item = [d[1], d[0].strip()] examples.append(data.Example().fromlist(item, fields)) train, valid, test = Dataset( examples=examples, fields=fields).split(split_ratio=split_ratio) self.TEXT.build_vocab(train) self.LABEL.build_vocab(train) voca_size = len(self.TEXT.vocab) train_iter, val_iter, test_iter = data.Iterator.splits( (train, valid, test), sort_key=lambda x: len(x.text), batch_sizes=(batch_size, len(valid), len(test))) return train_iter, val_iter, test_iter, voca_size
def load_pickle(PATH, FIELDNAMES, FIELD): dataList = [] with open(PATH, "rb") as input_file: while True: try: # Taking the dictionary instance as the input Instance inputInstance = pickle.load(input_file) # plugging it into the list dataInstance = [ inputInstance[FIELDNAMES[0]], inputInstance[FIELDNAMES[1]] ] # Finally creating an example objects list dataList.append(data.Example().fromlist(dataInstance, fields=FIELD)) except EOFError: break # At last creating a data Set Object exampleListObject = data.Dataset(dataList, fields=FIELD) return exampleListObject
def __init__(self, path, src_field, trg_field, sep='\t', **kwargs): """Create an dataset instance given a path and fields. Arguments: path: Path to the data file. src_field: The field that will be used for source data. trg_field: The field that will be used for destion data. kwargs: Passed to the constructor of data.Dataset. """ fields = [('src', src_field), ('trg', trg_field)] examples = [] with open(path, errors='ignore') as f: for line in f: s = line.strip().split(sep) if len(s) != 2: continue src, trg = s[0], s[1] e = data.Example() setattr(e, "src", src_field.preprocess(src)) setattr(e, "trg", trg_field.preprocess(trg)) examples.append(e) super(TranslateDataset, self).__init__(examples, fields, **kwargs)