def load_generation_data(): """ This function loads all data necessary for training and evaluation of a code/comment generation model. Data is loaded from a TSV file that contains all data instances. This file is found in the directory pointed to by data_path. Training, dev, and testing sets are created for the model using a torchtext BucketIterator that creates batches, indicated by the batch_size variable such that the batches have minimal padding. This function also loads pretrained word embedding vectors that are located in the data_path directory. :param batch_size: [int] -- amount of data elements per batch :returns: [Tuple] -- (TRAIN set of batches, DEV set of batches, TEST set of batches, code pretrained vectors, docstring pretrained vectors) """ input_path = CODE_CORPUS / "input" # Create a field variable for each field that will be in our TSV file code_field = data.Field(sequential=True, tokenize=lambda s: s.split(" "), include_lengths=True, use_vocab=True) comm_field = data.Field(sequential=True, tokenize=lambda s: s.split(" "), include_lengths=True, use_vocab=True) # Used to create a tabular dataset from TSV train_val_fields = [("code", code_field), ("comm", comm_field)] # Build the large tabular dataset using the defined fields tsv_file_path = input_path / "generation_dataset.tsv" tab_data = data.TabularDataset(str(tsv_file_path), "TSV", train_val_fields) # Split the large dataset into TRAIN, DEV, TEST portions train_data, dev_data, test_data = tab_data.split( split_ratio=[0.85, 0.05, 0.1]) # Load the pretrained word embedding vectors code_vec_path = input_path / "code-vectors.txt" comm_vec_path = input_path / "comm-vectors.txt" code_vectors = vocab.Vectors(str(code_vec_path), str(input_path)) comm_vectors = vocab.Vectors(str(comm_vec_path), str(input_path)) # Builds the known word vocab for code and comments from the pretrained vectors code_field.build_vocab(train_data, dev_data, test_data, vectors=code_vectors) comm_field.build_vocab(train_data, dev_data, test_data, vectors=comm_vectors) # We need to return the test sets and the field pretrained vectors return (train_data, dev_data, test_data, code_field.vocab, comm_field.vocab)
def sst_word_char(path, word_field, char_field, label_field, batch_size, device, word_emb_file, char_emb_file): fields = { 'text': [('text_word', word_field), ('text_char', char_field)], 'label': ('label', label_field) } train, dev, test = data.TabularDataset.splits( path=path, train='train.jsonl', validation='dev.jsonl', test='test.jsonl', format='json', skip_header=True, fields=fields) word_vectors = vocab.Vectors(word_emb_file) char_vectors = vocab.Vectors(char_emb_file) word_field.build_vocab( train, dev, test, max_size=25000, vectors=word_vectors, unk_init=torch.Tensor.normal_) char_field.build_vocab( train, dev, test, max_size=94, vectors=char_vectors, unk_init=torch.Tensor.normal_) label_field.build_vocab(train, dev, test) train_iter, dev_iter, test_iter = data.BucketIterator.splits( (train, dev, test), batch_sizes=(batch_size, len(dev), len(test)), sort_key=lambda x: len(x.text_word), sort_within_batch=True, repeat=False, shuffle=True, device=device ) return train_iter, dev_iter, test_iter
def __init__(self, batch_size=100): print('Device: ' + str(device)) self.candidate_title = data.Field(sequential=True, lower=True, tokenize=tokenizer, include_lengths=True, use_vocab=True) self.candidate_resume = data.Field(sequential=True, lower=True, include_lengths=True, use_vocab=True) self.job_title = data.Field(sequential=True, lower=True, tokenize=tokenizer, include_lengths=True, use_vocab=True) self.job_description = data.Field(sequential=True, lower=True, include_lengths=True, use_vocab=True) self.match_status = data.Field(sequential=False, use_vocab=False) self.train_set, self.validation_set = data.TabularDataset.splits( path='./gdrive/My Drive/Colab Notebooks/data/TalentFox/', train='train_data.csv', validation='val_data.csv', format='csv', fields=[('index', None), ('job_title', self.job_title), ('job_description', self.job_description), ('candidate_title', self.candidate_title), ('candidate_resume', self.candidate_resume), ('match_status', self.match_status)], skip_header=True, ) self.train_iter, self.validation_iter = data.BucketIterator.splits( (self.train_set, self.validation_set), batch_size=batch_size, shuffle=True, device=device, sort_key=lambda x: len(x.job_description), sort_within_batch=True, repeat=True) self.match_status.build_vocab(self.train_set) url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.de.vec' self.job_title.build_vocab(self.train_set, vectors=vocab.Vectors('wiki.de.vec', url=url)) self.job_description.build_vocab(self.train_set, vectors=vocab.Vectors('wiki.de.vec', url=url)) self.candidate_title.build_vocab(self.train_set, vectors=vocab.Vectors('wiki.de.vec', url=url)) self.candidate_resume.build_vocab(self.train_set, vectors=vocab.Vectors('wiki.de.vec', url=url))
def load_sst2(path, text_field, label_field, batch_size, device, embedding_file): train, dev, test = data.TabularDataset.splits(path=path, train='train.tsv', validation='dev.tsv', test='test.tsv', format='tsv', skip_header=True, fields=[('text', text_field), ('label', label_field)]) print("the size of train: {}, dev:{}, test:{}".format( len(train.examples), len(dev.examples), len(test.examples))) vectors = vocab.Vectors(embedding_file) text_field.build_vocab(train, dev, test, max_size=25000, vectors=vectors, unk_init=torch.Tensor.normal_) label_field.build_vocab(train, dev, test) train_iter, dev_iter, test_iter = data.BucketIterator.splits( (train, dev, test), batch_sizes=(batch_size, len(dev), len(test)), sort_key=lambda x: len(x.text), sort_within_batch=True, repeat=False, shuffle=True, device=device) return train_iter, dev_iter, test_iter
def get_embedding_weights(embedding): embeddings_file = '' cache = '' if embedding == 'glove_specific': embeddings_file = 'glove.vec' cache = 'specific-embeddings' elif embedding == 'glove_generic': embeddings_file = 'glove.6B.300d.txt' cache = '.vector_cache' elif embedding == 'fasttext_specific': embeddings_file = 'fasttext.vec' cache = 'specific-embeddings' elif embedding == 'fasttext_generic': embeddings_file = 'crawl-300d-2M.vec' cache = '.fasttext_cache' elif embedding == 'word2vec_specific': embeddings_file = 'word2vec.vec' cache = 'specific-embeddings' elif embedding == 'word2vec_generic': embeddings_file = 'embeddings.vec' cache = '.word2vec_cache' model = vocab.Vectors(name=embeddings_file, cache=cache) return torch.FloatTensor(model.vectors)
def create_text_and_label(SEED, ratio, filename): TEXT = data.Field(sequential=True, tokenize='spacy', include_lengths=True) LABEL = data.LabelField(tokenize='spacy', is_target=True, sequential=False) fields = [('text', TEXT), ('label', LABEL)] train_data = data.TabularDataset.splits(path='', train=filename, format='csv', fields=fields, skip_header=True) train_data = train_data[0] if ratio == 8: ratio = 0.8 else: ratio = 0.7 train_data, valid_data = train_data.split(split_ratio=ratio, random_state=random.seed(SEED)) custom_embeddings = vocab.Vectors( name=os.path.join(GLOVE, 'glove.6B.100d.txt')) MAX_VOCAB_SIZE = 25000 TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE, vectors=custom_embeddings, unk_init=torch.Tensor.normal_) LABEL.build_vocab(train_data) return TEXT, LABEL, train_data, valid_data
def load_conv_data(file, g_sequence_len, embed_file=None, min_freq=1): TEXT = data.Field(tokenize=tokenize_en, lower=True, fix_length=g_sequence_len, batch_first=True, eos_token='<eos>', init_token='<sos>') LABEL = data.Field(sequential=False, unk_token=None) tb = data.TabularDataset(file, format='tsv', fields=[('text1', TEXT), ('text2', TEXT), ('label', LABEL)]) if embed_file: TEXT.build_vocab(tb, vectors=vocab.Vectors(embed_file), min_freq=min_freq) else: TEXT.build_vocab(tb, min_freq=min_freq) LABEL.build_vocab(tb) label_names = LABEL.vocab.itos label_examples = [[] for _ in label_names] for each in tb: label_examples[label_names.index(each.label)].append(each) label_datasets = [ data.Dataset(label_examples[i], fields=[('text1', TEXT), ('text2', TEXT), ('label', LABEL)]) for i in range(len(label_names)) ] return tb, TEXT, LABEL, label_names, label_datasets
def load_news(config, text_field, band_field): fields = {'text': ('text', text_field), 'label': ('label', band_field)} word_vectors = vocab.Vectors(config.embedding_file) train, val, test = data.TabularDataset.splits(path=config.data_path, train='train.csv', validation='val.csv', test='test.csv', format='csv', fields=fields) print("the size of train: {}, dev:{}, test:{}".format( len(train.examples), len(val.examples), len(test.examples))) text_field.build_vocab(train, val, test, max_size=config.n_vocab, vectors=word_vectors, unk_init=torch.Tensor.normal_) band_field.build_vocab(train, val, test) train_iter, val_iter, test_iter = data.BucketIterator.splits( (train, val, test), batch_sizes=(config.batch_size, config.batch_size, config.batch_size), sort=False, device=config.device, sort_within_batch=False, shuffle=False)
def __init__(self, num_docs, text_field, path): # vec_dim, num_docs, num_words) super(DMemb, self).__init__() vectors = vocab.Vectors(path) text_field.vocab.set_vectors(vectors.stoi, vectors.vectors, vectors.dim) [num_words, vec_dim] = torch.tensor(vectors.vectors.shape) # paragraph matrix self._D = nn.Parameter(torch.randn(num_docs, vec_dim), requires_grad=True) # word matrix self._W = nn.Embedding.from_pretrained( torch.FloatTensor(text_field.vocab.vectors)) z = self._W.weight[0, :] for key, value in text_field.vocab.stoi.items(): a = self._W.weight[value, :] if bool(torch.all(torch.eq(z, a))): for keyv, valuev in vectors.stoi.items(): word = keyv.lower() if word == key: #print(key, keyv) self._W.weight[value, :] = vectors.vectors[ vectors.stoi[keyv], :] # output layer parameters self._O = nn.Parameter(torch.FloatTensor(vec_dim, num_words).zero_(), requires_grad=True)
def load_race(path, id_field, word_field, label_field, train_batch_size, dev_batch_size, test_batch_size, device, word_embed_file, cache_dir): fields = { 'race_id': ('race_id', id_field), 'article': ('article', word_field), 'question': ('question', word_field), 'option_0': ('option_0', word_field), 'option_1': ('option_1', word_field), 'option_2': ('option_2', word_field), 'option_3': ('option_3', word_field), 'label': ('label', label_field) } word_vectors = vocab.Vectors(word_embed_file, cache_dir) train, dev, test = data.TabularDataset.splits( path=path, train='train.jsonl', validation='dev.jsonl', test='test.jsonl', format='json', fields=fields) print("the size of train: {}, dev:{}, test:{}".format( len(train.examples), len(dev.examples), len(test.examples))) word_field.build_vocab(train, dev, test, max_size=50000, vectors=word_vectors, unk_init=torch.Tensor.normal_) label_field.build_vocab(train, dev, test) train_iter, dev_iter, test_iter = data.BucketIterator.splits( (train, dev, test), batch_sizes=(train_batch_size, dev_batch_size, test_batch_size), sort_key=lambda x: len(x.article), device=device, shuffle=True) return train_iter, dev_iter, test_iter
def load_dataset(data_dir, embeddings_path, max_vocab_size): # Load train and test data text = data.Field(sequential=True, tokenize='spacy', include_lengths=True) label = data.LabelField(sequential=False, tokenize='spacy', is_target=True) fields = [('text', text), ('label', label)] train_data, test_data = data.TabularDataset.splits( path=data_dir, train='train.csv', test='test.csv', format='csv', fields=fields, skip_header=True, ) # Load embeddings embeddings = vocab.Vectors(name=embeddings_path) # Build vocabulary text.build_vocab(train_data, max_size=max_vocab_size, vectors=embeddings, unk_init=torch.Tensor.normal_) label.build_vocab(train_data) return train_data, test_data, text, label
def get_dataset(self): self.embeddings = vocab.Vectors(name=self.index_path, cache=self.cache_path) self.vocabulary = torchtext.data.Field() # Adding pad and unk token self.embeddings.stoi[self.vocabulary.pad_token] = len( self.embeddings.stoi) self.embeddings.vectors[self.embeddings.stoi[ self.vocabulary.pad_token]] = torch.zeros(300) self.embeddings.stoi[self.vocabulary.unk_token] = len( self.embeddings.stoi) self.embeddings.vectors[self.embeddings.stoi[ self.vocabulary.unk_token]] = torch.zeros(300) for lang in ['en', 'hi', 'gu', 'pa', 'or', 'mr', 'bn']: for d in self.dataset: if self.lang_map[d["Target_ID"]] == lang: try: # Remove unknown tokens self.targets.append(self.embeddings.vectors[ self.embeddings.stoi[d["Target_keyword"]]]) self.src_lang.append(self.lang_map[d["Source_ID"]]) self.target_lang.append(self.lang_map[d["Target_ID"]]) self.phrases.append(d["Source_text"]) except KeyError: #print(d["Target_keyword"] + " not found") pass
def load_data( path, id_field, word_field, label_field, train_batch_size, dev_batch_size, test_batch_size, device, word_embed_file, cache_dir, ): fields = { "article": ("article", word_field), "question": ("question", word_field), "option_0": ("option_0", word_field), "option_1": ("option_1", word_field), "option_2": ("option_2", word_field), "option_3": ("option_3", word_field), "option_4": ("option_4", word_field), "label": ("label", label_field), } word_vectors = vocab.Vectors(word_embed_file, cache_dir) train, dev = data.TabularDataset.splits( path=path, train="Task_2_train_trial.jsonl", validation="Task_2_dev.jsonl", test=None, format="json", fields=fields, ) print("the size of train: {}, dev:{},".format( len(train.examples), len(dev.examples), )) word_field.build_vocab(train, dev, max_size=50000, vectors=word_vectors, unk_init=torch.Tensor.normal_) label_field.build_vocab(train, dev) train_iter, dev_iter = data.BucketIterator.splits( (train, dev), batch_sizes=(train_batch_size, dev_batch_size), sort_key=lambda x: len(x.article), device=device, shuffle=True, ) return train_iter, dev_iter
def load_then_visualize_embeddings(path): """Visualizes pretrained embeddings into tensorboard. Args: path: Path to the pretrained vector file. """ writer = SummaryWriter() v = vocab.Vectors(path) writer.add_embedding(v.vectors, v.itos)
def load_embedding(self): embed_path = self.args_dict['embed_path'] custom_embedding = vocab.Vectors(name=os.path.basename(embed_path), cache=os.path.dirname(embed_path)) # !custom_embedding.stoi['cat']) # !custom_embedding.vectors[6]) return custom_embedding
def load_custom_embeddings(): weibo_word_vector = os.path.join('/', 'home', 'wzw', 'pretrained_word_embeddings', custom_word_embedding) cache = os.path.join('/', 'home', 'wzw', 'pretrained_word_embeddings', 'cache.' + custom_word_embedding) custom_embeddings = vocab.Vectors(name=weibo_word_vector, cache=cache, unk_init=torch.Tensor.normal_) return custom_embeddings
def load_sst2(path, text_field, label_field, batch_size, embedding_file, cache_file): # 2. 定义 DataSet train, dev = data.TabularDataset.splits(path=path, train='train.tsv', validation='dev.tsv', format='tsv', skip_header=True, fields=[('text', text_field), ('label', label_field)]) # 这里需要注意单独处理的时候不能用 splits 方法。 test = data.TabularDataset(path + 'test.tsv', format='tsv', skip_header=True, fields=[('index', label_field), ('text', text_field)]) print("the size of train: {}, dev:{}, test:{}".format( len(train), len(dev), len(test))) print("the result of dataset: ", train[0].text, train[0].label) # 3. 建立 vocab,大小是text_field里面的词数量 vectors = vocab.Vectors(embedding_file, cache_file) text_field.build_vocab(train, dev, test, max_size=25000, vectors=vectors, unk_init=torch.Tensor.normal_) label_field.build_vocab(train, dev, test) # 4. 构造迭代器 train_iter, dev_iter = data.BucketIterator.splits( (train, dev), batch_sizes=(batch_size, batch_size), sort_key=lambda x: len(x.text), sort_within_batch=True, repeat=False, shuffle=True) # 同样单独处理的时候 test_iter = data.Iterator(test, batch_size=len(test), train=False, sort=False) print("the size of train_iter: {}, dev_iter:{}, test_iter:{}".format( len(train_iter), len(dev_iter), len(test_iter))) # for batch_idx, (X_train_var, y_train_var) in enumerate(train_iter): # print("the shape of train_x: {}, train_y:{}".format(X_train_var.shape, y_train_var.shape)) # break return train_iter, dev_iter, test_iter
def build_vocab(self) -> None: def extend_vocab(field, word_lst, using_vector=False): cnt_add_w = 0 for w in word_lst: if w not in field.vocab.stoi: cnt_add_w += 1 field.vocab.itos.append(w) field.vocab.stoi[w] = len(field.vocab.itos) - 1 # else: # self.logger.warning(w + ' is already in the field') if using_vector: # self.logger.info('Add ' + str(cnt_add_w) + ' zero vectors into vocab.vectors') field.vocab.vectors = torch.cat((field.vocab.vectors, torch.zeros(cnt_add_w, self.word_embedding_size)), 0) self.logger.info('Building vocabularies') self.logger.info('Loading pretrained vectors from' + self.pretrained_emb_path) pretrained_vec = vocab.Vectors(os.path.basename(self.pretrained_emb_path), os.path.dirname(self.pretrained_emb_path)) self.WORDS.build_vocab(self.train_dataset, min_freq=self.min_freq, vectors=pretrained_vec) extend_vocab(self.WORDS, self.singletons, using_vector=True) # print vocab to file f_write = open(os.path.join(self.save_to, 'vocab.txt'), 'w') for w in self.WORDS.vocab.itos: f_write.write(w + '\n') cnt_zero = 0 zero_words = [] for cnt, each_vec in enumerate(self.WORDS.vocab.vectors): if each_vec.sum().item() == 0: cnt_zero += 1 cur_word = self.WORDS.vocab.itos[cnt] assert cur_word.startswith('unk') or cur_word == '<unk>' or cur_word in self.singletons self.WORDS.vocab.vectors[cnt] = np.random.normal(0, 0.05) zero_words.append(cur_word) self.logger.info('There are ' + str(cnt_zero) + ' zero embeddings') print('Zero words = ', zero_words[-25:] + zero_words[:25]) assert cnt_zero > 1 self.POS_TAGS.build_vocab(self.train_dataset) self.NONTERMS.build_vocab(self.train_dataset) extend_vocab(self.NONTERMS, ['<w>']) self.ACTIONS.build_vocab() assert self.ACTIONS.vocab.itos[2] == 'NP(TOP -> S)' self.num_words = len(self.WORDS.vocab) self.num_pos = len(self.POS_TAGS.vocab) self.num_nt = len(self.NONTERMS.vocab) self.num_actions = len(self.ACTIONS.vocab) self.logger.info('Found %d words, %d POS tags, %d nonterminals, %d actions', self.num_words, self.num_pos, self.num_nt, self.num_actions)
def __init__(self): glove = torchvocab.Vectors(name=os.path.join( Constants.Data.datadir, Constants.Data.glove_path)) counter = Counter([w for w in glove.stoi]) self.vocab = torchvocab.Vocab(counter, vectors=glove, specials=[ Constants.SpecialTokens.pad, Constants.SpecialTokens.unk ]) self.embedding_layer = nn.Embedding.from_pretrained(self.vocab.vectors)
def load_w2v_vectors(fname): """ load pre-trained word2vec word embeddings from local disk file. :param fname: file name :return: word2vec word embeddings """ print("Loading word2vec model from {}".format(fname)) if not os.path.exists('model/w2v.mod'): model = gensim.models.KeyedVectors.load_word2vec_format(fname, binary=True, limit=1000000) model.wv.save_word2vec_format('model/w2v.mod') return vocab.Vectors('model/w2v.mod')
def generate_embedding(self, path, dimensions): r"""Generates an embedding layer based on a vector of pretrained embeddings. Args: path: Path to the vector file of pretrained embeddings. dimensions: Dimensions of embedding layer for sequences. """ v = vocab.Vectors(path) emb = build_embedding(self.vocab_dict, self.vocab_len, dimensions, v) return emb, dimensions, v
def load_squad(path, raw_field, word_field, label_field, train_batch_size, dev_batch_size, device, word_embedding_file, cache_dir): if os.path.exists(cache_dir): print("dataset have cached, loding splits... ") list_fields = [('id', raw_field), ('s_idx', label_field), ('e_idx', label_field), ('context', word_field), ('question', word_field)] train_examples = torch.load(cache_dir + 'train_examples.pt') dev_examples = torch.load(cache_dir + "dev_examples.pt") train = data.Dataset(examples=train_examples, fields=list_fields) dev = data.Dataset(examples=dev_examples, fields=list_fields) else: dict_field = { 'id': ('id', raw_field), 's_idx': ('s_idx', label_field), 'e_idx': ('e_idx', label_field), 'context': ('context', word_field), 'question': ('question', word_field) } train, dev = data.TabularDataset.splits(path=path, train='train.jsonl', validation='dev.jsonl', format='json', fields=dict_field) os.makedirs(cache_dir) torch.save(train.examples, cache_dir + 'train_examples.pt') torch.save(dev.examples, cache_dir + "dev_examples.pt") print("the size of train: {}, dev:{}".format(len(train.examples), len(dev.examples))) word_field.build_vocab(train, dev, vectors=vocab.Vectors(word_embedding_file), max_size=25000, unk_init=torch.Tensor.normal_) print("building iterators...") train_iter, dev_iter = data.BucketIterator.splits( (train, dev), batch_sizes=[train_batch_size, dev_batch_size], device=device, sort_key=lambda x: len(x.c_word)) return train_iter, dev_iter
def main(): text_field = data.Field(lower=True) label_field = data.Field(sequential=False) logging.critical('starting loading data') train_iter, dev_iter, total_steps = vulgar(text_field, label_field, args, device=-1, repeat=False) if args.load_vec: if args.load_vec == 'hi': args.load_vec = 'model/hi_1105_ml_100.w2v' logging.critical('start load word2vec') embeddings_file = args.load_vec vectors = vocab.Vectors(embeddings_file) text_field.vocab.set_vectors(vectors.stoi, vectors.vectors, vectors.dim) embedding = nn.Embedding.from_pretrained( torch.FloatTensor(text_field.vocab.vectors)) args.embed_dim = vectors.dim embedding.weight.requires_grad = True # logging.critical(embedding.weight.requires_grad) else: # update args and print args.embed_num = len(text_field.vocab) embedding = nn.Embedding(args.embed_num, args.embed_dim) args.class_num = len(label_field.vocab) - 1 # 有个<unk> args.cuda = (not args.no_cuda) and torch.cuda.is_available() del args.no_cuda args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')] # args中-变成了_ args.save_dir = os.path.join( args.save_dir, datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) logging.critical('Parameters:') for attr, value in sorted(args.__dict__.items()): logging.critical("\t{}={}".format(attr.upper(), value)) # model cnn = model.CNN_Text(args, embedding) if args.snapshot is not None: logging.critical('\nLoading model from {}...'.format(args.snapshot)) cnn.load_state_dict(torch.load(args.snapshot)) if args.cuda: torch.cuda.set_device(args.device) cnn = cnn.cuda() try: train.train(train_iter, dev_iter, cnn, args, total_steps) except KeyboardInterrupt: print('\n' + '-' * 89) print('Exiting from training early')
def iters(cls, config, **kwargs): """ Create the iterator objects for splits of the SemEval dataset. :param batch_size: Batch_size :param device: Device to create batches, -1 for CPU and None for GPU. :param root: The root directory containing datasets files. :param vectors: Load pretrained vectors :param kwargs: :return: """ vectors = vocab.Vectors(name=config.vectors, cache=config.cache) ID = data.RawField() TEXT = data.Field(batch_first=True, tokenize=lambda x: x, fix_length=20) TAG = data.Field(batch_first=True, tokenize=lambda x: x, fix_length=20) RAW = data.RawField() REL = data.Field(sequential=False, use_vocab=False, batch_first=True, tensor_type=torch.FloatTensor, postprocessing=data.Pipeline(get_class_probs)) CONF = data.RawField() #TAG.preprocessing = shrink_chunk train, val, test = cls.splits(ID, TEXT, REL, CONF, RAW, TAG, root=config.datasets_dir, **kwargs) TEXT.build_vocab(train) config.n_embed = len(TEXT.vocab) config.d_embed = vectors.dim TEXT.vocab.load_vectors(vectors) config.weights = TEXT.vocab.vectors config.n_classes = 2 return data.BucketIterator.splits((train, val, test), batch_size=config.batch_size, shuffle=config.shuffle, device=config.device, repeat=False)
def GetIterator(TEXT, LABEL, path, args, **kwargs): """ 生成数据迭代器 args: TEXT: torchtext.data生成的Field对象 LABEL: torchtext.data生成的Field对象 return: train_iter: 训练集迭代器 dev_iter: 验证集迭代器 """ #定义TEXT的tokenize规则 TEXT.tokenize = tokenizer #创建表格数据集 train_dataset, dev_dataset, test_dataset = data.TabularDataset.splits( path=path, format='csv', skip_header=True, train='cnews.train.csv', validation='cnews.val.csv', test='cnews.test.csv', fields=[ ('label', LABEL), ('text', TEXT), ]) if args.static and args.pretrainedEmbeddingName and args.pretrainedEmbeddingPath: #加载预训练的词向量,name:包含词向量的文件名,cache:包含词向量的目录 vectors = vocab.Vectors(name=args.pretrainedEmbeddingName, cache=args.pretrainedEmbeddingPath) #建立TEXT的词汇表 TEXT.build_vocab(train_dataset, dev_dataset, vectors=vectors) else: TEXT.build_vocab(train_dataset, dev_dataset) #建立LABEL的词汇表 LABEL.build_vocab(train_dataset, dev_dataset) train_iter, dev_iter, test_iter = data.Iterator.splits( (train_dataset, dev_dataset, test_dataset), batch_sizes=(args.batch_size, len(dev_dataset) / 8, len(test_dataset) / 8), sort_key=lambda x: len(x.text), **kwargs) return train_iter, dev_iter, test_iter
def dataloader(text_field, label_field, user_field, args, wdir=None, u2vdir=None, **kargs): train_data, dev_data, test_data = mydatasets.MR.splits(text_field, label_field, user_field, args=args) if args.pretrained_embed_words: custom_embed = vocab.Vectors(name=wdir, max_vectors=100000) text_field.build_vocab(train_data, dev_data, test_data, vectors=custom_embed) # print(args.custom_embed) else: text_field.build_vocab(train_data, dev_data, test_data) if args.pretrained_embed_users: custom_embed_u = vocab.Vectors(name=u2vdir, max_vectors=8000) user_field.build_vocab(train_data, dev_data, test_data, vectors=custom_embed_u) else: user_field.build_vocab(train_data, dev_data, test_data) label_field.build_vocab(train_data, dev_data, test_data) # split valid and train (10%) train_iter, dev_iter, test_iter = data.Iterator.splits( (train_data, dev_data, test_data), batch_sizes=(batch_size, len(dev_data), len(test_data)), **kargs) return train_iter, dev_iter, test_iter
def dataset(): build_csv() data_dir = "/home/donchan/Documents/DATA/jigsaw" start_t = time() vec = vocab.Vectors('glove.6B.100d.txt', '/home/donchan/Documents/DATA/glove_embedding/') TEXT = Field(sequential=True, tokenize=tokenizer2, lower=True) LABEL = Field(sequential=False, use_vocab=False) datafields = [ ("id", None), # we won't be needing the id, so we pass in None as the field ("comment_text", TEXT), ("toxic", LABEL), ("severe_toxic", LABEL), ("obscene", LABEL), ("threat", LABEL), ("insult", LABEL), ("identity_hate", LABEL) ] train, val = TabularDataset.splits(path=data_dir, train='traindf.csv', validation='valdf.csv', format='csv', skip_header=True, fields=datafields) print("train val length", len(train), len(val)) #print( train[0].comment_text ) #print( train[0].toxic, train[0].severe_toxic, train[0].threat, train[0].insult, train[0].identity_hate ) TEXT.build_vocab(train, val, vectors=vec, min_freq=2) #LABEL.build_vocab(train, val) print("time to build vocab", (time() - start_t)) print("length of vocaburary", len(TEXT.vocab), TEXT.vocab.vectors.shape) print("- " * 20) print("* most common words.") print(TEXT.vocab.freqs.most_common(20)) return train, val, TEXT, LABEL
def init_workspace(): if not os.path.exists(prodirectory): print("directory at " + prodirectory) os.makedirs(prodirectory) else: print("warning: directory already exists") global multi_classes multi_classes = [data.LabelField() for _ in range(3)] word_field = data.Field(tokenize=lambda x: x.split(','), include_lengths=True, batch_first=True, fix_length=MAX_SEQ_LEN) print("load torch data ") class_fields = [('w', word_field), ('cate1_id', multi_classes[0]), ('cate2_id', multi_classes[1]), ('cate3_id', multi_classes[2])] train = data.TabularDataset(TRAINFILE, 'tsv', skip_header=True, fields=class_fields) valid = data.TabularDataset(VALFILE, 'tsv', skip_header=True, fields=class_fields) test = data.TabularDataset(TESTFILE, 'tsv', skip_header=True, fields=[('w', word_field)]) # discretization word_field.build_vocab(train, valid, test) for cls in multi_classes: cls.build_vocab(train, valid) trainiter = data.BucketIterator(train, batch_size=BATCH_SIZE, sort_key=lambda x: len(x.w), shuffle=True) valiter = data.BucketIterator(valid, batch_size=BATCH_SIZE, shuffle=False) testiter = data.BucketIterator(test, batch_size=BATCH_SIZE, shuffle=False) vectors = vocab.Vectors(W2VFILE) print("Word2vec model Loaded") word_field.vocab.set_vectors(vectors.stoi, vectors.vectors, vectors.dim) return word_field.vocab.vectors, trainiter, valiter, testiter
def from_pretrained_embedding(cls, embedding_file_path, inner_module_init_func, cache_path="../data/cache"): word_embeddings = vocab.Vectors(embedding_file_path, cache=cache_path) padding_idx, embedding_length = word_embeddings.vectors.shape padding_embedding = np.zeros(embedding_length) inner_module = inner_module_init_func(embedding_length) torch_embedding = torch.from_numpy( np.row_stack((word_embeddings.vectors, padding_embedding))) nn_embedding = nn.Embedding.from_pretrained(embeddings=torch_embedding, freeze=False, padding_idx=padding_idx) return cls(inner_module, pretrained_embedding=word_embeddings, nn_embedding=nn_embedding)
def rnn_iter(train_path, test_path, batchsize, TEXT, LABEL): train = RnnDataset(train_path, text_field=TEXT, label_field=LABEL, aug=1) test = RnnDataset(test_path, text_field=TEXT, label_field=None, aug=1) # 传入用于构建词表的数据集 vectors = vocab.Vectors(name="wordvec.txt", cache="data") TEXT.build_vocab(test, vectors=vectors) weight_matrix = TEXT.vocab.vectors # 同时对训练集和验证集构造迭代器 train_iter, test_iter = data.BucketIterator.splits( (train, test), batch_sizes=(batchsize, batchsize), device=torch.device('cuda'), sort_key=lambda x: len(x.text), sort_within_batch=False) return train_iter, test_iter, weight_matrix