def load_dataset(config, train_pos='train.pos', train_neg='train.neg', dev_pos='dev.pos', dev_neg='dev.neg', test_pos='test.pos', test_neg='test.neg'): root = config.data_path roots = re.split(', +', root) if len(roots) > 1: logger.info("Combining datasets...") files = {'train.pos':[], 'train.neg':[], 'dev.pos':[], \ 'dev.neg':[], 'test.pos':[], 'test.neg':[]} for dir_path in roots: for file in files.keys(): with open(dir_path + file, 'r', encoding='utf8') as f: files[file].extend(f.readlines()) for file, sents in files.items(): with open('./data/style_transfer/%s' % file, 'w', encoding='utf8') as f: for sent in sents: f.write('%s' % sent) root = './data/style_transfer/' TEXT = data.Field(batch_first=True, eos_token='<eos>') dataset_fn = lambda name: data.TabularDataset( path=root + name, format='tsv', fields=[('text', TEXT)] ) train_pos_set, train_neg_set = map(dataset_fn, [train_pos, train_neg]) dev_pos_set, dev_neg_set = map(dataset_fn, [dev_pos, dev_neg]) test_pos_set, test_neg_set = map(dataset_fn, [test_pos, test_neg]) TEXT.build_vocab(train_pos_set, train_neg_set, min_freq=config.min_freq) if config.load_pretrained_embed: start = time.time() vectors=torchtext.vocab.GloVe('6B', dim=config.embed_size, cache=config.pretrained_embed_path) TEXT.vocab.set_vectors(vectors.stoi, vectors.vectors, vectors.dim) print('vectors', TEXT.vocab.vectors.size()) print('load embedding took {:.2f} s.'.format(time.time() - start)) vocab = TEXT.vocab dataiter_fn = lambda dataset, train: data.BucketIterator( dataset=dataset, batch_size=config.batch_size, shuffle=train, repeat=train, sort_key=lambda x: len(x.text), sort_within_batch=False, device=config.device ) train_pos_iter, train_neg_iter = map(lambda x: dataiter_fn(x, True), [train_pos_set, train_neg_set]) dev_pos_iter, dev_neg_iter = map(lambda x: dataiter_fn(x, False), [dev_pos_set, dev_neg_set]) test_pos_iter, test_neg_iter = map(lambda x: dataiter_fn(x, False), [test_pos_set, test_neg_set]) train_iters = DatasetIterator(train_pos_iter, train_neg_iter) dev_iters = DatasetIterator(dev_pos_iter, dev_neg_iter) test_iters = DatasetIterator(test_pos_iter, test_neg_iter) return train_iters, dev_iters, test_iters, vocab
('id', id_variable), # we process this as id field ('tweet', text_variable), # process it as text ('subtask_a', None), # process it as label ('encoded_subtask_a', None) ] test_fields = [ ('id', id_variable), # we process this as id field ('tweet', text_variable), # process it as text ('subtask_a', None), # process it as label ('encoded_subtask_a', None) ] # Creating our train and test data train_data = data.TabularDataset(path=os.path.join(TEMP_DIRECTORY, TRAIN_FILE), format='tsv', skip_header=True, fields=train_fields) dev_data = data.TabularDataset(path=os.path.join(TEMP_DIRECTORY, DEV_FILE), format='tsv', skip_header=True, fields=dev_fields) test_data = data.TabularDataset(path=os.path.join(TEMP_DIRECTORY, TEST_FILE), format='tsv', skip_header=True, fields=dev_fields) vec = vocab.Vectors(DANISH_EMBEDDING_PATH, cache=VECTOR_CACHE) dev_preds = np.zeros((len(dev_data), N_FOLD))
def get_dataset(fix_length=100, lower=False, vectors=None): if vectors is not None: lower=True LOGGER.debug('Preparing CSV files...') # prepare_csv(train, test) TEXT = data.Field(sequential=True, lower=True, include_lengths=True, batch_first=True, fix_length=25) NUM_FEATURE = data.Field(use_vocab=False, sequential=False, dtype=torch.float16) KEYWORD = data.Field(use_vocab=True, sequential=True) LOCATION = data.Field(use_vocab=True, sequential=True) ID = data.Field(use_vocab=False, sequential=False, dtype=torch.float16) # LABEL = data.LabelField(dtype = torch.float) LABEL = data.Field(use_vocab=True, sequential=False, dtype=torch.float16) tv_datafields = [ ("id", None), # we won't be needing the id, so we pass in None as the field ("keyword", None), ("location", None), ("text", TEXT), ("word_count", NUM_FEATURE), ("char_count", NUM_FEATURE), ("stop_word_count", NUM_FEATURE), ("punctuation_count", NUM_FEATURE), ("mention_count", NUM_FEATURE), ("hashtag_count", NUM_FEATURE), ("target", LABEL)] LOGGER.debug('Reading train csv files...') train_temp, val_temp = data.TabularDataset.splits( path='data/', format='csv', skip_header=True, train='train_train.csv', validation='val_val.csv', fields=tv_datafields ) LOGGER.debug('Reading test csv file...') test_temp = data.TabularDataset( path='data/prepared_df_test.csv', format='csv', skip_header=True, fields=tv_datafields[:-1] ) LOGGER.debug('Building vocabulary...') MAX_VOCAB_SIZE = 25000 # TODO: проверить, нет ли здесь лика, # когда строю словарь по валидационной и тестовой выборках? TEXT.build_vocab( train_temp, val_temp, test_temp, max_size=MAX_VOCAB_SIZE, min_freq=10, vectors=GloVe(name='6B', dim=300) # We use it for getting vocabulary of words ) LABEL.build_vocab(train_temp) # KEYWORD.build_vocab( # train_temp, val_temp, test_temp, # max_size=MAX_VOCAB_SIZE, # ) # LOCATION.build_vocab( # train_temp, val_temp, test_temp, # max_size=MAX_VOCAB_SIZE, # ) word_embeddings = TEXT.vocab.vectors vocab_size = len(TEXT.vocab) train_iter = get_iterator(train_temp, batch_size=32, train=True, shuffle=True, repeat=False) val_iter = get_iterator(val_temp, batch_size=32, train=True, shuffle=True, repeat=False) test_iter = get_iterator(test_temp, batch_size=32, train=False, shuffle=False, repeat=False) LOGGER.debug('Done preparing the datasets') return TEXT, vocab_size, word_embeddings, train_iter, val_iter, test_iter
# Prepare data text_field = data.Field( # tokenize=apply_preprocessing, lower=True) label_field = data.Field(sequential=False, use_vocab=False, is_target=True) print("Creating TabularDatasets for training ({}) and validation ({})...". format(SPLIT_RATIO, 1.0 - SPLIT_RATIO)) trainds, valds = data.TabularDataset( path=data_file_path, format='csv', csv_reader_params={ 'delimiter': '|' }, fields=[ ('', None), # ('Unnamed: 0', None), ('anon_id', None), ('text', text_field), ('label', label_field) ], skip_header=True).split(split_ratio=SPLIT_RATIO) print("Loading vocab from embedding file: {}".format(embedding_file_path)) # Load/prepare pre-trained embedding vectors (FastText) vectors = vocab.Vectors(name=embedding_file_path) text_field.build_vocab(trainds, valds, vectors=vectors) print("Vocab size: {}".format(len(text_field.vocab)))
def __init__( self, train_fn, batch_size=64, valid_ratio=.2, device=-1, max_vocab=999999, min_freq=1, use_eos=False, shuffle=True, ): ''' DataLoader initialization. :param train_fn: Train-set filename :param batch_size: Batchify data fot certain batch size. :param device: Device-id to load data (-1 for CPU) :param max_vocab: Maximum vocabulary size :param min_freq: Minimum frequency for loaded word. :param use_eos: If it is True, put <EOS> after every end of sentence. :param shuffle: If it is True, random shuffle the input data. ''' super().__init__() # Define field of the input file. # The input file consists of two fields. self.label = data.Field(sequential=False, use_vocab=True, unk_token=None) self.text = data.Field( use_vocab=True, batch_first=True, include_lengths=False, eos_token='<EOS>' if use_eos else None, ) # Those defined two columns will be delimited by TAB. # Thus, we use TabularDataset to load two columns in the input file. # We would have two separate input file: train_fn, valid_fn # Files consist of two columns: label field and text field. train, valid = data.TabularDataset( path=train_fn, format='tsv', fields=[ ('label', self.label), ('text', self.text), ], ).split(split_ratio=(1 - valid_ratio)) # Those loaded dataset would be feeded into each iterator: # train iterator and valid iterator. # We sort input sentences by length, to group similar lengths. self.train_loader, self.valid_loader = data.BucketIterator.splits( (train, valid), batch_size=batch_size, device='cuda:%d' % device if device >= 0 else 'cpu', shuffle=shuffle, sort_key=lambda x: len(x.text), sort_within_batch=True, ) # At last, we make a vocabulary for label and text field. # It is making mapping table between words and indice. self.label.build_vocab(train) self.text.build_vocab(train, max_size=max_vocab, min_freq=min_freq)
cudnn.benchmark = True # fire on all cylinders # go through rigamaroo to do ..utils.display_results import show_performance if __package__ is None: import sys from os import path sys.path.append(path.dirname(path.dirname(path.abspath(__file__)))) from utils.display_results import get_performance # ============================ 20 Newsgroups ============================ # TEXT_20ng = data.Field(pad_first=True, lower=True, fix_length=100) LABEL_20ng = data.Field(sequential=False) train_20ng = data.TabularDataset(path='./.data/20newsgroups/20ng-train.txt', format='csv', fields=[('label', LABEL_20ng), ('text', TEXT_20ng)]) test_20ng = data.TabularDataset(path='./.data/20newsgroups/20ng-test.txt', format='csv', fields=[('label', LABEL_20ng), ('text', TEXT_20ng)]) TEXT_20ng.build_vocab(train_20ng, max_size=10000) LABEL_20ng.build_vocab(train_20ng, max_size=10000) print('vocab length (including special tokens):', len(TEXT_20ng.vocab)) train_iter_20ng = data.BucketIterator(train_20ng, batch_size=args.batch_size, repeat=False) test_iter_20ng = data.BucketIterator(test_20ng,
def main(): print("Using device: {}" "\n".format(str(device))) # Load the training dataset, and create a dataloader to generate a batch. textField = data.Field(lower=True, include_lengths=True, batch_first=True, tokenize=student.tokenise, preprocessing=student.preprocessing, postprocessing=student.postprocessing, stop_words=student.stopWords) labelField = data.Field(sequential=False, use_vocab=False, is_target=True) dataset = data.TabularDataset( 'train.json', 'json', { 'reviewText': ('reviewText', textField), 'rating': ('rating', labelField), 'businessCategory': ('businessCategory', labelField) }) textField.build_vocab(dataset, vectors=student.wordVectors) #print(len(textField.vocab)) # Allow training on the entire dataset, or split it for training and validation. if student.trainValSplit == 1: trainLoader = data.BucketIterator(dataset, shuffle=True, batch_size=student.batchSize, sort_key=lambda x: len(x.reviewText), sort_within_batch=True) else: train, validate = dataset.split(split_ratio=student.trainValSplit) trainLoader, valLoader = data.BucketIterator.splits( (train, validate), shuffle=True, batch_size=student.batchSize, sort_key=lambda x: len(x.reviewText), sort_within_batch=True) # Get model and optimiser from student. net = student.net.to(device) lossFunc = student.lossFunc optimiser = student.optimiser # Train. for epoch in range(student.epochs): runningLoss = 0 for i, batch in enumerate(trainLoader): # Get a batch and potentially send it to GPU memory. inputs = textField.vocab.vectors[batch.reviewText[0]].to(device) length = batch.reviewText[1].to(device) rating = batch.rating.to(device) businessCategory = batch.businessCategory.to(device) # PyTorch calculates gradients by accumulating contributions to them # (useful for RNNs). Hence we must manually set them to zero before # calculating them. optimiser.zero_grad() # Forward pass through the network. ratingOutput, categoryOutput = net(inputs, length) loss = lossFunc(ratingOutput, categoryOutput, rating, businessCategory) # Calculate gradients. loss.backward() # Minimise the loss according to the gradient. optimiser.step() runningLoss += loss.item() if i % 32 == 31: print("Epoch: %2d, Batch: %4d, Loss: %.3f" % (epoch + 1, i + 1, runningLoss / 32)) runningLoss = 0 # Save model. torch.save(net.state_dict(), 'savedModel.pth') print("\n" "Model saved to savedModel.pth") # Test on validation data if it exists. if student.trainValSplit != 1: net.eval() correctRatingOnlySum = 0 correctCategoryOnlySum = 0 bothCorrectSum = 0 with torch.no_grad(): for batch in valLoader: # Get a batch and potentially send it to GPU memory. inputs = textField.vocab.vectors[batch.reviewText[0]].to( device) length = batch.reviewText[1].to(device) rating = batch.rating.to(device) businessCategory = batch.businessCategory.to(device) # Convert network output to integer values. ratingOutputs, categoryOutputs = student.convertNetOutput( *net(inputs, length)) # Calculate performance #print("rating = ", rating) #print("rating outputs = ", ratingOutputs) #print("category = ", businessCategory) #print("category_outputs = ", categoryOutputs) correctRating = rating == ratingOutputs.flatten() correctCategory = businessCategory == categoryOutputs.flatten() correctRatingOnlySum += torch.sum(correctRating & ~correctCategory).item() correctCategoryOnlySum += torch.sum(correctCategory & ~correctRating).item() bothCorrectSum += torch.sum(correctRating & correctCategory).item() correctRatingOnlyPercent = correctRatingOnlySum / len(validate) correctCategoryOnlyPercent = correctCategoryOnlySum / len(validate) bothCorrectPercent = bothCorrectSum / len(validate) neitherCorrectPer = 1 - correctRatingOnlyPercent \ - correctCategoryOnlyPercent \ - bothCorrectPercent score = 100 * (bothCorrectPercent + 0.5 * correctCategoryOnlyPercent + 0.1 * correctRatingOnlyPercent) print("\n" "Rating incorrect, business category incorrect: {:.2%}\n" "Rating correct, business category incorrect: {:.2%}\n" "Rating incorrect, business category correct: {:.2%}\n" "Rating correct, business category correct: {:.2%}\n" "\n" "Weighted score: {:.2f}".format(neitherCorrectPer, correctRatingOnlyPercent, correctCategoryOnlyPercent, bothCorrectPercent, score))
def create_dataset(opt, SRC, TRG): print("creating dataset and iterator... ") if opt.task == 'toy_task' or opt.task == 'e_snli_o': # Load in validation data f_in, f_out = open(opt.data_path + '/val_in.txt', 'r', encoding='utf-8'), open(opt.data_path + '/val_out.txt', 'r', encoding='utf-8') in_ = [x.replace('\n', '') for x in f_in.readlines()] out_ = [x.replace('\n', '') for x in f_out.readlines()] raw_data = {'src': in_, 'trg': out_} df = pd.DataFrame(raw_data, columns=["src", "trg"]) mask = (df['src'].str.count(' ') < opt.max_strlen) & (df['trg'].str.count(' ') < opt.max_strlen) df = df.loc[mask] df.to_csv("translate_transformer_temp.csv", index=False) data_fields = [('src', SRC), ('trg', TRG)] val = data.TabularDataset('./translate_transformer_temp.csv', format='csv', fields=data_fields, skip_header=True) os.remove('translate_transformer_temp.csv') val_iter = MyIterator(val, batch_size=opt.batchsize, repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)), train=False, shuffle=False) elif opt.task == 'e_snli_r': # Load in validation data f_in, f_out = open(opt.data_path + '/val_in.txt', 'r', encoding='utf-8'), open(opt.data_path + '/val_out.txt', 'r', encoding='utf-8') if opt.label_path is None: raise AssertionError( 'Need to provide a path to label data for validation checks') f_label = open(opt.label_path + '/val_out.txt', 'r', encoding='utf-8') in_ = [x.replace('\n', '') for x in f_in.readlines()] out_ = [x.replace('\n', '') for x in f_out.readlines()] labels_ = [x.replace('\n', '') for x in f_label.readlines()] out1, out2, out3 = [], [], [] for o in out_: split = o.split(' @@SEP@@ ') out1.append(split[0]) out2.append(split[1]) out3.append(split[2]) raw_data = { 'src': in_, 'trg1': out1, 'trg2': out2, 'trg3': out3, 'labels': labels_ } df = pd.DataFrame(raw_data, columns=["src", "trg1", "trg2", "trg3", "labels"]) mask = (df['src'].str.count(' ') < opt.max_strlen) & (df['trg1'].str.count(' ') < opt.max_strlen) & \ (df['trg2'].str.count(' ') < opt.max_strlen) & (df['trg3'].str.count(' ') < opt.max_strlen) df = df.loc[mask] df.to_csv("translate_transformer_temp.csv", index=False) data_fields = [('src', SRC), ('trg1', TRG), ('trg2', TRG), ('trg3', TRG), ('label', opt.classifier_TRG)] val = data.TabularDataset('./translate_transformer_temp.csv', format='csv', fields=data_fields, skip_header=True) os.remove('translate_transformer_temp.csv') val_iter = MyIterator( val, batch_size=opt.batchsize, repeat=False, sort_key=lambda x: (len(x.src), len(x.trg1), len(x.trg2), len(x.trg3)), train=False, shuffle=False) else: # cos_e raise NotImplementedError( "No implementation provided in process.py for cos-e (yet)") ##### TRAIN DATA ##### raw_data = { 'src': [line for line in opt.src_data], 'trg': [line for line in opt.trg_data] } df = pd.DataFrame(raw_data, columns=["src", "trg"]) mask = (df['src'].str.count(' ') < opt.max_strlen) & (df['trg'].str.count(' ') < opt.max_strlen) df = df.loc[mask] df.to_csv("translate_transformer_temp.csv", index=False) data_fields = [('src', SRC), ('trg', TRG)] train = data.TabularDataset('./translate_transformer_temp.csv', format='csv', fields=data_fields, skip_header=True) print('desired batch size', opt.batchsize) train_iter = MyIterator( train, batch_size=opt.batchsize, # device=opt.device, repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)), train=True, shuffle=True) os.remove('translate_transformer_temp.csv') if opt.load_weights is None: if opt.checkpoint > 0: try: os.mkdir("weights") except: print( "weights folder already exists, run program with -load_weights weights to load them" ) quit() pickle.dump(SRC, open('weights/SRC.pkl', 'wb')) pickle.dump(TRG, open('weights/TRG.pkl', 'wb')) opt.src_pad = SRC.vocab.stoi['<pad>'] opt.trg_pad = TRG.vocab.stoi['<pad>'] opt.train_len = get_len(train_iter) print('number of train batches:', opt.train_len) print('number of val batches:', get_len(val_iter)) return train_iter, val_iter
def load_data_pair_task(path_file_data, name_file_train, name_file_test=None, device_set="cuda:0", min_freq_word=1, min_freq_char=1, batch_size=2, cache_folder=None, name_vocab=None, path_vocab_pre_built=None): inputs_word_query = data.Field(init_token="<bos>", eos_token="<eos>", batch_first=True, include_lengths=True) inputs_char_query_nesting = data.Field(tokenize=list, init_token="<bos>", eos_token="<eos>", batch_first=True) inputs_char_query = data.NestedField(inputs_char_query_nesting, init_token="<bos>", eos_token="<eos>") inputs_word_document = data.Field(init_token="<bos>", eos_token="<eos>", batch_first=True, include_lengths=True) inputs_char_document_nesting = data.Field(tokenize=list, init_token="<bos>", eos_token="<eos>", batch_first=True) inputs_char_document = data.NestedField(inputs_char_document_nesting, init_token="<bos>", eos_token="<eos>") labels = data.LabelField(sequential=False) fields = ([(('inputs_word_query', 'inputs_char_query'), (inputs_word_query, inputs_char_query)), (('inputs_word_document', 'inputs_char_document'), (inputs_word_document, inputs_char_document)), ('labels', labels)]) if name_file_test is not None: train, test = data.TabularDataset.splits(path=path_file_data, train=name_file_train, test=name_file_test, fields=tuple(fields), format='csv', skip_header=False, csv_reader_params={ 'delimiter': '\t', 'quoting': 3 }) if path_vocab_pre_built is None: if cache_folder is not None and name_vocab is not None: inputs_word_document.build_vocab( train.inputs_word_document, test.inputs_word_document, min_freq=min_freq_word, vectors=[MyPretrainedVector(name_vocab, cache_folder)]) else: inputs_word_document.build_vocab(train.inputs_word_document, test.inputs_word_document, min_freq=min_freq_word) inputs_char_document.build_vocab(train.inputs_char_document, test.inputs_char_document, min_freq=min_freq_char) inputs_word_query.vocab = inputs_word_document.vocab inputs_char_query.vocab = inputs_char_query_nesting.vocab = \ inputs_char_document_nesting.vocab = inputs_char_document.vocab labels.build_vocab(train.labels) else: vocabs = torch.load(path_vocab_pre_built) inputs_word_document.vocab = inputs_word_query.vocab = vocabs[0] inputs_char_document.vocab = inputs_char_query.vocab = \ inputs_char_document_nesting.vocab = inputs_char_query_nesting.vocab = vocabs[1] labels.vocab = vocabs[2] train_iter, test_iter = data.BucketIterator.splits( datasets=(train, test), batch_size=batch_size, shuffle=True, sort=False, device=torch.device( device_set if torch.cuda.is_available() else "cpu")) dict_return = { 'iters': (train_iter, test_iter), 'vocabs': (inputs_word_document.vocab, inputs_char_document.vocab, labels.vocab) } else: path_file_data_train = os.path.join(path_file_data, name_file_train) train = data.TabularDataset(path_file_data_train, fields=tuple(fields), format='csv', skip_header=True, csv_reader_params={ 'delimiter': '\t', 'quoting': 3 }) if path_vocab_pre_built is None: if cache_folder is not None and name_vocab is not None: inputs_word_document.build_vocab( train.inputs_word_document, min_freq=min_freq_word, vectors=[MyPretrainedVector(name_vocab, cache_folder)]) else: inputs_word_document.build_vocab(train.inputs_word_document, min_freq=min_freq_word) inputs_char_document.build_vocab(train.inputs_char_document, min_freq=min_freq_char) inputs_word_query.vocab = inputs_word_document.vocab inputs_char_query.vocab = inputs_char_query_nesting.vocab = \ inputs_char_document_nesting.vocab = inputs_char_document.vocab labels.build_vocab(train.labels) else: vocabs = torch.load(path_vocab_pre_built) inputs_word_document.vocab = inputs_word_query.vocab = vocabs[0] inputs_char_document.vocab = inputs_char_query.vocab = \ inputs_char_document_nesting.vocab = inputs_char_query_nesting.vocab = vocabs[1] labels.vocab = vocabs[2] train_iter = data.BucketIterator( train, batch_size=batch_size, shuffle=True, sort=False, device=torch.device( device_set if torch.cuda.is_available() else "cpu")) dict_return = { 'iters': [train_iter], 'vocabs': (inputs_word_document.vocab, inputs_char_document.vocab, labels.vocab) } return dict_return
def train(): INPUTS_DIR = os.getenv( 'VH_INPUTS_DIR', '/valohai/inputs/' ) #,'/stockage/Research_Team_Ressources/valohai_test/') dataset_path = get_first_file(os.path.join(INPUTS_DIR, 'dataset')) word_vectors_path = get_first_file((os.path.join(INPUTS_DIR, 'word_vectors'))) try: with open(word_vectors_path, 'rb') as my_pickle: TEXT = pickle.load(my_pickle) except IOError: print("IOError") pass LABEL = data.Field(sequential=False, preprocessing=custom_preprocess_label, use_vocab=False) dataset = data.TabularDataset(path=dataset_path, format='csv', fields=[('Num', None), ('Label', LABEL), ('id', None), ('date', None), ('flag', None), ('user', None), ('Text', TEXT)], skip_header=True) nb_train = 1000000 ratio_train = nb_train / len(dataset) nb_test = 500000 ratio_test = nb_test / len(dataset) ratio_other = 1 - ratio_train - ratio_test train_dataset, other_dataset, test_dataset = dataset.split( split_ratio=[ratio_train, ratio_test, ratio_other]) train_iter, test_iter = BucketIterator.splits( (train_dataset, test_dataset ), # we pass in the datasets we want the iterator to draw data from batch_sizes=(FLAGS.batch_size, FLAGS.batch_size), device= num_device, # if you want to use the GPU, specify the GPU number here sort_key=lambda x: len( x.Text ), # the BucketIterator needs to be told what function it should use to group the data. sort_within_batch=False, repeat= False # we pass repeat=False because we want to wrap this Iterator layer. ) n_vocab = len(TEXT.vocab) model = ConvNet(n_vocab, embed_size=FLAGS.embedding_size, num_classes=2).to(device) # This was important from their code. # Initialize parameters with Glorot / fan_avg. for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) # Loss and optimizer criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=FLAGS.learning_rate) criterion.to(device) num_epoch = FLAGS.epochs for epoch in range(num_epoch): print("epoch : ", epoch) model.train() print(run_epoch(train_iter, model, criterion, TEXT, optimizer)) model.eval() print(run_epoch(test_iter, model, criterion, TEXT, None)) model.eval() print(run_epoch(test_iter, model, criterion, TEXT, None)) # Saving weights and biases as outputs of the task. outputs_dir = os.getenv('VH_OUTPUTS_DIR', '/valohai/outputs/') filename = os.path.join(outputs_dir, 'model.pth') torch.save(model, filename) filename_text = os.path.join(outputs_dir, 'text.pickle') pickle.dump(TEXT, filename_text)
def caption_iterator(start_token, end_token, pad_token, train_meta_path, val_1_meta_path, val_2_meta_path, min_freq, batch_size, device, phase, use_categories, use_subs): spacy_en = spacy.load('en') print(f'Preparing dataset for {phase}') def tokenize_en(txt): return [token.text for token in spacy_en.tokenizer(txt)] CAPTION = data.ReversibleField( tokenize='spacy', init_token=start_token, eos_token=end_token, pad_token=pad_token, lower=True, batch_first=True, is_target=True ) INDEX = data.Field( sequential=False, use_vocab=False, batch_first=True ) if use_categories: # preprocessing: if there is no category replace with -1 (unique number) CATEGORY = data.Field( sequential=False, use_vocab=False, batch_first=True, preprocessing=data.Pipeline(lambda x: -1 if len(x) == 0 else int(float(x))) ) # filter the dataset if the a category is missing (31 -> 41 (count = 1 :())) filter_pred = lambda x: vars(x)['category_32'] != -1 and vars(x)['category_32'] != 31 else: CATEGORY = None filter_pred = None if use_subs: SUBS = data.ReversibleField( tokenize='spacy', init_token=start_token, eos_token=end_token, pad_token=pad_token, lower=True, batch_first=True ) else: SUBS = None # the order has to be the same as in the table fields = [ ('video_id', None), ('caption', CAPTION), ('start', None), ('end', None), ('duration', None), ('category_32', CATEGORY), ('subs', SUBS), ('phase', None), ('idx', INDEX), ] dataset = data.TabularDataset( path=train_meta_path, format='tsv', skip_header=True, fields=fields, filter_pred=filter_pred ) CAPTION.build_vocab(dataset.caption, min_freq=min_freq) train_vocab = CAPTION.vocab train_subs_vocab = None if use_subs: SUBS.build_vocab(dataset.subs, min_freq=min_freq) train_subs_vocab = SUBS.vocab if phase == 'val_1': dataset = data.TabularDataset( path=val_1_meta_path, format='tsv', skip_header=True, fields=fields, filter_pred=filter_pred ) elif phase == 'val_2': dataset = data.TabularDataset( path=val_2_meta_path, format='tsv', skip_header=True, fields=fields, filter_pred=filter_pred ) # sort_key = lambda x: data.interleave_keys(len(x.caption), len(x.caption)) sort_key = lambda x: 0 #len(x.caption) datasetloader = data.BucketIterator( dataset, batch_size, sort_key=sort_key, device=device, repeat=False, shuffle=True ) return train_vocab, train_subs_vocab, datasetloader
def tokenizer(text): return [token.text for token in nlp.tokenizer(text)] qid = None text_field = data.Field(sequential=True, tokenize=tokenizer, lower=True) target_field = data.Field(sequential=False, use_vocab=False, is_target=True, dtype=torch.long) df = data.TabularDataset(path=PATH_TO_TRAINING_DATA, format='CSV', fields=[('qid', qid), ('question_text', text_field), ('target', target_field)], skip_header=True) df_test = data.TabularDataset(path=PATH_TO_TEST_DATA, format='CSV', fields=[('qid', qid), ('question_text', text_field)], skip_header=True) vec = vocab.Vectors(PATH_TO_EMB_FILE) text_field.build_vocab(df, df_test, vectors=vec) train, val = df.split(split_ratio=[0.8, 0.2]) train_dl, val_dl = data.Iterator.splits( (train, val),
def data_loader(opt): TEXT = data.Field(sequential=True, tokenize=tokenizer, batch_first=True, pad_first=False, lower=True, include_lengths=False, pad_token='<pad>', fix_length=opt.seq_len ) LABEL = data.Field(sequential=False, unk_token=None) fields = [ ('APP_ID', None), ('LABEL', LABEL), ('ACTION', TEXT)] # read datasets print('reading data ...') train = data.TabularDataset( path=DirConfig.train_path, format='tsv', skip_header=True, fields=fields) dev = data.TabularDataset( path=DirConfig.dev_path, format='tsv', skip_header=True, fields=fields) test = data.TabularDataset( path=DirConfig.test_path, format='tsv', skip_header=True, fields=fields) TEXT.build_vocab(train, dev, test) LABEL.build_vocab(train) print('ACTION:') print('\tvocab size:', len(TEXT.vocab)) print('LABEL:') print('\tvocab size:', len(LABEL.vocab)) print('\t', LABEL.vocab.stoi.items()) print('\t', LABEL.vocab.itos) print('\tDataset:') print('\t# Train:', len(train.examples)) print('\t\tLABEL:', train.examples[0].LABEL) print('\t\tACTION:', train.examples[0].ACTION) print('\t# Dev:', len(dev.examples)) print('\t\tLABEL:', dev.examples[0].LABEL) print('\t\tACTION:', dev.examples[0].ACTION) print('\t# Test:', len(test.examples)) print('\t\tLABEL:', test.examples[0].LABEL) print('\t\tACTION:', test.examples[0].ACTION) print('=========================') batch = data.BucketIterator.splits(datasets=[train, dev, test], batch_sizes=[opt.batch_size] * 3, sort_key=lambda x: len(x.ACTION), device=opt.device, sort_within_batch=True, repeat=False) batch = [list(b) for b in batch] return batch, TEXT.vocab
if not args.cuda: args.gpu = -1 if torch.cuda.is_available() and args.cuda: print("Note: You are using GPU for training") torch.cuda.set_device(args.gpu) torch.cuda.manual_seed(args.seed) if torch.cuda.is_available() and not args.cuda: print( "Warning: You have Cuda but not use it. You are using CPU for training." ) # Set up the data for training TEXT = data.Field(lower=True) ED = data.Field() train = data.TabularDataset(path=os.path.join(args.output, 'dete_train.txt'), format='tsv', fields=[('text', TEXT), ('ed', ED)]) field = [('id', None), ('sub', None), ('entity', None), ('relation', None), ('obj', None), ('text', TEXT), ('ed', ED)] dev, test = data.TabularDataset.splits(path=args.output, validation='valid.txt', test='test.txt', format='tsv', fields=field) TEXT.build_vocab(train, dev, test) ED.build_vocab(train, dev) match_embedding = 0 if os.path.isfile(args.vector_cache): stoi, vectors, dim = torch.load(args.vector_cache) TEXT.vocab.vectors = torch.Tensor(len(TEXT.vocab), dim)
LABEL = data.LabelField() SEED = 1234 MAX_VOCAB_SIZE = 25_000 BATCH_SIZE = 64 EMBEDDING_DIM = 50 N_FILTERS = 50 FILTER_SIZES = [1] DROPOUT = 0.5 N_EPOCHS = 100 torch.manual_seed(SEED) fields = {'question': ('text', TEXT), 'name': ('label', LABEL)} train_data = data.TabularDataset( path = os.path.join(os.path.dirname(__file__),'eva_nlp_training.json'), format = 'json', fields = fields) TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE, vectors = "glove.6B.50d", unk_init = torch.Tensor.normal_) LABEL.build_vocab(train_data) INPUT_DIM = len(TEXT.vocab) OUTPUT_DIM = len(LABEL.vocab) UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token] PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token] train_iterator = data.BucketIterator(
("threat", LABEL), ("obscene", LABEL), ("insult", LABEL), ("identity_hate", LABEL) ] test_datafields = [ ("id", None), # we won't be needing the id, so we pass in None as the field ("comment_text", TEXT) ] SEED = 1 BATCH_SIZE = 64 data_dir = '/media/feng/storage/Downloads/jigsaw' train_data = data.TabularDataset(path=os.path.join(data_dir, 'train.csv'), format='csv', skip_header=True, fields=trainval_datafields) # valid_data = data.TabularDataset(path=os.path.join(data_dir, 'train.csv'), # format='csv', skip_header=True, fields=tv_datafields) train_data, valid_data = train_data.split(split_ratio=0.8, stratified=False, strata_field='toxic', random_state=random.seed(SEED)) test_data = data.TabularDataset( path=os.path.join(data_dir, "test.csv"), # the file path format='csv', skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data! fields=test_datafields)
def convert_text_to_idx(args): path_text = os.path.join(cwd, args.data_dir, args.text_file) path_out = os.path.join(cwd, args.data_dir, args.idx_file) if args.write_data_idx: f_out = open(path_out, 'w') ## --------------------------------------- ## ## -- Tokenize by BERT-- ## """ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') data = [] with open(path_text) as f: lines = f.read().split('\n') aa = lines[2].split('\t')[-1] pdb.set_trace() data = [tokenizer.tokenize(line) for line in lines] pdb.set_trace() """ ## --Tokenize by torchtext-- ## """ tokenize = lambda x: x.split() TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=30) datafields = [('eid', None),('idxP',None),('idxC',None),('MaxDegree',None),('MaxL',None),('text', TEXT)] train_data = data.TabularDataset(path=path_text, format='tsv', skip_header=False, fields=datafields) TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=300)) table = build_loopup_table(TEXT.vocab.freqs.most_common(5000)) """ ## --Build table by idf-- ## tokenize = lambda x: x TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=30) datafields = [('eid', None), ('idxP', None), ('idxC', None), ('MaxDegree', None), ('MaxL', None), ('interval', None), ('text', TEXT)] train_data = data.TabularDataset(path=path_text, format='tsv', skip_header=False, fields=datafields) corpus = [] for dd in train_data: dd_text = dd.text corpus.append(dd_text) vectorizer = TfidfVectorizer(token_pattern=r'\S+') X = vectorizer.fit_transform(corpus) indices = np.argsort(vectorizer.idf_) # sort from large to small by IDF feature_names = vectorizer.get_feature_names() top_n = 5000 # Find the top n words by IDF top_features = [feature_names[i] for i in indices[:top_n]] table = {} idx = 0 for feature in top_features: table[feature] = idx idx += 1 ## ======================================== ## #with open(os.path.join(cwd, 'data/{}/dict_0.json'.format(args.data_dir.split('/')[-1])), 'w') as f_dict: # f_dict.write(json.dumps(table)) # Save the vocabulary dictionary with open(path_text, 'r') as f: raw_lines = f.read().rstrip().split('\n') print('Writing idx:count data file') cnt = 0 for line in tqdm(raw_lines): text = line.split('\t')[-1] idx_count = text_to_idx_count(text, table) temp = line.split('\t')[:-1] temp.append(idx_count) new_line = '\t'.join(temp) cnt += 1 if args.write_data_idx: if cnt == len(raw_lines): f_out.write('{}'.format(new_line)) else: f_out.write('{}\n'.format(new_line)) if args.write_data_idx: f_out.close()
if not args.save_dir.is_dir(): args.save_dir.mkdir() # creating dated path for saving updated datasets later if not (args.path/args.now).is_dir(): (args.path/args.now).mkdir() # creating dataframes print('\nCreating DataFrames ... \n') train_df = pd.read_csv(args.path/'train.csv', header=None, names=args.names) valid_df = pd.read_csv(args.path/'val.csv', header=None, names=args.names) test_df = pd.read_csv(args.path/'test.csv', header=None, names=args.names) test_df = helpers.check_batch_size(test_df, len(test_df['text']), args) # copying validation set to new dated path print('Copying validation set to time specific folder. \n') valid_df.to_csv(args.path/args.now/'val.csv', index=False, header=False) # creating datasets train_ds = data.TabularDataset(path=args.path/'train.csv', format='csv', fields=args.datafields) label_field.build_vocab(train_ds) args.class_num = len(label_field.vocab) - 1 # creating DataBunch objects for langage modelling and classification print('\nCreating DataBunch objects...') data_lm = TextLMDataBunch.from_df(args.path, train_df=train_df, valid_df=valid_df, test_df=test_df, text_cols=0, label_cols=1) data_clas = TextClasDataBunch.from_df(args.path, train_df=train_df, valid_df=valid_df, test_df=test_df, text_cols=0, label_cols=1, vocab=data_lm.train_ds.vocab, bs=args.bs) # fine-tuning language model print('\nFine-tuning language model ...') helpers.language_model(data_lm, args) # creating a classifier print('\nTraining classifier ...') model = helpers.classifier(data_clas, args)
def main(): # if GPU is availale, use GPU device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') print("Use " + str(device)) # Load the training dataset, and create a dataloader to generate a batch.自动处理小写,计算长度 textField = data.Field( lower=True, include_lengths=True, batch_first=True, preprocessing=preprocessing, # 单词形式下的预处理,过去式之类的去除 postprocessing=postprocessing, stop_words=get_stopwords()) # 剔除stopwords中的所有单词 labelField = data.Field(sequential=False, use_vocab=False, is_target=True) dataset = data.TabularDataset('train.csv', 'csv', { 'text': ('text', textField), 'target': ('target', labelField) }) textField.build_vocab( dataset, vectors=config.wordVectors) # 把数据转换为向量,用上面定义的textfield # 分割数据集,训练集与验证集 train_dataset, validate_dataset = dataset.split( split_ratio=config.proportion_of_val_dataset, stratified=True, strata_field='target') train_loader, val_loader = data.BucketIterator.splits( (train_dataset, validate_dataset), shuffle=True, batch_size=config.batchSize, sort_key=lambda x: len(x.text), sort_within_batch=True) net = get_model(config.dim, config.from_old_model, config.model_path).to(device) criterion = config.criterion params = net.parameters() # create optimizer if config.optimizer_name == "SGD": optimizer = toptim.SGD(params, lr=config.learning_rate) elif config.optimizer_name == "Adam": optimizer = toptim.Adam(params, lr=config.learning_rate) elif config.optimizer_name == "AdamW": optimizer = AdamW(params, lr=config.learning_rate, weight_decay=1e-6) # 混合精度加速 if config.use_apex: net, optimizer = amp.initialize(net, optimizer, opt_level="O1") train_start = time.time() for epoch in range(config.epochs): ''' # change lr by epoch adjust_learning_rate(optimizer, epoch) ''' # start train train(net, train_loader, config.criterion, optimizer, epoch, device, log, textField) # start val val(net, val_loader, config.criterion, optimizer, epoch, device, log, train_start, textField) print("Final saved model is epoch " + str(best_val_acc[0]) + ", acc: " + str(best_val_acc[1]) + ".") log.write("Final saved model is epoch " + str(best_val_acc[0]) + ", acc: " + str(best_val_acc[1]) + "\n") print("Done.") log.write("Done.\n")
def create_datasets(self): """ Load data, build vocabulary and create Iterator objects for train, validation and test data. Returns: - train_iter : Iterator object for train batches of size self.train_batch_size to iterate over. - val_iter : Iterator object for val batches of size self.val_batch_size to iterate over. - test_iter : Iterator object for test batches of size self.test_batch_size to iterate over. """ if self.seed: random.seed(14) # Create fields tokenizer = lambda x: x.split() ID = data.Field() TEXT = data.Field(tokenize=tokenizer, init_token='<bos>', eos_token='<eos>', lower=True) TARGET = data.LabelField(dtype=torch.float) train_fields = [('id', None), ('text', TEXT), ('target', TARGET)] # Data train_data = data.TabularDataset(path=self.path, format='csv', skip_header=True, fields=train_fields) # Split train, val, test = train_data.split(split_ratio=[0.6, 0.2, 0.2], random_state=random.getstate()) # Vocab if self.use_embedding: TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=300), min_freq=5) self.embedding_matrix = TEXT.vocab.vectors else: TEXT.build_vocab(train_data, min_freq=5) TARGET.build_vocab(train_data) # Iterators train_iter = data.BucketIterator( train, sort_key=lambda x: len( x.text), # sort sequences by length (dynamic padding) batch_size=self.train_batch_size, # batch size device=self.device # select device (e.g. CPU) ) val_iter = data.BucketIterator(val, sort_key=lambda x: len(x.text), batch_size=self.val_batch_size, device=self.device) test_iter = data.Iterator(test, batch_size=self.test_batch_size, device=self.device, train=False, sort=False, sort_within_batch=False) return train_iter, val_iter, test_iter
train.drop_duplicates(subset="text", inplace = True) SEED = 1234 torch.manual_seed(SEED) TEXT = data.Field(tokenize = 'spacy', batch_first=True, include_lengths=True) LABEL = data.LabelField(dtype = torch.float, batch_first=True) train.to_csv("train_formatted.csv", index=False) fields = [('text',TEXT), ('label',LABEL)] train = data.TabularDataset( path = 'train_formatted.csv', format = 'csv', fields = fields, skip_header =True ) import random train, valid = train.split(split_ratio=0.9, random_state = random.seed(SEED)) TEXT.build_vocab(train, vectors ="glove.6B.100d") LABEL.build_vocab(train) BATCH_SIZE = 64 train_iterator, valid_iterator = data.BucketIterator.splits(
def loader(self): tokenize = lambda x: self.lemmatizer.lemmatize( re.sub(r'<.*?>|[^\w\s]|\d+', '', x)).split() TEXT = data.Field(sequential=True, tokenize=tokenize, include_lengths=True, batch_first=True, dtype=torch.long) PRONOUN = data.Field(sequential=False, batch_first=True) P_OFFSET = data.Field(sequential=False, batch_first=True) A = data.Field(sequential=False, batch_first=True) B = data.Field(sequential=False, batch_first=True) A_OFFSET = data.Field(sequential=False, batch_first=True) B_OFFSET = data.Field(sequential=False, batch_first=True) A_COREF = data.Field(sequential=False, batch_first=True) B_COREF = data.Field(sequential=False, batch_first=True) NE_LABEL = data.LabelField( batch_first=True, sequential=False) #tokenize is removed since default is none input_fields = [('ID', None), ('Text', TEXT), ('Pronoun', PRONOUN), ('Pronoun_off', P_OFFSET), ('A', A), ('A_off', A_OFFSET), ('A_coref', A_COREF), ('B', A), ('B_off', B_OFFSET), ('B_coref', B_COREF), ('URL', None)] train = data.TabularDataset(path=self.train_path, format='tsv', fields=input_fields, skip_header=True) valid = data.TabularDataset(path=self.valid_path, format='tsv', fields=input_fields, skip_header=True) test = data.TabularDataset(path=self.test_path, format='tsv', fields=input_fields, skip_header=True) ##MAP WORDS & FIGURE OUT THE MAX SIZE FOR BUILDING VOCAB TEXT.build_vocab(train, max_size=30000, vectors=GloVe(name='6B', dim=300)) # Glove Embedding PRONOUN.build_vocab(train) # NE emb list_of_A = [x for x in train.A] list_of_B = [x for x in train.B] AB_concat = list_of_A + list_of_B NE_LABEL.build_vocab(AB_concat) word_emb = TEXT.vocab.vectors #pro_emb = PRONOUN.vocab.vectors #NE_emb = NE_LABEL.vocab.vectors vocab_size = len(TEXT.vocab) # if want to use bucket iterator (batching) train_data, valid_data, test_data = data.BucketIterator.splits( (train, valid, test), batch_size=self.batch_size, repeat=False, shuffle=True) print("Length of Text Vocabulary: " + str(len(TEXT.vocab))) print("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size()) print("NE Length: " + str(len(NE_LABEL.vocab))) print( "\nSize of train set: {} \nSize of validation set: {} \nSize of test set: {}" .format(len(train_data.dataset), len(valid_data.dataset), len(test_data.dataset))) return TEXT, PRONOUN, NE_LABEL, word_emb, train_data, valid_data, test_data, train, valid, test
LABEL_snli.build_vocab(train_snli, max_size=10000) print('vocab length (including special tokens):', len(TEXT_snli.vocab)) # make iterators train_iter_snli, val_iter_snli, test_iter_snli = data.BucketIterator.splits( (train_snli, val_snli, test_snli), batch_size=args.batch_size, repeat=False) # ============================ SNLI ============================ # # ============================ Multi30K ============================ # TEXT_m30k = data.Field(pad_first=True, lower=True) m30k_data = data.TabularDataset(path='./.data/multi30k/train.txt', format='csv', fields=[('text', TEXT_m30k)]) TEXT_m30k.build_vocab(train_sst.text, max_size=10000) print('vocab length (including special tokens):', len(TEXT_m30k.vocab)) train_iter_m30k = data.BucketIterator(m30k_data, batch_size=args.batch_size, repeat=False) # ============================ Multi30K ============================ # # ============================ WMT16 ============================ # TEXT_wmt16 = data.Field(pad_first=True, lower=True) wmt16_data = data.TabularDataset(path='./.data/wmt16/wmt16_sentences', format='csv',
logger.info(labels) TEXT = ReversibleField(sequential=True, include_lengths=True, lower=False) label_size = 42 # 18 if args.dataset != "multi_top_snomed_no_des" else 42 LABEL = MultiLabelField(sequential=True, use_vocab=False, label_size=label_size, tensor_type=torch.FloatTensor) # load in adobe if args.abbr: adobe_test = data.TabularDataset( path= '../../data/csu/adobe_abbr_matched_snomed_multi_label_no_des_test.tsv', format='tsv', fields=[('Text', TEXT), ('Description', LABEL)]) else: adobe_test = data.TabularDataset( path='../../data/csu/adobe_snomed_multi_label_no_des_test.tsv', format='tsv', fields=[('Text', TEXT), ('Description', LABEL)]) if args.dataset == 'multi_top_snomed_no_des': train, val, test = data.TabularDataset.splits( path='../../data/csu/', train='snomed_multi_label_no_des_train.tsv', validation='snomed_multi_label_no_des_valid.tsv', test='snomed_multi_label_no_des_test.tsv', format='tsv',
PATH = '/media/ubuntu/1TO/DTU/courses/DeepLearning/DeepLearning_summarization/saved_network' glove_dim = 50 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #TODO: How do we enable sort in dataloader? #%% """ Data loader part """ TEXT = data.Field(init_token='<bos>', eos_token='<eos>', sequential=True) LABEL = data.Field(init_token='<bos>', eos_token='<eos>', sequential=True) train_set = data.TabularDataset(path, 'CSV', fields=[('data', TEXT), ('label', LABEL)], skip_header=True) validation_set = data.TabularDataset(path_val, 'CSV', fields=[('data', TEXT), ('label', LABEL)], skip_header=True) TEXT.build_vocab(train_set, max_size=vocab_size, vectors="glove.6B." + str(glove_dim) + "d") LABEL.vocab = TEXT.vocab vocab = TEXT.vocab #GloVe embedding function embed = torch.nn.Embedding(len(vocab), glove_dim)
def subjective_bot(): # these are for debugging # game_name_list = ['Counter-Strike Global Offensive', 'Transformice', 'Dead Island Epidemic', 'Dota 2', 'Team Fortress 2', 'War Thunder', "Garry's Mod", 'Injustice Gods Among Us Ultimate Edition', 'Loadout', 'Geometry Dash'] # hour_list = [6.0, 3.0, 2.0, 820.0, 250.0, 50.0, 36.0, 25.0, 14.0, 13.0] # # SpeedRunners # game_name_list = ['Dota 2','Warframe','The Elder Scrolls V Skyrim','DayZ','DARK SOULS II','Trove','Fallout 4','Starbound','Endless Legend','Warhammer 40,000 Dawn of War II'] # hour_list = [600.0, 300.0, 200.0, 820.0, 250.0, 500.0, 360.0, 250.0, 54.0, 130.0] # # Endless Space # game_name_list = ['Dota 2' ,'Counter-Strike Global Offensive' ,'Warhammer 40,000 Dawn of War II - Chaos Rising' ,"NOBUNAGA'S AMBITION Sphere of Influence",'Endless Space','Shadowrun Hong Kong' ,'The Dark Eye Chains of Satinav','Demonicon' ,"Shadowrun Dragonfall - Director's Cut",'Total War SHOGUN 2' ] # hour_list = [100,100,100,100,5,20,20,5,5,10] # # new: The Elder Scrolls V Skyrim # game_name_list= ['Dota 2','Dota 2','Dota 2','Dota 2','Dota 2','Dota 2','Dota 2','Dota 2','Dota 2','Dota 2'] # hour_list = [100, 500, 500, 500, 700, 200, 200, 500, 500, 10] newplayer = False model = torch.load('shuffledmodel_0.52.pt') # model.cuda() print('Hello There! Welcome to Check This Out!') print('Loading Essential Tools...') TEXT = data.Field(sequential=True, include_lengths=True, tokenize='spacy') LABEL = data.Field(sequential=False, use_vocab=False) abstract_data = data.TabularDataset(path='./data/abstract_tsv.tsv', skip_header=True, format='tsv', fields=[('text', TEXT), ('label', LABEL)]) TEXT.build_vocab(abstract_data) Vocab = TEXT.vocab TEXTn = data.Field(sequential=True, include_lengths=True, tokenize='spacy') LABELn = data.Field(sequential=False, use_vocab=False) abstract_datan = data.TabularDataset(path='./data/full_abstract_tsv.tsv', skip_header=True, format='tsv', fields=[('text', TEXTn), ('label', LABELn)]) TEXTn.build_vocab(abstract_datan) Vocabfull = TEXTn.vocab glove = torchtext.vocab.GloVe(name='6B', dim=100) Vocabfull.load_vectors(glove) embeds = nn.Embedding.from_pretrained(Vocabfull.vectors) abstract_dictionary = convert_csv_to_dict('./data/abstracts_final.csv') game_name_list = [] hour_list = [] print('Complete!\n') for i in range(10): # this is for entering the name of games name_true = 0 while name_true != 1: name = input('Please enter NAME of game #{}:'.format(i + 1)) if name not in abstract_dictionary.keys(): print('Sorry! The game is not recognized, please try again!') else: name_true = 1 game_name_list.append(name) # this is for entering the number of hour hour_true = 0 while hour_true == 0: hour = input('Enter in HOURS, how much you have played this game:') try: float(hour) hour_list.append(float(hour)) hour_true = 1 print('\n') except: print('Sorry! The input is not valid, please try again!') while not newplayer: newgamelist = game_name_list[:] newhours = hour_list[:] name_true = 0 print('\n') while name_true != 1: name = input('Please enter NAME of the NEW GAME:') # if name == "newplayer!": # newplayer = True # break if name not in abstract_dictionary.keys(): print('Sorry! The game is not recognized, please try again!') else: name_true = 1 newgamelist.append(name) newhours.append(0) #==========================================================# # print('\n') print('Let us think about it!') temp_input = [] for i in range(11): temp = [newgamelist[i], newhours[i]] temp_input.append(temp) net_cnn = torch.load('cnn_model_epoch0.pkl') abstract_list_cnn, hour_list_cnn, label_cnn = convert_data_cnn( temp_input, Vocabfull, embeds, abstract_dictionary) prediction_cnn = net_cnn.forward(abstract_list_cnn, hour_list_cnn) prediction_cnn = prediction_cnn.detach().numpy() max_cnn = prediction_cnn.argmax() results = ['%.3f' % elem for elem in prediction_cnn.tolist()] print('CNN:') print(results) # print( # 'the prediction of the cnn model is:' + str(prediction_cnn[0]) + ', ' + str(prediction_cnn[1]) + ', ' + str( # prediction_cnn[2]) + ', ' + str(prediction_cnn[3])) if max_cnn == 0: print( 'I believe the player will be playing this new game for: 0 - 10 hours' ) if max_cnn == 1: print( 'I believe the player will be playing this new game for: 10 - 35 hours' ) if max_cnn == 2: print( 'I believe the player will be playing this new game for: 35 - 85 hours' ) if max_cnn == 3: print( 'I believe the player will be playing this new game for: above 85 hours' ) #==========================================================# intomodel = [] for i in range(11): intomodel.append(newgamelist[i]) if newhours[i] < 10: intomodel.extend([1, 0, 0, 0]) elif newhours[i] < 35: intomodel.extend([0, 1, 0, 0]) elif newhours[i] < 85: intomodel.extend([0, 0, 1, 0]) else: intomodel.extend([0, 0, 0, 1]) intomodel = intomodel[:-4] nameindex = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50] absfeatures = [] for l in nameindex: absfeatures.append( sentence_preprocess_rnn(abstract_dictionary[intomodel[l]], Vocab).cuda()) for k in range(50): if k not in nameindex: absfeatures.append(torch.tensor(float(intomodel[k])).cuda()) predict = model(absfeatures) # print(predict) results_rnn = ['%.3f' % elem for elem in predict.detach().tolist()] predict = predict.argmax() #===============================================================# print('RNN:') print(results_rnn) if predict == 0: print( "Got it! I think you will play this game for less than 10 hours!" ) elif predict == 1: print( "Got it! I think you will play this game for 10 to 35 hours!") elif predict == 2: print( "Got it! I think you will play this game for 35 to 85 hours!") else: print( "Got it! I think you will play this game for more than 85 hours!" )
TEXT = data.Field(include_lengths = True, tokenize='spacy') LABEL = data.LabelField() OTHER = data.RawField() OTHER.is_target = False devset_fields = {"sentence":("sentence",TEXT), "claim":("claim", TEXT), "org_sentence":("org_sentence",OTHER), "docid_claimid_sentno":("docid_claimid_sentno",OTHER)} with open("/content/gdrive/My Drive/TEXT_VOCAB_5EPOCH", "rb") as f: TEST_TEXT = dill.load(f) print("Text Load Successfull") with open("/content/gdrive/My Drive/LABEL_VOCAB_5EPOCH", "rb") as f: TEST_LABEL = dill.load(f) print("Label Load Successfull") devset = data.TabularDataset(dev_path, format="CSV", fields=devset_fields, skip_header=False) print(len(devset)) print(vars(devset.examples[0])) TEXT.build_vocab(devset) LABEL.build_vocab(devset) TEXT.vocab = TEST_TEXT.vocab TEXT.vocab.itos = TEST_TEXT.vocab.itos TEXT.vocab.stoi = TEST_TEXT.vocab.stoi LABEL.vocab = TEST_LABEL.vocab LABEL.vocab.itos = TEST_LABEL.vocab.itos LABEL.vocab.stoi = TEST_LABEL.vocab.stoi
def __init__(self, path='data', glove_p='glove', train_file='train.csv', valid_file='valid.csv', test_file='test.csv', vocab_file=None, batch_size=32, embed_dim=100, max_vocab_size=None, min_freq=1, max_seq_len=None, gpu=False, use_fasttext=False, padded=False): self.batch_size = batch_size self.device = 0 if gpu else -1 self.sort_key = lambda x: len(x.context) #print (self.sort_key) if not padded: self.TEXT = data.Field(lower=True, pad_token='__pad__', unk_token='<UNK>', batch_first=True, tokenize=clean_str) else: self.TEXT = data.Field(lower=True, include_lengths=True, fix_length=max_seq_len, unk_token='<UNK>', batch_first=True, tokenize=clean_str) self.LABEL = data.Field(sequential=False, tensor_type=torch.FloatTensor, unk_token=None, batch_first=True) file_format = train_file[-3:] # Only take data with max length 160 # f = lambda ex: len(ex.context) <= max_seq_len and len(ex.response) f = None self.train = data.TabularDataset(path='{}/{}'.format(path, train_file), format=file_format, skip_header=True, fields=[('context', self.TEXT), ('response', self.TEXT), ('label', self.LABEL)], filter_pred=f) self.valid, self.test = data.TabularDataset.splits( path=path, validation=valid_file, test=test_file, format=file_format, skip_header=True, fields=[('context', self.TEXT), ('positive', self.TEXT), ('negative_1', self.TEXT), ('negative_2', self.TEXT), ('negative_3', self.TEXT), ('negative_4', self.TEXT), ('negative_5', self.TEXT), ('negative_6', self.TEXT), ('negative_7', self.TEXT), ('negative_8', self.TEXT), ('negative_9', self.TEXT)]) if vocab_file is None: if use_fasttext: print("building vocabulary") # self.TEXT.build_vocab( # self.train, max_size=max_vocab_size, min_freq=3, # vectors="fasttext.en.300d" # ) self.TEXT.build_vocab(self.train, max_size=max_vocab_size, min_freq=5, vectors="fasttext.en.300d") else: self.TEXT.build_vocab(self.train, max_size=max_vocab_size, min_freq=min_freq, vectors=GloVe('6B', dim=embed_dim)) vocab = self.TEXT.vocab self.TEXT.build_vocab(self.train, max_size=max_vocab_size, min_freq=min_freq, vectors=GloVe('840B', dim=embed_dim)) else: specials = list( OrderedDict.fromkeys(tok for tok in [ self.TEXT.unk_token, self.TEXT.pad_token, self.TEXT.init_token, self.TEXT.eos_token ] if tok is not None)) with open(f'{path}/{vocab_file}', 'r') as f: counter = Counter(f.read().split('\n')) if use_fasttext: print("Using fasttext") vocab = Vocab(counter, specials=specials, vectors="fasttext.en.300d") else: vocab = Vocab(counter, specials=specials, vectors=GloVe('6B', dim=embed_dim)) self.TEXT.vocab = vocab self.LABEL.build_vocab(self.train) print(vocab.stoi['__pad__']) print(vocab.itos[25], vocab.itos[32]) self.dataset_size = len(self.train.examples) self.vocab_size = len(self.TEXT.vocab.itos) self.embed_dim = embed_dim #self.vectors = self.load_glove_embeddings(glove_p+'/glove.6B.50d.txt', self.TEXT.vocab.stoi) self.vectors = self.TEXT.vocab.vectors
def main(): opt = parse_args() src_field = data.Field() label_field = data.Field(pad_token=None, unk_token=None) train = data.TabularDataset(path=opt.train_path, format='tsv', fields=[('text', src_field), ('label', label_field)]) test = data.TabularDataset(path=opt.test_path, format='tsv', fields=[('text', src_field), ('label', label_field)]) src_field.build_vocab(train, max_size=100000, min_freq=2, vectors="glove.6B.300d") label_field.build_vocab(train) print("Training size: {0}, Testing size: {1}".format( len(train), len(test))) classifier = LSTMClassifier(300, 512, len(label_field.vocab), src_field.vocab.vectors) if torch.cuda.is_available(): classifier.cuda() train_iter = data.BucketIterator(dataset=train, batch_size=opt.batch_size, device=device, repeat=False) test_iter = data.BucketIterator(dataset=test, batch_size=5, device=device, repeat=False) for param in classifier.parameters(): param.data.uniform_(-0.08, 0.08) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(classifier.parameters()) step = 0 for epoch in range(15): test_acc = 0 for batch in test_iter: test_acc += evaluate(classifier, batch) print('Test accuracy: {0}'.format(test_acc / len(test))) running_loss = 0.0 for batch in train_iter: optimizer.zero_grad() pred = classifier(batch.text) loss = criterion(pred, batch.label.view(-1)) running_loss += loss.data[0] loss.backward() optimizer.step() step += 1 if step % opt.log_every == 0: print('[%d, %5d] loss: %.3f' % (epoch + 1, step + 1, running_loss / opt.log_every)) running_loss = 0.0 torch.save(classifier, os.path.join("model_{0}".format(epoch + 1)))
def make_vocab(self, args): args.path = args.datadir + args.data self.INPUT = data.Field(sequential=True, batch_first=True, init_token="<start>", eos_token="<eos>", include_lengths=True) # Title self.OUTPUT = data.Field( sequential=True, batch_first=True, init_token="<start>", eos_token="<eos>", include_lengths=True) # Gold Abstract, preprocessed self.TARGET = data.Field(sequential=True, batch_first=True, init_token="<start>", eos_token="<eos>") self.ENT_TYPE = data.Field(sequential=True, batch_first=True, eos_token="<eos>") # Entity Type self.ENT = data.RawField() # Entity self.REL = data.RawField() # Relation between entities self.REL.is_target = False self.ENT.is_target = False self.fields = [("title", self.INPUT), ("ent", self.ENT), ("nerd", self.ENT_TYPE), ("rel", self.REL), ("out", self.OUTPUT)] train = data.TabularDataset(path=args.path, format='tsv', fields=self.fields) print('Building Vocab... ', end='') # Output Vocab # mapping from generics to indices are at the last of the vocab # also includes indexed generics, (e.g. <method_0>) but in mixed order self.OUTPUT.build_vocab(train, min_freq=args.outunk) generics = [ '<method>', '<material>', '<otherscientificterm>', '<metric>', '<task>' ] # Entity Types self.OUTPUT.vocab.itos.extend(generics) for generic in generics: self.OUTPUT.vocab.stoi[generic] = self.OUTPUT.vocab.itos.index( generic) # Target Vocab # Same as Output Vocab, except for the indexed generics' indices # len(vocab) = 11738 / <method_0>, <material_0> ... : 11738, <method_1>, ... : 11739 and so on. self.TARGET.vocab = copy(self.OUTPUT.vocab) entity_types = [ 'method', 'material', 'otherscientificterm', 'metric', 'task' ] for entity_type in entity_types: for idx in range(40): s = "<" + entity_type + "_" + str(idx) + ">" self.TARGET.vocab.stoi[s] = len(self.TARGET.vocab.itos) + idx # Entity Type Vocab # Indices for not-indexed generics are same with those of output vocab self.ENT_TYPE.build_vocab(train, min_freq=0) for x in generics: self.ENT_TYPE.vocab.stoi[x] = self.OUTPUT.vocab.stoi[x] # Title Vocab self.INPUT.build_vocab(train, min_freq=args.entunk) # Relation Vocab # Adds relations.vocab + inverse of relations.vocab self.REL.special = ['<pad>', '<unk>', 'ROOT'] with open(args.datadir + "/" + args.relvocab) as f: rel_vocab = [x.strip() for x in f.readlines()] self.REL.size = len(rel_vocab) rel_vocab += [x + "_inv" for x in rel_vocab] rel_vocab += self.REL.special self.REL.itos = rel_vocab self.ENT.itos, self.ENT.stoi = self.build_ent_vocab(args.path) print('Done') if not self.args.eval: self.make_iterator(train)