device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #################################### # Hyper-parameters # #################################### BATCH_SIZE = 64 LEARNING_RATE = 1e-3 #################################### # Preparing Data # #################################### # 1. data.Field() TEXT = data.Field(tokenize='spacy', include_lengths=True) LABELS = data.LabelField() # 2. data.TabularDataset train_data, test_data = data.TabularDataset.splits(path=dataset_path, train="train.tsv", test="test.tsv", fields=[('labels', LABELS), ('text', TEXT)], format="tsv") # train_data, test_data = datasets.IMDB.splits(TEXT, LABELS) print("Number of train_data = {}".format(len(train_data))) print("Number of test_data = {}".format(len(test_data))) print("vars(train_data[0]) = {}\n".format(vars(train_data[0])))
# convert neutral, positive and negative to numeric # sentiment_map = {'neutral': 0, 'positive': 1, 'negative': -1} # final_df['airline_sentiment'] = final_df['airline_sentiment'].map(sentiment_map) # split into train, test, val (.7, .15, .15) train_df, testval_df = train_test_split(final_df, test_size=0.3) test_df, val_df = train_test_split(testval_df, test_size=0.5) # convert df back to csv, with column names train_df.to_csv(data_dir + '/train.csv', index=False) test_df.to_csv(data_dir + '/test.csv', index=False) val_df.to_csv(data_dir + '/val.csv', index=False) # load into torchtext ID = data.Field() TEXT = data.Field(tokenize='spacy') SENTIMENT = data.LabelField(dtype=torch.float) AIRLINE = data.Field() # access using batch.id, batch.text etc fields = [('id', ID), ('text', TEXT), ('airline', AIRLINE), ('label', SENTIMENT)] train_data, valid_data, test_data = data.TabularDataset.splits( path=data_dir, train='train.csv', validation='val.csv', test='test.csv', format='csv', fields=fields, skip_header=True) # build iterators MAX_VOCAB_SIZE = 10_000
from torchtext import datasets import time import random import torch.nn as nn import torch.optim as optim import spacy import sys import pandas as pd SEED = 1234 torch.manual_seed(SEED) torch.backends.cudnn.deterministic = True print('begin to load dataset') TEXT = data.Field(tokenize = 'spacy', include_lengths = True) LABEL = data.LabelField(dtype = torch.float) train_data, test_data = datasets.IMDB.splits(TEXT, LABEL) train_data, valid_data = train_data.split(random_state = random.getstate()) MAX_VOCAB_SIZE = 25_000 print('building vocab') TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE, vectors = "glove.6B.100d", unk_init = torch.Tensor.normal_) LABEL.build_vocab(train_data) BATCH_SIZE = 64
return train_iter, dev_iter, test_iter if __name__ == "__main__": data_dir = "/home/songyingxin/datasets/SST-2" CHAR_NESTING = data.Field(batch_first=True, tokenize=list, lower=True) char_field = data.NestedField(CHAR_NESTING, tokenize='spacy') word_field = data.Field(tokenize='spacy', lower=True, include_lengths=True, fix_length=100) label_field = data.LabelField(dtype=torch.long) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") word_emb_file = "/home/songyingxin/datasets/WordEmbedding/glove/glove.840B.300d.txt" char_emb_file = "/home/songyingxin/datasets/WordEmbedding/glove/glove.840B.300d-char.txt" train_iter, dev_iter, test_iter = sst_word_char( data_dir, word_field, char_field, label_field, 32, device, word_emb_file, char_emb_file) for batch in train_iter: print(batch)
def load_dataset(test_sen=None): """ tokenizer : Breaks sentences into a list of words. If sequential=False, no tokenization is applied Field : A class that stores information about the way of preprocessing fix_length : An important property of TorchText is that we can let the input to be variable length, and TorchText will dynamically pad each sequence to the longest sequence in that "batch". But here we are using fi_length which will pad each sequence to have a fix length of 200. build_vocab : It will first make a vocabulary or dictionary mapping all the unique words present in the train_data to an idx and then after it will use GloVe word embedding to map the index to the corresponding word embedding. vocab.vectors : This returns a torch tensor of shape (vocab_size x embedding_dim) containing the pre-trained word embeddings. BucketIterator : Defines an iterator that batches examples of similar lengths together to minimize the amount of padding needed. """ # TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=200) LABEL = data.LabelField(tensor_type=torch.cuda.FloatTensor) INDEX = data.Field(tensor_type=torch.cuda.LongTensor) TEXT = data.Field(sequential=True, fix_length=20000, tokenize=tokenizer, pad_first=True, tensor_type=torch.cuda.LongTensor, lower=True, batch_first=True) train_data, test_data = data.TabularDataset.splits( path='.', format='csv', skip_header=True, train='blogs_training.csv', validation='blogs_testing.csv', fields=[('index', None), ('text', TEXT), ('fileIndex', None), ('label', LABEL), ('age', None), ('industry', None), ('hscope', None)]) # train_data, test_data = datasets.IMDB.splits(TEXT, LABEL) TEXT.build_vocab(train_data, vectors=GloVe(name='twitter.27B', dim=100)) LABEL.build_vocab(train_data) pickle.dump(TEXT, open("TEXT.pickle", "wb")) word_embeddings = TEXT.vocab.vectors print("Length of Text Vocabulary: " + str(len(TEXT.vocab))) print("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size()) print("Label Length: " + str(len(LABEL.vocab))) train_data, valid_data = train_data.split( ) # Further splitting of training_data to create new training_data & validation_data train_iter, valid_iter, test_iter = data.BucketIterator.splits( (train_data, valid_data, test_data), batch_size=32, sort_key=lambda x: len(x.text), repeat=False, shuffle=True) '''Alternatively we can also use the default configurations''' # train_iter, test_iter = datasets.IMDB.iters(batch_size=32) vocab_size = len(TEXT.vocab) return TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter
'ninp': emsize, 'nhid': nhid, 'nlayers': nlayers, 'dropout': dropout, 'tie_weights': tied, } # TODO: # Rewrite this entire thing: https://github.com/pytorch/text/issues/664 # We have to use the same numericalization as in the example before. TEXT = data.Field(sequential=True, include_lengths=True, use_vocab=True, tokenize=lambda x: tokenizer.encode(x).tokens) LABELS = data.LabelField(dtype=torch.float, is_target=True) # , is_target=True NAMES = data.RawField(is_target=False) # Fields are added by column left to write in the underlying table fields = [('name', NAMES), ('label', LABELS), ('text', TEXT)] train, dev, test = data.TabularDataset.splits( path= '/Users/phi/Dropbox/projects/picotext/journal/2020-05-23T1315/tmp/processed', format='CSV', fields=fields, train='train.csv', validation='dev.csv', test='test.csv') TEXT.build_vocab() # We'll fill this w/ the tokenizer
def predict(csv1, csv2): train = csv1 test = csv2 #encoding='gb18030' #print(train.shape) print('Now loading and predicting........') train_df, valid_df = train_test_split(train) import spacy spacy_en = spacy.load("en_ner_bionlp13cg_md") def tokenizer(text): # create a tokenizer function return [tok.text for tok in spacy_en.tokenizer(text)] TEXT = data.Field(tokenize=tokenizer, include_lengths=True) LABEL = data.LabelField(dtype=torch.float) class DataFrameDataset(data.Dataset): def __init__(self, df, fields, is_test=False, **kwargs): examples = [] for i, row in df.iterrows(): label = row.Label if not is_test else None text = row.TEXT examples.append(data.Example.fromlist([text, label], fields)) super().__init__(examples, fields, **kwargs) @staticmethod def sort_key(ex): return len(ex.text) @classmethod def splits(cls, fields, train_df, val_df=None, test_df=None, **kwargs): train_data, val_data, test_data = (None, None, None) data_field = fields if train_df is not None: #print('do train') train_data = cls(train_df.copy(), data_field, **kwargs) if val_df is not None: #print('do valid') val_data = cls(val_df.copy(), data_field, **kwargs) if test_df is not None: #print('do test') test_data = cls(test_df.copy(), data_field, **kwargs) return tuple(d for d in (train_data, val_data, test_data) if d is not None) fields = [('text', TEXT), ('label', LABEL)] train_ds, val_ds, test_ds = DataFrameDataset.splits(fields, train_df=train_df, val_df=valid_df, test_df=test) MAX_VOCAB_SIZE = 10000 TEXT.build_vocab(train_ds, max_size=MAX_VOCAB_SIZE, vectors='glove.6B.50d', unk_init=torch.Tensor.zero_) LABEL.build_vocab(train_ds) BATCH_SIZE = 64 * 2 device = 'cpu' train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits( (train_ds, val_ds, test_ds), batch_size=BATCH_SIZE, sort_within_batch=True, device=device) INPUT_DIM = len(TEXT.vocab) EMBEDDING_DIM = 50 HIDDEN_DIM = 50 OUTPUT_DIM = 1 N_LAYERS = 2 BIDIRECTIONAL = True DROPOUT = 0.1 PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token] # padding class LSTM_net(nn.Module): def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx): super().__init__() self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx) self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout) self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim) self.fc2 = nn.Linear(hidden_dim, 1) def forward(self, text, text_lengths): embedded = self.embedding(text) packed_embedded = nn.utils.rnn.pack_padded_sequence( embedded, text_lengths) packed_output, (hidden, cell) = self.rnn(packed_embedded) hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1) #hidden=hidden[-1,:,:] output = self.fc1(hidden) output = self.fc2(output) return output from sklearn.metrics import roc_auc_score def binary_accuracy(preds, y): """ Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8 """ rounded_preds = (torch.sigmoid(preds) > 0.41).float() correct = ( rounded_preds == y).float() #convert into float for division acc = correct.sum() / len(correct) return acc, rounded_preds, torch.sigmoid(preds) def evaluate(model, iterator): epoch_acc = 0 model.eval() pred_collect = torch.empty(0) y_collect = torch.empty(0) y_prob = torch.empty(0) with torch.no_grad(): for batch in iterator: text, text_lengths = batch.text predictions = model(text, text_lengths).squeeze(1) acc, pred_y, prob = binary_accuracy(predictions, batch.label) epoch_acc = acc.item() + epoch_acc pred_collect = torch.cat([pred_collect, pred_y]) y_collect = torch.cat([y_collect, batch.label]) y_prob = torch.cat([y_prob, prob]) try: auc = roc_auc_score(y_collect.cpu().data.numpy(), pred_collect.cpu().data.numpy()) except: auc = 'UNAVAILABLE' return epoch_acc / len(iterator), auc, y_collect, y_prob, pred_collect model = LSTM_net(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX) model.load_state_dict(torch.load('LSTM_MODEL', map_location='cpu')) a, b, my_lab, my_prob, my_pred = evaluate(model, test_iterator) return 'Yes, this patient might have readmission' if my_pred.data.numpy( ) else 'No, this patient might not have readmission' #back to label class
def test_model(test_data_dir): """ Use trained models to get the final prediction """ pretrained_models = ['bert-base-uncased', 'xlnet-base-cased', 'roberta-base'] # load testing data into pandas DataFrame with open(test_data_dir) as f: test_lines = [line.rstrip('\n')[line.rstrip('\n').find(',') + 1:] for line in f] test_df = pd.DataFrame(test_lines, columns=['text']) # because the model input required some label we won't use this though test_df['label'] = 1 for pretrained_model in pretrained_models: # load model if pretrained_model == 'bert-base-uncased': from transformers import BertForSequenceClassification as SequenceClassificationModel selected_epochs = bert_picks elif pretrained_model == 'xlnet-base-cased': from transformers import XLNetForSequenceClassification as SequenceClassificationModel selected_epochs = xlnet_picks elif pretrained_model == 'roberta-base': from transformers import RobertaForSequenceClassification as SequenceClassificationModel selected_epochs = roberta_picks config = AutoConfig.from_pretrained(pretrained_model) model = SequenceClassificationModel(config) # load tokenizer tokenizer = AutoTokenizer.from_pretrained(pretrained_model) init_token_idx = tokenizer.cls_token_id eos_token_idx = tokenizer.sep_token_id pad_token_idx = tokenizer.pad_token_id unk_token_idx = tokenizer.unk_token_id max_input_length = tokenizer.max_model_input_sizes[pretrained_model] def tokenize_and_cut(sentence): """ Tokenize the sentence and cut it if it's too long """ tokens = tokenizer.tokenize(sentence) # - 2 is for cls and sep tokens tokens = tokens[:max_input_length - 2] return tokens # xlnet model has no max_model_input_sizes field but it acutally has a limit # so we manually set it if max_input_length == None: max_input_length = 512 # Field handles the conversion to Tensor (tokenizing) TEXT = data.Field( batch_first=True, use_vocab=False, tokenize=tokenize_and_cut, preprocessing=tokenizer.convert_tokens_to_ids, init_token=init_token_idx, eos_token=eos_token_idx, pad_token=pad_token_idx, unk_token=unk_token_idx ) LABEL = data.LabelField(dtype=torch.long, use_vocab=False) # transform DataFrame into torchtext Dataset print('Transforming testing data for', pretrained_model, 'model') test_data = DataFrameDataset.splits(text_field=TEXT, label_field=LABEL, test_df=test_df) BATCH_SIZE = 32 # get gpu if possible device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') test_iterator = data.Iterator(test_data, batch_size=BATCH_SIZE, device=device, shuffle=False, sort=False, train=False) for selected_epoch in selected_epochs: # load trained model model.load_state_dict( torch.load(os.path.join( 'models', f'{pretrained_model}-e{selected_epoch:02}-model.pt' ), map_location=device) ) model = model.eval() # get predictions of test data print(f'Testing for {pretrained_model} epoch {selected_epoch}') predictions = test(model, test_iterator) # map predictions to match the original label_map = {0: -1, 1: 1} corrected_predictions = list(map(lambda x: label_map[x], predictions)) # load data into dataframe submission = pd.read_csv('predictions_test/sample_submission.csv') submission.Prediction = corrected_predictions submission.to_csv(os.path.join('predictions_test', f'{pretrained_model}-e{selected_epoch:02}.csv'), index=False) test_predictions('predictions_test')
def classify(tokenizerType): #Load dataset TEXT = data.Field(tokenize=tokenizerOptions[tokenizerType], include_lengths=True, lower=True) LABEL = data.LabelField(dtype=torch.float, sequential=False, use_vocab=False) fields = [('text', TEXT), ('label', LABEL)] train_data = data.TabularDataset(path='amazon_reviews.txt', format='tsv', fields=fields) #Split dataset into train, validation and test train_data, valid_data, test_data = train_data.split( split_ratio=[0.64, 0.2, 0.16], random_state=random.seed(SEED)) #Build vocabulary using predefined vectors TEXT.build_vocab(train_data, vectors="glove.6B.100d", unk_init=torch.Tensor.normal_) LABEL.build_vocab(train_data) #print(TEXT.vocab.itos[:100]) #Use GPU, if available device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #Create iterators to get data in batches train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits( datasets=(train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=device, sort_key=lambda x: len(x.text), sort=False, sort_within_batch=True) INPUT_DIM = len(TEXT.vocab) EMBEDDING_DIM = 100 HIDDEN_DIM = 256 OUTPUT_DIM = 1 model = LSTM(vocab_size=INPUT_DIM, embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, output_dim=OUTPUT_DIM, n_layers=3, bidirectional=True, dropout=0.5, pad_idx=TEXT.vocab.stoi[TEXT.pad_token]) #Replace initial weights of embedding with pre-trained embedding pretrained_embeddings = TEXT.vocab.vectors model.embedding.weight.data.copy_(pretrained_embeddings) #Set UNK and PAD embeddings to zero model.embedding.weight.data[TEXT.vocab.stoi[TEXT.unk_token]] = torch.zeros( EMBEDDING_DIM) model.embedding.weight.data[TEXT.vocab.stoi[TEXT.pad_token]] = torch.zeros( EMBEDDING_DIM) #SGD optimizer and binary cross entropy loss optimizer = optim.Adam(model.parameters()) criterion = nn.BCEWithLogitsLoss() #Transfer model and criterion to GPU model = model.to(device) criterion = criterion.to(device) best_valid_loss = float('inf') train_loss_list = [] valid_loss_list = [] for epoch in range(N_EPOCHS): train_loss, train_acc = train(model, train_iterator, optimizer, criterion) valid_loss, valid_acc = evaluate(model, valid_iterator, criterion) if valid_loss < best_valid_loss: best_valid_loss = valid_loss torch.save(model.state_dict(), 'best-model.pt') train_loss_list.append(train_loss) valid_loss_list.append(valid_loss) print(tokenizerType + ":") plotLoss(train_loss_list, valid_loss_list) model.load_state_dict(torch.load('best-model.pt')) test_loss, test_acc = evaluate(model, test_iterator, criterion) print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%') print("\n")
dev.to_csv(os.path.join(TEMP_DIRECTORY, DEV_FILE), header=True, sep='\t', index=False, encoding='utf-8') test.to_csv(os.path.join(TEMP_DIRECTORY, TEST_FILE), header=True, sep='\t', index=False, encoding='utf-8') id_variable = data.Field() text_variable = data.Field(batch_first=True, tokenize=pipeline, fix_length=FIXED_LENGTH) target_variable = data.LabelField(dtype=torch.float) train_fields = [ ('id', None), # we dont need this, so no processing ('tweet', text_variable), # process it as text ('subtask_a', None), # process it as label ('encoded_subtask_a', target_variable) ] dev_fields = [ ('id', id_variable), # we process this as id field ('tweet', text_variable), # process it as text ('subtask_a', None), # process it as label ('encoded_subtask_a', None) ]
def train_text_model(userName, projectName, projectType, numEpochs=10): S3_BUCKET_OUTPUT = 'gauravp-eva4-capstone-models' # Find number of users s3 = boto3.client('s3', aws_access_key_id='aws_access_key_id', aws_secret_access_key='aws_secre') print('Delete model files corresponding to current session') savedTokenizerName = f'{userName}_{projectName}_{projectType}.pkl' print(f'Deleting {savedTokenizerName}') s3.delete_object(Bucket=S3_BUCKET_OUTPUT, Key=savedTokenizerName) savedModelName = f'{userName}_{projectName}_{projectType}.pt' print(f'Deleting {savedModelName}') s3.delete_object(Bucket=S3_BUCKET_OUTPUT, Key=savedModelName) model_info_file_name = f'{userName}_{projectName}_{projectType}.json' print(f'Deleting {model_info_file_name}') s3.delete_object(Bucket=S3_BUCKET_OUTPUT, Key=model_info_file_name) print('Preparing train val splits') datasetPath = f'./user_data/{userName}/{projectName}/{projectType}/train_data' texts = [] labels = [] for dirName in os.listdir(datasetPath): dirPath = os.path.join(datasetPath, dirName) print(dirPath) #print(resizedDirPath) count = 0 for fileName in os.listdir(dirPath): filePath = os.path.join(dirPath, fileName) #print(filePath) labelName = filePath.split('/')[-2] print('className: ', labelName, filePath) with open(filePath, newline='') as f: reader = csv.reader(f) row = next(reader) print(row) texts.append(row[0]) labels.append(labelName) print(len(texts)) print(len(labels)) # Defining Fields # We are using spacy as a tokanizer dataset_text = data.Field(sequential=True, tokenize='spacy', batch_first=True, include_lengths=True) dataset_label = data.LabelField(tokenize='spacy', is_target=True, batch_first=True, sequential=False) # Define names of dataset and its label fields = [('dataset_text', dataset_text), ('dataset_label', dataset_label)] # We will gather data into a list example = [ data.Example.fromlist([texts[i], labels[i]], fields) for i in range(len(texts)) ] # Define userDataset consisting of data from dataframe and fields defined by us userDataset = data.Dataset(example, fields) # split dataset into training and validation (train, valid) = userDataset.split(split_ratio=[0.70, 0.30]) print((len(train), len(valid))) print(vars(train.examples[10])) # Build vacab for text data as well as text labels dataset_text.build_vocab(train) dataset_label.build_vocab(train) num_classes = len(dataset_label.vocab) print('Size of input vocab : ', len(dataset_text.vocab)) print('Size of label vocab : ', len(dataset_label.vocab)) print('Top 10 words appreared repeatedly :', list(dataset_text.vocab.freqs.most_common(10))) print('Labels : ', dataset_label.vocab.stoi) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") train_iterator, valid_iterator = data.BucketIterator.splits( (train, valid), batch_size=32, sort_key=lambda x: len(x.dataset_text), sort_within_batch=True, device=device) with open('tokenizer.pkl', 'wb') as tokens: pickle.dump(dataset_text.vocab.stoi, tokens) # Define hyperparameters size_of_vocab = len(dataset_text.vocab) embedding_dim = 300 num_hidden_nodes = 100 num_output_nodes = len(dataset_label.vocab) num_layers = 2 dropout = 0.2 # Instantiate the model model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes, num_output_nodes, num_layers, dropout=dropout) print(model) # No. of trianable parameters def count_parameters(model): return sum(p.numel() for p in model.parameters() if p.requires_grad) print(f'The model has {count_parameters(model):,} trainable parameters') import torch.optim as optim # define optimizer and loss optimizer = optim.Adam(model.parameters(), lr=2e-4) criterion = nn.CrossEntropyLoss() # define metric def binary_accuracy(preds, y): # round predictions to the closest integer _, predictions = torch.max(preds, 1) correct = (predictions == y).float() acc = correct.sum() / len(correct) return acc # push to cuda if available model = model.to(device) criterion = criterion.to(device) # train loop def train(model, iterator, optimizer, criterion): # initialize every epoch epoch_loss = 0 epoch_acc = 0 # set the model in training phase model.train() for batch in iterator: # resets the gradients after every batch optimizer.zero_grad() # retrieve text and no. of words dataset_text, dataset_text_lengths = batch.dataset_text # convert to 1D tensor predictions = model(dataset_text, dataset_text_lengths).squeeze() # compute the loss loss = criterion(predictions, batch.dataset_label) # compute the binary accuracy acc = binary_accuracy(predictions, batch.dataset_label) # backpropage the loss and compute the gradients loss.backward() # update the weights optimizer.step() # loss and accuracy epoch_loss += loss.item() epoch_acc += acc.item() return epoch_loss / len(iterator), epoch_acc / len(iterator) #evaluate loop def evaluate(model, iterator, criterion): # initialize every epoch epoch_loss = 0 epoch_acc = 0 # deactivating dropout layers model.eval() # deactivates autograd with torch.no_grad(): for batch in iterator: # retrieve text and no. of words dataset_text, dataset_text_lengths = batch.dataset_text # convert to 1d tensor predictions = model(dataset_text, dataset_text_lengths).squeeze() # compute loss and accuracy loss = criterion(predictions, batch.dataset_label) acc = binary_accuracy(predictions, batch.dataset_label) # keep track of loss and accuracy epoch_loss += loss.item() epoch_acc += acc.item() return epoch_loss / len(iterator), epoch_acc / len(iterator) N_EPOCHS = numEpochs best_valid_loss = float('inf') best_valid_acc = 0.0 best_train_acc = 0.0 for epoch in range(N_EPOCHS): # train the model train_loss, train_acc = train(model, train_iterator, optimizer, criterion) # evaluate the model valid_loss, valid_acc = evaluate(model, valid_iterator, criterion) # save the best model if valid_loss < best_valid_loss: best_valid_loss = valid_loss torch.save(model.state_dict(), './saved_weights.pt') if valid_acc > best_valid_acc: best_valid_acc = valid_acc if train_acc > best_train_acc: best_train_acc = train_acc print( f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%' ) print( f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc * 100:.2f}% \n' ) model.load_state_dict(torch.load('./saved_weights.pt')) savedModelName = f'{userName}_{projectName}_{projectType}.pt' print("Saved Model Name", savedModelName) torch.save(model, savedModelName) savedTokenizerName = f'{userName}_{projectName}_{projectType}.pkl' os.rename('./tokenizer.pkl', savedTokenizerName) # prepare model information file model_info = {} model_info['numClasses'] = num_classes model_info['classNames'] = dataset_label.vocab.itos model_info['modelName'] = savedModelName model_info['userName'] = userName model_info['projectName'] = projectName model_info['bestTestAcc'] = best_valid_acc model_info['bestTrainAcc'] = best_train_acc print(model_info) model_info_file_name = f'{userName}_{projectName}_{projectType}.json' with open(model_info_file_name, "w") as outfile: json.dump(model_info, outfile) print('Saving model info and model to s3') # S3_BUCKET_OUTPUT = 'gauravp-eva4-capstone-models' # Find number of users # s3 = boto3.client('s3',aws_access_key_id='aws_access_key_id',aws_secret_access_key='aws_secret_access_key') s3.upload_file( model_info_file_name, S3_BUCKET_OUTPUT, model_info_file_name, ) s3.upload_file(savedModelName, S3_BUCKET_OUTPUT, savedModelName) s3.upload_file(savedTokenizerName, S3_BUCKET_OUTPUT, savedTokenizerName) print("Done!!!")
def train_section_model(case_folder, params=None): """Trains a section formatting model. If no specific parameters are specified, the best identified values are used. OUTPUT: Trained model, text vocabulary and label vocabulary""" _prepare_data(case_folder) if params is None: params = {'embedding_dim': 100, 'num_hidden_nodes': 32, 'num_output_nodes': 5, 'bidirection': True, 'num_layers': 2, 'dropout': 0.2} t.backends.cudnn.deterministic = True TEXT = data.Field(tokenize='spacy', batch_first=True, include_lengths=True) LABEL = data.LabelField(dtype=t.long, batch_first=True) fields = [('text', TEXT), ('label', LABEL)] training_data = data.TabularDataset(path='externals/tmp/dataset.csv', format='csv', fields=fields, skip_header=True) train_data, valid_data = training_data.split(split_ratio=0.2, random_state=random.seed(2020)) TEXT.build_vocab(training_data, min_freq=1, vectors="glove.6B.100d") LABEL.build_vocab(training_data) # check whether cuda is available device = t.device('cuda' if t.cuda.is_available() else 'cpu') # set batch size BATCH_SIZE = 32 # Load an iterator train_iterator, valid_iterator = data.BucketIterator.splits( (train_data, valid_data), batch_size=BATCH_SIZE, sort_key=lambda x: len(x.text), sort_within_batch=True, device=device) # define hyperparameters size_of_vocab = len(TEXT.vocab) params['size_of_vocab'] = size_of_vocab # instantiate the model model = internal_functions.classifier(size_of_vocab, params['embedding_dim'], params['num_hidden_nodes'], params['num_output_nodes'], params['num_layers'], bidirectional=params['bidirection'], dropout=params['dropout']) # Initialize the pretrained embedding pretrained_embeddings = TEXT.vocab.vectors model.embedding.weight.data.copy_(pretrained_embeddings) model, optimizer, criterion = internal_functions.optimizer_and_loss(model, device) # Now, we train the model N_EPOCHS = 10 best_valid_loss = float('inf') for epoch in range(N_EPOCHS): # train the model model, train_loss, train_acc = internal_functions.train(model, train_iterator, optimizer, criterion) # evaluate the model valid_loss, valid_acc = internal_functions.evaluate(model, valid_iterator, criterion) # save the best model if valid_loss < best_valid_loss: best_valid_loss = valid_loss _save_obj({'params': params, 'model': model.state_dict(), 'vocab_dict': TEXT.vocab.stoi, 'label_dict': LABEL.vocab.stoi, 'acc': valid_acc, 'timestamp': datetime.datetime.utcnow()}, 'externals/tmp/section_model') # print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%') # print(f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc * 100:.2f}%') os.remove('externals/tmp/dataset.csv') if not os.path.exists('externals/'+device.type+'_section_model.pkl'): shutil.move('externals/tmp/section_model.pkl', 'externals/'+device.type+'_section_model.pkl') return model, TEXT.vocab.stoi, LABEL.vocab.stoi
def test_stratified_dataset_split(self): num_examples, num_labels = 30, 3 self.write_test_splitting_dataset(num_examples=num_examples, num_labels=num_labels) text_field = data.Field() label_field = data.LabelField() fields = [('text', text_field), ('label', label_field)] dataset = data.TabularDataset(path=self.test_dataset_splitting_path, format="csv", fields=fields) # Default split ratio expected_train_size = 21 expected_test_size = 9 train, test = dataset.split(stratified=True) assert len(train) == expected_train_size assert len(test) == expected_test_size # Test array arguments with same ratio split_ratio = [0.7, 0.3] train, test = dataset.split(split_ratio=split_ratio, stratified=True) assert len(train) == expected_train_size assert len(test) == expected_test_size # Test strata_field argument train, test = dataset.split(split_ratio=split_ratio, stratified=True, strata_field='label') assert len(train) == expected_train_size assert len(test) == expected_test_size # Test invalid field name strata_field = 'dummy' with pytest.raises(ValueError): dataset.split(split_ratio=split_ratio, stratified=True, strata_field=strata_field) # Test uneven stratify sizes num_examples, num_labels = 28, 3 self.write_test_splitting_dataset(num_examples=num_examples, num_labels=num_labels) # 10 examples for class 1 and 9 examples for classes 2,3 dataset = data.TabularDataset(path=self.test_dataset_splitting_path, format="csv", fields=fields) expected_train_size = 7 + 6 + 6 expected_test_size = 3 + 3 + 3 train, test = dataset.split(split_ratio=split_ratio, stratified=True) assert len(train) == expected_train_size assert len(test) == expected_test_size split_ratio = [0.7, 0.3] train, test = dataset.split(split_ratio=split_ratio, stratified=True) assert len(train) == expected_train_size assert len(test) == expected_test_size # Add validation set split_ratio = [0.6, 0.3, 0.1] expected_train_size = 6 + 5 + 5 expected_valid_size = 1 + 1 + 1 expected_test_size = 3 + 3 + 3 train, valid, test = dataset.split(split_ratio=split_ratio, stratified=True) assert len(train) == expected_train_size assert len(valid) == expected_valid_size assert len(test) == expected_test_size
def test_dataset_split_arguments(self): num_examples, num_labels = 30, 3 self.write_test_splitting_dataset(num_examples=num_examples, num_labels=num_labels) text_field = data.Field() label_field = data.LabelField() fields = [('text', text_field), ('label', label_field)] dataset = data.TabularDataset(path=self.test_dataset_splitting_path, format="csv", fields=fields) # Test default split ratio (0.7) expected_train_size = 21 expected_test_size = 9 train, test = dataset.split() assert len(train) == expected_train_size assert len(test) == expected_test_size # Test array arguments with same ratio split_ratio = [0.7, 0.3] train, test = dataset.split(split_ratio=split_ratio) assert len(train) == expected_train_size assert len(test) == expected_test_size # Add validation set split_ratio = [0.6, 0.3, 0.1] expected_train_size = 18 expected_valid_size = 3 expected_test_size = 9 train, valid, test = dataset.split(split_ratio=split_ratio) assert len(train) == expected_train_size assert len(valid) == expected_valid_size assert len(test) == expected_test_size # Test ratio normalization split_ratio = [6, 3, 1] train, valid, test = dataset.split(split_ratio=split_ratio) assert len(train) == expected_train_size assert len(valid) == expected_valid_size assert len(test) == expected_test_size # Test only two splits returned for too small valid split size split_ratio = [0.66, 0.33, 0.01] expected_length = 2 splits = dataset.split(split_ratio=split_ratio) assert len(splits) == expected_length # Test invalid arguments split_ratio = 1.1 with pytest.raises(AssertionError): dataset.split(split_ratio=split_ratio) split_ratio = -1. with pytest.raises(AssertionError): dataset.split(split_ratio=split_ratio) split_ratio = [0.7] with pytest.raises(AssertionError): dataset.split(split_ratio=split_ratio) split_ratio = [1, 2, 3, 4] with pytest.raises(AssertionError): dataset.split(split_ratio=split_ratio) split_ratio = "string" with pytest.raises(ValueError): dataset.split(split_ratio=split_ratio)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--train', type=Path, required=True) parser.add_argument('--dev', type=Path, required=True) parser.add_argument('--output-dir', type=Path, required=True) parser.add_argument('--epochs', type=int, default=20) parser.add_argument('--batch_size', type=int, default=512) args = parser.parse_args() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') TEXT = data.Field(tokenize='spacy', lower=True) LABEL = data.LabelField() train_csv = os.path.join('/', args.output_dir, 'train.csv') json_to_csv(args.train, train_csv) dev_csv = os.path.join('/', args.output_dir, 'dev.csv') json_to_csv(args.dev, dev_csv) train_data, val_data = data.TabularDataset.splits(path=args.output_dir, train='train.csv', validation='dev.csv', format='csv', skip_header=True, fields=[ ('sentence1', TEXT), ('sentence2', TEXT), ('gold_label', LABEL) ]) TEXT.build_vocab(train_data, min_freq=2, vectors="glove.6B.300d", unk_init=torch.Tensor.normal_) field_path = os.path.join('/', args.output_dir, 'bilstm-field.pt') torch.save(TEXT, field_path, pickle_module=dill) LABEL.build_vocab(train_data) train_iterator, valid_iterator = data.BucketIterator.splits( (train_data, val_data), batch_size=args.batch_size, device=device, sort_key=lambda x: len(x.sentence1), sort_within_batch=False) pad_idx = TEXT.vocab.stoi[TEXT.pad_token] model = BiLSTM(input_dim=len(TEXT.vocab), embedding_dim=300, hidden_dim=300, lstm_layers=2, fc_layers=3, output_dim=len(LABEL.vocab), dropout=0.25, pad_idx=pad_idx).to(device) model.embedding.weight.data[pad_idx] = torch.zeros(300) model.embedding.weight.requires_grad = True optimizer = optim.Adam(model.parameters()) ce_loss = nn.CrossEntropyLoss().to(device) #torch.set_default_tensor_type('torch.cuda.FloatTensor') best_valid_loss = float('inf') model_path = os.path.join('/', args.output_dir, 'bilstm.pt') for epoch in range(args.epochs): train_loss = 0 train_acc = 0 model.train() for batch in train_iterator: prem = batch.sentence1 hypo = batch.sentence2 labels = batch.gold_label optimizer.zero_grad() predictions = model(prem, hypo) loss = ce_loss(predictions, labels) acc = accuracy(predictions, labels) loss.backward() optimizer.step() train_loss += loss.item() train_acc += acc.item() train_loss = train_loss / len(train_iterator) train_acc = train_acc / len(train_iterator) valid_loss = 0 valid_acc = 0 model.eval() with torch.no_grad(): for batch in valid_iterator: prem = batch.sentence1 hypo = batch.sentence2 labels = batch.gold_label predictions = model(prem, hypo) loss = ce_loss(predictions, labels) acc = accuracy(predictions, labels) valid_loss += loss.item() valid_acc += acc.item() valid_loss = valid_loss / len(valid_iterator) valid_acc = valid_acc / len(valid_iterator) if valid_loss < best_valid_loss: best_valid_loss = valid_loss torch.save(model.state_dict(), model_path) print(f'Epoch: {epoch+1:02}') print( f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%' ) print( f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%' )
def main(config): if not os.path.exists(config.model_dir): os.makedirs(config.model_dir) if not os.path.exists(config.log_dir): os.makedirs(config.log_dir) print("\t \t \t the model name is {}".format(config.model_name)) device, n_gpu = get_device() torch.manual_seed(config.seed) np.random.seed(config.seed) torch.manual_seed(config.seed) if n_gpu > 0: torch.cuda.manual_seed_all(config.seed) torch.backends.cudnn.deterministic = True # cudnn 使用确定性算法,保证每次结果一样 """ sst2 数据准备 """ text_field = data.Field(tokenize='spacy', lower=True, include_lengths=True, fix_length=config.sequence_length) label_field = data.LabelField(dtype=torch.long) train_iterator, dev_iterator, test_iterator = load_sst2( config.data_path, text_field, label_field, config.batch_size, device, config.glove_word_file, config.cache_path) """ 词向量准备 """ pretrained_embeddings = text_field.vocab.vectors model_file = config.model_dir + 'model1.pt' """ 模型准备 """ if config.model_name == "TextCNN": from TextCNN import TextCNN filter_sizes = [int(val) for val in config.filter_sizes.split()] model = TextCNN.TextCNN(config.glove_word_dim, config.filter_num, filter_sizes, config.output_dim, config.dropout, pretrained_embeddings) elif config.model_name == "TextRNN": from TextRNN import TextRNN model = TextRNN.TextRNN(config.glove_word_dim, config.output_dim, config.hidden_size, config.num_layers, config.bidirectional, config.dropout, pretrained_embeddings) elif config.model_name == "LSTMATT": from LSTM_ATT import LSTMATT model = LSTMATT.LSTMATT(config.glove_word_dim, config.output_dim, config.hidden_size, config.num_layers, config.bidirectional, config.dropout, pretrained_embeddings) elif config.model_name == 'TextRCNN': from TextRCNN import TextRCNN model = TextRCNN.TextRCNN(config.glove_word_dim, config.output_dim, config.hidden_size, config.num_layers, config.bidirectional, config.dropout, pretrained_embeddings) optimizer = optim.Adam(model.parameters()) criterion = nn.CrossEntropyLoss() model = model.to(device) criterion = criterion.to(device) if config.do_train: train(config.epoch_num, model, train_iterator, dev_iterator, optimizer, criterion, ['0', '1'], model_file, config.log_dir, config.print_step, 'word') model.load_state_dict(torch.load(model_file)) test_loss, test_acc, test_report = evaluate(model, test_iterator, criterion, ['0', '1'], 'word') print("-------------- Test -------------") print( "\t Loss: {} | Acc: {} | Micro avg F1: {} | Macro avg F1: {} | Weighted avg F1: {}" .format(test_loss, test_acc, test_report['micro avg']['f1-score'], test_report['macro avg']['f1-score'], test_report['weighted avg']['f1-score']))
def main(file_path, batch_size, base_model, num_epochs): """Train movie sentiment model""" # %% # base_model = "roberta-base" # batch_size=8 # num_epochs=5 print("Initializing models") tokenizer = RobertaTokenizerFast.from_pretrained(base_model) if torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') model = RoBERTaSentimentClassifier(device=device, base_model=base_model) print(f"Using device {model.device}") #%% train_cache = Path(".data/cache/train_data") val_cache = Path(".data/cache/validate_data") if train_cache.exists() and val_cache.exists(): print("Load cached datasets") train = load_cached_dataset(train_cache) val = load_cached_dataset(val_cache) else: print("Generating datasets") PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token) UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token) # set up fields TEXT = data.Field(use_vocab=False, include_lengths=False, batch_first=True, lower=False, fix_length=512, tokenize=tokenizer.encode, pad_token=PAD_INDEX, unk_token=UNK_INDEX) LABEL = data.LabelField() # make splits for data train, test = datasets.IMDB.splits(TEXT, LABEL) LABEL.build_vocab(train) test, val = test.split(split_ratio=0.9) print("Cache train and validate sets") save_cached_dataset(train, train_cache) save_cached_dataset(val, val_cache) print("Prepare dataset iterators") # make iterator for splits train_iter, val_iter = data.BucketIterator.splits((train, val), batch_size=batch_size, device=device) #%% for batch in val_iter: if batch.text.shape[0] != batch.label.shape[0]: print(batch) # print(batch.text.shape, batch.label.shape) # break #%% #dir(val_iter) #%% # initialize running values running_loss = 0.0 valid_running_loss = 0.0 global_step = 0 train_loss_list = [] valid_loss_list = [] global_steps_list = [] best_valid_loss = float("Inf") model.train() optimizer = torch.optim.Adam(model.parameters(), lr=2e-5) for item in train_iter: print(item) break print("Start training") for epoch in range(1, num_epochs + 1): print(f"Epoch {epoch}") train_iter.init_epoch() val_iter.init_epoch() for i, (text, labels) in enumerate(tqdm(train_iter, desc="train")): labels = labels.type(torch.LongTensor) labels = labels.to(device) output = model(text, labels) loss, _ = output optimizer.zero_grad() loss.backward() optimizer.step() # update running values running_loss += loss.item() global_step += 1 model.eval() with torch.no_grad(): answers = [] # validation loop for i, (text, labels) in enumerate(tqdm(val_iter, desc="validate")): labels = labels.type(torch.LongTensor) labels = labels.to(device) output = model(text, labels) loss, preds = output correct = torch.argmax(preds, dim=1) == labels answers.extend(correct.cpu().tolist()) valid_running_loss += loss.item() average_accuracy = sum([1 for a in answers if a]) / len(answers) # evaluation average_train_loss = running_loss / epoch average_valid_loss = valid_running_loss / 10 train_loss_list.append(average_train_loss) valid_loss_list.append(average_valid_loss) global_steps_list.append(global_step) # resetting running values running_loss = 0.0 valid_running_loss = 0.0 model.train() # print progress print( 'Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}, Valid Acc: {:.4f}' .format(epoch + 1, num_epochs, global_step, num_epochs * len(train_iter), average_train_loss, average_valid_loss, average_accuracy)) # checkpoint if best_valid_loss > average_valid_loss: best_valid_loss = average_valid_loss save_checkpoint(file_path + '/' + 'model.pt', model, best_valid_loss) save_metrics(file_path + '/' + 'metrics.pt', train_loss_list, valid_loss_list, global_steps_list) save_metrics(file_path + '/' + 'metrics.pt', train_loss_list, valid_loss_list, global_steps_list) print('Finished Training!')
def create_fields(): TEXT = Field(sequential=True, tokenize="basic_english") LABEL = data.LabelField(dtype=torch.float) return TEXT, LABEL
prediction = model(tensor, length_tensor) #prediction return prediction.item() SEED = 42 BATCH_SIZE = 64 torch.manual_seed(SEED) embedding_dim = 100 num_hidden_nodes = 32 num_output_nodes = 1 num_layers = 2 bidirection = True dropout = 0.2 TEXT = data.Field(tokenize='spacy', batch_first=True, include_lengths=True) LABEL = data.LabelField(dtype=torch.float, batch_first=True) fields = [(None, None), ('text', TEXT), ('label', LABEL)] training_data = data.TabularDataset(path='quora.csv', format='csv', fields=fields, skip_header=True) train_data, valid_data = training_data.split(split_ratio=0.8, random_state=random.seed(SEED)) TEXT.build_vocab(train_data, min_freq=3, vectors="glove.6B.100d") size_of_vocab = len(TEXT.vocab) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = classifier(size_of_vocab,
np.random.binomial(1, p=self.p_word_dropout, size=tuple(data.size())).astype('uint8')) if self.gpu: mask = mask.cuda() # Set to <unk> data[mask] = self.UNK_IDX return Variable(data) ########################################################################### mTEXT = data.Field(tokenize='spacy') mLABEL = data.LabelField(tensor_type=torch.FloatTensor) print("loading dataset male_sent_obftrain_less700.tsv...") mtrain = data.TabularDataset.splits(path='../sent/ori_gender_data/', train='male_sent_obftrain_less700.tsv', format='tsv', fields=[('Text', mTEXT), ('Label', mLABEL)])[0] print("creating vocab for mTEXT") mTEXT.build_vocab(mtrain, max_size=60000, vectors="glove.6B.100d") mLABEL.build_vocab(mtrain) mLABEL.vocab.stoi['1'] = 1 mLABEL.vocab.stoi['2'] = 2 mLABEL.vocab.stoi['3'] = 3
def main(config, model_filename): if not os.path.exists(config.output_dir): os.makedirs(config.output_dir) if not os.path.exists(config.cache_dir): os.makedirs(config.cache_dir) model_file = os.path.join(config.output_dir, model_filename) # Prepare the device gpu_ids = [int(device_id) for device_id in config.gpu_ids.split()] device, n_gpu = get_device(gpu_ids[0]) if n_gpu > 1: n_gpu = len(gpu_ids) # Set Random Seeds random.seed(config.seed) torch.manual_seed(config.seed) np.random.seed(config.seed) if n_gpu > 0: torch.cuda.manual_seed_all(config.seed) torch.backends.cudnn.deterministic = True # Prepare the data id_field = data.RawField() id_field.is_target = False text_field = data.Field(tokenize='spacy', lower=True, include_lengths=True) label_field = data.LabelField(dtype=torch.long) train_iterator, dev_iterator, test_iterator = load_data( config.data_path, id_field, text_field, label_field, config.train_batch_size, config.dev_batch_size, config.test_batch_size, device, config.glove_word_file, config.cache_dir) # Word Vector word_emb = text_field.vocab.vectors if config.model_name == "GAReader": from Baselines.GAReader.GAReader import GAReader model = GAReader(config.glove_word_dim, config.output_dim, config.hidden_size, config.rnn_num_layers, config.ga_layers, config.bidirectional, config.dropout, word_emb) print(model) # optimizer = optim.Adam(model.parameters(), lr=config.lr) optimizer = optim.SGD(model.parameters(), lr=config.lr) criterion = nn.CrossEntropyLoss() model = model.to(device) criterion = criterion.to(device) if config.do_train: train(config.epoch_num, model, train_iterator, dev_iterator, optimizer, criterion, ['0', '1', '2', '3', '4'], model_file, config.log_dir, config.print_step, config.clip) model.load_state_dict(torch.load(model_file)) test_loss, test_acc, test_report = evaluate(model, test_iterator, criterion, ['0', '1', '2', '3', '4']) print("-------------- Test -------------") print("\t Loss: {} | Acc: {} | Macro avg F1: {} | Weighted avg F1: {}". format(test_loss, test_acc, test_report['macro avg']['f1-score'], test_report['weighted avg']['f1-score']))
def main(args): aspects = [] with open(args.dataset + "/" + args.dataset + "_aspects.txt") as f: for line in f: lst = line.split() aspect = lst[0].lower() aspects.append(aspect) print(aspects) TEXT = data.Field(tokenize=tokenizer) train_data = data.TabularDataset(path=args.dataset + "/" + args.dataset + "_train.csv", format='csv', fields=[('text', TEXT)]) LABEL = data.LabelField() test_data = data.TabularDataset(path=args.dataset + "/" + args.dataset + "_test.csv", format='csv', fields=[('text', TEXT), ('label', LABEL)]) embedding = torchtext.vocab.Vectors(args.dataset + "/" + args.dataset + ".200d.txt") MAX_VOCAB_SIZE = 40000 TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE, vectors=embedding, unk_init=torch.Tensor.normal_) LABEL.build_vocab(test_data) print(LABEL.vocab.stoi) print(LABEL.vocab.itos) BATCH_SIZE = int(len(train_data) / 500) if torch.cuda.is_available(): torch.cuda.set_device(6) device = torch.device('cuda') else: device = torch.device('cpu') #device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #device = torch.device('cpu') train_iterator = data.BucketIterator(train_data, batch_size=BATCH_SIZE, device=device, sort=False) test_iterator = data.BucketIterator(test_data, batch_size=len(test_data), device=device, sort=False) LABEL_KPLUS = data.LabelField() test_kplus_data = data.TabularDataset(path=args.dataset + "/" + args.dataset + "_test_kplus.csv", format='csv', fields=[('text', TEXT), ('label', LABEL_KPLUS)]) test_kplus_iterator = data.BucketIterator(test_kplus_data, batch_size=len(test_kplus_data), device=device, sort=False) LABEL_KPLUS.build_vocab(test_kplus_data) print(LABEL_KPLUS.vocab.stoi) from sklearn import metrics def train_metric(preds, label): max_preds = preds.argmax(dim=1) max_label = label.argmax(dim=1) acc = metrics.accuracy_score(max_label.cpu().numpy(), max_preds.cpu().numpy()) return acc def train(model, pseudolabel, iterator, optimizer): criterion = nn.KLDivLoss() epoch_loss = 0 epoch_acc = 0 model.train() pseudolabel.eval() for batch in iterator: optimizer.zero_grad() probs, _ = model(batch.text) #[batch size, output dim] p, q = pseudolabel(batch.text) loss = criterion(torch.log(probs), p.detach()) acc = train_metric(probs, p.detach()) loss.backward() optimizer.step() epoch_loss += loss.item() epoch_acc += acc return epoch_loss / len(iterator), epoch_acc / len(iterator) def evaluate(model, eval_data, LABEL): preds = [] labels = [] for e in eval_data.examples: pred = predict(model, e.text) preds.append(pred) labels.append(LABEL.vocab.stoi[e.label]) f1 = metrics.f1_score(labels, preds, average='weighted') acc = metrics.accuracy_score(labels, preds) return acc, f1 def predict_class(model, sentence, min_len=5): model.eval() tokenized = [tok for tok in tokenizer(sentence)] if len(tokenized) < min_len: tokenized += ['<pad>'] * (min_len - len(tokenized)) indexed = [TEXT.vocab.stoi[t] for t in tokenized] tensor = torch.LongTensor(indexed).to(device) tensor = tensor.unsqueeze(1) preds, _ = model(tensor) max_preds = preds.argmax(dim=1) return max_preds.item() def predict(model, sentence, min_len=5): model.eval() if len(sentence) < min_len: sentence += ['<pad>'] * (min_len - len(sentence)) indexed = [TEXT.vocab.stoi[t] for t in sentence] tensor = torch.LongTensor(indexed).to(device) tensor = tensor.unsqueeze(1) preds, _ = model(tensor) max_preds = preds.argmax(dim=1) return max_preds.item() def predict_pseudolabel(model, sentence, min_len=5): model.eval() if len(sentence) < min_len: sentence += ['<pad>'] * (min_len - len(sentence)) indexed = [TEXT.vocab.stoi[t] for t in sentence] tensor = torch.LongTensor(indexed).to(device) tensor = tensor.unsqueeze(1) _, preds = model(tensor) max_preds = preds.argmax(dim=1) return max_preds.item() def get_qs(model, sentence, min_len=5): model.eval() if len(sentence) < min_len: sentence += ['<pad>'] * (min_len - len(sentence)) indexed = [TEXT.vocab.stoi[t] for t in sentence] tensor = torch.LongTensor(indexed).to(device) tensor = tensor.unsqueeze(1) p, q = model(tensor) max_q = torch.max(q, 1)[1] return q, max_q def get_p(model, sentence, min_len=5): model.eval() if len(sentence) < min_len: sentence += ['<pad>'] * (min_len - len(sentence)) indexed = [TEXT.vocab.stoi[t] for t in sentence] tensor = torch.LongTensor(indexed).to(device) tensor = tensor.unsqueeze(1) preds, classes = model(tensor) return preds, classes import datetime time = int(datetime.datetime.now().timestamp()) if not os.path.exists('outputs'): os.makedirs('outputs') import logging logging.basicConfig(filename='outputs/' + str(time) + 'train-' + args.dataset + '.log', level=logging.DEBUG) logging.debug("no filtering: " + str(args.no_filtering)) logging.debug("no tuning: " + str(args.no_tuning)) import collections seed_words_d = collections.defaultdict(set) with open(args.dataset + "/" + args.dataset + "_seeds.txt") as f: for line in f: lst = line.split() w1 = lst[0].lower() w2 = lst[1].lower() seed_words_d[w2].add(w1) seed_words = sorted(seed_words_d.items(), key=lambda x: LABEL.vocab.stoi[x[0]]) print(seed_words) def get_seed_embedding(seed_words): SEED_WORDS = [] for w, lst in seed_words: temp = [] for e in lst: temp.append( TEXT.vocab.vectors[TEXT.vocab.stoi[e]].unsqueeze(0)) embeds = torch.cat(temp) embed = torch.mean(embeds, dim=0) SEED_WORDS.append(embed.unsqueeze(0)) SEED_WORDS = torch.cat(SEED_WORDS) SEED_WORDS = SEED_WORDS.unsqueeze(1) SEED_WORDS = SEED_WORDS.unsqueeze(1) return SEED_WORDS SEED_WORDS = get_seed_embedding(seed_words) print(SEED_WORDS.shape) def init_kmodel(SEED_WORDS): INPUT_DIM = len(TEXT.vocab) EMBEDDING_DIM = 200 N_FILTERS = 100 FILTER_SIZES = [2, 3, 4] KOUTPUT_DIM = len(LABEL.vocab) DROPOUT = 0.5 PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token] k_model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, KOUTPUT_DIM, DROPOUT, PAD_IDX) k_model = k_model.to(device) #k_model.load_state_dict(torch.load('k-model.pt')) k_pseudolabel = PseudoLabel(INPUT_DIM, EMBEDDING_DIM, KOUTPUT_DIM, KOUTPUT_DIM, PAD_IDX, SEED_WORDS) k_pseudolabel.eval() k_pseudolabel = k_pseudolabel.to(device) pretrained_embeddings = TEXT.vocab.vectors k_model.embedding.weight.data.copy_(pretrained_embeddings) k_pseudolabel.embedding.weight.data.copy_(pretrained_embeddings) UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token] k_model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM) k_model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM) k_pseudolabel.embedding.weight.data[UNK_IDX] = torch.zeros( EMBEDDING_DIM) k_pseudolabel.embedding.weight.data[PAD_IDX] = torch.zeros( EMBEDDING_DIM) return k_model, k_pseudolabel k_model, k_pseudolabel = init_kmodel(SEED_WORDS) k_model_optimizer = optim.Adam( filter(lambda p: p.requires_grad, k_model.parameters())) N_EPOCHS = 5 for epoch in range(N_EPOCHS): print("epoch: ", epoch + 1) train_loss, train_acc = train(k_model, k_pseudolabel, train_iterator, k_model_optimizer) print("training loss: ", train_loss) print("training accuracy: ", train_acc) valid_acc, valid_f1 = evaluate(k_model, test_data, LABEL) print("validation accuracy: ", valid_acc) print('validation F1:', valid_f1) torch.cuda.empty_cache() preds = [] labels = [] for e in test_data.examples: pred = predict(k_model, e.text) preds.append(pred) labels.append(LABEL.vocab.stoi[e.label]) def log_info(labels, preds): print(metrics.accuracy_score(labels, preds)) logging.debug(metrics.accuracy_score(labels, preds)) print(metrics.precision_score(labels, preds, average='weighted')) logging.debug( metrics.precision_score(labels, preds, average='weighted')) print(metrics.recall_score(labels, preds, average='weighted')) logging.debug(metrics.recall_score(labels, preds, average='weighted')) print(metrics.f1_score(labels, preds, average='weighted')) logging.debug(metrics.f1_score(labels, preds, average='weighted')) m = confusion_matrix(labels, preds) print(m) logging.debug(m) log_info(labels, preds) logging.debug("k pseudolabel") preds = [] labels = [] for e in test_data.examples: pred = predict_pseudolabel(k_pseudolabel, e.text) preds.append(pred) labels.append(LABEL.vocab.stoi[e.label]) log_info(labels, preds) def compute_threshold(): lst1 = [] lst2 = [] for e in train_data.examples: qs, _ = get_qs(k_pseudolabel, e.text) preds, _ = get_p(k_model, e.text) vs = [v.item() for v in preds.squeeze(0) if v.item() != 0] h_norm = (-1 / math.log(preds.shape[1])) * sum( [v * math.log(v) for v in vs]) #if e.label == 'miscellaneous': #lst1.append(int(h_norm*100)) #else: lst2.append(int(h_norm * 100)) a = np.array(lst2) threshold = np.quantile(a, args.quantile) / 100 return threshold threshold = compute_threshold() #threshold = 0.2 print("threshold:", threshold) logging.debug("threshold:" + str(threshold)) def init_kplusmodel(SEED_WORDS): INPUT_DIM = len(TEXT.vocab) EMBEDDING_DIM = 200 N_FILTERS = 100 FILTER_SIZES = [2, 3, 4] OUTPUT_DIM = len(LABEL_KPLUS.vocab) DROPOUT = 0.5 PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token] kplus_model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX) kplus_model = kplus_model.to(device) kplus_pseudolabel = PseudoLabelPlus(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM - 1, OUTPUT_DIM - 1, PAD_IDX, SEED_WORDS, k_model, threshold, args.upperbound, device, LABEL_KPLUS.vocab.stoi) kplus_pseudolabel = kplus_pseudolabel.to(device) kplus_pseudolabel.eval() pretrained_embeddings = TEXT.vocab.vectors kplus_model.embedding.weight.data.copy_(pretrained_embeddings) kplus_pseudolabel.embedding.weight.data.copy_(pretrained_embeddings) UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token] kplus_model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM) kplus_model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM) kplus_pseudolabel.embedding.weight.data[UNK_IDX] = torch.zeros( EMBEDDING_DIM) kplus_pseudolabel.embedding.weight.data[PAD_IDX] = torch.zeros( EMBEDDING_DIM) return kplus_model, kplus_pseudolabel kplus_model, kplus_pseudolabel = init_kplusmodel(SEED_WORDS) kplus_model_optimizer = optim.Adam( filter(lambda p: p.requires_grad, kplus_model.parameters())) N_EPOCHS = 5 for epoch in range(N_EPOCHS): print("epoch: ", epoch + 1) train_loss, train_acc = train(kplus_model, kplus_pseudolabel, train_iterator, kplus_model_optimizer) print("training loss: ", train_loss) print("training accuracy: ", train_acc) valid_acc, valid_f1 = evaluate(kplus_model, test_kplus_data, LABEL_KPLUS) print("validation accuracy: ", valid_acc) print('validation F1:', valid_f1) torch.cuda.empty_cache() preds = [] labels = [] for e in test_kplus_data.examples: pred = predict(kplus_model, e.text) preds.append(pred) labels.append(LABEL_KPLUS.vocab.stoi[e.label]) log_info(labels, preds) logging.debug("kplus pseudolabel") preds = [] labels = [] for e in test_kplus_data.examples: pred = predict_pseudolabel(kplus_pseudolabel, e.text) preds.append(pred) labels.append(LABEL_KPLUS.vocab.stoi[e.label]) log_info(labels, preds) import nltk import string from nltk.corpus import stopwords stop_words_en = list(set(stopwords.words('english'))) stop_words_fr = list(set(stopwords.words('french'))) stop_words_sp = list(set(stopwords.words('spanish'))) stop_words = set(stop_words_en + stop_words_fr + stop_words_sp) def update_seeds(seed_words_d, no_filtering, no_tuning): tf1 = collections.defaultdict(dict) pool1 = collections.defaultdict(dict) kl = nn.KLDivLoss() for e in test_data.examples: #p_orig, label = get_qs(k_pseudolabel, e.text) p_orig, label = get_p(k_model, e.text) kls = [] words = [] #label = LABEL.vocab.itos[label.item()] label = e.label for i in range(len(e.text)): tmp = e.text[i] if tmp not in tf1[label]: tf1[label][tmp] = 0 tf1[label][tmp] += 1 if tmp in stop_words or tmp in string.punctuation or tmp == '<unk>' or tmp == '<pad>': continue e.text[i] = '<unk>' #p_new, _ = get_qs(k_pseudolabel, e.text) p_new, _ = get_p(k_model, e.text) loss = kl(torch.log(p_orig.detach()), p_new.detach()) kls.append(loss.item()) words.append(tmp) e.text[i] = tmp lst = list(zip(words, kls)) lst.sort(key=lambda x: x[1], reverse=True) #print(lst[:len_]) if not no_tuning: for i in range(len(lst) // 4): threshold = 5e-2 if lst[i][1] > threshold: if lst[i][0] not in pool1[label]: pool1[label][lst[i][0]] = 0 pool1[label][lst[i][0]] += lst[i][1] pops1 = collections.defaultdict(dict) aspects1 = list(tf1.keys()) for i in range(len(aspects1)): for word in tf1[aspects1[i]]: sum_ = 0 for j in range(len(aspects1)): if word in tf1[aspects1[j]]: sum_ += tf1[aspects1[j]][word] pops1[aspects1[i]][word] = tf1[aspects1[i]][word] / sum_ dists1 = collections.defaultdict(dict) for i in range(len(aspects1)): for word in tf1[aspects1[i]]: max_ = 0 for j in range(len(aspects1)): if word in tf1[aspects1[j]]: max_ = max(max_, tf1[aspects1[j]][word]) dists1[aspects1[i]][word] = tf1[aspects1[i]][word] / max_ scores1 = collections.defaultdict(dict) for i in range(len(aspects1)): if no_tuning: for word in tf1[aspects1[i]]: scores1[aspects1[i]][word] = pops1[ aspects1[i]][word] * dists1[aspects1[i]][word] else: for word in pool1[aspects1[i]]: scores1[aspects1[i]][word] = pops1[ aspects1[i]][word] * dists1[aspects1[i]][word] candidates1 = collections.defaultdict(list) for aspect in aspects1: candidates1[aspect] = sorted(scores1[aspect].items(), key=lambda x: x[1], reverse=True) commons1 = set() aspects1 = list(candidates1.keys()) for i in range(len(aspects1) - 1): for j in range(i + 1, len(aspects1)): lst1, _ = zip(*candidates1[aspects1[i]]) lst2, _ = zip(*candidates1[aspects1[j]]) common = set.intersection(set(lst1), set(lst2)) for c in common: commons1.add(c) miscs = set() if not no_filtering: tf2 = collections.defaultdict(dict) pool2 = collections.defaultdict(dict) kl = nn.KLDivLoss() for e in test_kplus_data.examples: #p_orig, label = get_qs(kplus_pseudolabel, e.text) p_orig, label = get_p(kplus_model, e.text) kls = [] words = [] #label = LABEL.vocab.itos[label.item()] label = e.label for i in range(len(e.text)): tmp = e.text[i] if tmp not in tf2[label]: tf2[label][tmp] = 0 tf2[label][tmp] += 1 if tmp in stop_words or tmp in string.punctuation or tmp == '<unk>' or tmp == '<pad>': continue e.text[i] = '<unk>' #p_new, _ = get_qs(kplus_pseudolabel, e.text) p_new, _ = get_p(kplus_model, e.text) loss = kl(torch.log(p_orig.detach()), p_new.detach()) kls.append(loss.item()) words.append(tmp) e.text[i] = tmp lst = list(zip(words, kls)) lst.sort(key=lambda x: x[1], reverse=True) #print(lst[:len_]) for i in range(len(lst) // 4): threshold = 1e-2 if lst[i][1] > threshold: if lst[i][0] not in pool2[label]: pool2[label][lst[i][0]] = 0 pool2[label][lst[i][0]] += lst[i][1] pops2 = collections.defaultdict(dict) aspects2 = list(tf2.keys()) for i in range(len(aspects2)): for word in tf2[aspects2[i]]: sum_ = 0 for j in range(len(aspects2)): if word in tf2[aspects2[j]]: sum_ += tf2[aspects2[j]][word] pops2[aspects2[i]][word] = tf2[aspects2[i]][word] / sum_ dists2 = collections.defaultdict(dict) for i in range(len(aspects2)): for word in tf2[aspects2[i]]: max_ = 0 for j in range(len(aspects2)): if word in tf2[aspects2[j]]: max_ = max(max_, tf2[aspects2[j]][word]) dists2[aspects2[i]][word] = tf2[aspects2[i]][word] / max_ scores2 = collections.defaultdict(dict) for i in range(len(aspects2)): for word in pool2[aspects2[i]]: scores2[aspects2[i]][word] = pops2[ aspects2[i]][word] * dists2[aspects2[i]][word] candidates2 = collections.defaultdict(list) for aspect in aspects2: candidates2[aspect] = sorted(scores2[aspect].items(), key=lambda x: x[1], reverse=True) print(candidates2['miscellaneous']) for i in range(len(candidates2['miscellaneous'])): word, score = candidates2['miscellaneous'][i] if score > 1e-2: miscs.add(word) for aspect in aspects: if not no_filtering: for word in miscs: if word in seed_words_d[aspect]: seed_words_d[aspect].remove(word) i = 0 while len(seed_words_d[aspect]) < args.seedword_limit and i < len( candidates1[aspect]): word, score = candidates1[aspect][i] if word not in seed_words_d[ aspect] and word not in commons1 and word not in miscs and score >= args.score_threshold: seed_words_d[aspect].add(word) i += 1 commons2 = set() aspects2 = list(seed_words_d.keys()) for i in range(len(aspects2) - 1): for j in range(i + 1, len(aspects2)): lst1 = seed_words_d[aspects2[i]] lst2 = seed_words_d[aspects2[j]] common = set.intersection(set(lst1), set(lst2)) for c in common: commons2.add(c) for aspect in aspects2: for c in commons2: if c in seed_words_d[aspect]: seed_words_d[aspect].remove(c) update_seeds(seed_words_d, args.no_filtering, args.no_tuning) print(seed_words_d) for k in seed_words_d: seed_words_d[k] = list(seed_words_d[k]) for k in seed_words_d: seed_words_d[k] = set(seed_words_d[k]) seed_words = sorted(seed_words_d.items(), key=lambda x: LABEL.vocab.stoi[x[0]]) print(seed_words) logging.debug(seed_words) get_seed_embedding(seed_words) k_model, k_pseudolabel = init_kmodel(SEED_WORDS) k_model_optimizer = optim.Adam( filter(lambda p: p.requires_grad, k_model.parameters())) N_EPOCHS = 5 for epoch in range(N_EPOCHS): print("epoch: ", epoch + 1) train_loss, train_acc = train(k_model, k_pseudolabel, train_iterator, k_model_optimizer) print("training loss: ", train_loss) print("training accuracy: ", train_acc) valid_acc, valid_f1 = evaluate(k_model, test_data, LABEL) print("validation accuracy: ", valid_acc) print('validation F1:', valid_f1) torch.cuda.empty_cache() preds = [] labels = [] for e in test_data.examples: pred = predict(k_model, e.text) preds.append(pred) labels.append(LABEL.vocab.stoi[e.label]) log_info(labels, preds) logging.debug("k pseudolabel") preds = [] labels = [] for e in test_data.examples: pred = predict_pseudolabel(k_pseudolabel, e.text) preds.append(pred) labels.append(LABEL.vocab.stoi[e.label]) log_info(labels, preds) threshold = compute_threshold() #threshold = 0.2 print("threshold:", threshold) logging.debug("threshold:" + str(threshold)) kplus_model, kplus_pseudolabel = init_kplusmodel(SEED_WORDS) kplus_model_optimizer = optim.Adam( filter(lambda p: p.requires_grad, kplus_model.parameters())) N_EPOCHS = 5 for epoch in range(N_EPOCHS): print("epoch: ", epoch + 1) train_loss, train_acc = train(kplus_model, kplus_pseudolabel, train_iterator, kplus_model_optimizer) print("training loss: ", train_loss) print("training accuracy: ", train_acc) valid_acc, valid_f1 = evaluate(kplus_model, test_kplus_data, LABEL_KPLUS) print("validation accuracy: ", valid_acc) print('validation F1:', valid_f1) torch.cuda.empty_cache() preds = [] labels = [] for e in test_kplus_data.examples: pred = predict(kplus_model, e.text) preds.append(pred) labels.append(LABEL_KPLUS.vocab.stoi[e.label]) log_info(labels, preds) logging.debug("kplus pseudolabel") preds = [] labels = [] for e in test_kplus_data.examples: pred = predict_pseudolabel(kplus_pseudolabel, e.text) preds.append(pred) labels.append(LABEL_KPLUS.vocab.stoi[e.label]) log_info(labels, preds) for i in range(3): logging.debug("iteration: " + str(i + 1)) import copy seed_words_d_copy = copy.deepcopy(seed_words_d) update_seeds(seed_words_d, args.no_filtering, args.no_tuning) print(seed_words_d) seed_words = sorted(seed_words_d.items(), key=lambda x: LABEL.vocab.stoi[x[0]]) print(seed_words) logging.debug(seed_words) if seed_words_d == seed_words_d_copy: break get_seed_embedding(seed_words) k_model, k_pseudolabel = init_kmodel(SEED_WORDS) k_model_optimizer = optim.Adam( filter(lambda p: p.requires_grad, k_model.parameters())) N_EPOCHS = 5 for epoch in range(N_EPOCHS): print("epoch: ", epoch + 1) train_loss, train_acc = train(k_model, k_pseudolabel, train_iterator, k_model_optimizer) print("training loss: ", train_loss) print("training accuracy: ", train_acc) valid_acc, valid_f1 = evaluate(k_model, test_data, LABEL) print("validation accuracy: ", valid_acc) print('validation F1:', valid_f1) torch.cuda.empty_cache() preds = [] labels = [] for e in test_data.examples: pred = predict(k_model, e.text) preds.append(pred) labels.append(LABEL.vocab.stoi[e.label]) log_info(labels, preds) logging.debug("k pseudolabel") preds = [] labels = [] for e in test_data.examples: pred = predict_pseudolabel(k_pseudolabel, e.text) preds.append(pred) labels.append(LABEL.vocab.stoi[e.label]) log_info(labels, preds) threshold = compute_threshold() #threshold = 0.2 print("threshold:", threshold) logging.debug("threshold:" + str(threshold)) kplus_model, kplus_pseudolabel = init_kplusmodel(SEED_WORDS) kplus_model_optimizer = optim.Adam( filter(lambda p: p.requires_grad, kplus_model.parameters())) N_EPOCHS_2 = 5 for epoch in range(N_EPOCHS_2): print("epoch: ", epoch + 1) train_loss, train_acc = train(kplus_model, kplus_pseudolabel, train_iterator, kplus_model_optimizer) print("training loss: ", train_loss) print("training accuracy: ", train_acc) valid_acc, valid_f1 = evaluate(kplus_model, test_kplus_data, LABEL_KPLUS) print("validation accuracy: ", valid_acc) print('validation F1:', valid_f1) torch.cuda.empty_cache() preds = [] labels = [] for e in test_kplus_data.examples: pred = predict(kplus_model, e.text) preds.append(pred) labels.append(LABEL_KPLUS.vocab.stoi[e.label]) log_info(labels, preds) logging.debug("kplus pseudolabel") preds = [] labels = [] for e in test_kplus_data.examples: pred = predict_pseudolabel(kplus_pseudolabel, e.text) preds.append(pred) labels.append(LABEL_KPLUS.vocab.stoi[e.label]) log_info(labels, preds)
def cross_val_score( Model, model_kwargs, model_path, custom_embeddings, vocab_kwargs, data_path, label_column, text_column, other_fields, process_text, process_labels, Optimizer, optimizer_kwargs, criterion, batch_size, n_epochs, writer, device, ): p = Path(data_path) n_files = len(list(p.glob('*.json'))) # тут мы полагаем что на каждый фолд должно приходится 2 файла: test и train assert n_files % 2 == 0 n_splits = n_files // 2 # будем поддерживать масивы с accuracy на валидации для последней эпохи, и accuracy на эпохе с лучшим лоссом best_accuracy = [] final_accuracy = [] for fold in range(n_splits): # всем используемым моделям нужны поля с текстом и целевым лейблом TEXT = data.Field(**process_text) LABEL = data.LabelField(**process_labels) fields = {label_column: ('label', LABEL), text_column: ('text', TEXT)} # некоторые модели требуют дополнительные поля, их опредялем в вызывающем контексте fields.update(other_fields) train_data = data.TabularDataset( path=Path(data_path, f'train_{fold}.json'), format='json', fields=fields, ) test_data = data.TabularDataset( path=Path(data_path, f'test_{fold}.json'), format='json', fields=fields, ) TEXT.build_vocab(train_data, vectors=custom_embeddings, **vocab_kwargs) LABEL.build_vocab(train_data) input_dim = len(TEXT.vocab) output_dim = len(LABEL.vocab) pad_idx = TEXT.vocab.stoi[TEXT.pad_token] model = Model(input_dim, output_dim, pad_idx=pad_idx, **model_kwargs) if custom_embeddings is not None: embeddings = TEXT.vocab.vectors model.embedding.weight.data.copy_(embeddings) # явно зануляем ембеддинг для <pad> model.embedding.weight.data[pad_idx] = torch.zeros( model_kwargs['embedding_dim']) optimizer = Optimizer(model.parameters(), **optimizer_kwargs) model = model.to(device) criterion = criterion.to(device) train_iterator, test_iterator = data.BucketIterator.splits( (train_data, test_data), batch_size=batch_size, sort_key=lambda ex: len(ex.text), sort_within_batch=True, device=device) best_valid_acc, final_valid_acc = train_model(model, train_iterator, test_iterator, optimizer, criterion, model_path + f'_{fold}', n_epochs=n_epochs, comment=f'fold_{fold}', writer=writer) best_accuracy.append(best_valid_acc) final_accuracy.append(final_valid_acc) return best_accuracy, final_accuracy
def initialise_train(self): #create random seeds SEED = 1234 random.seed(SEED) np.random.seed(SEED) torch.manual_seed(SEED) torch.backends.cudnn.deterministic = True #Use torchtext.data to create dataset using random seeds TEXT = data.Field(batch_first=True, use_vocab=False, tokenize=self.tokenize_and_cut, preprocessing=self.tokenizer.convert_tokens_to_ids, init_token=self.init_token_idx, eos_token=self.eos_token_idx, pad_token=self.pad_token_idx, unk_token=self.unk_token_idx) LABEL = data.LabelField(dtype=torch.float) train_data, test_data = datasets.IMDB.splits(TEXT, LABEL) train_data, valid_data = train_data.split( random_state=random.seed(SEED)) print(f"Number of training examples: {len(train_data)}") print(f"Number of validation examples: {len(valid_data)}") print(f"Number of testing examples: {len(test_data)}") print(vars(train_data.examples[6])) tokens = self.tokenizer.convert_ids_to_tokens( vars(train_data.examples[6])['text']) print(tokens) LABEL.build_vocab(train_data) print(LABEL.vocab.stoi) #Freeze some model parameters to increase training speed for name, param in self.model.named_parameters(): if name.startswith('bert'): param.requires_grad = False #Setup for training train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits( (train_data, valid_data, test_data), batch_size=self.BATCH_SIZE, device=self.device) optimizer = optim.Adam(self.model.parameters()) criterion = nn.BCEWithLogitsLoss() self.model = self.model.to(self.device) criterion = criterion.to(self.device) N_EPOCHS = 5 best_valid_loss = float('inf') #start training loop for epoch in range(N_EPOCHS): start_time = time.time() train_loss, train_acc = self.train(self.model, train_iterator, optimizer, criterion) valid_loss, valid_acc = self.evaluate(self.model, valid_iterator, criterion) end_time = time.time() epoch_mins, epoch_secs = self.epoch_time(start_time, end_time) if valid_loss < best_valid_loss: best_valid_loss = valid_loss torch.save(self.model.state_dict(), 'tut6-model.pt') print( f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s' ) print( f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%' ) print( f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%' )
def __init__(self, args): self.TEXT = data.Field(lower=args.lower, tokenize=lambda x: x.split()) # We'll use NestedField to tokenize each word into list of chars if args.char: CHAR_NESTING = data.Field() self.char_text = data.NestedField(CHAR_NESTING) self.LABEL = data.LabelField() self.ids = data.Field(sequential=True, use_vocab=True) if args.char: fields = { 'question': ('question', self.TEXT), 'char_question': ('question_c', self.char_text), 'label': ('label', self.LABEL), 'text': ('answer', self.TEXT), 'char_text': ('answer_c', self.char_text) } test_fields = { '__id__': ('q_id', self.ids), 'question': ('question', self.TEXT), 'char_question': ('question_c', self.char_text), 'text': ('answer', self.TEXT), 'char_text': ('answer_c', self.char_text), 'id': ('a_id', self.ids) } else: fields = { 'question': ('question', self.TEXT), 'label': ('label', self.LABEL), 'text': ('answer', self.TEXT) } test_fields = { '__id__': ('q_id', self.ids), 'id': ('a_id', self.ids), 'question': ('question', self.TEXT), 'text': ('answer', self.TEXT) } data_zalo = data.TabularDataset( path='../wikiqa_zalo/data/train_pr.json', format='json', fields=fields) data_submission = data.TabularDataset( path='../wikiqa_zalo/data/test_pr_submission.json', format='json', fields=test_fields) self.train, self.test = data_zalo.split(0.8, random_state=random.seed(SEED)) self.train, self.dev = self.train.split(0.8, random_state=random.seed(SEED)) # len(data_zalo), len(train), len(test), len(valid) self.TEXT.build_vocab(self.train, self.dev, self.test) self.ids.build_vocab(data_submission) if args.char: self.char_text.build_vocab(self.train, self.dev, self.test) if args.word_vectors: if os.path.isfile(args.vector_cache): print('Found pretrained word embeddings') cache, name = '/'.join(args.vector_cache.split('/') [:-1]), args.vector_cache.split('/')[-1] vectors = Vectors(cache=cache, name=name) self.TEXT.vocab.set_vectors(vectors.stoi, vectors.vectors, vectors.dim) # inputs.vocab.vectors = torch.load(args.vector_cache) else: print('Not found pretrained word embeddings\nDownloading') self.TEXT.vocab.load_vectors(args.word_vectors) makedirs(os.path.dirname(args.vector_cache)) torch.save(self.TEXT.vocab.vectors, args.vector_cache) if args.char_vectors and args.char: print('Found pretrained character embeddings') cache, name = '/'.join(args.char_vectors.split('/') [:-1]), args.char_vectors.split('/')[-1] char_vectors = Vectors(cache=cache, name=name) self.char_text.vocab.set_vectors(char_vectors.stoi, char_vectors.vectors, char_vectors.dim) self.LABEL.build_vocab(self.train) def sort_key(x): return data.interleave_keys(len(x.question), len(x.answer)) self.train_iter, self.dev_iter, self.test_iter = data.BucketIterator.splits( (self.train, self.dev, self.test), batch_size=args.batch_size, device=args.device, sort_key=sort_key, sort_within_batch=False) self.max_word_len = max([len(w) for w in self.TEXT.vocab.itos]) self.submission_iter = data.BucketIterator(data_submission, batch_size=args.batch_size, device=args.device, sort_within_batch=False)
ApparelTEXT = data.Field(tokenize='spacy') ApparelLABEL = data.LabelField(tensor_type=torch.FloatTensor) print("loading dataset clean_Apparel300.tsv...") Appareltrain = data.TabularDataset.splits( path='../stanford-corenlp-full-2018-10-05/stanfordSentimentTreebank/', train='mytrain2.tsv', format='tsv', fields=[('Text', ApparelTEXT),('Label', ApparelLABEL)])[0] ApparelTEXT.build_vocab(Appareltrain, max_size=60000, vectors="glove.6B.300d",min_freq=1) ApparelLABEL.build_vocab(Appareltrain) for a,b in ApparelLABEL.vocab.stoi.items(): ApparelLABEL.vocab.stoi[a]=float(a) ''' JewelryTEXT = data.Field(tokenize='spacy') JewelryLABEL = data.LabelField(tensor_type=torch.FloatTensor) print("loading dataset stanford-sentiment-treebank.train.tsv...") Jewelrytrain = data.TabularDataset.splits( path='../stanford-corenlp-full-2018-10-05/stanfordSentimentTreebank/', train='stanford-sentiment-treebank.train.tsv', format='tsv', fields=[('Text', JewelryTEXT),('Label', JewelryLABEL)])[0] JewelryTEXT.build_vocab(Jewelrytrain, max_size=60000, vectors="glove.6B.300d",min_freq=1) JewelryLABEL.build_vocab(Jewelrytrain) for a,b in JewelryLABEL.vocab.stoi.items(): JewelryLABEL.vocab.stoi[a]=float(a) ShoesTEXT = data.Field(tokenize='spacy') ShoesLABEL = data.LabelField(tensor_type=torch.FloatTensor) print("loading dataset stanford-sentiment-treebank.train.tsv...") Shoestrain = data.TabularDataset.splits(
BATCH_SIZE = 256 LEARNING_RATE = 1e-3 EMBEDDING_DIM = 100 HIDDEN_DIM = 256 N_LAYERS = 2 BIDIRECTIONAL = True DROUPOUT = 0.5 NUM_EPOCHS = 20 LAMBDA = 1e-3 #################################### # Preparing Data # #################################### # 1. data.Field() TEXT = data.Field(include_lengths=True, pad_token='<pad>', unk_token='<unk>') TAG_LABEL = data.LabelField() AGE_LABEL = data.LabelField() GENDER_LABEL = data.LabelField() # 2. data.TabularDataset train_data, test_data = data.TabularDataset.splits( path=TrustPilot_processed_dataset_path, train="train.csv", test="test.csv", fields=[('text', TEXT), ('tag_label', TAG_LABEL), ('age_label', AGE_LABEL), ('gender_label', GENDER_LABEL)], format="csv") # 3. Split train_data to train_data, valid_data train_data, valid_data = train_data.split(random_state=random.seed(SEED)) print("Number of train_data = {}".format(len(train_data)))
from torchtext import data, datasets device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if __name__=='__main__': print('Using device:', device) """## Download dataset First we will download the dataset using [torchtext](https://torchtext.readthedocs.io/en/latest/index.html), which is a package that supports NLP for PyTorch. The following command will get you 3 objects `train_data`, `val_data` and `test_data`. To access the data: * To access list of textual tokens - `train_data[0].text` * To access label - `train_data[0].label` """ if __name__=='__main__': train_data, val_data, test_data = datasets.SST.splits(data.Field(tokenize = 'spacy'), data.LabelField(dtype = torch.float), filter_pred=lambda ex: ex.label != 'neutral') print('{:d} train and {:d} test samples'.format(len(train_data), len(test_data))) print('Sample text:', train_data[0].text) print('Sample label:', train_data[0].label) """# 1. Define the Dataset Class (4 points) In the following cell, we will define the dataset class. You need to implement the following functions: * ` build_dictionary() ` - creates the dictionaries `ixtoword` and `wordtoix`. Converts all the text of all examples, in the form of text ids and stores them in `textual_ids`. If a word is not present in your dictionary, it should use `<unk>`. Use the hyperparameter `THRESHOLD` to control which words appear in the dictionary, based on their frequency in the training data. Note that a word’s frequency should be `>=THRESHOLD` to be included in the dictionary. Also make sure that `<end>` should be at idx 0, and `<unk>` should be at idx 1 * ` get_label() ` - This function should return the value `1` if the label in the dataset is `positive`, and should return `0` if it is `negative`. The data type for the returned item should be `torch.LongTensor`
from google.colab import drive drive.mount('/content/drive/') cd drive/My Drive/DL2/ cd DL2 import torch #handling text data from torchtext import data import torch.optim as optim tokenize = lambda x: x.split() TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=200) LABEL = data.LabelField() # fields = [('label', LABEL), ('text',TEXT)] fields = [(None, None), ('label', LABEL), ('text', TEXT)] train_data=data.TabularDataset(path = 'p_training_data.csv',fields = fields, format = 'csv',skip_header = True) valid_data = data.TabularDataset(path = 'p_validation_data.csv',fields = fields, format = 'csv',skip_header = True) #print preprocessed text print(vars(train_data.examples[0])) SEED = 2019 #Torch torch.manual_seed(SEED) #Cuda algorithms torch.backends.cudnn.deterministic = True
def build_field(stop_word): for ii in range(len(stop_word)): stop_word[ii] = str(stop_word[ii]) text_field = data.Field(stop_words=stop_word) label_field = data.LabelField(use_vocab=False) return text_field, label_field