def all_exemplars(dataset, model, tokenizer, similarity_function): with torch.no_grad(): relations_indices = list(range(19)) # all relations (both ways each + Other) data = [e for e in dataset if e['r'] in relations_indices] results = [] exemplar_reps = torch.load('exemplars/all_exemplars.pt').cuda() for idx, entry in enumerate(data): printProgressBar(idx, len(data)) original_sentence = entry['text'] og_ids, og_mask = preprocess_sentence(original_sentence, tokenizer) # shape (1, 64) model_hidden_states = model(og_ids, attention_mask=og_mask).last_hidden_state # shape (1, sentence_length, h) og_rep = model_hidden_states[:, 0, :] # use the CLS output: first hidden state: shape (1, h) similarities = similarity_function(torch.cat([og_rep, exemplar_reps])).cpu().numpy() chosen_r = np.argmax(similarities) d1 = { 'chosen_r': str(chosen_r), 'chosen_r_name': RELATION_LIST[chosen_r], 'r_label': str(entry['r']), 'r_label_name': entry['r_name'], 'original_sentence': str(original_sentence), 'e1': entry['e1'], 'e2': entry['e2'], 'label': entry['r_name'], } d2 = {v+'_score': str(similarities[idx]) for idx, v in enumerate(RELATION_LIST.values())} result = {**d1, **d2} results.append(result) with open('all_exemplars_results.json', 'w') as file: json.dump(results, file, indent=4)
def create_vocabulary(): """ Using RNN_CONFIG['vocab_using_n_tweets'] tweets from the train.csv dataset, \n creates a vocabulary with RNN_CONFIG['AE_vocab_size']] words.\n The vocabulary is an ordered dictionary: the keys are the word radicals and the keys each word's index.\n :return: None, dumps the vocabulary as a .json file at data/vocab.json """ with open(cfg['csv_relative_path'], newline='') as csvfile: data = list(csv.reader(csvfile))[1:] vocab = {} ps = PorterStemmer() for idx, line in enumerate(data[:RNN_CONFIG['vocab_using_n_tweets']]): printProgressBar(idx, RNN_CONFIG['vocab_using_n_tweets'], 'creating dictionary') for word in line[COLUMN_NAME_TO_IDX['text']].lower().split(' '): w = ps.stem(word) if w in vocab: vocab[w] += 1 else: vocab[w] = 1 # sort the vocabulary by descending occurrences vocab = OrderedDict([(k, idx) for idx, (k, _) in enumerate( sorted(vocab.items(), key=lambda item: item[1], reverse=True) [:RNN_CONFIG['AE_vocab_size']])]) with open('data/vocab.json', 'w') as f: json.dump(vocab, f, indent=4)
def export_RNN_regressor(checkpoint_path): """ :param checkpoint_path: relative path to a PyTorch .pth checkpoint :return: None, dumps a prediction text file in the model's training folder """ checkpoint = torch.load(checkpoint_path) model = RNN(checkpoint['net_config']) model.load_state_dict(checkpoint['model']) model = model.eval().cuda() test_dataset = TweetDataset(dataset_type='test') test_loader = DataLoader(test_dataset, batch_size=TRAIN_CONFIG['batch_size'], num_workers=TRAIN_CONFIG['workers'], collate_fn=collate_function, shuffle=False, pin_memory=True) with open(DATASET_CONFIG['test_csv_relative_path'], newline='') as csvfile: test_data = list(csv.reader(csvfile))[1:] ids = [datum[0] for datum in test_data] n = len(test_loader) with open( "checkpoints/{}/predictions.txt".format( checkpoint['train_config']['experiment_name']), 'w') as f: writer = csv.writer(f) writer.writerow(["TweetID", "NoRetweets"]) current_idx = 0 for batch_index, batch in enumerate(test_loader): printProgressBar(batch_index, n) batch_size = batch['numeric'].shape[0] numeric = batch['numeric'].cuda() text = batch['embedding'].cuda() prediction = torch.exp(model( text, numeric)) - 1 if EXPORT_CONFIG['log'] else model( text, numeric) if EXPORT_CONFIG['threshold']: prediction[ prediction > EXPORT_CONFIG['threshold']] = EXPORT_CONFIG['threshold'] for idx_in_batch in range(batch_size): writer.writerow([ str(ids[current_idx + idx_in_batch]), str(int(prediction[idx_in_batch].item())) ]) current_idx += batch_size print("Exportation done! :)")
def cause_vs_component_replace(dataset, model, tokenizer, similarity_function): with torch.no_grad(): relations_indices = [1, 2, 9, 10] # keep Cause-Effect and Component-Whole, both directions data = [e for e in dataset if e['r'] in relations_indices] results = [] for idx, entry in enumerate(data): printProgressBar(idx, len(data)) original_sentence = entry['text'] og_ids, og_mask = preprocess_sentence(original_sentence, tokenizer) # shape (1, 64) comparison_sentences = [create_comparison_sentence(original_sentence, entry['e1'], entry['e2'], r_idx) for r_idx in relations_indices] encoded_comparison_sentences = [preprocess_sentence(s, tokenizer) for s in comparison_sentences] ids = torch.cat([og_ids] + [e[0] for e in encoded_comparison_sentences]) # stacking input ids mask = torch.cat([og_mask] + [e[1] for e in encoded_comparison_sentences]) # stacking attention masks model_hidden_states = model(ids, attention_mask=mask).last_hidden_state # shape (5, sentence_length, h) model_output = model_hidden_states[:, 0, :] # use the CLS output: first hidden state: shape (5, h) similarities = similarity_function(model_output).cpu().numpy() cause_score = max(similarities[:2]) component_score = max(similarities[2:]) is_cause = cause_score > component_score result = { 'Cause-Effect(e1,e2)_score': str(similarities[0]), 'Cause-Effect(e1,e2)_sentence': str(comparison_sentences[0]), 'Cause-Effect(e2,e1)_score': str(similarities[1]), 'Cause-Effect(e2,e1)_sentence': str(comparison_sentences[1]), 'cause_score': str(cause_score), 'Component-Whole(e1,e2)_score': str(similarities[2]), 'Component-Whole(e1,e2)_sentence': str(comparison_sentences[2]), 'Component-Whole(e2,e1)_score': str(similarities[3]), 'Component-Whole(e2,e1)_sentence': str(comparison_sentences[3]), 'component_score': str(component_score), 'is_cause': str(is_cause), 'original_sentence': str(original_sentence), 'e1': entry['e1'], 'e2': entry['e2'], 'label': entry['r_name'], 'is_cause_gt': str(entry['r'] in [1, 2]) } results.append(result) with open('cc_replace_results.json', 'w') as file: json.dump(results, file, indent=4)
def get_data(dataset, message): N = len(dataset) data = np.zeros((N, XGBOOST_CONFIG['numeric_data_size'] + XGBOOST_CONFIG['embedding_size'] + 1)) # 1 for answer loader = DataLoader(dataset, batch_size=TRAIN_CONFIG['batch_size'], num_workers=TRAIN_CONFIG['workers'], collate_fn=collate_function, shuffle=False) current_idx = 0 n = len(loader) print('') for batch_index, batch in enumerate(loader): printProgressBar(batch_index, n, prefix=message) batch_size = batch['numeric'].shape[0] numeric = batch['numeric'].cuda() text = batch['embedding'].cuda() if XGBOOST_CONFIG['embedding_use_hidden']: embedding = embed( text, numeric[:, :checkpoint['net_config']['numeric_data_size']] )[1] elif XGBOOST_CONFIG['embedding_use_output']: embedding = torch.exp( embed( text, numeric[:, :checkpoint['net_config'] ['numeric_data_size']])[0]) - 1 else: # expecting a built-in embedding layer -> taking the mean of the embeddings embedding = embed.emb(text).mean(axis=1) data[current_idx:current_idx+batch_size, XGBOOST_CONFIG['numeric_data_size']:-1] = \ embedding.detach().cpu().numpy() data[current_idx:current_idx+batch_size, :XGBOOST_CONFIG['numeric_data_size']] = \ numeric.detach().cpu().numpy() data[current_idx:current_idx + batch_size, -1] = batch['target'].numpy() current_idx += batch_size return data
def parser(file_name): '''Parse website OpenEdu''' #getting first page r = api.get('https://courses.openedu.ru/api/courses/v1/courses/?page=1') data = course_encode(str(r.json())) with open(file_name, 'w+') as f: f.write(data) pages = int(r.json()['pagination']['num_pages']) #Initiate progress bar printProgressBar(0, pages, prefix='Progress of parsing: ', suffix='Complete {} of {}'.format(0, pages), length=30) #getting all other pages for i in range(2, pages + 1): printProgressBar(i, pages, prefix='Progress of parsing: ', suffix='Complete {} of {}'.format(i, pages), length=30) r = api.get( 'https://courses.openedu.ru/api/courses/v1/courses/?page=' + str(i)) dict_of_data = r.json() text_to_write = list(str(dict_of_data['results'])) text_to_write[0] = ',' text_to_write.append('}') text_to_write = course_encode(''.join(text_to_write)) text_to_write = str.encode(''.join(text_to_write)) with open(file_name, 'rb+') as f: f.seek(-2, 2) f.write(text_to_write) return dict_of_data
def save_exemplar_representations(batch_size=64, workers=8): bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') bert = BertModel.from_pretrained("bert-base-uncased").cuda() with open('data/semeval_val.json', 'r') as f: data_val = json.load(f) dataset = ProcessedTextDataset([e['text'] for e in data_val], bert_tokenizer, labels=[e['r'] for e in data_val]) loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=workers) n_batches = len(loader) averages = torch.zeros(19, 768).cuda() # n_relations * BERT-base hidden size with torch.no_grad(): for batch_idx, batch in enumerate(loader): printProgressBar(batch_idx, n_batches, prefix='Processing all exemplars ...') input_ids = batch['input_ids'].cuda() masks = batch['mask'].cuda() labels = batch['label'].cuda() model_hidden_states = bert(input_ids, attention_mask=masks).last_hidden_state model_output = model_hidden_states[:, 0, :] for r in range(19): is_r = labels == r # mask of examples labelled with the relation r n_r = is_r.sum() # number of examples with relation r if n_r > 0: # there might be no example of this class in the batch averages[r, :] = averages[r, :] + torch.sum( model_output[is_r, :], dim=0) / n_r file = 'exemplars/all_exemplars.pt' if not os.path.exists('exemplars/'): os.makedirs('exemplars/') torch.save(averages.cpu(), file)
def cause_vs_component_examplars(dataset, model, tokenizer, similarity_function): with torch.no_grad(): relations_indices = [1, 2, 9, 10] # keep Cause-Effect and Component-Whole, both directions data = [e for e in dataset if e['r'] in relations_indices] results = [] all_reps = torch.load('exemplars/all_exemplars.pt').cuda() exemplar_reps = all_reps[relations_indices, :] for idx, entry in enumerate(data): printProgressBar(idx, len(data)) original_sentence = entry['text'] og_ids, og_mask = preprocess_sentence(original_sentence, tokenizer) # shape (1, 64) model_hidden_states = model(og_ids, attention_mask=og_mask).last_hidden_state # shape (1, sentence_length, h) og_rep = model_hidden_states[:, 0, :] # use the CLS output: first hidden state: shape (1, h) similarities = similarity_function(torch.cat([og_rep, exemplar_reps])).cpu().numpy() cause_score = max(similarities[:2]) component_score = max(similarities[2:]) is_cause = cause_score > component_score result = { 'Cause-Effect(e1,e2)_score': str(similarities[0]), 'Cause-Effect(e2,e1)_score': str(similarities[1]), 'cause_score': str(cause_score), 'Component-Whole(e1,e2)_score': str(similarities[2]), 'Component-Whole(e2,e1)_score': str(similarities[3]), 'component_score': str(component_score), 'is_cause': str(is_cause), 'original_sentence': str(original_sentence), 'e1': entry['e1'], 'e2': entry['e2'], 'label': entry['r_name'], 'is_cause_gt': str(entry['r'] in [1, 2]) } results.append(result) with open('cc_exemplars_results.json', 'w') as file: json.dump(results, file, indent=4)
def val(model, val_loader, writer, step, infer): """ Computes the loss on the validation set and logs it to tensorboard \n The loss is computed on a fixed subset with the first [val_batches] batches, defined in the config file \n :param model: a PyTorch NN to evaluate :param val_loader: a PyTorch Dataloader :param writer: a tensorboard writer object :param step: the current training step :param infer: inference function (see above) :return: """ print('\n') model.eval() val_losses = [] n = len(val_loader) with torch.no_grad(): for batch_idx, batch in enumerate(val_loader): # run only on a subset if batch_idx >= cfg['val_batches']: break batch_val_loss = infer(model, batch).item() # log printProgressBar(batch_idx, min(n, cfg['val_batches']), suffix='\tValidation ...') val_losses.append(batch_val_loss) val_loss = sum(val_losses) / len(val_losses) writer.add_scalar('Steps/val_loss', val_loss, step) print('\n') print('Finished validation with loss {:4f}'.format(val_loss)) return val_loss
def train(model, infer_train, infer_val, load_checkpoint=None): """ Train the RNN model using the parameters defined in the config file \n :param model: a pytorch NN :param infer_train: the inference function used for training (see above) :param infer_val: the inference function used for validating (see above) :param load_checkpoint: if None, does nothing, otherwise starts training from the given path to a .pth checkpoint :return: """ global checkpoint_name print('Initialising {}'.format(cfg['experiment_name'])) checkpoint_folder = 'checkpoints/{}/'.format(cfg['experiment_name']) if not os.path.exists(checkpoint_folder): os.makedirs(checkpoint_folder) tb_folder = 'tb/{}/'.format(cfg['experiment_name']) if not os.path.exists(tb_folder): os.makedirs(tb_folder) writer = SummaryWriter(logdir=tb_folder, flush_secs=30) optimiser = Adam(model.parameters(), lr=cfg['learning_rate'], weight_decay=cfg['weight_decay']) train_dataset = TweetDataset(dataset_type='train') train_loader = DataLoader(train_dataset, batch_size=cfg['batch_size'], num_workers=cfg['workers'], collate_fn=collate_function, shuffle=True, pin_memory=True) val_dataset = TweetDataset(dataset_type='val') val_loader = DataLoader(val_dataset, batch_size=cfg['batch_size'], num_workers=cfg['workers'], collate_fn=collate_function, shuffle=False, pin_memory=True) if load_checkpoint: checkpoint = torch.load(load_checkpoint) assert model.config == checkpoint['net_config'], \ "The provided checkpoint has a different configuration, loading is impossible" start_epoch = checkpoint['epoch'] + 1 epochs = cfg['epochs'] + start_epoch step = checkpoint['step'] model.load_state_dict(checkpoint['model']) optimiser.load_state_dict(checkpoint['optimiser']) print("Loaded the checkpoint at {}".format(load_checkpoint)) else: start_epoch, step = 0, 0 epochs = cfg['epochs'] init_loss = 0. avg_loss = AverageMeter() best_mae = 1e10 print('Sanity val') val(model, val_loader, writer, 0, infer_val) model.train() print('Starting training') for epoch in range(start_epoch, epochs): loader_length = len(train_loader) epoch_start = time.time() for batch_idx, batch in enumerate(train_loader): optimiser.zero_grad() loss = infer_train(model, batch) loss.backward() if epoch == 0 and batch_idx == 0: init_loss = loss # logging elapsed = time.time() - epoch_start progress = batch_idx / loader_length est = datetime.timedelta( seconds=int(elapsed / progress)) if progress > 0.001 else '-' avg_loss.update(loss) suffix = '\tloss {:.4f}/{:.4f}\tETA [{}/{}]'.format( avg_loss.avg, init_loss, datetime.timedelta(seconds=int(elapsed)), est) printProgressBar(batch_idx, loader_length, suffix=suffix, prefix='Epoch [{}/{}]\tStep [{}/{}]'.format( epoch, epochs - 1, batch_idx, loader_length)) writer.add_scalar('Steps/train_loss', loss, step) # saving the model if step % cfg['checkpoint_every'] == 0: checkpoint_name = '{}/epoch_{}.pth'.format( checkpoint_folder, epoch) torch.save( { 'model': model.state_dict(), 'epoch': epoch, 'batch_idx': batch_idx, 'step': step, 'optimiser': optimiser.state_dict(), 'train_config': cfg, 'net_config': model.config, 'dataset_config': DATASET_CONFIG }, checkpoint_name) step += 1 optimiser.step() # validating if step % cfg['val_every'] == 0: mae = val(model, val_loader, writer, step, infer_val) if mae < best_mae: best_mae = mae print('Best model with V{:.2f}'.format(best_mae)) torch.save( { 'model': model.state_dict(), 'epoch': epoch, 'batch_idx': batch_idx, 'step': step, 'optimiser': optimiser.state_dict(), 'train_config': cfg, 'net_config': model.config, 'dataset_config': DATASET_CONFIG }, '{}/best.pth'.format(checkpoint_folder)) model.train() # end of epoch print('') writer.add_scalar('Epochs/train_loss', avg_loss.avg, epoch) avg_loss.reset() checkpoint_name = '{}/epoch_{}.pth'.format(checkpoint_folder, epoch) torch.save( { 'model': model.state_dict(), 'epoch': epoch, 'batch_idx': loader_length, 'step': step, 'optimiser': optimiser.state_dict(), 'train_config': cfg, 'net_config': model.config, 'dataset_config': DATASET_CONFIG }, checkpoint_name) # finished training writer.close() print('Training finished :)')