def all_exemplars(dataset, model, tokenizer, similarity_function):
    with torch.no_grad():
        relations_indices = list(range(19))  # all relations (both ways each + Other)
        data = [e for e in dataset if e['r'] in relations_indices]
        results = []

        exemplar_reps = torch.load('exemplars/all_exemplars.pt').cuda()

        for idx, entry in enumerate(data):
            printProgressBar(idx, len(data))
            original_sentence = entry['text']
            og_ids, og_mask = preprocess_sentence(original_sentence, tokenizer)  # shape (1, 64)
            model_hidden_states = model(og_ids,
                                        attention_mask=og_mask).last_hidden_state  # shape (1, sentence_length, h)
            og_rep = model_hidden_states[:, 0, :]  # use the CLS output: first hidden state: shape (1, h)
            similarities = similarity_function(torch.cat([og_rep, exemplar_reps])).cpu().numpy()
            chosen_r = np.argmax(similarities)

            d1 = {
                'chosen_r': str(chosen_r),
                'chosen_r_name': RELATION_LIST[chosen_r],
                'r_label': str(entry['r']),
                'r_label_name': entry['r_name'],
                'original_sentence': str(original_sentence),
                'e1': entry['e1'],
                'e2': entry['e2'],
                'label': entry['r_name'],
            }
            d2 = {v+'_score': str(similarities[idx]) for idx, v in enumerate(RELATION_LIST.values())}
            result = {**d1, **d2}
            results.append(result)

        with open('all_exemplars_results.json', 'w') as file:
            json.dump(results, file, indent=4)
Пример #2
0
def create_vocabulary():
    """
    Using RNN_CONFIG['vocab_using_n_tweets'] tweets from the train.csv dataset, \n
    creates a vocabulary with RNN_CONFIG['AE_vocab_size']] words.\n
    The vocabulary is an ordered dictionary: the keys are the word radicals and the keys each word's index.\n
    :return: None, dumps the vocabulary as a .json file at data/vocab.json
    """
    with open(cfg['csv_relative_path'], newline='') as csvfile:
        data = list(csv.reader(csvfile))[1:]

    vocab = {}
    ps = PorterStemmer()

    for idx, line in enumerate(data[:RNN_CONFIG['vocab_using_n_tweets']]):
        printProgressBar(idx, RNN_CONFIG['vocab_using_n_tweets'],
                         'creating dictionary')
        for word in line[COLUMN_NAME_TO_IDX['text']].lower().split(' '):
            w = ps.stem(word)
            if w in vocab:
                vocab[w] += 1
            else:
                vocab[w] = 1

    # sort the vocabulary by descending occurrences
    vocab = OrderedDict([(k, idx) for idx, (k, _) in enumerate(
        sorted(vocab.items(), key=lambda item: item[1], reverse=True)
        [:RNN_CONFIG['AE_vocab_size']])])

    with open('data/vocab.json', 'w') as f:
        json.dump(vocab, f, indent=4)
Пример #3
0
def export_RNN_regressor(checkpoint_path):
    """
    :param checkpoint_path: relative path to a PyTorch .pth checkpoint
    :return: None, dumps a prediction text file in the model's training folder
    """
    checkpoint = torch.load(checkpoint_path)
    model = RNN(checkpoint['net_config'])
    model.load_state_dict(checkpoint['model'])
    model = model.eval().cuda()

    test_dataset = TweetDataset(dataset_type='test')
    test_loader = DataLoader(test_dataset,
                             batch_size=TRAIN_CONFIG['batch_size'],
                             num_workers=TRAIN_CONFIG['workers'],
                             collate_fn=collate_function,
                             shuffle=False,
                             pin_memory=True)

    with open(DATASET_CONFIG['test_csv_relative_path'], newline='') as csvfile:
        test_data = list(csv.reader(csvfile))[1:]

    ids = [datum[0] for datum in test_data]
    n = len(test_loader)

    with open(
            "checkpoints/{}/predictions.txt".format(
                checkpoint['train_config']['experiment_name']), 'w') as f:
        writer = csv.writer(f)
        writer.writerow(["TweetID", "NoRetweets"])
        current_idx = 0
        for batch_index, batch in enumerate(test_loader):
            printProgressBar(batch_index, n)
            batch_size = batch['numeric'].shape[0]

            numeric = batch['numeric'].cuda()
            text = batch['embedding'].cuda()
            prediction = torch.exp(model(
                text, numeric)) - 1 if EXPORT_CONFIG['log'] else model(
                    text, numeric)

            if EXPORT_CONFIG['threshold']:
                prediction[
                    prediction >
                    EXPORT_CONFIG['threshold']] = EXPORT_CONFIG['threshold']

            for idx_in_batch in range(batch_size):
                writer.writerow([
                    str(ids[current_idx + idx_in_batch]),
                    str(int(prediction[idx_in_batch].item()))
                ])

            current_idx += batch_size

    print("Exportation done! :)")
def cause_vs_component_replace(dataset, model, tokenizer, similarity_function):
    with torch.no_grad():
        relations_indices = [1, 2, 9, 10]  # keep Cause-Effect and Component-Whole, both directions
        data = [e for e in dataset if e['r'] in relations_indices]
        results = []
        for idx, entry in enumerate(data):
            printProgressBar(idx, len(data))
            original_sentence = entry['text']
            og_ids, og_mask = preprocess_sentence(original_sentence, tokenizer)  # shape (1, 64)

            comparison_sentences = [create_comparison_sentence(original_sentence, entry['e1'], entry['e2'], r_idx)
                                    for r_idx in relations_indices]
            encoded_comparison_sentences = [preprocess_sentence(s, tokenizer) for s in comparison_sentences]
            ids = torch.cat([og_ids] + [e[0] for e in encoded_comparison_sentences])  # stacking input ids
            mask = torch.cat([og_mask] + [e[1] for e in encoded_comparison_sentences])  # stacking attention masks

            model_hidden_states = model(ids, attention_mask=mask).last_hidden_state  # shape (5, sentence_length, h)
            model_output = model_hidden_states[:, 0, :]  # use the CLS output: first hidden state: shape (5, h)

            similarities = similarity_function(model_output).cpu().numpy()
            cause_score = max(similarities[:2])
            component_score = max(similarities[2:])
            is_cause = cause_score > component_score

            result = {
                'Cause-Effect(e1,e2)_score': str(similarities[0]),
                'Cause-Effect(e1,e2)_sentence': str(comparison_sentences[0]),
                'Cause-Effect(e2,e1)_score': str(similarities[1]),
                'Cause-Effect(e2,e1)_sentence': str(comparison_sentences[1]),
                'cause_score': str(cause_score),
                'Component-Whole(e1,e2)_score': str(similarities[2]),
                'Component-Whole(e1,e2)_sentence': str(comparison_sentences[2]),
                'Component-Whole(e2,e1)_score': str(similarities[3]),
                'Component-Whole(e2,e1)_sentence': str(comparison_sentences[3]),
                'component_score': str(component_score),
                'is_cause': str(is_cause),
                'original_sentence': str(original_sentence),
                'e1': entry['e1'],
                'e2': entry['e2'],
                'label': entry['r_name'],
                'is_cause_gt': str(entry['r'] in [1, 2])
            }
            results.append(result)

        with open('cc_replace_results.json', 'w') as file:
            json.dump(results, file, indent=4)
Пример #5
0
    def get_data(dataset, message):
        N = len(dataset)
        data = np.zeros((N, XGBOOST_CONFIG['numeric_data_size'] +
                         XGBOOST_CONFIG['embedding_size'] + 1))  # 1 for answer
        loader = DataLoader(dataset,
                            batch_size=TRAIN_CONFIG['batch_size'],
                            num_workers=TRAIN_CONFIG['workers'],
                            collate_fn=collate_function,
                            shuffle=False)
        current_idx = 0
        n = len(loader)
        print('')
        for batch_index, batch in enumerate(loader):
            printProgressBar(batch_index, n, prefix=message)
            batch_size = batch['numeric'].shape[0]

            numeric = batch['numeric'].cuda()
            text = batch['embedding'].cuda()

            if XGBOOST_CONFIG['embedding_use_hidden']:
                embedding = embed(
                    text,
                    numeric[:, :checkpoint['net_config']['numeric_data_size']]
                )[1]
            elif XGBOOST_CONFIG['embedding_use_output']:
                embedding = torch.exp(
                    embed(
                        text, numeric[:, :checkpoint['net_config']
                                      ['numeric_data_size']])[0]) - 1
            else:  # expecting a built-in embedding layer -> taking the mean of the embeddings
                embedding = embed.emb(text).mean(axis=1)

            data[current_idx:current_idx+batch_size, XGBOOST_CONFIG['numeric_data_size']:-1] = \
                embedding.detach().cpu().numpy()
            data[current_idx:current_idx+batch_size, :XGBOOST_CONFIG['numeric_data_size']] = \
                numeric.detach().cpu().numpy()
            data[current_idx:current_idx + batch_size,
                 -1] = batch['target'].numpy()

            current_idx += batch_size

        return data
Пример #6
0
def parser(file_name):
    '''Parse website OpenEdu'''

    #getting first page
    r = api.get('https://courses.openedu.ru/api/courses/v1/courses/?page=1')
    data = course_encode(str(r.json()))
    with open(file_name, 'w+') as f:
        f.write(data)
    pages = int(r.json()['pagination']['num_pages'])

    #Initiate progress bar
    printProgressBar(0,
                     pages,
                     prefix='Progress of parsing: ',
                     suffix='Complete {} of {}'.format(0, pages),
                     length=30)

    #getting all other pages
    for i in range(2, pages + 1):

        printProgressBar(i,
                         pages,
                         prefix='Progress of parsing: ',
                         suffix='Complete {} of {}'.format(i, pages),
                         length=30)

        r = api.get(
            'https://courses.openedu.ru/api/courses/v1/courses/?page=' +
            str(i))
        dict_of_data = r.json()
        text_to_write = list(str(dict_of_data['results']))
        text_to_write[0] = ','
        text_to_write.append('}')
        text_to_write = course_encode(''.join(text_to_write))
        text_to_write = str.encode(''.join(text_to_write))

        with open(file_name, 'rb+') as f:
            f.seek(-2, 2)
            f.write(text_to_write)

    return dict_of_data
Пример #7
0
def save_exemplar_representations(batch_size=64, workers=8):
    bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    bert = BertModel.from_pretrained("bert-base-uncased").cuda()

    with open('data/semeval_val.json', 'r') as f:
        data_val = json.load(f)

    dataset = ProcessedTextDataset([e['text'] for e in data_val],
                                   bert_tokenizer,
                                   labels=[e['r'] for e in data_val])
    loader = DataLoader(dataset,
                        batch_size=batch_size,
                        shuffle=False,
                        num_workers=workers)
    n_batches = len(loader)
    averages = torch.zeros(19,
                           768).cuda()  # n_relations * BERT-base hidden size

    with torch.no_grad():
        for batch_idx, batch in enumerate(loader):
            printProgressBar(batch_idx,
                             n_batches,
                             prefix='Processing all exemplars ...')
            input_ids = batch['input_ids'].cuda()
            masks = batch['mask'].cuda()
            labels = batch['label'].cuda()
            model_hidden_states = bert(input_ids,
                                       attention_mask=masks).last_hidden_state
            model_output = model_hidden_states[:, 0, :]

            for r in range(19):
                is_r = labels == r  # mask of examples labelled with the relation r
                n_r = is_r.sum()  # number of examples with relation r
                if n_r > 0:  # there might be no example of this class in the batch
                    averages[r, :] = averages[r, :] + torch.sum(
                        model_output[is_r, :], dim=0) / n_r

    file = 'exemplars/all_exemplars.pt'
    if not os.path.exists('exemplars/'):
        os.makedirs('exemplars/')
    torch.save(averages.cpu(), file)
def cause_vs_component_examplars(dataset, model, tokenizer, similarity_function):
    with torch.no_grad():
        relations_indices = [1, 2, 9, 10]  # keep Cause-Effect and Component-Whole, both directions
        data = [e for e in dataset if e['r'] in relations_indices]
        results = []

        all_reps = torch.load('exemplars/all_exemplars.pt').cuda()
        exemplar_reps = all_reps[relations_indices, :]

        for idx, entry in enumerate(data):
            printProgressBar(idx, len(data))
            original_sentence = entry['text']
            og_ids, og_mask = preprocess_sentence(original_sentence, tokenizer)  # shape (1, 64)
            model_hidden_states = model(og_ids,
                                        attention_mask=og_mask).last_hidden_state  # shape (1, sentence_length, h)
            og_rep = model_hidden_states[:, 0, :]  # use the CLS output: first hidden state: shape (1, h)
            similarities = similarity_function(torch.cat([og_rep, exemplar_reps])).cpu().numpy()
            cause_score = max(similarities[:2])
            component_score = max(similarities[2:])
            is_cause = cause_score > component_score

            result = {
                'Cause-Effect(e1,e2)_score': str(similarities[0]),
                'Cause-Effect(e2,e1)_score': str(similarities[1]),
                'cause_score': str(cause_score),
                'Component-Whole(e1,e2)_score': str(similarities[2]),
                'Component-Whole(e2,e1)_score': str(similarities[3]),
                'component_score': str(component_score),
                'is_cause': str(is_cause),
                'original_sentence': str(original_sentence),
                'e1': entry['e1'],
                'e2': entry['e2'],
                'label': entry['r_name'],
                'is_cause_gt': str(entry['r'] in [1, 2])
            }
            results.append(result)

        with open('cc_exemplars_results.json', 'w') as file:
            json.dump(results, file, indent=4)
Пример #9
0
def val(model, val_loader, writer, step, infer):
    """
    Computes the loss on the validation set and logs it to tensorboard \n
    The loss is computed on a fixed subset with the first [val_batches] batches, defined in the config file \n
    :param model: a PyTorch NN to evaluate
    :param val_loader: a PyTorch Dataloader
    :param writer: a tensorboard writer object
    :param step: the current training step
    :param infer: inference function (see above)
    :return:
    """

    print('\n')
    model.eval()
    val_losses = []
    n = len(val_loader)

    with torch.no_grad():
        for batch_idx, batch in enumerate(val_loader):

            # run only on a subset
            if batch_idx >= cfg['val_batches']:
                break

            batch_val_loss = infer(model, batch).item()

            # log
            printProgressBar(batch_idx,
                             min(n, cfg['val_batches']),
                             suffix='\tValidation ...')

            val_losses.append(batch_val_loss)

        val_loss = sum(val_losses) / len(val_losses)
    writer.add_scalar('Steps/val_loss', val_loss, step)
    print('\n')
    print('Finished validation with loss {:4f}'.format(val_loss))
    return val_loss
Пример #10
0
def train(model, infer_train, infer_val, load_checkpoint=None):
    """
    Train the RNN model using the parameters defined in the config file \n
    :param model: a pytorch NN
    :param infer_train: the inference function used for training (see above)
    :param infer_val: the inference function used for validating (see above)
    :param load_checkpoint: if None, does nothing, otherwise starts training from the given path to a .pth checkpoint
    :return:
    """

    global checkpoint_name
    print('Initialising {}'.format(cfg['experiment_name']))
    checkpoint_folder = 'checkpoints/{}/'.format(cfg['experiment_name'])

    if not os.path.exists(checkpoint_folder):
        os.makedirs(checkpoint_folder)

    tb_folder = 'tb/{}/'.format(cfg['experiment_name'])
    if not os.path.exists(tb_folder):
        os.makedirs(tb_folder)

    writer = SummaryWriter(logdir=tb_folder, flush_secs=30)
    optimiser = Adam(model.parameters(),
                     lr=cfg['learning_rate'],
                     weight_decay=cfg['weight_decay'])

    train_dataset = TweetDataset(dataset_type='train')
    train_loader = DataLoader(train_dataset,
                              batch_size=cfg['batch_size'],
                              num_workers=cfg['workers'],
                              collate_fn=collate_function,
                              shuffle=True,
                              pin_memory=True)

    val_dataset = TweetDataset(dataset_type='val')
    val_loader = DataLoader(val_dataset,
                            batch_size=cfg['batch_size'],
                            num_workers=cfg['workers'],
                            collate_fn=collate_function,
                            shuffle=False,
                            pin_memory=True)

    if load_checkpoint:
        checkpoint = torch.load(load_checkpoint)
        assert model.config == checkpoint['net_config'], \
            "The provided checkpoint has a different configuration, loading is impossible"
        start_epoch = checkpoint['epoch'] + 1
        epochs = cfg['epochs'] + start_epoch
        step = checkpoint['step']
        model.load_state_dict(checkpoint['model'])
        optimiser.load_state_dict(checkpoint['optimiser'])
        print("Loaded the checkpoint at {}".format(load_checkpoint))
    else:
        start_epoch, step = 0, 0
        epochs = cfg['epochs']

    init_loss = 0.
    avg_loss = AverageMeter()
    best_mae = 1e10

    print('Sanity val')
    val(model, val_loader, writer, 0, infer_val)
    model.train()

    print('Starting training')
    for epoch in range(start_epoch, epochs):
        loader_length = len(train_loader)
        epoch_start = time.time()

        for batch_idx, batch in enumerate(train_loader):
            optimiser.zero_grad()

            loss = infer_train(model, batch)
            loss.backward()

            if epoch == 0 and batch_idx == 0:
                init_loss = loss

            # logging
            elapsed = time.time() - epoch_start
            progress = batch_idx / loader_length
            est = datetime.timedelta(
                seconds=int(elapsed / progress)) if progress > 0.001 else '-'
            avg_loss.update(loss)
            suffix = '\tloss {:.4f}/{:.4f}\tETA [{}/{}]'.format(
                avg_loss.avg, init_loss,
                datetime.timedelta(seconds=int(elapsed)), est)
            printProgressBar(batch_idx,
                             loader_length,
                             suffix=suffix,
                             prefix='Epoch [{}/{}]\tStep [{}/{}]'.format(
                                 epoch, epochs - 1, batch_idx, loader_length))

            writer.add_scalar('Steps/train_loss', loss, step)

            # saving the model
            if step % cfg['checkpoint_every'] == 0:
                checkpoint_name = '{}/epoch_{}.pth'.format(
                    checkpoint_folder, epoch)
                torch.save(
                    {
                        'model': model.state_dict(),
                        'epoch': epoch,
                        'batch_idx': batch_idx,
                        'step': step,
                        'optimiser': optimiser.state_dict(),
                        'train_config': cfg,
                        'net_config': model.config,
                        'dataset_config': DATASET_CONFIG
                    }, checkpoint_name)
            step += 1
            optimiser.step()

            # validating
            if step % cfg['val_every'] == 0:
                mae = val(model, val_loader, writer, step, infer_val)
                if mae < best_mae:
                    best_mae = mae
                    print('Best model with V{:.2f}'.format(best_mae))
                    torch.save(
                        {
                            'model': model.state_dict(),
                            'epoch': epoch,
                            'batch_idx': batch_idx,
                            'step': step,
                            'optimiser': optimiser.state_dict(),
                            'train_config': cfg,
                            'net_config': model.config,
                            'dataset_config': DATASET_CONFIG
                        }, '{}/best.pth'.format(checkpoint_folder))
                model.train()

        # end of epoch
        print('')
        writer.add_scalar('Epochs/train_loss', avg_loss.avg, epoch)
        avg_loss.reset()
        checkpoint_name = '{}/epoch_{}.pth'.format(checkpoint_folder, epoch)
        torch.save(
            {
                'model': model.state_dict(),
                'epoch': epoch,
                'batch_idx': loader_length,
                'step': step,
                'optimiser': optimiser.state_dict(),
                'train_config': cfg,
                'net_config': model.config,
                'dataset_config': DATASET_CONFIG
            }, checkpoint_name)

    # finished training
    writer.close()
    print('Training finished :)')