Exemplo n.º 1
0
    list(train["passage"].values), 9)

logging.info("Initlizing the Tokenizer")
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
special_tokens_dict = {
    'additional_special_tokens': ['[UTTERANCE_SEP]', '[TURN_SEP]']
}
tokenizer.add_special_tokens(special_tokens_dict)

#Create the loaders for the datasets, with the respective negative samplers
dataloader = dataset.QueryDocumentDataLoader(train_df=train,
                                             val_df=valid,
                                             test_df=valid,
                                             tokenizer=tokenizer,
                                             negative_sampler_train=ns_train,
                                             negative_sampler_val=ns_val,
                                             task_type='classification',
                                             train_batch_size=24,
                                             val_batch_size=24,
                                             max_seq_len=512,
                                             sample_data=-1,
                                             cache_path="../data")
logging.info("Initlizing the DataLoader")
train_loader, val_loader, test_loader = dataloader.get_pytorch_dataloaders()

#Use BERT to rank responses
model = BertForSequenceClassification.from_pretrained('bert-base-cased')
# we added [UTTERANCE_SEP] and [TURN_SEP] to the vocabulary so we need to resize the token embeddings
model.resize_token_embeddings(len(dataloader.tokenizer))

#Instantiate trainer that handles fitting.
trainer = transformer_trainer.TransformerTrainer(model=model,
Exemplo n.º 2
0
def run_experiment(args):
    args.run_id = str(ex.current_run._id)

    tokenizer = BertTokenizer.from_pretrained(args.transformer_model)
    #Load datasets
    ## Conversation Response Ranking
    if args.task in ["mantis", "msdialog", "ubuntu_dstc8"]: 
        add_turn_separator = (args.task != "ubuntu_dstc8") # Ubuntu data has several utterances from same user in the context.
        train = preprocess_crr.read_crr_tsv_as_df(args.data_folder+args.task+"/train.tsv", args.sample_data, add_turn_separator)
        valid = preprocess_crr.read_crr_tsv_as_df(args.data_folder+args.task+"/valid.tsv", args.sample_data, add_turn_separator)
        special_tokens_dict = {'additional_special_tokens': ['[UTTERANCE_SEP]', '[TURN_SEP]'] }
        tokenizer.add_special_tokens(special_tokens_dict)
    ## Similar Question Retrieval and Passage Retrieval
    elif args.task in ["qqp", "linkso", "trec2020pr"]:
        if args.sample_data == -1: args.sample_data=None            
        train = pd.read_csv(args.data_folder+args.task+"/train.tsv", sep="\t", nrows=args.sample_data)
        valid = pd.read_csv(args.data_folder+args.task+"/valid.tsv", sep="\t", nrows=args.sample_data)
    elif args.task=="scisumm":
        train, valid = preprocess_scisumm.transform_to_dfs("../data/Training-Set-2019/Task1/From-Training-Set-2018/")

    #Choose the negative candidate sampler
    document_col = train.columns[1]
    if args.train_negative_sampler == 'random':
        ns_train = negative_sampling.RandomNegativeSampler(list(train[document_col].values), args.num_ns_train)
    elif args.train_negative_sampler == 'bm25':
        ns_train = negative_sampling.BM25NegativeSamplerPyserini(list(train[document_col].values), args.num_ns_train, 
                    args.data_folder+"/"+args.task+"/anserini_train/", args.sample_data, args.anserini_folder)
    elif args.train_negative_sampler == 'sentenceBERT':
        ns_train = negative_sampling.SentenceBERTNegativeSampler(list(train[document_col].values), args.num_ns_train, 
                    args.data_folder+"/"+args.task+"/train_sentenceBERTembeds", args.sample_data, args.bert_sentence_model)

    if args.test_negative_sampler == 'random':
        ns_val = negative_sampling.RandomNegativeSampler(list(valid[document_col].values) + list(train[document_col].values), args.num_ns_eval)
    elif args.test_negative_sampler == 'bm25':
        ns_val = negative_sampling.BM25NegativeSamplerPyserini(list(valid[document_col].values) + list(train[document_col].values),
                    args.num_ns_eval, args.data_folder+"/"+args.task+"/anserini_valid/", args.sample_data, args.anserini_folder)
    elif args.test_negative_sampler == 'sentenceBERT':
        ns_val = negative_sampling.SentenceBERTNegativeSampler(list(valid[document_col].values) + list(train[document_col].values),
                    args.num_ns_eval, args.data_folder+"/"+args.task+"/valid_sentenceBERTembeds", args.sample_data, args.bert_sentence_model)

    #Create the loaders for the datasets, with the respective negative samplers
    dataloader = dataset.QueryDocumentDataLoader(train, valid, valid,
                                tokenizer, ns_train, ns_val,
                                'classification', args.train_batch_size, 
                                args.val_batch_size, args.max_seq_len, 
                                args.sample_data, args.data_folder + args.task)

    train_loader, val_loader, test_loader = dataloader.get_pytorch_dataloaders()


    #Instantiate transformer model to be used
    model = BertForSequenceClassification.from_pretrained(args.transformer_model)
    model.resize_token_embeddings(len(dataloader.tokenizer))

    #Instantiate trainer that handles fitting.
    trainer = transformer_trainer.TransformerTrainer(model, train_loader, val_loader, test_loader, 
                                 args.num_ns_eval, "classification", tokenizer,
                                 args.validate_every_epochs, args.num_validation_instances,
                                 args.num_epochs, args.lr, args.sacred_ex)

    #Train
    model_name = model.__class__.__name__
    logging.info("Fitting {} for {}{}".format(model_name, args.data_folder, args.task))
    trainer.fit()

    #Predict for test
    logging.info("Predicting")
    preds, labels = trainer.test()
    res = results_analyses_tools.evaluate_and_aggregate(preds, labels, ['R_10@1','R_10@1',
                    'R_10@2',
                    'R_10@5',
                    'R_2@1'])
    for metric, v in res.items():
        logging.info("Test {} : {:4f}".format(metric, v))

    #Saving predictions and labels to a file
    max_preds_column = max([len(l) for l in preds])
    preds_df = pd.DataFrame(preds, columns=["prediction_"+str(i) for i in range(max_preds_column)])
    preds_df.to_csv(args.output_dir+"/"+args.run_id+"/predictions.csv", index=False)

    labels_df = pd.DataFrame(labels, columns=["label_"+str(i) for i in range(max_preds_column)])
    labels_df.to_csv(args.output_dir+"/"+args.run_id+"/labels.csv", index=False)

    #Saving model to a file
    if args.save_model:
        torch.save(model.state_dict(), args.output_dir+"/"+args.run_id+"/model")

    #In case we want to get uncertainty estimations at prediction time
    if args.predict_with_uncertainty_estimation:  
        logging.info("Predicting with dropout.")      
        preds, uncertainties, labels, foward_passes_preds = trainer.test_with_dropout(args.num_foward_prediction_passes)
        res = results_analyses_tools.evaluate_and_aggregate(preds, labels, ['R_10@1'])
        for metric, v in res.items():
            logging.info("Test (w. dropout and {} foward passes) {} : {:4f}".format(args.num_foward_prediction_passes, metric, v))
        
        max_preds_column = max([len(l) for l in preds])
        preds_df = pd.DataFrame(preds, columns=["prediction_"+str(i) for i in range(max_preds_column)])
        preds_df.to_csv(args.output_dir+"/"+args.run_id+"/predictions_with_dropout.csv", index=False)

        for i, f_pass_preds in enumerate(foward_passes_preds):
            preds_df = pd.DataFrame(f_pass_preds, columns=["prediction_"+str(i) for i in range(max_preds_column)])
            preds_df.to_csv(args.output_dir+"/"+args.run_id+"/predictions_with_dropout_f_pass_{}.csv".format(i), index=False)

        labels_df = pd.DataFrame(labels, columns=["label_"+str(i) for i in range(max_preds_column)])
        labels_df.to_csv(args.output_dir+"/"+args.run_id+"/labels.csv", index=False)
        
        uncertainties_df = pd.DataFrame(uncertainties, columns=["uncertainty_"+str(i) for i in range(max_preds_column)])
        uncertainties_df.to_csv(args.output_dir+"/"+args.run_id+"/uncertainties.csv", index=False)

    return trainer.best_ndcg
Exemplo n.º 3
0
def run_experiment(args):
    args.run_id = str(ex.current_run._id)

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    tokenizer = BertTokenizer.from_pretrained(args.transformer_model)
    # Conversation Response Ranking datasets needs special tokens
    if args.task in ["mantis", "msdialog", "ubuntu_dstc8"]:
        special_tokens_dict = {
            'additional_special_tokens': ['[UTTERANCE_SEP]', '[TURN_SEP]']
        }
        tokenizer.add_special_tokens(special_tokens_dict)

    #Load datasets
    train = pd.read_csv(
        args.data_folder + args.task + "/train.tsv",
        sep="\t",
        nrows=args.sample_data if args.sample_data != -1 else None)
    valid = pd.read_csv(
        args.data_folder + args.task + "/valid.tsv",
        sep="\t",
        nrows=args.sample_data if args.sample_data != -1 else None)

    #Choose the negative candidate sampler
    document_col = train.columns[1]
    if args.train_negative_sampler == 'random':
        ns_train = negative_sampling.RandomNegativeSampler(
            list(train[document_col].values), args.num_ns_train)
    elif args.train_negative_sampler == 'bm25':
        ns_train = negative_sampling.BM25NegativeSamplerPyserini(
            list(train[document_col].values), args.num_ns_train,
            args.data_folder + args.task + "/anserini_train/",
            args.sample_data, args.anserini_folder)
    elif args.train_negative_sampler == 'sentenceBERT':
        ns_train = negative_sampling.SentenceBERTNegativeSampler(
            list(train[document_col].values), args.num_ns_train,
            args.data_folder + args.task + "/train_sentenceBERTembeds",
            args.sample_data, args.bert_sentence_model)

    if args.test_negative_sampler == 'random':
        ns_val = negative_sampling.RandomNegativeSampler(
            list(valid[document_col].values) +
            list(train[document_col].values), args.num_ns_eval)
    elif args.test_negative_sampler == 'bm25':
        ns_val = negative_sampling.BM25NegativeSamplerPyserini(
            list(valid[document_col].values) +
            list(train[document_col].values), args.num_ns_eval,
            args.data_folder + args.task + "/anserini_valid/",
            args.sample_data, args.anserini_folder)
    elif args.test_negative_sampler == 'sentenceBERT':
        ns_val = negative_sampling.SentenceBERTNegativeSampler(
            list(valid[document_col].values) +
            list(train[document_col].values), args.num_ns_eval,
            args.data_folder + args.task + "/valid_sentenceBERTembeds",
            args.sample_data, args.bert_sentence_model)

    #Create the loaders for the datasets, with the respective negative samplers
    dataloader = dataset.QueryDocumentDataLoader(
        train, valid, valid, tokenizer, ns_train, ns_val, 'classification',
        args.train_batch_size, args.val_batch_size, args.max_seq_len,
        args.sample_data, args.data_folder + args.task)

    train_loader, val_loader, test_loader = dataloader.get_pytorch_dataloaders(
    )

    #Instantiate transformer model to be used
    model = pointwise_bert.BertForPointwiseLearning.from_pretrained(
        args.transformer_model,
        loss_function=args.loss_function,
        smoothing=args.smoothing)

    model.resize_token_embeddings(len(dataloader.tokenizer))

    #Instantiate trainer that handles fitting.
    trainer = transformer_trainer.TransformerTrainer(
        model,
        train_loader,
        val_loader,
        test_loader,
        args.num_ns_eval,
        "classification",
        tokenizer,
        args.validate_every_epochs,
        args.num_validation_batches,
        args.num_epochs,
        args.lr,
        args.sacred_ex,
        args.validate_every_steps,
        validation_metric='R_10@1',
        num_training_instances=args.num_training_instances)

    #Train
    model_name = model.__class__.__name__
    logging.info("Fitting {} for {}{}".format(model_name, args.data_folder,
                                              args.task))
    trainer.fit()

    #Predict for test
    logging.info("Predicting for the validation set.")
    preds, labels, softmax_logits = trainer.test()
    res = results_analyses_tools.evaluate_and_aggregate(
        preds, labels, ['R_10@1'])
    for metric, v in res.items():
        logging.info("Test {} : {:3f}".format(metric, v))
        wandb.log({'step': 0, "dev_" + metric: v})

    #Saving predictions and labels to a file
    max_preds_column = max([len(l) for l in preds])
    preds_df = pd.DataFrame(
        preds,
        columns=["prediction_" + str(i) for i in range(max_preds_column)])
    preds_df.to_csv(args.output_dir + "/" + args.run_id + "/predictions.csv",
                    index=False)

    softmax_df = pd.DataFrame(
        softmax_logits,
        columns=["prediction_" + str(i) for i in range(max_preds_column)])
    softmax_df.to_csv(args.output_dir + "/" + args.run_id +
                      "/predictions_softmax.csv",
                      index=False)

    labels_df = pd.DataFrame(
        labels, columns=["label_" + str(i) for i in range(max_preds_column)])
    labels_df.to_csv(args.output_dir + "/" + args.run_id + "/labels.csv",
                     index=False)

    #Saving model to a file
    if args.save_model:
        torch.save(model.state_dict(),
                   args.output_dir + "/" + args.run_id + "/model")

    #In case we want to get uncertainty estimations at prediction time
    if args.predict_with_uncertainty_estimation:
        logging.info("Predicting with MC dropout for the validation set.")
        preds, labels, softmax_logits, foward_passes_preds, uncertainties = trainer.test_with_dropout(
            args.num_foward_prediction_passes)
        res = results_analyses_tools.evaluate_and_aggregate(
            preds, labels, ['R_10@1'])
        for metric, v in res.items():
            logging.info(
                "Test (w. dropout and {} foward passes) {} : {:3f}".format(
                    args.num_foward_prediction_passes, metric, v))

        max_preds_column = max([len(l) for l in preds])
        preds_df = pd.DataFrame(
            preds,
            columns=["prediction_" + str(i) for i in range(max_preds_column)])
        preds_df.to_csv(args.output_dir + "/" + args.run_id +
                        "/predictions_with_dropout.csv",
                        index=False)

        softmax_df = pd.DataFrame(
            softmax_logits,
            columns=["prediction_" + str(i) for i in range(max_preds_column)])
        softmax_df.to_csv(args.output_dir + "/" + args.run_id +
                          "/predictions_with_dropout_softmax.csv",
                          index=False)

        for i, f_pass_preds in enumerate(foward_passes_preds):
            preds_df = pd.DataFrame(f_pass_preds,
                                    columns=[
                                        "prediction_" + str(i)
                                        for i in range(max_preds_column)
                                    ])
            preds_df.to_csv(
                args.output_dir + "/" + args.run_id +
                "/predictions_with_dropout_f_pass_{}.csv".format(i),
                index=False)

        labels_df = pd.DataFrame(
            labels,
            columns=["label_" + str(i) for i in range(max_preds_column)])
        labels_df.to_csv(args.output_dir + "/" + args.run_id + "/labels.csv",
                         index=False)

        uncertainties_df = pd.DataFrame(
            uncertainties,
            columns=["uncertainty_" + str(i) for i in range(max_preds_column)])
        uncertainties_df.to_csv(args.output_dir + "/" + args.run_id +
                                "/uncertainties.csv",
                                index=False)

    return trainer.best_eval_metric
Exemplo n.º 4
0
def main():
    logging.basicConfig(level=logging.INFO,
                        format="%(asctime)s [%(levelname)s] %(message)s",
                        handlers=[logging.StreamHandler()])
    task = 'qqp'
    data_folder = "../../data/"
    logging.info("Starting downloader for task {}".format(task))

    dataDownloader = downloader.DataDownloader(task, data_folder)
    dataDownloader.download_and_preprocess()

    train = pd.read_csv("{}/{}/train.tsv".format(data_folder, task), sep="\t")
    valid = pd.read_csv("{}/{}/valid.tsv".format(data_folder, task), sep="\t")

    # Random negative samplers
    ns_train = negative_sampling.RandomNegativeSampler(
        list(train["question1"].values), 1)
    ns_val = negative_sampling.RandomNegativeSampler(list(valid["question1"].values) + \
       list(train["question1"].values), 1)

    tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

    #Create the loaders for the datasets, with the respective negative samplers
    dataloader = dataset.QueryDocumentDataLoader(
        train_df=train,
        val_df=valid,
        test_df=valid,
        tokenizer=tokenizer,
        negative_sampler_train=ns_train,
        negative_sampler_val=ns_val,
        task_type='classification',
        train_batch_size=6,
        val_batch_size=6,
        max_seq_len=100,
        sample_data=-1,
        cache_path="{}/{}".format(data_folder, task))

    train_loader, val_loader, test_loader = dataloader.get_pytorch_dataloaders(
    )

    model = pointwise_bert.BertForPointwiseLearning.from_pretrained(
        'bert-base-cased')

    #Instantiate trainer that handles fitting.
    trainer = transformer_trainer.TransformerTrainer(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        test_loader=test_loader,
        num_ns_eval=9,
        task_type="classification",
        tokenizer=tokenizer,
        validate_every_epochs=1,
        num_validation_batches=-1,
        num_epochs=1,
        lr=0.0005,
        sacred_ex=None,
        validate_every_steps=100)

    #Train the model
    logging.info("Fitting pointwise BERT for {}".format(task))
    trainer.fit()

    #Predict for test (in our example the validation set)
    logging.info("Predicting")
    preds, labels, _ = trainer.test()
    res = results_analyses_tools.\
       evaluate_and_aggregate(preds, labels, ['ndcg_cut_10'])

    for metric, v in res.items():
        logging.info("Test {} : {:4f}".format(metric, v))
def run_experiment(args):
    args.run_id = str(ex.current_run._id)

    tokenizer = BertTokenizer.from_pretrained(args.transformer_model)
    # Load datasets
    ## Conversation Response Ranking
    if args.task in ["mantis", "msdialog", "ubuntu_dstc8"]:
        add_turn_separator = (
            args.task != "ubuntu_dstc8"
        )  # Ubuntu data has several utterances from same user in the context.
        train = preprocess_crr.read_crr_tsv_as_df(
            args.data_folder + args.task + "/train.tsv", args.sample_data,
            add_turn_separator)
        valid = preprocess_crr.read_crr_tsv_as_df(
            args.data_folder + args.task + "/valid.tsv", args.sample_data,
            add_turn_separator)
        special_tokens_dict = {
            'additional_special_tokens': ['[UTTERANCE_SEP]', '[TURN_SEP]']
        }
        tokenizer.add_special_tokens(special_tokens_dict)
    ## Similar Question Retrieval and Passage Retrieval
    elif args.task in ["qqp", "linkso", "trec2020pr"]:
        if args.sample_data == -1: args.sample_data = None
        train = pd.read_csv(args.data_folder + args.task + "/train.tsv",
                            sep="\t",
                            nrows=args.sample_data)
        valid = pd.read_csv(args.data_folder + args.task + "/valid.tsv",
                            sep="\t",
                            nrows=args.sample_data)
    elif args.task == "scisumm":
        train, valid = preprocess_scisumm.transform_to_dfs(
            "../data/Training-Set-2019/Task1/From-Training-Set-2018/")
    elif args.task == "scisumm_ranked":
        train, valid, test = preprocess_scisumm_ranked.transform_to_dfs(
            args.path_to_ranked_file, args.path_to_ranked_test,
            args.path_to_ranked_dev)

    # Choose the negative candidate sampler
    document_col = train.columns[1]
    ns_train = None
    ns_val = None
    if args.train_negative_sampler == 'random':
        ns_train = negative_sampling.RandomNegativeSampler(
            list(train[document_col].values), args.num_ns_train)
    elif args.train_negative_sampler == 'bm25':
        ns_train = negative_sampling.BM25NegativeSamplerPyserini(
            list(train[document_col].values), args.num_ns_train,
            args.data_folder + "/" + args.task + "/anserini_train/",
            args.sample_data, args.anserini_folder)
    elif args.train_negative_sampler == 'sentenceBERT':
        ns_train = negative_sampling.SentenceBERTNegativeSampler(
            list(train[document_col].values), args.num_ns_train,
            args.data_folder + "/" + args.task + "/train_sentenceBERTembeds",
            args.sample_data, args.bert_sentence_model)
    if args.test_negative_sampler == 'random':
        ns_val = negative_sampling.RandomNegativeSampler(
            list(valid[document_col].values) +
            list(train[document_col].values), args.num_ns_eval)
    elif args.test_negative_sampler == 'bm25':
        ns_val = negative_sampling.BM25NegativeSamplerPyserini(
            list(valid[document_col].values) +
            list(train[document_col].values), args.num_ns_eval,
            args.data_folder + "/" + args.task + "/anserini_valid/",
            args.sample_data, args.anserini_folder)
    elif args.test_negative_sampler == 'sentenceBERT':
        ns_val = negative_sampling.SentenceBERTNegativeSampler(
            list(valid[document_col].values) +
            list(train[document_col].values), args.num_ns_eval,
            args.data_folder + "/" + args.task + "/valid_sentenceBERTembeds",
            args.sample_data, args.bert_sentence_model)

    # Create the loaders for the datasets, with the respective negative samplers
    dataloader = dataset.QueryDocumentDataLoader(
        train, valid, test, tokenizer, ns_train, ns_val, 'classification',
        args.train_batch_size, args.val_batch_size, args.max_seq_len,
        args.sample_data, args.data_folder + "/" + args.task)
    if args.task == "scisumm_ranked":
        with_ranked_list = True
    else:
        with_ranked_list = False
    train_loader, val_loader, test_loader = dataloader.get_pytorch_dataloaders(
        with_ranked_list)

    # Instantiate transformer model to be used
    model = BertForSequenceClassification.from_pretrained(
        args.transformer_model)
    model.resize_token_embeddings(len(dataloader.tokenizer))

    # Instantiate trainer that handles fitting.
    trainer = transformer_trainer.TransformerTrainer(
        model, train_loader, val_loader, test_loader, args.num_ns_eval,
        "classification", tokenizer, args.validate_every_epochs,
        args.num_validation_instances, args.num_epochs, args.lr,
        args.sacred_ex)

    # Train
    model_name = model.__class__.__name__
    logging.info("Fitting {} for {}{}".format(model_name, args.data_folder,
                                              args.task))
    trainer.fit()

    # Predict for test
    logging.info("Predicting")
    preds, labels, doc_ids, all_queries, preds_without_acc = trainer.validate()
    res = results_analyses_tools.evaluate_and_aggregate(
        preds, labels, [
            'R_10@1', 'R_10@2', 'R_10@5', 'R_2@1', 'accuracy_0.3',
            'accuracy_0.3_upto_1', 'precision_0.3', 'recall_0.3',
            'f_score_0.3', 'accuracy_0.4', 'accuracy_0.4_upto_1',
            'precision_0.4', 'recall_0.4', 'f_score_0.4', 'accuracy_0.5',
            'accuracy_0.5_upto_1', 'precision_0.5', 'recall_0.5', 'f_score_0.5'
        ])
    for metric, v in res.items():
        logging.info("Test {} : {:4f}".format(metric, v))

    # Saving predictions and labels to a file
    max_preds_column = max([len(l) for l in preds])
    preds_df = pd.DataFrame(
        preds,
        columns=["prediction_" + str(i) for i in range(max_preds_column)])
    preds_df.to_csv(args.output_dir + "/" + args.run_id + "/predictions.csv",
                    index=False)

    labels_df = pd.DataFrame(
        labels, columns=["label_" + str(i) for i in range(max_preds_column)])
    labels_df.to_csv(args.output_dir + "/" + args.run_id + "/labels.csv",
                     index=False)

    new_preds = list((np.array(preds_without_acc) > 0.3).astype(int))
    d = {
        'query': all_queries,
        'doc_id': doc_ids,
        'label': new_preds,
        'similiarity': preds_without_acc
    }

    df_doc_ids = pd.DataFrame(d)
    df_doc_ids_ones = df_doc_ids[df_doc_ids['label'] == 1]
    df_doc_ids_ones = df_doc_ids_ones.groupby('query').agg(list).reset_index()
    df_doc_ids_non_ones = df_doc_ids.groupby('query').agg(list).reset_index()
    new_df = []
    for i, row in df_doc_ids_non_ones.iterrows():
        if all([v == 0 for v in row['label']]):
            highest_value = [
                x for _, x in sorted(zip(row['similiarity'], row['doc_id']),
                                     key=lambda pair: pair[0])
            ]
            highest_value_sim = [x for x in sorted(row['similiarity'])]

            row['label'] = [1]
            row['doc_id'] = [highest_value[0]]
            row['similiarity'] = [highest_value_sim[0]]

            new_df.append(row)

    result = pd.concat([df_doc_ids_ones, pd.DataFrame(new_df)])
    result.to_csv(args.output_dir + "/" + args.run_id + "/doc_ids_dev.csv",
                  index=False,
                  sep='\t')

    # predict on the test set
    preds, labels, doc_ids, all_queries, preds_without_acc = trainer.test()

    new_preds = list((np.array(preds_without_acc) > 0.3).astype(int))
    d = {
        'query': all_queries,
        'doc_id': doc_ids,
        'label': new_preds,
        'similiarity': preds_without_acc
    }

    df_doc_ids = pd.DataFrame(d)
    df_doc_ids_ones = df_doc_ids[df_doc_ids['label'] == 1]
    df_doc_ids_ones = df_doc_ids_ones.groupby('query').agg(list).reset_index()
    df_doc_ids_non_ones = df_doc_ids.groupby('query').agg(list).reset_index()
    new_df = []
    for i, row in df_doc_ids_non_ones.iterrows():
        if all([v == 0 for v in row['label']]):
            highest_value = [
                x for _, x in sorted(zip(row['similiarity'], row['doc_id']),
                                     key=lambda pair: pair[0])
            ]
            highest_value_sim = [x for x in sorted(row['similiarity'])]

            row['label'] = [1]
            row['doc_id'] = [highest_value[0]]
            row['similiarity'] = [highest_value_sim[0]]

            new_df.append(row)

    result = pd.concat([df_doc_ids_ones, pd.DataFrame(new_df)])
    result.to_csv(args.output_dir + "/" + args.run_id + "/doc_ids_test.csv",
                  index=False,
                  sep='\t')

    # Saving model to a file
    if args.save_model:
        torch.save(model.state_dict(),
                   args.output_dir + "/" + args.run_id + "/model")

    # In case we want to get uncertainty estimations at prediction time
    if args.predict_with_uncertainty_estimation:
        logging.info("Predicting with dropout.")
        preds, uncertainties, labels, foward_passes_preds = trainer.test_with_dropout(
            args.num_foward_prediction_passes)
        res = results_analyses_tools.evaluate_and_aggregate(
            preds, labels, ['R_10@1'])
        for metric, v in res.items():
            logging.info(
                "Test (w. dropout and {} foward passes) {} : {:4f}".format(
                    args.num_foward_prediction_passes, metric, v))

        max_preds_column = max([len(l) for l in preds])
        preds_df = pd.DataFrame(
            preds,
            columns=["prediction_" + str(i) for i in range(max_preds_column)])
        preds_df.to_csv(args.output_dir + "/" + args.run_id +
                        "/predictions_with_dropout.csv",
                        index=False)

        for i, f_pass_preds in enumerate(foward_passes_preds):
            preds_df = pd.DataFrame(f_pass_preds,
                                    columns=[
                                        "prediction_" + str(i)
                                        for i in range(max_preds_column)
                                    ])
            preds_df.to_csv(
                args.output_dir + "/" + args.run_id +
                "/predictions_with_dropout_f_pass_{}.csv".format(i),
                index=False)

        labels_df = pd.DataFrame(
            labels,
            columns=["label_" + str(i) for i in range(max_preds_column)])
        labels_df.to_csv(args.output_dir + "/" + args.run_id + "/labels.csv",
                         index=False)

        uncertainties_df = pd.DataFrame(
            uncertainties,
            columns=["uncertainty_" + str(i) for i in range(max_preds_column)])
        uncertainties_df.to_csv(args.output_dir + "/" + args.run_id +
                                "/uncertainties.csv",
                                index=False)

    return trainer.best_ndcg
def run_experiment(args):
    args.run_id = str(ex.current_run._id)

    tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

    train, valid, test = preprocess_scisumm_ranked.transform_to_dfs(
        args.path_to_ranked_file, args.path_to_ranked_test,
        args.path_to_ranked_dev)

    # Choose the negative candidate sampler
    ns_train = None
    ns_val = None

    # Create the loaders for the datasets, with the respective negative samplers
    dataloader = dataset.QueryDocumentDataLoader(
        train, valid, test, tokenizer, ns_train, ns_val, 'classification',
        args.val_batch_size, args.val_batch_size, 512, 0,
        args.data_folder + "/scisumm_ranked")
    with_ranked_list = True
    train_loader, val_loader, test_loader = dataloader.get_pytorch_dataloaders(
        with_ranked_list)

    # Instantiate transformer model to be used
    model = BertForSequenceClassification.from_pretrained('bert-base-cased')
    model.resize_token_embeddings(len(dataloader.tokenizer))
    e = torch.load(args.model_dir)
    model.load_state_dict(e)

    model.eval()
    # Instantiate trainer that handles fitting.
    trainer = transformer_trainer.TransformerTrainer(model, train_loader,
                                                     val_loader, test_loader,
                                                     0, "classification",
                                                     tokenizer, False, 0, 0, 0,
                                                     0)
    # Predict for test
    logging.info("Predicting")
    preds, labels, doc_ids, all_queries, preds_without_acc = trainer.test()
    # res = results_analyses_tools.evaluate_and_aggregate(preds, labels, ['R_10@1',
    #                                                                     'R_10@2',
    #                                                                     'R_10@5',
    #                                                                     'R_2@1',
    #                                                                     'accuracy_0.3',
    #                                                                     'accuracy_0.3_upto_1',
    #                                                                     'precision_0.3',
    #                                                                     'recall_0.3',
    #                                                                     'f_score_0.3',
    #                                                                     'accuracy_0.4',
    #                                                                     'accuracy_0.4_upto_1',
    #                                                                     'precision_0.4',
    #                                                                     'recall_0.4',
    #                                                                     'f_score_0.4',
    #                                                                     'accuracy_0.5',
    #                                                                     'accuracy_0.5_upto_1',
    #                                                                     'precision_0.5',
    #                                                                     'recall_0.5',
    #                                                                     'f_score_0.5'
    #                                                                     ])
    # for metric, v in res.items():
    #     logging.info("Test {} : {:4f}".format(metric, v))

    # # Saving predictions and labels to a file
    # max_preds_column = max([len(l) for l in preds])
    # preds_df = pd.DataFrame(preds, columns=["prediction_" + str(i) for i in range(max_preds_column)])
    # preds_df.to_csv(args.output_dir + "/" + args.run_id + "/predictions.csv", index=False)
    #
    # labels_df = pd.DataFrame(labels, columns=["label_" + str(i) for i in range(max_preds_column)])
    # labels_df.to_csv(args.output_dir + "/" + args.run_id + "/labels.csv", index=False)

    # # predict on the test set
    # preds, labels, doc_ids, all_queries, preds_without_acc = trainer.test()

    new_preds = list((np.array(preds_without_acc) > 0.4).astype(int))
    d = {
        'query': all_queries,
        'doc_id': doc_ids,
        'label': new_preds,
        'similiarity': preds_without_acc
    }

    df_doc_ids = pd.DataFrame(d)
    import pdb
    pdb.set_trace()
    df_doc_ids = df_doc_ids.groupby('query').agg(list).reset_index()
    # df_doc_ids_ones = df_doc_ids[df_doc_ids['label']==1]
    # df_doc_ids_ones = df_doc_ids_ones.groupby('query').agg(list).reset_index()
    # df_doc_ids_non_ones = df_doc_ids.groupby('query').agg(list).reset_index()
    # new_df=[]
    # for i,row in df_doc_ids_non_ones.iterrows():
    #     if all([v == 0 for v in row['label']]):
    #         highest_value=[x for _, x in sorted(zip(row['similiarity'], row['doc_id']), key=lambda pair: pair[0])]
    #         highest_value_sim=[x for x in sorted(row['similiarity'])]
    #
    #         row['label'] = [1]
    #         row[ 'doc_id'] = [highest_value[0]]
    #         row[ 'similiarity'] = [highest_value_sim[0]]
    #
    #         new_df.append(row)

    # result = pd.concat([df_doc_ids,pd.DataFrame(new_df)])

    df_doc_ids.to_csv(args.output_dir + "/" + args.run_id +
                      "/doc_ids_test_all_results.csv",
                      index=False,
                      sep='\t')

    return trainer.best_ndcg
def run_experiment(args):
    args.run_id = str(ex.current_run._id)

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    tokenizer = BertTokenizer.from_pretrained(args.transformer_model)
    #Load datasets
    train = pd.read_csv(
        args.data_folder + args.task + "/train_test.tsv",
        sep="\t",
        nrows=args.sample_data if args.sample_data != -1 else None)
    valid = pd.read_csv(
        args.data_folder + args.task + "/valid_test.tsv",
        sep="\t",
        nrows=args.sample_data if args.sample_data != -1 else None)
    special_tokens_dict = {
        'additional_special_tokens': ['[UTTERANCE_SEP]', '[TURN_SEP]']
    }
    tokenizer.add_special_tokens(special_tokens_dict)

    #Choose the negative candidate sampler
    document_col = train.columns[1]
    ns_train = negative_sampling.BM25NegativeSamplerPyserini(
        list(train[document_col].values), args.num_ns_train,
        args.data_folder + args.task + "/anserini_train/", args.sample_data,
        args.anserini_folder)

    ns_val_random = negative_sampling.RandomNegativeSampler(
        list(valid[document_col].values) + list(train[document_col].values),
        args.num_ns_eval)
    ns_val_bm25 = negative_sampling.BM25NegativeSamplerPyserini(
        list(valid[document_col].values) + list(train[document_col].values),
        args.num_ns_eval, args.data_folder + args.task + "/anserini_valid/",
        args.sample_data, args.anserini_folder)
    ns_val_bert_sentence = negative_sampling.SentenceBERTNegativeSampler(
        list(valid[document_col].values) + list(train[document_col].values),
        args.num_ns_eval,
        args.data_folder + args.task + "/valid_sentenceBERTembeds",
        args.sample_data, args.bert_sentence_model)

    #Create the loaders for the datasets, with the respective negative samplers
    cross_ns_val = {}
    cross_ns_train = {}
    for (ns_name, ns_val) in [("random", ns_val_random), ("bm25", ns_val_bm25),
                              ("sentenceBERT", ns_val_bert_sentence)]:
        dataloader = dataset.QueryDocumentDataLoader(
            train, valid, valid, tokenizer, ns_train, ns_val, 'classification',
            args.train_batch_size, args.val_batch_size, args.max_seq_len,
            args.sample_data, args.data_folder + args.task)
        train_loader, val_loader, test_loader = dataloader.get_pytorch_dataloaders(
        )
        cross_ns_val[ns_name] = val_loader
        cross_ns_train[ns_name] = train_loader

    #Instantiate transformer model to be used
    model = BertForSequenceClassification.from_pretrained(
        args.transformer_model)
    model.resize_token_embeddings(len(dataloader.tokenizer))

    #Instantiate trainer that handles fitting.
    trainer = transformer_trainer.TransformerTrainer(
        model, cross_ns_train["bm25"], cross_ns_val["bm25"],
        cross_ns_val["bm25"], args.num_ns_eval, "classification", tokenizer,
        args.validate_every_epochs, args.num_validation_batches,
        args.num_epochs, args.lr, args.sacred_ex)

    #Train
    model_name = model.__class__.__name__
    logging.info("Fitting {} for {}{}".format(model_name, args.data_folder,
                                              args.task))
    trainer.fit()

    #Cross-NS predictions
    for ns_index, ns_name in enumerate(["random", "bm25", "sentenceBERT"]):
        logging.info("Predicting for NS {}".format(ns_name))
        os.makedirs(args.output_dir + "/" + str(int(args.run_id) + ns_index),
                    exist_ok=True)
        with open(
                args.output_dir + "/" + str(int(args.run_id) + ns_index) +
                "/config.json", "w") as f:
            config_w = {'args': vars(args)}
            config_w['args']['test_dataset'] = args.task
            config_w['args']['train_negative_sampler'] = 'bm25'
            config_w['args']['test_negative_sampler'] = ns_name
            if 'sacred_ex' in config_w['args']:
                del config_w['args']['sacred_ex']
            json.dump(config_w, f, indent=4)
        # preds, labels, softmax_logits = trainer.test()
        trainer.num_validation_batches = -1  # no sample
        preds, labels, softmax_logits = trainer.predict(cross_ns_val[ns_name])

        #Saving predictions and labels to a file
        max_preds_column = max([len(l) for l in preds])
        preds_df = pd.DataFrame(
            preds,
            columns=["prediction_" + str(i) for i in range(max_preds_column)])
        preds_df.to_csv(args.output_dir + "/" +
                        str(int(args.run_id) + ns_index) + "/predictions.csv",
                        index=False)

        softmax_df = pd.DataFrame(
            softmax_logits,
            columns=["prediction_" + str(i) for i in range(max_preds_column)])
        softmax_df.to_csv(args.output_dir + "/" +
                          str(int(args.run_id) + ns_index) +
                          "/predictions_softmax.csv",
                          index=False)

        labels_df = pd.DataFrame(
            labels,
            columns=["label_" + str(i) for i in range(max_preds_column)])
        labels_df.to_csv(args.output_dir + "/" +
                         str(int(args.run_id) + ns_index) + "/labels.csv",
                         index=False)

        #Saving model to a file
        if args.save_model:
            torch.save(
                model.state_dict(), args.output_dir + "/" +
                str(int(args.run_id) + ns_index) + "/model")

        #In case we want to get uncertainty estimations at prediction time
        if args.predict_with_uncertainty_estimation:
            logging.info("Predicting with dropout.")
            trainer.num_validation_batches = -1  # no sample
            preds, labels, softmax_logits, foward_passes_preds, uncertainties = \
                trainer.predict_with_uncertainty(cross_ns_val[ns_name], args.num_foward_prediction_passes)

            max_preds_column = max([len(l) for l in preds])
            preds_df = pd.DataFrame(preds,
                                    columns=[
                                        "prediction_" + str(i)
                                        for i in range(max_preds_column)
                                    ])
            preds_df.to_csv(args.output_dir + "/" +
                            str(int(args.run_id) + ns_index) +
                            "/predictions_with_dropout.csv",
                            index=False)

            softmax_df = pd.DataFrame(softmax_logits,
                                      columns=[
                                          "prediction_" + str(i)
                                          for i in range(max_preds_column)
                                      ])
            softmax_df.to_csv(args.output_dir + "/" +
                              str(int(args.run_id) + ns_index) +
                              "/predictions_with_dropout_softmax.csv",
                              index=False)

            for i, f_pass_preds in enumerate(foward_passes_preds):
                preds_df = pd.DataFrame(f_pass_preds,
                                        columns=[
                                            "prediction_" + str(i)
                                            for i in range(max_preds_column)
                                        ])
                preds_df.to_csv(
                    args.output_dir + "/" + str(int(args.run_id) + ns_index) +
                    "/predictions_with_dropout_f_pass_{}.csv".format(i),
                    index=False)

            labels_df = pd.DataFrame(
                labels,
                columns=["label_" + str(i) for i in range(max_preds_column)])
            labels_df.to_csv(args.output_dir + "/" +
                             str(int(args.run_id) + ns_index) + "/labels.csv",
                             index=False)

            uncertainties_df = pd.DataFrame(
                uncertainties,
                columns=[
                    "uncertainty_" + str(i) for i in range(max_preds_column)
                ])
            uncertainties_df.to_csv(args.output_dir + "/" +
                                    str(int(args.run_id) + ns_index) +
                                    "/uncertainties.csv",
                                    index=False)

    #Cross-dataset predictions
    cross_datasets = set(["msdialog", "ubuntu_dstc8", "mantis"]) - set(
        [args.task])
    cross_datasets = sorted(list(cross_datasets))
    cross_data_val_dataloader = {}
    for cross_task in cross_datasets:
        train_cross = preprocess_crr.read_crr_tsv_as_df(
            args.data_folder + cross_task + "/train.tsv", args.sample_data,
            add_turn_separator)
        valid_cross = preprocess_crr.read_crr_tsv_as_df(
            args.data_folder + cross_task + "/valid.tsv", args.sample_data,
            add_turn_separator)
        ns_train_cross = negative_sampling.BM25NegativeSamplerPyserini(
            list(train_cross[document_col].values), args.num_ns_train,
            args.data_folder + cross_task + "/anserini_train/",
            args.sample_data, args.anserini_folder)
        ns_val_bm25_cross = negative_sampling.BM25NegativeSamplerPyserini(
            list(valid_cross[document_col].values) +
            list(train_cross[document_col].values), args.num_ns_eval,
            args.data_folder + cross_task + "/anserini_valid/",
            args.sample_data, args.anserini_folder)
        dataloader = dataset.QueryDocumentDataLoader(
            train_cross, valid_cross, valid_cross, tokenizer, ns_train_cross,
            ns_val_bm25_cross, 'classification', args.train_batch_size,
            args.val_batch_size, args.max_seq_len, args.sample_data,
            args.data_folder + cross_task)
        _, val_loader, _ = dataloader.get_pytorch_dataloaders()
        cross_data_val_dataloader[cross_task] = val_loader

    for task_index, cross_task in enumerate(cross_datasets):
        logging.info("Predicting for dataset {}".format(cross_task))
        os.makedirs(args.output_dir + "/" +
                    str(int(args.run_id) + ns_index + task_index + 1),
                    exist_ok=True)
        with open(
                args.output_dir + "/" +
                str(int(args.run_id) + ns_index + task_index + 1) +
                "/config.json", "w") as f:
            config_w = {'args': vars(args)}
            config_w['args']['test_dataset'] = cross_task
            config_w['args']['train_negative_sampler'] = 'bm25'
            config_w['args']['test_negative_sampler'] = 'bm25'
            if 'sacred_ex' in config_w['args']:
                del config_w['args']['sacred_ex']
            json.dump(config_w, f, indent=4)
        # preds, labels, softmax_logits = trainer.test()
        trainer.num_validation_batches = -1  # no sample
        preds, labels, softmax_logits = trainer.predict(
            cross_data_val_dataloader[cross_task])

        #Saving predictions and labels to a file
        max_preds_column = max([len(l) for l in preds])
        preds_df = pd.DataFrame(
            preds,
            columns=["prediction_" + str(i) for i in range(max_preds_column)])
        preds_df.to_csv(args.output_dir + "/" +
                        str(int(args.run_id) + ns_index + task_index + 1) +
                        "/predictions.csv",
                        index=False)

        softmax_df = pd.DataFrame(
            softmax_logits,
            columns=["prediction_" + str(i) for i in range(max_preds_column)])
        softmax_df.to_csv(args.output_dir + "/" +
                          str(int(args.run_id) + ns_index + task_index + 1) +
                          "/predictions_softmax.csv",
                          index=False)

        labels_df = pd.DataFrame(
            labels,
            columns=["label_" + str(i) for i in range(max_preds_column)])
        labels_df.to_csv(args.output_dir + "/" +
                         str(int(args.run_id) + ns_index + task_index + 1) +
                         "/labels.csv",
                         index=False)

        #Saving model to a file
        if args.save_model:
            torch.save(
                model.state_dict(), args.output_dir + "/" +
                str(int(args.run_id) + ns_index + task_index + 1) + "/model")

        #In case we want to get uncertainty estimations at prediction time
        if args.predict_with_uncertainty_estimation:
            logging.info("Predicting with dropout.")
            preds, labels, softmax_logits, foward_passes_preds, uncertainties = \
                trainer.predict_with_uncertainty(cross_data_val_dataloader[cross_task], args.num_foward_prediction_passes)

            max_preds_column = max([len(l) for l in preds])
            preds_df = pd.DataFrame(preds,
                                    columns=[
                                        "prediction_" + str(i)
                                        for i in range(max_preds_column)
                                    ])
            preds_df.to_csv(args.output_dir + "/" +
                            str(int(args.run_id) + ns_index + task_index + 1) +
                            "/predictions_with_dropout.csv",
                            index=False)

            softmax_df = pd.DataFrame(softmax_logits,
                                      columns=[
                                          "prediction_" + str(i)
                                          for i in range(max_preds_column)
                                      ])
            softmax_df.to_csv(
                args.output_dir + "/" +
                str(int(args.run_id) + ns_index + task_index + 1) +
                "/predictions_with_dropout_softmax.csv",
                index=False)

            for i, f_pass_preds in enumerate(foward_passes_preds):
                preds_df = pd.DataFrame(f_pass_preds,
                                        columns=[
                                            "prediction_" + str(i)
                                            for i in range(max_preds_column)
                                        ])
                preds_df.to_csv(
                    args.output_dir + "/" +
                    str(int(args.run_id) + ns_index + task_index + 1) +
                    "/predictions_with_dropout_f_pass_{}.csv".format(i),
                    index=False)

            labels_df = pd.DataFrame(
                labels,
                columns=["label_" + str(i) for i in range(max_preds_column)])
            labels_df.to_csv(
                args.output_dir + "/" +
                str(int(args.run_id) + ns_index + task_index + 1) +
                "/labels.csv",
                index=False)

            uncertainties_df = pd.DataFrame(
                uncertainties,
                columns=[
                    "uncertainty_" + str(i) for i in range(max_preds_column)
                ])
            uncertainties_df.to_csv(
                args.output_dir + "/" +
                str(int(args.run_id) + ns_index + task_index + 1) +
                "/uncertainties.csv",
                index=False)
    return 0.0