예제 #1
0
def bert4ir_score(model, dataset, batch_size=32):
    import warnings
    import numpy as np
    if torch.cuda.is_available():
        # Assign the model to GPUs, specifying to use Data parallelism.
        model = torch.nn.DataParallel(model, device_ids=GPUS_TO_USE)
        parallel = len(GPUS_TO_USE)
        # The main model should be on the first GPU
        device = torch.device(f"cuda:{GPUS_TO_USE[0]}")
        model.to(device)
        # For a 1080Ti, 16 samples fit on a GPU comfortably. So, the train batch size will be 16*the number of GPUS
        train_batch_size = parallel * 16
        print(
            f"running on {parallel} GPUS, on {train_batch_size}-sized batches")
    else:
        print(
            "Are you sure about it? We will try to run this in CPU, but it's a BAD idea..."
        )
        device = torch.device("cpu")
        train_batch_size = 16
        model.to(device)
        parallel = number_of_cpus

    preds = None
    nb_eval_steps = 0
    data_loader = DataLoader(dataset,
                             batch_size=batch_size,
                             num_workers=number_of_cpus,
                             shuffle=False)
    for batch in tqdm(data_loader, desc="Scoring batch"):
        model.eval()

        with torch.no_grad():  # Avoid upgrading gradients here
            inputs = {
                'input_ids': batch[0].to(device),
                'attention_mask': batch[1].to(device),
                #'labels': batch[3].to(device)
            }

            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                outputs = model(**inputs)
            #print(outputs)
            logits = outputs[:2][0]
            #logits = outputs[0] # Logits is the actual output. Probabilities between 0 and 1.
            #print(logits)
            # we take the second column(?)
            logits = logits[:, 0]
            nb_eval_steps += 1
            # Concatenate all outputs to one big score array.
            if preds is None:
                preds = logits.detach().cpu().numpy().flatten(
                )  # Predictions into numpy mode
                #print(preds.shape)
            else:
                batch_predictions = logits.detach().cpu().numpy().flatten()
                preds = np.append(preds, batch_predictions, axis=0)
    #print(preds)
    #print(preds.shape)
    return preds
예제 #2
0
    def our_rerank_with_embeddings(self, qembs, pids, weightsQ=None, gpu=True):
        """
        input: qid,query, docid, query_tokens, query_embeddings, query_weights 
        
        output: qid, query, docid, score
        """
        colbert = self.args.colbert
        inference = self.inference
        # default is uniform weight for all query embeddings
        if weightsQ is None:
            weightsQ = torch.ones(len(qembs))
        # make to 3d tensor
        Q = torch.unsqueeze(qembs, 0)
        if gpu:
            Q = Q.cuda()
        #weightE = weightE(Q,E) # calculate the mean_cos score of each expansion term with all query term, the softmax normalised as the weight of the expansion term

        if self.verbose:
            pid_iter = tqdm(pids, desc="lookups", unit="d")
        else:
            pid_iter = pids

        D_ = torch.zeros(len(pids), self.doc_maxlen, self.dim)
        for offset, pid in enumerate(pid_iter):
            self.get_embedding_copy(pid, D_, offset)
        if gpu:
            D_ = D_.cuda()
        maxscoreQ = (Q @ D_.permute(0, 2, 1)).max(2).values.cpu()
        scores = (weightsQ * maxscoreQ).sum(1).cpu()
        return scores.tolist()
예제 #3
0
def train_neg_sampling(res_with_labels, neg_ratio):
    '''
        This implements the negative sampling found in Arthur's ECIR 2020 paper:
        We fine-tuned our BERT10 model with 10 negative samples for each positive
        sample from the training dataset, randomly picked from the top-100 retrieved
        from QL.
    '''
    import pandas as pd

    qid_groups = res_with_labels.groupby("qid")

    keeping_dfs = []
    for qid, queryDf in tqdm(qid_groups,
                             desc="Negative sampling",
                             total=qid_groups.ngroups,
                             unit="q"):

        pos = queryDf[queryDf["label"] >= 1]
        neg = queryDf[queryDf["label"] < 1]
        num_pos = len(pos)
        num_neg = len(neg)
        num_neg_needed = num_pos * neg_ratio
        #print("qid %s num_pos %d num_neg %d num_neg needed %d" % (qid, num_pos, num_neg, num_neg_needed))

        if num_neg > num_neg_needed:
            neg = neg.sample(num_neg_needed)
        keeping_dfs.append(pos)
        keeping_dfs.append(neg)

    #we keep all positives
    rtr = pd.concat(keeping_dfs)
    # ensure labels are ints
    rtr["label"] = rtr["label"].astype(int)
    return rtr
예제 #4
0
 def _single_retrieve(queries_df):
     rtr = []
     iter = queries_df.itertuples()
     iter = tqdm(iter, unit="q") if verbose else iter
     for row in iter:
         qid = row.qid
         with torch.no_grad():
             Q, ids, masks = self.args.inference.queryFromText(
                 [row.query], bsize=512, with_ids=True)
         Q_f = Q[0:1, :, :]
         all_pids = faiss_index.retrieve(faiss_depth,
                                         Q_f,
                                         verbose=verbose)
         for passage_ids in all_pids:
             if verbose:
                 print("qid %s retrieved docs %d" %
                       (qid, len(passage_ids)))
             for pid in passage_ids:
                 rtr.append(
                     [qid, row.query, pid, ids[0], Q[0, :, :].cpu()])
     return self._add_docnos(
         pd.DataFrame(rtr,
                      columns=[
                          "qid", "query", 'docid', 'query_toks',
                          'query_embs'
                      ]))
예제 #5
0
 def _single_retrieve_qembs(queries_df):
     rtr = []
     iter = queries_df.itertuples()
     iter = tqdm(iter, unit="q") if verbose else iter
     for row in iter:
         qid = row.qid
         embs = row.query_embs
         Q_f = torch.unsqueeze(embs, 0)
         all_pids = faiss_index.retrieve(faiss_depth,
                                         Q_f,
                                         verbose=verbose)
         for passage_ids in all_pids:
             if verbose:
                 print("qid %s retrieved docs %d" %
                       (qid, len(passage_ids)))
             for pid in passage_ids:
                 rtr.append([
                     qid, row.query, pid, row.query_toks, row.query_embs
                 ])
     return self._add_docnos(
         pd.DataFrame(rtr,
                      columns=[
                          "qid", "query", 'docid', 'query_toks',
                          'query_embs'
                      ]))
예제 #6
0
    def __init__(self,
                 checkpoint_path=None,
                 index_path=None,
                 cpu_index=None,
                 passage_embedding2id=None,
                 docid2docno=None,
                 num_results=100,
                 **kwargs):
        self.args = type('', (), {})()
        args = self.args
        args.local_rank = -1
        args.model_type = 'rdot_nll'
        args.cache_dir = None
        args.no_cuda = False
        args.max_query_length = 64
        args.max_seq_length = 128
        args.per_gpu_eval_batch_size = 128
        args.device = torch.device("cuda" if torch.cuda.is_available()
                                   and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
        args.__dict__.update(kwargs)

        self.checkpoint_path = checkpoint_path
        self.num_results = num_results
        from pyterrier import tqdm

        #faiss.omp_set_num_threads(16)

        config, tokenizer, model = _load_model(self.args, self.checkpoint_path)
        self.model = model
        self.tokenizer = tokenizer
        if index_path is not None:
            print("Loading shard metadata")
            shards_files = os.path.join(index_path, "shards.pkl")
            with pt.io.autoopen(shards_files) as f:
                self.shard_sizes = pickle.load(f)
                self.docid2docno = pickle.load(f)
            self.segments = len(self.shard_sizes)
            self.cpu_index = []
            self.shard_offsets = []
            self.passage_embedding2id = []
            offset = 0
            for i, shard_size in enumerate(
                    tqdm(self.shard_sizes, desc="Loading shards",
                         unit="shard")):
                faiss_file = os.path.join(index_path, str(i) + ".faiss")
                lookup_file = os.path.join(index_path, str(i) + ".docids.pkl")
                index = faiss.read_index(faiss_file)
                self.cpu_index.append(index)
                self.shard_offsets.append(offset)
                offset += shard_size
                with pt.io.autoopen(lookup_file) as f:
                    self.passage_embedding2id.append(pickle.load(f))
        else:
            self.cpu_index = cpu_index
            self.passage_embedding2id = passage_embedding2id
            self.docid2docno = docid2docno
예제 #7
0
    def applyPassaging(self, df, labels=True):
        newRows=[]
        labelCount=defaultdict(int)
        p = re.compile(r"\s+")
        currentQid=None
        rank=0
        copy_columns=[]
        for col in ["score", "rank"]:
            if col in df.columns:
                copy_columns.append(col)

        if len(df) == 0:
            return pd.DataFrame(columns=['qid', 'query', 'docno', self.text_attr, 'score', 'rank'])
    
        from pyterrier import tqdm
        with tqdm('passsaging', total=len(df), ncols=80, desc='passaging', leave=False) as pbar:
            for index, row in df.iterrows():
                pbar.update(1)
                qid = row['qid']
                if currentQid is None or currentQid != qid:
                    rank=0
                    currentQid = qid
                rank+=1
                toks = p.split(row[self.text_attr])
                if len(toks) < self.passage_length:
                    newRow = row.copy()
                    newRow['docno'] = row['docno'] + "%p0"
                    newRow[self.text_attr] = ' '.join(toks)
                    if self.prepend_title:
                        newRow.drop(labels=[self.title_attr], inplace=True)
                        newRow[self.text_attr] = str(row[self.title_attr]) + self.join + newRow[self.text_attr]
                    if labels:
                        labelCount[row['label']] += 1
                    for col in copy_columns:
                        newRow[col] = row[col]
                    newRows.append(newRow)
                else:
                    passageCount=0
                    for i, passage in enumerate( slidingWindow(toks, self.passage_length, self.passage_stride)):
                        newRow = row.copy()
                        newRow['docno'] = row['docno'] + "%p" + str(i)
                        newRow[self.text_attr] = ' '.join(passage)
                        if self.prepend_title:
                            newRow.drop(labels=[self.title_attr], inplace=True)
                            newRow[self.text_attr] = str(row[self.title_attr]) + self.join + newRow[self.text_attr]
                        for col in copy_columns:
                            newRow[col] = row[col]
                        if labels:
                            labelCount[row['label']] += 1
                        newRows.append(newRow)
                        passageCount+=1
        newDF = pd.DataFrame(newRows)
        newDF['query'].fillna('',inplace=True)
        newDF[self.text_attr].fillna('',inplace=True)
        newDF['qid'].fillna('',inplace=True)
        newDF.reset_index(inplace=True,drop=True)
        return newDF
예제 #8
0
 def _add_attr(df):
   iter = chunked(df.itertuples(), self.batch_size)
   if self.verbose:
     iter = pt.tqdm(iter, total=len(df)/self.batch_size, unit='d')
   output=[]
   for batch_rows in iter:
     docs = [getattr(row, self.doc_attr) for row in batch_rows]
     gens = self._doc2query(docs)
     output.extend(gens)
   df[self.out_attr] = output
   return df
예제 #9
0
 def __init__(self,
              df,
              tokenizer,
              *args,
              split,
              get_doc_fn,
              tokenizer_batch=8000):
     '''Initialize a Dataset object. 
     Arguments:
         samples: A list of samples. Each sample should be a tuple with (query_id, doc_id, <label>), where label is optional
         tokenizer: A tokenizer object from Hugging Face's Tokenizer lib. (need to implement encode_batch())
         split: a name for this dataset
         get_doc_fn: a function that maps a row into the text of the document 
         tokenizer_batch: How many samples to be tokenized at once by the tokenizer object.
     '''
     self.tokenizer = tokenizer
     tokenizer.padding_side = "right"
     print("Loading and tokenizing %s dataset of %d rows ..." %
           (split, len(df)))
     assert len(df) > 0
     self.labels_present = "label" in df.columns
     query_batch = []
     doc_batch = []
     sample_ids_batch = []
     labels_batch = []
     self.store = {}
     self.processed_samples = 0
     number_of_batches = math.ceil(len(df) / tokenizer_batch)
     assert number_of_batches > 0
     with tqdm(total=len(df), desc="Tokenizer input",
               unit="d") as batch_pbar:
         i = 0
         for indx, row in df.iterrows():
             query_batch.append(row["query"])
             doc_batch.append(get_doc_fn(row))
             sample_ids_batch.append(row["qid"] + "_" + row["docno"])
             if self.labels_present:
                 labels_batch.append(row["label"])
             else:
                 # we dont have a label, but lets append 0, to get rid of if elsewhere.
                 labels_batch.append(0)
             if len(query_batch) == tokenizer_batch or i == len(df) - 1:
                 self._tokenize_and_dump_batch(doc_batch, query_batch,
                                               labels_batch,
                                               sample_ids_batch)
                 query_batch = []
                 doc_batch = []
                 sample_ids_batch = []
                 labels_batch = []
             i += 1
             batch_pbar.update()
예제 #10
0
 def convert_gen(iterator):
     import pyterrier as pt
     nonlocal docnos
     nonlocal docid
     if self.num_docs is not None:
         iterator = pt.tqdm(iterator,
                            total=self.num_docs,
                            desc="encoding",
                            unit="d")
     for l in iterator:
         l["docid"] = docid
         docnos.append(l['docno'])
         docid += 1
         yield l
예제 #11
0
 def _text_scorer(queries_and_docs):
     groupby = queries_and_docs.groupby("qid")
     rtr = []
     with torch.no_grad():
         for qid, group in tqdm(groupby, total=len(groupby),
                                unit="q") if verbose else groupby:
             query = group["query"].values[0]
             ranking = slow_rerank(self.args, query,
                                   group["docno"].values,
                                   group[doc_attr].values.tolist())
             for rank, (score, pid, passage) in enumerate(ranking):
                 rtr.append([qid, query, pid, score, rank])
     return pd.DataFrame(
         rtr, columns=["qid", "query", "docno", "score", "rank"])
예제 #12
0
 def _add_attr(df):
   iter = chunked(df.itertuples(), self.batch_size)
   if self.verbose:
     iter = pt.tqdm(iter, total=len(df)/self.batch_size, unit='d')
   output=[]
   for batch_rows in iter:
     docs = [getattr(row, self.doc_attr) for row in batch_rows]
     gens = self._doc2query(docs)
     if self.append:
       gens = [f'{getattr(row, self.doc_attr)} {gen}' for row, gen in zip(batch_rows, gens)]
     output.extend(gens)
   if self.append:
     df[self.doc_attr] = output # replace doc content
   else:
     df[self.out_attr] = output # add new column
   return df
예제 #13
0
    def transform(self, topics):
        from pyterrier import tqdm
        queries = []
        qid2q = {}
        for q, qid in zip(topics["query"].to_list(), topics["qid"].to_list()):
            passage = self.tokenizer.encode(
                q,
                add_special_tokens=True,
                max_length=self.args.max_seq_length,
            )

            passage_len = min(len(passage), self.args.max_query_length)
            input_id_b = pad_input_ids(passage, self.args.max_query_length)
            queries.append([passage_len, input_id_b])
            qid2q[qid] = q

        print("***** inference of %d queries *****" % len(queries))
        dev_query_embedding, dev_query_embedding2id = StreamInferenceDoc(
            self.args,
            self.model,
            GetProcessingFn(self.args, query=True),
            "transform",
            queries,
            is_query_inference=True)

        print("***** faiss search for %d queries on %d shards *****" %
              (len(queries), self.segments))
        rtr = []
        for i, offset in enumerate(tqdm(self.shard_offsets, unit="shard")):
            scores, neighbours = self.cpu_index[i].search(
                dev_query_embedding, self.num_results)
            res = self._calc_scores(topics["qid"].values,
                                    self.passage_embedding2id[i],
                                    neighbours,
                                    scores,
                                    num_results=self.num_results,
                                    offset=offset,
                                    qid2q=qid2q)
            rtr.append(res)
        rtr = pd.concat(rtr)
        rtr = add_ranks(rtr)
        rtr = rtr[rtr["rank"] < self.num_results]
        rtr = rtr.sort_values(by=["qid", "score", "docno"],
                              ascending=[True, False, True])
        return rtr
예제 #14
0
 def transform(self, queries_and_docs):
     groupby = queries_and_docs.groupby("qid")
     rtr = []
     with torch.no_grad():
         for qid, group in tqdm(
                 groupby, total=len(groupby), desc='colbert',
                 unit="q") if self.verbose else groupby:
             query = group["query"].values[0]
             ranking = rerank(self.args,
                              query,
                              group["docno"].values,
                              group[self.doc_attr].values,
                              index=None)
             for rank, (score, pid, passage) in enumerate(ranking):
                 rtr.append([qid, query, pid, score, rank])
     return add_ranks(
         pd.DataFrame(rtr,
                      columns=["qid", "query", "docno", "score", "rank"]))
예제 #15
0
        def gen_tokenize():
            text_attr = self.text_attr
            kwargs = {}
            if self.num_docs is not None:
                kwargs['total'] = self.num_docs
            for doc in pt.tqdm(generator, desc="Indexing", unit="d", **
                               kwargs) if self.verbose else generator:
                contents = doc[text_attr]
                docid2docno.append(doc["docno"])

                passage = tokenizer.encode(
                    contents,
                    add_special_tokens=True,
                    max_length=self.args.max_seq_length,
                )
                passage_len = min(len(passage), self.args.max_seq_length)
                input_id_b = pad_input_ids(passage, self.args.max_seq_length)
                yield passage_len, input_id_b
예제 #16
0
    def our_rerank(self, query, pids, gpu=True):
        colbert = self.args.colbert
        inference = self.inference

        Q = inference.queryFromText([query])
        if self.verbose:
            pid_iter = tqdm(pids, desc="lookups", unit="d")
        else:
            pid_iter = pids

        D_ = torch.zeros(len(pids), self.doc_maxlen, self.dim)
        for offset, pid in enumerate(pid_iter):
            self.get_embedding_copy(pid, D_, offset)

        if gpu:
            D_ = D_.cuda()

        scores = colbert.score(Q, D_).cpu()
        del (D_)
        return scores.tolist()
예제 #17
0
    def transform(self, topics_and_res):
        import pandas as pd
        rtr = []
        grouper = topics_and_res.groupby("qid")
        from pyterrier import tqdm, started
        assert started()

        #for each query, get the results, and pass to _for_each_query
        for qid, group in tqdm(grouper, desc="BERTQE",
                               unit="q") if self.verbose else grouper:
            query = group["query"].iloc[0]
            scores = self._for_each_query(qid, query,
                                          group[["docno", self.body_attr]])

            # assigned the scores to the input documents
            for i, s in enumerate(scores.tolist()):
                rtr.append([qid, query, group.iloc[i]["docno"], s])

        # returns the final dataframe
        df = pd.DataFrame(rtr, columns=["qid", "query", "docno", "score"])
        return add_ranks(df)
예제 #18
0
    def _load_parts(index_path, part_doclens, memtype="mmap"):
        # Every pt file is loaded and managed independently, with local pids
        _, all_parts_paths, _ = get_parts(index_path)

        if memtype == "mmap":
            all_parts_paths = [
                file.replace(".pt", ".store") for file in all_parts_paths
            ]
            mmaps = [
                file_part_mmap(path, doclens)
                for path, doclens in zip(all_parts_paths, part_doclens)
            ]
        elif memtype == "mem":
            mmaps = [
                file_part_mem(path, doclens) for path, doclens in tqdm(
                    zip(all_parts_paths, part_doclens),
                    total=len(all_parts_paths),
                    desc="Loading index shards to memory",
                    unit="shard")
            ]
        else:
            assert False, "Unknown memtype %s" % memtype
        return mmaps
예제 #19
0
def train_bert4ir(model, train_dataset, dev_dataset):

    if torch.cuda.is_available():
        # Asssign the model to GPUs, specifying to use Data parallelism.
        model = torch.nn.DataParallel(model, device_ids=GPUS_TO_USE)
        parallel = len(GPUS_TO_USE)
        # The main model should be on the first GPU
        device = torch.device(f"cuda:{GPUS_TO_USE[0]}")
        model.to(device)
        # For a 1080Ti, 16 samples fit on a GPU comfortably. So, the train batch size will be 16*the number of GPUS
        train_batch_size = parallel * 16
        print(
            f"running on {parallel} GPUS, on {train_batch_size}-sized batches")
    else:
        print(
            "Are you sure about it? We will try to run this in CPU, but it's a BAD idea..."
        )
        device = torch.device("cpu")
        train_batch_size = 16
        model.to(device)
        parallel = number_of_cpus

    # A data loader is a nice device for generating batches for you easily.
    # It receives any object that implements __getitem__(self, idx) and __len__(self)
    train_data_loader = DataLoader(train_dataset,
                                   batch_size=train_batch_size,
                                   num_workers=number_of_cpus,
                                   shuffle=True)
    dev_data_loader = DataLoader(dev_dataset,
                                 batch_size=32,
                                 num_workers=number_of_cpus,
                                 shuffle=True)

    #how many optimization steps to run, given the NUMBER OF BATCHES. (The len of the dataloader is the number of batches).
    num_train_optimization_steps = len(train_data_loader) * n_epochs

    #which layers will not have a linear weigth decay when training
    no_decay = ['bias', 'LayerNorm.weight']

    #all parameters to be optimized by our fine tunning.
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]

    #We use the AdamW optmizer here.
    optimizer = AdamW(optimizer_grouped_parameters, lr=lr, eps=1e-8)

    # How many steps to wait before we start to decrease the learning rate
    warmup_steps = num_train_optimization_steps * warmup_proportion
    # A scheduler to take care of the above.
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=num_train_optimization_steps)
    print(
        f"*********Total optmization steps: {num_train_optimization_steps}*********"
    )

    import warnings
    import numpy as np
    import datetime

    from sklearn.metrics import f1_score, average_precision_score, accuracy_score, roc_auc_score

    global_step = 0  # Number of steps performed so far
    tr_loss = 0.0  # Training loss
    model.zero_grad()  # Initialize gradients to 0

    for _ in tqdm(range(n_epochs), desc="Epochs"):
        for step, batch in tqdm(enumerate(train_data_loader),
                                desc="Batches",
                                total=len(train_data_loader)):
            model.train()
            # get the batch inpute
            inputs = {
                'input_ids': batch[0].to(device),
                'attention_mask': batch[1].to(device),
                'labels': batch[3].to(device)
            }
            # Run through the network.

            with warnings.catch_warnings():
                # There is a very annoying warning here when we are using multiple GPUS,
                # As described here: https://github.com/huggingface/transformers/issues/852.
                # We can safely ignore this.
                warnings.simplefilter("ignore")
                outputs = model(**inputs)
            loss = outputs[0]

            loss = loss.sum() / parallel  # Average over all GPUs/CPUs.

            # Backward pass on the network
            loss.backward()
            tr_loss += loss.item()
            # Clipping gradients. Avoud gradient explosion, if the gradient is too large.
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            # Run the optimizer with the gradients
            optimizer.step()
            scheduler.step()
            model.zero_grad()
            if step % steps_to_print == 0:
                # Logits is the actual output from the network.
                # This is the probability of being relevant or not.
                # You can check its shape (Should be a vector sized 2) with logits.shape()
                logits = outputs[1]
                # Send the logits to the CPU and in numpy form. Easier to check what is going on.
                preds = logits.detach().cpu().numpy()

                tqdm.write(
                    f"Training loss: {loss.item()} Learning Rate: {scheduler.get_last_lr()[0]}"
                )
            global_step += 1

            # Run an evluation step over the eval dataset. Let's see how we are doing.
            if global_step % steps_to_eval == 0:
                eval_loss = 0.0
                nb_eval_steps = 0
                preds = None
                out_label_ids = None
                for batch in tqdm(dev_data_loader, desc="Valid batch"):
                    model.eval()
                    with torch.no_grad():  # Avoid upgrading gradients here
                        inputs = {
                            'input_ids': batch[0].to(device),
                            'attention_mask': batch[1].to(device),
                            'labels': batch[3].to(device)
                        }
                        with warnings.catch_warnings():
                            warnings.simplefilter("ignore")
                            outputs = model(**inputs)
                        tmp_eval_loss, logits = outputs[:
                                                        2]  # Logits is the actual output. Probabilities between 0 and 1.
                        eval_loss += tmp_eval_loss.mean().item()
                        nb_eval_steps += 1
                        # Concatenate all outputs to evaluate in the end.
                        if preds is None:
                            preds = logits.detach().cpu().numpy(
                            )  # PRedictions into numpy mode
                            out_label_ids = inputs['labels'].detach().cpu(
                            ).numpy().flatten()  # Labels assigned by model
                        else:
                            batch_predictions = logits.detach().cpu().numpy()
                            preds = np.append(preds, batch_predictions, axis=0)
                            out_label_ids = np.append(
                                out_label_ids,
                                inputs['labels'].detach().cpu().numpy(
                                ).flatten(),
                                axis=0)
                    eval_loss = eval_loss / nb_eval_steps
                results = {}
                results["ROC Dev"] = roc_auc_score(out_label_ids, preds[:, 1])
                preds = np.argmax(preds, axis=1)
                results["Accuracy Dev"] = accuracy_score(out_label_ids, preds)
                results["F1 Dev"] = f1_score(out_label_ids, preds)
                results["AP Dev"] = average_precision_score(
                    out_label_ids, preds)
                tqdm.write("***** Eval results *****")
                for key in sorted(results.keys()):
                    tqdm.write(f"  {key} = {str(results[key])}")
                save_model(model, global_step)
    model_to_save = save_model(model, global_step)
    return model_to_save