def bert4ir_score(model, dataset, batch_size=32): import warnings import numpy as np if torch.cuda.is_available(): # Assign the model to GPUs, specifying to use Data parallelism. model = torch.nn.DataParallel(model, device_ids=GPUS_TO_USE) parallel = len(GPUS_TO_USE) # The main model should be on the first GPU device = torch.device(f"cuda:{GPUS_TO_USE[0]}") model.to(device) # For a 1080Ti, 16 samples fit on a GPU comfortably. So, the train batch size will be 16*the number of GPUS train_batch_size = parallel * 16 print( f"running on {parallel} GPUS, on {train_batch_size}-sized batches") else: print( "Are you sure about it? We will try to run this in CPU, but it's a BAD idea..." ) device = torch.device("cpu") train_batch_size = 16 model.to(device) parallel = number_of_cpus preds = None nb_eval_steps = 0 data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=number_of_cpus, shuffle=False) for batch in tqdm(data_loader, desc="Scoring batch"): model.eval() with torch.no_grad(): # Avoid upgrading gradients here inputs = { 'input_ids': batch[0].to(device), 'attention_mask': batch[1].to(device), #'labels': batch[3].to(device) } with warnings.catch_warnings(): warnings.simplefilter("ignore") outputs = model(**inputs) #print(outputs) logits = outputs[:2][0] #logits = outputs[0] # Logits is the actual output. Probabilities between 0 and 1. #print(logits) # we take the second column(?) logits = logits[:, 0] nb_eval_steps += 1 # Concatenate all outputs to one big score array. if preds is None: preds = logits.detach().cpu().numpy().flatten( ) # Predictions into numpy mode #print(preds.shape) else: batch_predictions = logits.detach().cpu().numpy().flatten() preds = np.append(preds, batch_predictions, axis=0) #print(preds) #print(preds.shape) return preds
def our_rerank_with_embeddings(self, qembs, pids, weightsQ=None, gpu=True): """ input: qid,query, docid, query_tokens, query_embeddings, query_weights output: qid, query, docid, score """ colbert = self.args.colbert inference = self.inference # default is uniform weight for all query embeddings if weightsQ is None: weightsQ = torch.ones(len(qembs)) # make to 3d tensor Q = torch.unsqueeze(qembs, 0) if gpu: Q = Q.cuda() #weightE = weightE(Q,E) # calculate the mean_cos score of each expansion term with all query term, the softmax normalised as the weight of the expansion term if self.verbose: pid_iter = tqdm(pids, desc="lookups", unit="d") else: pid_iter = pids D_ = torch.zeros(len(pids), self.doc_maxlen, self.dim) for offset, pid in enumerate(pid_iter): self.get_embedding_copy(pid, D_, offset) if gpu: D_ = D_.cuda() maxscoreQ = (Q @ D_.permute(0, 2, 1)).max(2).values.cpu() scores = (weightsQ * maxscoreQ).sum(1).cpu() return scores.tolist()
def train_neg_sampling(res_with_labels, neg_ratio): ''' This implements the negative sampling found in Arthur's ECIR 2020 paper: We fine-tuned our BERT10 model with 10 negative samples for each positive sample from the training dataset, randomly picked from the top-100 retrieved from QL. ''' import pandas as pd qid_groups = res_with_labels.groupby("qid") keeping_dfs = [] for qid, queryDf in tqdm(qid_groups, desc="Negative sampling", total=qid_groups.ngroups, unit="q"): pos = queryDf[queryDf["label"] >= 1] neg = queryDf[queryDf["label"] < 1] num_pos = len(pos) num_neg = len(neg) num_neg_needed = num_pos * neg_ratio #print("qid %s num_pos %d num_neg %d num_neg needed %d" % (qid, num_pos, num_neg, num_neg_needed)) if num_neg > num_neg_needed: neg = neg.sample(num_neg_needed) keeping_dfs.append(pos) keeping_dfs.append(neg) #we keep all positives rtr = pd.concat(keeping_dfs) # ensure labels are ints rtr["label"] = rtr["label"].astype(int) return rtr
def _single_retrieve(queries_df): rtr = [] iter = queries_df.itertuples() iter = tqdm(iter, unit="q") if verbose else iter for row in iter: qid = row.qid with torch.no_grad(): Q, ids, masks = self.args.inference.queryFromText( [row.query], bsize=512, with_ids=True) Q_f = Q[0:1, :, :] all_pids = faiss_index.retrieve(faiss_depth, Q_f, verbose=verbose) for passage_ids in all_pids: if verbose: print("qid %s retrieved docs %d" % (qid, len(passage_ids))) for pid in passage_ids: rtr.append( [qid, row.query, pid, ids[0], Q[0, :, :].cpu()]) return self._add_docnos( pd.DataFrame(rtr, columns=[ "qid", "query", 'docid', 'query_toks', 'query_embs' ]))
def _single_retrieve_qembs(queries_df): rtr = [] iter = queries_df.itertuples() iter = tqdm(iter, unit="q") if verbose else iter for row in iter: qid = row.qid embs = row.query_embs Q_f = torch.unsqueeze(embs, 0) all_pids = faiss_index.retrieve(faiss_depth, Q_f, verbose=verbose) for passage_ids in all_pids: if verbose: print("qid %s retrieved docs %d" % (qid, len(passage_ids))) for pid in passage_ids: rtr.append([ qid, row.query, pid, row.query_toks, row.query_embs ]) return self._add_docnos( pd.DataFrame(rtr, columns=[ "qid", "query", 'docid', 'query_toks', 'query_embs' ]))
def __init__(self, checkpoint_path=None, index_path=None, cpu_index=None, passage_embedding2id=None, docid2docno=None, num_results=100, **kwargs): self.args = type('', (), {})() args = self.args args.local_rank = -1 args.model_type = 'rdot_nll' args.cache_dir = None args.no_cuda = False args.max_query_length = 64 args.max_seq_length = 128 args.per_gpu_eval_batch_size = 128 args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() args.__dict__.update(kwargs) self.checkpoint_path = checkpoint_path self.num_results = num_results from pyterrier import tqdm #faiss.omp_set_num_threads(16) config, tokenizer, model = _load_model(self.args, self.checkpoint_path) self.model = model self.tokenizer = tokenizer if index_path is not None: print("Loading shard metadata") shards_files = os.path.join(index_path, "shards.pkl") with pt.io.autoopen(shards_files) as f: self.shard_sizes = pickle.load(f) self.docid2docno = pickle.load(f) self.segments = len(self.shard_sizes) self.cpu_index = [] self.shard_offsets = [] self.passage_embedding2id = [] offset = 0 for i, shard_size in enumerate( tqdm(self.shard_sizes, desc="Loading shards", unit="shard")): faiss_file = os.path.join(index_path, str(i) + ".faiss") lookup_file = os.path.join(index_path, str(i) + ".docids.pkl") index = faiss.read_index(faiss_file) self.cpu_index.append(index) self.shard_offsets.append(offset) offset += shard_size with pt.io.autoopen(lookup_file) as f: self.passage_embedding2id.append(pickle.load(f)) else: self.cpu_index = cpu_index self.passage_embedding2id = passage_embedding2id self.docid2docno = docid2docno
def applyPassaging(self, df, labels=True): newRows=[] labelCount=defaultdict(int) p = re.compile(r"\s+") currentQid=None rank=0 copy_columns=[] for col in ["score", "rank"]: if col in df.columns: copy_columns.append(col) if len(df) == 0: return pd.DataFrame(columns=['qid', 'query', 'docno', self.text_attr, 'score', 'rank']) from pyterrier import tqdm with tqdm('passsaging', total=len(df), ncols=80, desc='passaging', leave=False) as pbar: for index, row in df.iterrows(): pbar.update(1) qid = row['qid'] if currentQid is None or currentQid != qid: rank=0 currentQid = qid rank+=1 toks = p.split(row[self.text_attr]) if len(toks) < self.passage_length: newRow = row.copy() newRow['docno'] = row['docno'] + "%p0" newRow[self.text_attr] = ' '.join(toks) if self.prepend_title: newRow.drop(labels=[self.title_attr], inplace=True) newRow[self.text_attr] = str(row[self.title_attr]) + self.join + newRow[self.text_attr] if labels: labelCount[row['label']] += 1 for col in copy_columns: newRow[col] = row[col] newRows.append(newRow) else: passageCount=0 for i, passage in enumerate( slidingWindow(toks, self.passage_length, self.passage_stride)): newRow = row.copy() newRow['docno'] = row['docno'] + "%p" + str(i) newRow[self.text_attr] = ' '.join(passage) if self.prepend_title: newRow.drop(labels=[self.title_attr], inplace=True) newRow[self.text_attr] = str(row[self.title_attr]) + self.join + newRow[self.text_attr] for col in copy_columns: newRow[col] = row[col] if labels: labelCount[row['label']] += 1 newRows.append(newRow) passageCount+=1 newDF = pd.DataFrame(newRows) newDF['query'].fillna('',inplace=True) newDF[self.text_attr].fillna('',inplace=True) newDF['qid'].fillna('',inplace=True) newDF.reset_index(inplace=True,drop=True) return newDF
def _add_attr(df): iter = chunked(df.itertuples(), self.batch_size) if self.verbose: iter = pt.tqdm(iter, total=len(df)/self.batch_size, unit='d') output=[] for batch_rows in iter: docs = [getattr(row, self.doc_attr) for row in batch_rows] gens = self._doc2query(docs) output.extend(gens) df[self.out_attr] = output return df
def __init__(self, df, tokenizer, *args, split, get_doc_fn, tokenizer_batch=8000): '''Initialize a Dataset object. Arguments: samples: A list of samples. Each sample should be a tuple with (query_id, doc_id, <label>), where label is optional tokenizer: A tokenizer object from Hugging Face's Tokenizer lib. (need to implement encode_batch()) split: a name for this dataset get_doc_fn: a function that maps a row into the text of the document tokenizer_batch: How many samples to be tokenized at once by the tokenizer object. ''' self.tokenizer = tokenizer tokenizer.padding_side = "right" print("Loading and tokenizing %s dataset of %d rows ..." % (split, len(df))) assert len(df) > 0 self.labels_present = "label" in df.columns query_batch = [] doc_batch = [] sample_ids_batch = [] labels_batch = [] self.store = {} self.processed_samples = 0 number_of_batches = math.ceil(len(df) / tokenizer_batch) assert number_of_batches > 0 with tqdm(total=len(df), desc="Tokenizer input", unit="d") as batch_pbar: i = 0 for indx, row in df.iterrows(): query_batch.append(row["query"]) doc_batch.append(get_doc_fn(row)) sample_ids_batch.append(row["qid"] + "_" + row["docno"]) if self.labels_present: labels_batch.append(row["label"]) else: # we dont have a label, but lets append 0, to get rid of if elsewhere. labels_batch.append(0) if len(query_batch) == tokenizer_batch or i == len(df) - 1: self._tokenize_and_dump_batch(doc_batch, query_batch, labels_batch, sample_ids_batch) query_batch = [] doc_batch = [] sample_ids_batch = [] labels_batch = [] i += 1 batch_pbar.update()
def convert_gen(iterator): import pyterrier as pt nonlocal docnos nonlocal docid if self.num_docs is not None: iterator = pt.tqdm(iterator, total=self.num_docs, desc="encoding", unit="d") for l in iterator: l["docid"] = docid docnos.append(l['docno']) docid += 1 yield l
def _text_scorer(queries_and_docs): groupby = queries_and_docs.groupby("qid") rtr = [] with torch.no_grad(): for qid, group in tqdm(groupby, total=len(groupby), unit="q") if verbose else groupby: query = group["query"].values[0] ranking = slow_rerank(self.args, query, group["docno"].values, group[doc_attr].values.tolist()) for rank, (score, pid, passage) in enumerate(ranking): rtr.append([qid, query, pid, score, rank]) return pd.DataFrame( rtr, columns=["qid", "query", "docno", "score", "rank"])
def _add_attr(df): iter = chunked(df.itertuples(), self.batch_size) if self.verbose: iter = pt.tqdm(iter, total=len(df)/self.batch_size, unit='d') output=[] for batch_rows in iter: docs = [getattr(row, self.doc_attr) for row in batch_rows] gens = self._doc2query(docs) if self.append: gens = [f'{getattr(row, self.doc_attr)} {gen}' for row, gen in zip(batch_rows, gens)] output.extend(gens) if self.append: df[self.doc_attr] = output # replace doc content else: df[self.out_attr] = output # add new column return df
def transform(self, topics): from pyterrier import tqdm queries = [] qid2q = {} for q, qid in zip(topics["query"].to_list(), topics["qid"].to_list()): passage = self.tokenizer.encode( q, add_special_tokens=True, max_length=self.args.max_seq_length, ) passage_len = min(len(passage), self.args.max_query_length) input_id_b = pad_input_ids(passage, self.args.max_query_length) queries.append([passage_len, input_id_b]) qid2q[qid] = q print("***** inference of %d queries *****" % len(queries)) dev_query_embedding, dev_query_embedding2id = StreamInferenceDoc( self.args, self.model, GetProcessingFn(self.args, query=True), "transform", queries, is_query_inference=True) print("***** faiss search for %d queries on %d shards *****" % (len(queries), self.segments)) rtr = [] for i, offset in enumerate(tqdm(self.shard_offsets, unit="shard")): scores, neighbours = self.cpu_index[i].search( dev_query_embedding, self.num_results) res = self._calc_scores(topics["qid"].values, self.passage_embedding2id[i], neighbours, scores, num_results=self.num_results, offset=offset, qid2q=qid2q) rtr.append(res) rtr = pd.concat(rtr) rtr = add_ranks(rtr) rtr = rtr[rtr["rank"] < self.num_results] rtr = rtr.sort_values(by=["qid", "score", "docno"], ascending=[True, False, True]) return rtr
def transform(self, queries_and_docs): groupby = queries_and_docs.groupby("qid") rtr = [] with torch.no_grad(): for qid, group in tqdm( groupby, total=len(groupby), desc='colbert', unit="q") if self.verbose else groupby: query = group["query"].values[0] ranking = rerank(self.args, query, group["docno"].values, group[self.doc_attr].values, index=None) for rank, (score, pid, passage) in enumerate(ranking): rtr.append([qid, query, pid, score, rank]) return add_ranks( pd.DataFrame(rtr, columns=["qid", "query", "docno", "score", "rank"]))
def gen_tokenize(): text_attr = self.text_attr kwargs = {} if self.num_docs is not None: kwargs['total'] = self.num_docs for doc in pt.tqdm(generator, desc="Indexing", unit="d", ** kwargs) if self.verbose else generator: contents = doc[text_attr] docid2docno.append(doc["docno"]) passage = tokenizer.encode( contents, add_special_tokens=True, max_length=self.args.max_seq_length, ) passage_len = min(len(passage), self.args.max_seq_length) input_id_b = pad_input_ids(passage, self.args.max_seq_length) yield passage_len, input_id_b
def our_rerank(self, query, pids, gpu=True): colbert = self.args.colbert inference = self.inference Q = inference.queryFromText([query]) if self.verbose: pid_iter = tqdm(pids, desc="lookups", unit="d") else: pid_iter = pids D_ = torch.zeros(len(pids), self.doc_maxlen, self.dim) for offset, pid in enumerate(pid_iter): self.get_embedding_copy(pid, D_, offset) if gpu: D_ = D_.cuda() scores = colbert.score(Q, D_).cpu() del (D_) return scores.tolist()
def transform(self, topics_and_res): import pandas as pd rtr = [] grouper = topics_and_res.groupby("qid") from pyterrier import tqdm, started assert started() #for each query, get the results, and pass to _for_each_query for qid, group in tqdm(grouper, desc="BERTQE", unit="q") if self.verbose else grouper: query = group["query"].iloc[0] scores = self._for_each_query(qid, query, group[["docno", self.body_attr]]) # assigned the scores to the input documents for i, s in enumerate(scores.tolist()): rtr.append([qid, query, group.iloc[i]["docno"], s]) # returns the final dataframe df = pd.DataFrame(rtr, columns=["qid", "query", "docno", "score"]) return add_ranks(df)
def _load_parts(index_path, part_doclens, memtype="mmap"): # Every pt file is loaded and managed independently, with local pids _, all_parts_paths, _ = get_parts(index_path) if memtype == "mmap": all_parts_paths = [ file.replace(".pt", ".store") for file in all_parts_paths ] mmaps = [ file_part_mmap(path, doclens) for path, doclens in zip(all_parts_paths, part_doclens) ] elif memtype == "mem": mmaps = [ file_part_mem(path, doclens) for path, doclens in tqdm( zip(all_parts_paths, part_doclens), total=len(all_parts_paths), desc="Loading index shards to memory", unit="shard") ] else: assert False, "Unknown memtype %s" % memtype return mmaps
def train_bert4ir(model, train_dataset, dev_dataset): if torch.cuda.is_available(): # Asssign the model to GPUs, specifying to use Data parallelism. model = torch.nn.DataParallel(model, device_ids=GPUS_TO_USE) parallel = len(GPUS_TO_USE) # The main model should be on the first GPU device = torch.device(f"cuda:{GPUS_TO_USE[0]}") model.to(device) # For a 1080Ti, 16 samples fit on a GPU comfortably. So, the train batch size will be 16*the number of GPUS train_batch_size = parallel * 16 print( f"running on {parallel} GPUS, on {train_batch_size}-sized batches") else: print( "Are you sure about it? We will try to run this in CPU, but it's a BAD idea..." ) device = torch.device("cpu") train_batch_size = 16 model.to(device) parallel = number_of_cpus # A data loader is a nice device for generating batches for you easily. # It receives any object that implements __getitem__(self, idx) and __len__(self) train_data_loader = DataLoader(train_dataset, batch_size=train_batch_size, num_workers=number_of_cpus, shuffle=True) dev_data_loader = DataLoader(dev_dataset, batch_size=32, num_workers=number_of_cpus, shuffle=True) #how many optimization steps to run, given the NUMBER OF BATCHES. (The len of the dataloader is the number of batches). num_train_optimization_steps = len(train_data_loader) * n_epochs #which layers will not have a linear weigth decay when training no_decay = ['bias', 'LayerNorm.weight'] #all parameters to be optimized by our fine tunning. optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] #We use the AdamW optmizer here. optimizer = AdamW(optimizer_grouped_parameters, lr=lr, eps=1e-8) # How many steps to wait before we start to decrease the learning rate warmup_steps = num_train_optimization_steps * warmup_proportion # A scheduler to take care of the above. scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_train_optimization_steps) print( f"*********Total optmization steps: {num_train_optimization_steps}*********" ) import warnings import numpy as np import datetime from sklearn.metrics import f1_score, average_precision_score, accuracy_score, roc_auc_score global_step = 0 # Number of steps performed so far tr_loss = 0.0 # Training loss model.zero_grad() # Initialize gradients to 0 for _ in tqdm(range(n_epochs), desc="Epochs"): for step, batch in tqdm(enumerate(train_data_loader), desc="Batches", total=len(train_data_loader)): model.train() # get the batch inpute inputs = { 'input_ids': batch[0].to(device), 'attention_mask': batch[1].to(device), 'labels': batch[3].to(device) } # Run through the network. with warnings.catch_warnings(): # There is a very annoying warning here when we are using multiple GPUS, # As described here: https://github.com/huggingface/transformers/issues/852. # We can safely ignore this. warnings.simplefilter("ignore") outputs = model(**inputs) loss = outputs[0] loss = loss.sum() / parallel # Average over all GPUs/CPUs. # Backward pass on the network loss.backward() tr_loss += loss.item() # Clipping gradients. Avoud gradient explosion, if the gradient is too large. torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Run the optimizer with the gradients optimizer.step() scheduler.step() model.zero_grad() if step % steps_to_print == 0: # Logits is the actual output from the network. # This is the probability of being relevant or not. # You can check its shape (Should be a vector sized 2) with logits.shape() logits = outputs[1] # Send the logits to the CPU and in numpy form. Easier to check what is going on. preds = logits.detach().cpu().numpy() tqdm.write( f"Training loss: {loss.item()} Learning Rate: {scheduler.get_last_lr()[0]}" ) global_step += 1 # Run an evluation step over the eval dataset. Let's see how we are doing. if global_step % steps_to_eval == 0: eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None for batch in tqdm(dev_data_loader, desc="Valid batch"): model.eval() with torch.no_grad(): # Avoid upgrading gradients here inputs = { 'input_ids': batch[0].to(device), 'attention_mask': batch[1].to(device), 'labels': batch[3].to(device) } with warnings.catch_warnings(): warnings.simplefilter("ignore") outputs = model(**inputs) tmp_eval_loss, logits = outputs[: 2] # Logits is the actual output. Probabilities between 0 and 1. eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 # Concatenate all outputs to evaluate in the end. if preds is None: preds = logits.detach().cpu().numpy( ) # PRedictions into numpy mode out_label_ids = inputs['labels'].detach().cpu( ).numpy().flatten() # Labels assigned by model else: batch_predictions = logits.detach().cpu().numpy() preds = np.append(preds, batch_predictions, axis=0) out_label_ids = np.append( out_label_ids, inputs['labels'].detach().cpu().numpy( ).flatten(), axis=0) eval_loss = eval_loss / nb_eval_steps results = {} results["ROC Dev"] = roc_auc_score(out_label_ids, preds[:, 1]) preds = np.argmax(preds, axis=1) results["Accuracy Dev"] = accuracy_score(out_label_ids, preds) results["F1 Dev"] = f1_score(out_label_ids, preds) results["AP Dev"] = average_precision_score( out_label_ids, preds) tqdm.write("***** Eval results *****") for key in sorted(results.keys()): tqdm.write(f" {key} = {str(results[key])}") save_model(model, global_step) model_to_save = save_model(model, global_step) return model_to_save