def run_model(model, dataset, run, runf, desc='valid'): from time import time # BATCH_SIZE = 16 rerank_run = {} with torch.no_grad(), tqdm(total=sum(len(r) for r in run.values()), ncols=80, desc=desc, leave=False) as pbar: model.eval() for records in data.iter_valid_records(model, dataset, run, BATCH_SIZE): scores = model(records['query_tok'], records['query_mask'], records['doc_tok'], records['doc_mask']) for qid, did, score in zip(records['query_id'], records['doc_id'], scores.detach().cpu().numpy()): rerank_run.setdefault(qid, {})[did] = score.item() pbar.update(len(records['query_id'])) with open(runf, 'wt') as runfile: for qid in rerank_run: scores = list( sorted(rerank_run[qid].items(), key=lambda x: (x[1], x[0]), reverse=True)) for i, (did, score) in enumerate(scores): runfile.write(f'{qid} 0 {did} {i+1} {score} run\n')
def run_model(model, dataset, run, runf, contentid2entity, embed): BATCH_SIZE = 16 #BATCH_SIZE = 8 rerank_run = {} with torch.no_grad(), tqdm(total=sum(len(r) for r in run.values()), ncols=80, desc='valid', leave=False) as pbar: model.eval() for records in data.iter_valid_records(model, dataset, run, BATCH_SIZE, contentid2entity): query_entity = embed(records['query_entity'].cpu() + 1).cuda() doc_entity = embed(records['doc_entity'].cpu() + 1).cuda() scores = model(records['query_tok'], records['query_mask'], records['doc_tok'], records['doc_mask'], query_entity, doc_entity) for qid, did, score in zip(records['query_id'], records['doc_id'], scores): rerank_run.setdefault(qid, {})[did] = score.item() pbar.update(len(records['query_id'])) with open(runf, 'wt') as runfile: for qid in rerank_run: scores = list( sorted(rerank_run[qid].items(), key=lambda x: (x[1], x[0]), reverse=True)) for i, (did, score) in enumerate(scores): runfile.write(f'{qid} 0 {did} {i+1} {score} run\n')
def run_model(model, dataset, run, runf, desc='valid'): BATCH_SIZE = 16 rerank_run = {} model_name = type(model).__name__ with torch.no_grad(), tqdm(total=sum(len(r) for r in run.values()), ncols=80, desc=desc, leave=False) as pbar: model.eval() for records in data.iter_valid_records(model, dataset, run, BATCH_SIZE): if (model_name.startswith("Duet")): scores, v_scores, c_scores = model(records['query_tok'], records['query_mask'], records['doc_tok'], records['doc_mask']) #scores = v_scores + c_scores else: scores = model(records['query_tok'], records['query_mask'], records['doc_tok'], records['doc_mask']) for qid, did, score in zip(records['query_id'], records['doc_id'], scores): rerank_run.setdefault(qid, {})[did] = score.item() pbar.update(len(records['query_id'])) with open(runf, 'wt') as runfile: for qid in rerank_run: scores = list( sorted(rerank_run[qid].items(), key=lambda x: (x[1], x[0]), reverse=True)) for i, (did, score) in enumerate(scores): runfile.write(f'{qid} 0 {did} {i+1} {score} run\n')
def run_model(model, dataset, run, desc='valid'): rerank_run = defaultdict(dict) with torch.no_grad(), tqdm(total=sum(len(r) for r in run.values()), ncols=80, desc=desc, leave=False) as pbar: model.eval() for records in data.iter_valid_records(model, dataset, run, BATCH_SIZE): scores = model(records['query_tok'], records['query_mask'], records['doc_tok'], records['doc_mask']) for qid, did, score in zip(records['query_id'], records['doc_id'], scores): rerank_run[qid][did] = score.item() pbar.update(len(records['query_id'])) return rerank_run
def interpret_model(model, dataset, run, runf, outdir, layer_n, desc='valid'): target_qid = "303" target_did = ["FT944-128", "FT934-5418"] ## true, false BATCH_SIZE = 1 ## should be 1!! cnt = 0 runfile = open(runf, "w") with tqdm(total=sum(len(r) for r in run.values()), ncols=80, desc=desc, leave=False) as pbar: model.eval() out = {} for records in data.iter_valid_records(model, dataset, run, BATCH_SIZE): scores, grads = model.grad_forward(records['query_tok'], records['query_mask'], records['doc_tok'], records['doc_mask'], layer_n=layer_n) qid = records['query_id'][0] did = records['doc_id'][0] score = scores[0, 0].item() grad = torch.mean(grads, dim=0) grad_sumabs = torch.sum(torch.abs(grad), dim=1) cls_sum = grad_sumabs[0] query_sum = torch.sum(grad_sumabs[1:21]) document_sum = torch.sum(grad_sumabs[22:-1]) query_avg = torch.mean(grad_sumabs[1:21]) document_avg = torch.mean(grad_sumabs[22:-1]) ratio = query_sum.item() / document_sum.item() print(qid, did, score, ratio, cls_sum.item(), query_sum.item(), document_sum.item(), file=runfile) if (qid == target_qid and did in target_did): print("found!!") qtk = model.tokenizer.tokenize(dataset[0].get(qid)) dtk = model.tokenizer.tokenize(dataset[1].get(did)) out['qtok'] = qtk out['dtok'] = dtk out['attention'] = scores out['grad'] = grad torch.save(out, "./models/" + outdir + "/" + qid + did + ".pt") pbar.update(len(records['query_id'])) cnt += 1
def score_model(model, dataset, run, passageAgg, desc='valid'): BATCH_SIZE = 16 passageAgg rerank_run = defaultdict(lambda: defaultdict(float)) #a defauldict where the default values are defaultdicts, whose default values are 0, qid->did->score with torch.no_grad(), tqdm(total=sum(len(r) for r in run.values()), ncols=80, desc=desc, leave=False) as pbar: model.eval() for records in data.iter_valid_records(model, dataset, run, BATCH_SIZE): scores = model(records['query_tok'], records['query_mask'], records['doc_tok'], records['doc_mask']) if passageAgg == 'first': for qid, pid, score in zip(records['query_id'], records['doc_id'], scores): did = pid.split("%p")[0] if not did in rerank_run[qid]: rerank_run[qid][did] = score.item() elif passageAgg == 'sum': for qid, pid, score in zip(records['query_id'], records['doc_id'], scores): did = pid.split("%p")[0] rerank_run[qid][did] += score.item() elif passageAgg == 'max': for qid, pid, score in zip(records['query_id'], records['doc_id'], scores): did = pid.split("%p")[0] #print("%s %s %f" % (qid, did, score.item())) #should be 0 if the document hasnt been seen before if score.item() > rerank_run[qid][did]: rerank_run[qid][did] = score.item() pbar.update(len(records['query_id'])) #print(rerank_run[64527]["D414820"]) return rerank_run
def computeScoresFromRawOverride(self, query, docs): if self.debugPrint: print('getScores', query.id, query.text) queryData = {query.id: query.text} # Run maps queries to arrays of document IDs see iter_valid_records (train.py) run = {query.id: [e.id for e in docs]} docData = {} for e in docs: docData[e.id] = e.text sampleRet = {} if docData: # based on the code from run_model function (train.py) dataSet = queryData, docData for records in data.iter_valid_records(self.model, dataSet, run, self.batchSize): scores = self.model(records['query_tok'], records['query_mask'], records['doc_tok'], records['doc_mask']) for qid, did, score in zip(records['query_id'], records['doc_id'], scores): score = score.item() # From tensor to value if self.debugPrint: print(score, did, docData[did]) # Note that each element must be an array, b/c # we can generate more than one feature per document! sampleRet[did] = [score] if self.debugPrint: print('All scores:', sampleRet) return sampleRet
def run_model(model, dataset, run, desc='valid'): rerank_run = defaultdict(dict) true_id = model.tokenizer.get_vocab()[model.tokenizer.tokenize("true")[0]] false_id = model.tokenizer.get_vocab()[model.tokenizer.tokenize("false") [0]] with torch.no_grad(), tqdm(total=sum(len(r) for r in run.values()), ncols=80, desc=desc) as pbar: model.eval() for records in data.iter_valid_records(model, dataset, run, BATCH_SIZE): logits = model.generate(records['query_tok'], records['query_mask'], records['doc_tok'], records['doc_mask']) # scores = logits.softmax(dim=-1)[:, true_id] true_logits = logits[:, true_id].unsqueeze(dim=-1) false_logits = logits[:, false_id].unsqueeze(dim=-1) tf_logits = torch.cat((true_logits, false_logits), dim=-1) scores = tf_logits.softmax(dim=-1)[:, 0] for qid, did, score in zip(records['query_id'], records['doc_id'], scores): rerank_run[qid][did] = score.item() pbar.update(len(records['query_id'])) return rerank_run