def id2vec(self, qid, posid, negid=None, **kwargs): query = self.qid2toks[qid] # TODO find a way to calculate qlen/doclen stats earlier, so we can log them and check sanity of our values qlen, doclen = self.config["maxqlen"], self.config["maxdoclen"] posdoc = self.get_doc_tokens(posid) if not posdoc: raise MissingDocError(qid, posid) idfs = padlist(self._get_idf(query), qlen, 0) query = self._tok2vec(padlist(query, qlen, self.pad_tok)) posdoc = self._tok2vec(padlist(posdoc, doclen, self.pad_tok)) # TODO determine whether pin_memory is happening. may not be because we don't place the strings in a np or torch object data = { "qid": qid, "posdocid": posid, "idfs": np.array(idfs, dtype=np.float32), "query": np.array(query, dtype=np.long), "posdoc": np.array(posdoc, dtype=np.long), "query_idf": np.array(idfs, dtype=np.float32), "negdocid": "", "negdoc": np.zeros(self.config["maxdoclen"], dtype=np.long), } if negid: negdoc = self.get_doc_tokens(negid) if not negdoc: raise MissingDocError(qid, negid) negdoc = self._tok2vec(padlist(negdoc, doclen, self.pad_tok)) data["negdocid"] = negid data["negdoc"] = np.array(negdoc, dtype=np.long) return data
def id2vec(self, qid, posid, negid=None, label=None): """ See parent class for docstring """ assert label is not None maxseqlen = self.config["maxseqlen"] numpassages = self.config["numpassages"] query_toks = self.qid2toks[qid] pos_bert_inputs = [] pos_bert_masks = [] pos_bert_segs = [] # N.B: The passages in self.docid2passages are not bert tokenized pos_passages = self.docid2passages[posid] for tokenized_passage in pos_passages: inp, mask, seg = self._prepare_bert_input(query_toks, tokenized_passage) pos_bert_inputs.append(inp) pos_bert_masks.append(mask) pos_bert_segs.append(seg) # TODO: Rename the posdoc key in the below dict to 'pos_bert_input' data = { "qid": qid, "posdocid": posid, "pos_bert_input": np.array(pos_bert_inputs, dtype=np.long), "pos_mask": np.array(pos_bert_masks, dtype=np.long), "pos_seg": np.array(pos_bert_segs, dtype=np.long), "negdocid": "", "neg_bert_input": np.zeros((numpassages, maxseqlen), dtype=np.long), "neg_mask": np.zeros((numpassages, maxseqlen), dtype=np.long), "neg_seg": np.zeros((numpassages, maxseqlen), dtype=np.long), "label": np.array(label, dtype=np.float32), } if not negid: return data neg_bert_inputs, neg_bert_masks, neg_bert_segs = [], [], [] neg_passages = self.docid2passages[negid] for tokenized_passage in neg_passages: inp, mask, seg = self._prepare_bert_input(query_toks, tokenized_passage) neg_bert_inputs.append(inp) neg_bert_masks.append(mask) neg_bert_segs.append(seg) if not neg_bert_inputs: raise MissingDocError(qid, negid) data["negdocid"] = negid data["neg_bert_input"] = np.array(neg_bert_inputs, dtype=np.long) data["neg_mask"] = np.array(neg_bert_masks, dtype=np.long) data["neg_seg"] = np.array(neg_bert_segs, dtype=np.long) return data
def id2vec(self, qid, posid, negid=None, query=None): if query is not None: if qid is None: query = self["tokenizer"].tokenize(query) pass else: raise RuntimeError("received both a qid and query, but only one can be passed") else: query = self.qid2toks[qid] # TODO find a way to calculate qlen/doclen stats earlier, so we can log them and check sanity of our values qlen, doclen = self.cfg["maxqlen"], self.cfg["maxdoclen"] posdoc = self.docid2toks.get(posid, None) if not posdoc: raise MissingDocError(qid, posid) idfs = padlist(self._get_idf(query), qlen, 0) query = self._tok2vec(padlist(query, qlen, self.pad_tok)) posdoc = self._tok2vec(padlist(posdoc, doclen, self.pad_tok)) # TODO determine whether pin_memory is happening. may not be because we don't place the strings in a np or torch object data = { "qid": qid, "posdocid": posid, "idfs": np.array(idfs, dtype=np.float32), "query": np.array(query, dtype=np.long), "posdoc": np.array(posdoc, dtype=np.long), "query_idf": np.array(idfs, dtype=np.float32), } if not negid: logger.debug(f"missing negtive doc id for qid {qid}") return data negdoc = self.docid2toks.get(negid, None) if not negdoc: raise MissingDocError(qid, negid) negdoc = self._tok2vec(padlist(negdoc, doclen, self.pad_tok)) data["negdocid"] = negid data["negdoc"] = np.array(negdoc, dtype=np.long) return data
def id2vec(self, qid, posid, negid=None): tokenizer = self.tokenizer qlen, doclen = self.config["maxqlen"], self.config["maxdoclen"] query_toks = tokenizer.convert_tokens_to_ids(self.qid2toks[qid]) query_mask = self.get_mask(query_toks, qlen) query = padlist(query_toks, qlen) posdoc_toks = tokenizer.convert_tokens_to_ids(self.docid2toks[posid]) posdoc_mask = self.get_mask(posdoc_toks, doclen) posdoc = padlist(posdoc_toks, doclen) data = { "qid": qid, "posdocid": posid, "idfs": np.zeros(qlen, dtype=np.float32), "query": np.array(query, dtype=np.long), "query_mask": np.array(query_mask, dtype=np.long), "posdoc": np.array(posdoc, dtype=np.long), "posdoc_mask": np.array(posdoc_mask, dtype=np.long), "query_idf": np.array(query, dtype=np.float32), "negdocid": "", "negdoc": np.zeros(doclen, dtype=np.long), "negdoc_mask": np.zeros(doclen, dtype=np.long), } if negid: negdoc_toks = tokenizer.convert_tokens_to_ids( self.docid2toks.get(negid, None)) negdoc_mask = self.get_mask(negdoc_toks, doclen) negdoc = padlist(negdoc_toks, doclen) if not negdoc: raise MissingDocError(qid, negid) data["negdocid"] = negid data["negdoc"] = np.array(negdoc, dtype=np.long) data["negdoc_mask"] = np.array(negdoc_mask, dtype=np.long) return data