def get_positions(self, input_ids: T, tenzorizer: Tensorizer, model: torch.nn.Module = None): if not self.token_id: self.token_id = tenzorizer.get_token_id(self.token) token_indexes = (input_ids == self.token_id).nonzero() # check if all samples in input_ids has index presence and out a default value otherwise bsz = input_ids.size(0) if bsz == token_indexes.size(0): return token_indexes token_indexes_result = [] found_idx_cnt = 0 for i in range(bsz): if (found_idx_cnt < token_indexes.size(0) and token_indexes[found_idx_cnt][0] == i): # this samples has the special token token_indexes_result.append(token_indexes[found_idx_cnt]) found_idx_cnt += 1 else: logger.warning("missing special token %s", input_ids[i]) token_indexes_result.append( torch.tensor([i, 0]).to(input_ids.device) ) # setting 0-th token, i.e. CLS for BERT as the special one token_indexes_result = torch.stack(token_indexes_result, dim=0) return token_indexes_result
def _select_span_with_token( text: str, tensorizer: Tensorizer, token_str: str = "[START_ENT]" ) -> T: id = tensorizer.get_token_id(token_str) query_tensor = tensorizer.text_to_tensor(text) if id not in query_tensor: query_tensor_full = tensorizer.text_to_tensor(text, apply_max_len=False) token_indexes = (query_tensor_full == id).nonzero() if token_indexes.size(0) > 0: start_pos = token_indexes[0, 0].item() # add some randomization to avoid overfitting to a specific token position left_shit = int(tensorizer.max_length / 2) rnd_shift = int((rnd.random() - 0.5) * left_shit / 2) left_shit += rnd_shift query_tensor = query_tensor_full[start_pos - left_shit :] cls_id = tensorizer.tokenizer.cls_token_id if query_tensor[0] != cls_id: query_tensor = torch.cat([torch.tensor([cls_id]), query_tensor], dim=0) from dpr.models.reader import _pad_to_len query_tensor = _pad_to_len( query_tensor, tensorizer.get_pad_id(), tensorizer.max_length ) query_tensor[-1] = tensorizer.tokenizer.sep_token_id # logger.info('aligned query_tensor %s', query_tensor) assert id in query_tensor, "query_tensor={}".format(query_tensor) return query_tensor else: raise RuntimeError( "[START_ENT] toke not found for Entity Linking sample query={}".format( text ) ) else: return query_tensor