def __init__(self, contexts=None, fill_context_embeddings=True, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')): super(LongQAModel, self).__init__() self.device = device self.c_model = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base').to(device) self.c_tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base') self.q_model = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base').to(device) self.q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base') self.r_model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base').to(device) self.r_tokenizer = DPRReaderTokenizerFast.from_pretrained('facebook/dpr-reader-single-nq-base') self.contexts = contexts # Not enough time to load context embeddings in AWS SageMaker, # but can fill weights from saved state dict after loading model. if not self.contexts: with open('code/contexts.json') as f: self.contexts = json.load(f) # output_features = self.c_model.ctx_encoder.bert_model.pooler.dense.out_features # self.context_embeddings = nn.Parameter(torch.zeros(len(self.contexts), output_features)).to(device) # else: context_embeddings = [] with torch.no_grad(): for context in self.contexts: input_ids = self.c_tokenizer(context, return_tensors='pt').to(device)["input_ids"] output = self.c_model(input_ids) context_embeddings.append(output.pooler_output) self.context_embeddings = nn.Parameter(torch.cat(context_embeddings, dim=0)).to(device) print('cwd!:', os.getcwd()) print(os.listdir('code')) self.noise_remover = joblib.load('code/filter_model.sav')
def get_model( pretrained_model_name_or_path: str = 'facebook/dpr-reader-single-nq-base', device: Optional[str] = None, ) -> DPRReader: device = device or ('cuda' if torch.cuda.is_available() else 'cpu') device = torch.device(device) return DPRReader.from_pretrained(pretrained_model_name_or_path).to(device).eval()
def test_reader_inference(self): tokenizer = DPRReaderTokenizer.from_pretrained("facebook/dpr-reader-single-nq-base") model = DPRReader.from_pretrained("facebook/dpr-reader-single-nq-base") model.to(torch_device) encoded_inputs = tokenizer( questions="What is love ?", titles="Haddaway", texts="What Is Love is a song recorded by the artist Haddaway", padding=True, return_tensors="pt", ) encoded_inputs.to(torch_device) outputs = model(**encoded_inputs) # compare the actual values for a slice. expected_start_logits = torch.tensor( [[-10.3005, -10.7765, -11.4872, -11.6841, -11.9312, -10.3002, -9.8544, -11.7378, -12.0821, -10.2975]], dtype=torch.float, device=torch_device, ) expected_end_logits = torch.tensor( [[-11.0684, -11.7041, -11.5397, -10.3465, -10.8791, -6.8443, -11.9959, -11.0364, -10.0096, -6.8405]], dtype=torch.float, device=torch_device, ) self.assertTrue(torch.allclose(outputs.start_logits[:, :10], expected_start_logits, atol=1e-4)) self.assertTrue(torch.allclose(outputs.end_logits[:, :10], expected_end_logits, atol=1e-4))
def __init__( self, dpr_fn: str, tokenizer_fn: str, tokenizer_max_len: int, ): self.dpr = DPRReader.from_pretrained(dpr_fn) self.tokenizer_max_len = tokenizer_max_len self.tokenizer = DPRReaderTokenizer.from_pretrained( tokenizer_fn, max_len=tokenizer_max_len) device = 'cuda' if cuda_is_available() else 'cpu' self.dpr.to(device) self.device = device
def __init__(self): self.context_tokenizer = DPRContextEncoderTokenizer.from_pretrained( 'facebook/dpr-ctx_encoder-single-nq-base') self.context_model = DPRContextEncoder.from_pretrained( 'facebook/dpr-ctx_encoder-single-nq-base', return_dict=True) self.query_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained( 'facebook/dpr-question_encoder-single-nq-base') self.query_encoder = DPRQuestionEncoder.from_pretrained( "facebook/dpr-question_encoder-single-nq-base") self.reader_tokenizer = DPRReaderTokenizer.from_pretrained( 'facebook/dpr-reader-single-nq-base') self.reader_model = DPRReader.from_pretrained( 'facebook/dpr-reader-single-nq-base', return_dict=True) self.vector_length = 768
def test_model_from_pretrained(self): for model_name in DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = DPRContextEncoder.from_pretrained(model_name) self.assertIsNotNone(model) for model_name in DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = DPRContextEncoder.from_pretrained(model_name) self.assertIsNotNone(model) for model_name in DPR_QUESTION_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = DPRQuestionEncoder.from_pretrained(model_name) self.assertIsNotNone(model) for model_name in DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = DPRReader.from_pretrained(model_name) self.assertIsNotNone(model)
def __init__(self, model_name: str, tokenizer_name: str = None, span_selection_rules=None, num_spans: int = 1, max_answer_length: int = 10, num_spans_per_passage: int = 10, batch_size: int = 16, device: str = 'cuda:0'): if span_selection_rules is None: span_selection_rules = [DprSelection()] self.device = device self.model = DPRReader.from_pretrained(model_name).to( self.device).eval() if tokenizer_name: self.tokenizer = DPRReaderTokenizer.from_pretrained(tokenizer_name) else: self.tokenizer = DPRReaderTokenizer.from_pretrained(model_name) self.span_selection_rules = span_selection_rules self.num_spans = num_spans self.max_answer_length = max_answer_length self.num_spans_per_passage = num_spans_per_passage self.batch_size = batch_size
class DPRReader: reader_tokenizer = DPRReaderTokenizer.from_pretrained( 'facebook/dpr-reader-single-nq-base') reader_model = DPRReader.from_pretrained( 'facebook/dpr-reader-single-nq-base', return_dict=True) MAX_TOKENS = 512 MAX_TOKENS_QUESTION = 30 MAX_TOKENS_DOCUMENT = MAX_TOKENS - MAX_TOKENS_QUESTION - 2 # [SEP] and [CLS] def __init__(self): self.device = 'cuda' if torch.cuda.is_available() else 'cpu' if self.device == 'cuda': self.reader_model = self.reader_model.cuda() def _reconstruct_tokens(self, bert_tokens): output_string = '' for token in bert_tokens: if token[:2] == '##': output_string += token[2:] else: output_string += ' ' output_string += token return output_string[1:] def get_token_length(self, string): tokens = self.reader_tokenizer.encode(string) return len(tokens) def chunk_document(self, document, re_consolidate=True): '''Chunks up a long document into optimally large pieces so that they can be passed to BERT. Activating `re_consolidate` will put the chunks back together to make them as large as possible for improved performance. ''' document_length = self.get_token_length(document) if document_length > self.MAX_TOKENS_DOCUMENT: approved_chunks = [] paragraphs = document.split('\n') paragraphs = [par for par in paragraphs if par] for paragraph in paragraphs: paragraph_length = self.get_token_length(paragraph) if paragraph_length > self.MAX_TOKENS_DOCUMENT: sentences = paragraph.split('.') sentences = [sen for sen in sentences if sen] for sentence in sentences: sentence_length = self.get_token_length(sentence) if sentence_length > self.MAX_TOKENS_DOCUMENT: print("Ignoring overlong sentence.") else: approved_chunks.append(sentence) else: approved_chunks.append(paragraph) if re_consolidate: lengths = [ self.get_token_length(chunk) for chunk in approved_chunks ] consolidated_chunks = [] running_length = 0 current_chunk = '' for chunk, length in zip(approved_chunks, lengths): if (running_length + length) < self.MAX_TOKENS_DOCUMENT: current_chunk += chunk running_length += length else: consolidated_chunks.append(current_chunk) current_chunk = chunk running_length = length return consolidated_chunks else: return approved_chunks else: return [document] def read_documents(self, question: str, documents: List[str], titles: List[str]): encoded_inputs = self.reader_tokenizer(questions=question, titles=titles, texts=documents, return_tensors='pt', padding=True) input_ids = encoded_inputs['input_ids'] encoded_inputs = encoded_inputs.to(self.device) #TODO Figure this out? outputs = self.reader_model(**encoded_inputs) start_logits = outputs.start_logits end_logits = outputs.end_logits relevance_logits = outputs.relevance_logits responses = [] for i in range(len(documents)): title = titles[i] document = documents[i] start = start_logits[i] end = end_logits[i] relevance = relevance_logits[i] inp_ids = input_ids[i] input_tokens = self.reader_tokenizer.convert_ids_to_tokens(inp_ids) answer_start = int(start.argmax()) answer_end = int(end.argmax()) relevance = float(relevance.max()) answer_tokens = input_tokens[answer_start:answer_end + 1] answer_str = self._reconstruct_tokens(answer_tokens) response = { 'answer': answer_str, 'relevance': relevance, 'title': title, 'document': document } responses.append(response) response = responses.sort(key=lambda x: -x['relevance']) return responses def read_chunked_document(self, question: str, document: str, title: str): chunked_docs = self.chunk_document(document) titles_list = [title for i in range(len(chunked_docs))] return self.read_documents(question, chunked_docs, titles_list)
class DPRReader(DocumentChunker): ''' Class for "reading" retrieved documents with DPR, which performs two functions: re-ranking them and providing candidate answers to the question. ''' reader_tokenizer = DPRReaderTokenizer.from_pretrained( 'facebook/dpr-reader-single-nq-base') reader_model = DPRReader.from_pretrained( 'facebook/dpr-reader-single-nq-base', return_dict=True) def __init__(self): super(DocumentChunker).__init__() self.device = 'cuda' if torch.cuda.is_available() else 'cpu' if self.device == 'cuda': self.reader_model = self.reader_model.cuda() def _reconstruct_tokens(self, bert_tokens: List[str]): ''' Utility function for reassembling WordPiece tokens into human-readable strings. ''' output_string = '' for token in bert_tokens: if token[:2] == '##': output_string += token[2:] else: output_string += ' ' output_string += token return output_string[1:] def read_documents(self, question: str, documents: List[str], titles: List[str]): ''' Reads a series of `documents` and `titles` and rates their relevance to the `question` as well as proposes an answer. Args: question (str): The question string (e.g. `who is bill gates?`) documents (List[str]): List of documents to rate/propose an answer from. titles (List[str]): List of the titles of those documents ''' assert len(documents) == len(titles) encoded_inputs = self.reader_tokenizer(questions=question, titles=titles, texts=documents, return_tensors='pt', padding=True) input_ids = encoded_inputs['input_ids'] encoded_inputs = encoded_inputs.to(self.device) outputs = self.reader_model(**encoded_inputs) start_logits = outputs.start_logits end_logits = outputs.end_logits relevance_logits = outputs.relevance_logits responses = [] for i in range(len(documents)): title = titles[i] document = documents[i] start = start_logits[i] end = end_logits[i] relevance = relevance_logits[i] inp_ids = input_ids[i] input_tokens = self.reader_tokenizer.convert_ids_to_tokens(inp_ids) answer_start = int(start.argmax()) answer_end = int(end.argmax()) relevance = float(relevance.max()) answer_tokens = input_tokens[answer_start:answer_end + 1] answer_str = self._reconstruct_tokens(answer_tokens) response = { 'answer': answer_str, 'relevance': relevance, 'title': title, 'document': document } responses.append(response) return responses def read_chunked_document(self, question: str, document: str, title: str): ''' Read a single document that may be exceed the maximum length BERT can handle, so chunk it up into pieces. For args see DPRReader.read_documents() ''' chunked_docs = self.chunk_document(document) titles_list = [title for i in range(len(chunked_docs))] return self.read_documents(question, chunked_docs, titles_list)
def __init__(self): self.r_encoder = DPRReader.from_pretrained( "facebook/dpr-reader-single-nq-base").to(Config.device) self.r_tokenizer = DPRReaderTokenizerFast.from_pretrained( "facebook/dpr-reader-single-nq-base")