def __init__(self, contexts=None, fill_context_embeddings=True, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')): super(LongQAModel, self).__init__() self.device = device self.c_model = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base').to(device) self.c_tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base') self.q_model = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base').to(device) self.q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base') self.r_model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base').to(device) self.r_tokenizer = DPRReaderTokenizerFast.from_pretrained('facebook/dpr-reader-single-nq-base') self.contexts = contexts # Not enough time to load context embeddings in AWS SageMaker, # but can fill weights from saved state dict after loading model. if not self.contexts: with open('code/contexts.json') as f: self.contexts = json.load(f) # output_features = self.c_model.ctx_encoder.bert_model.pooler.dense.out_features # self.context_embeddings = nn.Parameter(torch.zeros(len(self.contexts), output_features)).to(device) # else: context_embeddings = [] with torch.no_grad(): for context in self.contexts: input_ids = self.c_tokenizer(context, return_tensors='pt').to(device)["input_ids"] output = self.c_model(input_ids) context_embeddings.append(output.pooler_output) self.context_embeddings = nn.Parameter(torch.cat(context_embeddings, dim=0)).to(device) print('cwd!:', os.getcwd()) print(os.listdir('code')) self.noise_remover = joblib.load('code/filter_model.sav')
def test_inference_no_head(self): model = DPRQuestionEncoder.from_pretrained( "facebook/dpr-question_encoder-single-nq-base", return_dict=False) model.to(torch_device) input_ids = torch.tensor( [[101, 7592, 1010, 2003, 2026, 3899, 10140, 1029, 102]], dtype=torch.long, device=torch_device) # [CLS] hello, is my dog cute? [SEP] output = model(input_ids)[0] # embedding shape = (1, 768) # compare the actual values for a slice. expected_slice = torch.tensor( [[ 0.03236253, 0.12753335, 0.16818509, 0.00279786, 0.3896933, 0.24264945, 0.2178971, -0.02335227, -0.08481959, -0.14324117, ]], dtype=torch.float, device=torch_device, ) self.assertTrue( torch.allclose(output[:, :10], expected_slice, atol=1e-4))
def __init__(self, model_name: str, tokenizer_name: str = None, device: str = 'cpu'): self.device = device self.model = DPRQuestionEncoder.from_pretrained(model_name) self.model.to(self.device) self.tokenizer = DPRQuestionEncoderTokenizer.from_pretrained( tokenizer_name or model_name)
def __init__(self): self.ctx_encoder = DPRContextEncoder.from_pretrained( "facebook/dpr-ctx_encoder-multiset-base").to(Config.device) self.q_encoder = DPRQuestionEncoder.from_pretrained( "facebook/dpr-question_encoder-multiset-base").to(Config.device) self.ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained( "facebook/dpr-ctx_encoder-multiset-base") self.q_tokenizer = DPRQuestionEncoderTokenizerFast.from_pretrained( "facebook/dpr-question_encoder-multiset-base")
def __init__(self, encoder_dir: str = None, encoded_query_dir: str = None, device: str = 'cpu'): super().__init__(encoded_query_dir) if encoder_dir: self.device = device self.model = DPRQuestionEncoder.from_pretrained(encoder_dir) self.model.to(self.device) self.tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(encoder_dir) self.has_model = True if (not self.has_model) and (not self.has_encoded_query): raise Exception('Neither query encoder model nor encoded queries provided. Please provide at least one')
def __init__(self): self.tokenizer_q = DPRQuestionEncoderTokenizer.from_pretrained( 'facebook/dpr-question_encoder-single-nq-base') self.model_q = DPRQuestionEncoder.from_pretrained( 'facebook/dpr-question_encoder-single-nq-base') self.model_q.to(DEVICE) self.tokenizer_d = DPRContextEncoderTokenizer.from_pretrained( 'facebook/dpr-ctx_encoder-single-nq-base') self.model_d = DPRContextEncoder.from_pretrained( 'facebook/dpr-ctx_encoder-single-nq-base') self.model_d.to(DEVICE)
def test_init_changed_config(self): config = self.model_tester.prepare_config_and_inputs()[0] model = DPRQuestionEncoder(config=config) model.to(torch_device) model.eval() with tempfile.TemporaryDirectory() as tmp_dirname: model.save_pretrained(tmp_dirname) model = DPRQuestionEncoder.from_pretrained(tmp_dirname, projection_dim=512) self.assertIsNotNone(model)
def __init__(self): self.context_tokenizer = DPRContextEncoderTokenizer.from_pretrained( 'facebook/dpr-ctx_encoder-single-nq-base') self.context_model = DPRContextEncoder.from_pretrained( 'facebook/dpr-ctx_encoder-single-nq-base', return_dict=True) self.query_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained( 'facebook/dpr-question_encoder-single-nq-base') self.query_encoder = DPRQuestionEncoder.from_pretrained( "facebook/dpr-question_encoder-single-nq-base") self.reader_tokenizer = DPRReaderTokenizer.from_pretrained( 'facebook/dpr-reader-single-nq-base') self.reader_model = DPRReader.from_pretrained( 'facebook/dpr-reader-single-nq-base', return_dict=True) self.vector_length = 768
def test_model_from_pretrained(self): for model_name in DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = DPRContextEncoder.from_pretrained(model_name) self.assertIsNotNone(model) for model_name in DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = DPRContextEncoder.from_pretrained(model_name) self.assertIsNotNone(model) for model_name in DPR_QUESTION_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = DPRQuestionEncoder.from_pretrained(model_name) self.assertIsNotNone(model) for model_name in DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = DPRReader.from_pretrained(model_name) self.assertIsNotNone(model)
def __init__(self, encoder_dir: str = None, tokenizer_name: str = None, encoded_query_dir: str = None, device: str = 'cpu'): self.has_model = False self.has_encoded_query = False if encoded_query_dir: self.embedding = self._load_embeddings(encoded_query_dir) self.has_encoded_query = True if encoder_dir: self.device = device self.model = DPRQuestionEncoder.from_pretrained(encoder_dir) self.model.to(self.device) self.tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(tokenizer_name or encoder_dir) self.has_model = True if (not self.has_model) and (not self.has_encoded_query): raise Exception('Neither query encoder model nor encoded queries provided. Please provide at least one')
def download_model(outputdir_question_tokenizer: str, outputdir_question_encoder: str, outputdir_ctx_tokenizer: str, outputdir_ctx_encoder: str): q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained( "facebook/dpr-question_encoder-single-nq-base") print("Save question tokenizer to ", outputdir_question_tokenizer) q_tokenizer.save_pretrained(outputdir_question_tokenizer) q_encoder = DPRQuestionEncoder.from_pretrained( "facebook/dpr-question_encoder-single-nq-base") print("Save question encoder to ", outputdir_question_encoder) q_encoder.save_pretrained(outputdir_question_encoder) ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained( "facebook/dpr-ctx_encoder-single-nq-base") print("Save context tokenizer to ", outputdir_ctx_tokenizer) ctx_tokenizer.save_pretrained(outputdir_ctx_tokenizer) ctx_encoder = DPRContextEncoder.from_pretrained( "facebook/dpr-ctx_encoder-single-nq-base") print("Save context encoder to", outputdir_ctx_encoder) ctx_encoder.save_pretrained(outputdir_ctx_encoder)
from reader import get_answer model = ElectraForQuestionAnswering.from_pretrained("Reader/electra_QA").to( device=torch.device('cpu')) model.load_state_dict( torch.load('Reader/weight_electra/weights_3.pth', map_location=torch.device('cpu'))) model.eval() tokenizer = BertWordPieceTokenizer("Reader/electra_base_uncased/vocab.txt", lowercase=True) torch.set_grad_enabled(False) q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained( "facebook/dpr-question_encoder-single-nq-base") q_encoder = DPRQuestionEncoder.from_pretrained( "Retrieval/question_encoder").to(device=torch.device('cpu')) q_encoder.eval() # ctx_tokenizer = BertWordPieceTokenizer("ctx_tokenizer/vocab.txt", lowercase=True) ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained( "facebook/dpr-ctx_encoder-single-nq-base") ctx_encoder = DPRContextEncoder.from_pretrained("Retrieval/ctx_encoder").to( device=torch.device('cpu')) ctx_encoder.eval() app = Flask(__name__) @app.route('/') def home(): return render_template('home.html')
from flask import request, jsonify import torch from torch import nn import pandas as pd import numpy as np from datasets import Features, Sequence, Value, load_dataset, load_from_disk from transformers import (DPRContextEncoder, DPRQuestionEncoder, DPRQuestionEncoderTokenizerFast) import faiss import operator tokenizer = DPRQuestionEncoderTokenizerFast.from_pretrained( 'facebook/dpr-question_encoder-multiset-base') model = DPRQuestionEncoder.from_pretrained('Cencoder') model.to(torch.device('cpu')) model.eval() print('--- Model Loaded ---') dataset = load_from_disk('def_index/my_knowledge_dataset/') dataset.load_faiss_index("embeddings", 'def_index/my_knowledge_dataset_hnsw_index.faiss') app = Flask(__name__) app.debug = True @app.route("/", methods=['GET']) def home(): return "<center><h1>Flask web API</h1><p>Returns definitions for queries</center>"
help='qas file format', default='json', required=False) parser.add_argument('--output', type=str, help='path to store query embeddings', required=True) parser.add_argument('--device', type=str, help='device cpu or cuda [cuda:0, cuda:1...]', default='cpu', required=False) args = parser.parse_args() tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(args.encoder) model = DPRQuestionEncoder.from_pretrained(args.encoder) model.to(args.device) embeddings = {'id': [], 'text': [], 'embedding': []} qa_parser = None if args.format == 'csv': qa_parser = parse_qa_csv_file elif args.format == 'json': qa_parser = parse_qa_json_file if qa_parser is None: print( f'No QA parser defined for file format: {args.format}, or format not match' ) for qid, (question, answers) in enumerate(tqdm(list(qa_parser(args.input)))): embeddings['id'].append(qid)
import torch import torch.nn as nn import torch.nn.functional as F from CustomDPRDataset import CustomDPRDataset from tqdm import tqdm import sys from transformers import DPRContextEncoder, DPRContextEncoderTokenizer, DPRQuestionEncoder, DPRQuestionEncoderTokenizer, AdamW, get_linear_schedule_with_warmup # initialize tokenizers and models for context encoder and question encoder context_name = 'facebook/dpr-ctx_encoder-multiset-base' # set to what context encoder we want to use question_name = 'facebook/dpr-question_encoder-multiset-base' # set to what question encoder we want to use context_tokenizer = DPRContextEncoderTokenizer.from_pretrained(context_name) context_model = DPRContextEncoder.from_pretrained(context_name).cuda() question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(question_name) question_model = DPRQuestionEncoder.from_pretrained(question_name).cuda() nll = nn.NLLLoss() # question_model.half() # context_model.half() # params batch_size = 256 grad_accum = 8 lr = 1e-5 text_descrip = "batchsize256_gradaccum8_v2" print("intialized models/tokenizers") # initialize dataset train_dataset = CustomDPRDataset()
class DPRIndex(DocumentChunker): ''' Class for indexing and searching documents, using a combination of vectors producted by DPR and keyword matching from Elastic TF-IDF. As a subclass of DocumentChunker, this class automatically handles document chunking as well. ''' INDEX_NAME = 'dense-passage-retrieval' D = 768 context_tokenizer = DPRContextEncoderTokenizer.from_pretrained( 'facebook/dpr-ctx_encoder-single-nq-base') context_model = DPRContextEncoder.from_pretrained( 'facebook/dpr-ctx_encoder-single-nq-base', return_dict=True) question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained( 'facebook/dpr-question_encoder-single-nq-base') question_model = DPRQuestionEncoder.from_pretrained( 'facebook/dpr-question_encoder-single-nq-base', return_dict=True) def __init__(self, documents: List[DPRDocument]): super(DocumentChunker).__init__() self.device = 'cuda' if torch.cuda.is_available() else 'cpu' if self.device == 'cuda': self.reader_model = self.reader_model.cuda() self.faiss_index = faiss.IndexFlatIP(self.D) self._setup_elastic_index() self._build_index(documents) def _setup_elastic_index(self): '''Sets up the Elastic Index. Deletes old ones if needed.''' self.es = Elasticsearch() if self.es.indices.exists(self.INDEX_NAME): logging.warning(f'Deleting old index for {self.INDEX_NAME}.') self.es.indices.delete(self.INDEX_NAME) self.es.indices.create(index=self.INDEX_NAME) def _build_index(self, documents): ''' Initializes the data structure to keep track of which chunks correspond to which documents. ''' self.documents = documents self.doc_bodies = [doc.body for doc in self.documents] self.chunks = [] self.chunk_index = {} # {chunk: document} self.inverse_chunk_index = {} # {document: [chunks]} chunk_counter = 0 for doc_counter, doc_body in tqdm(enumerate(self.doc_bodies), total=len(self.doc_bodies)): self.inverse_chunk_index[doc_counter] = [] chunked_docs = self.chunk_document(doc_body) self.chunks.extend(chunked_docs) for chunked_doc in chunked_docs: chunk_embedding = self.embed_context(chunked_doc) self.faiss_index.add(chunk_embedding) self.es.create(self.INDEX_NAME, id=chunk_counter, body={'chunk': chunked_doc}) self.chunk_index[chunk_counter] = doc_counter self.inverse_chunk_index[doc_counter].append(chunk_counter) chunk_counter += 1 self.total_docs = len(self.documents) self.total_chunks = len(self.chunks) def embed_question(self, question: str): '''Embed the question in vector space with the question encoder.''' input_ids = self.question_tokenizer(question, return_tensors='pt')['input_ids'] embeddings = self.question_model( input_ids).pooler_output.detach().numpy() return embeddings def embed_context(self, context: str): '''Embed the context (doc) in vector space with the question encoder.''' input_ids = self.context_tokenizer(context, return_tensors='pt')['input_ids'] embeddings = self.context_model( input_ids).pooler_output.detach().numpy() return embeddings def search_dense_index(self, question: str, k: int = 5): ''' Search the vector index by encoding the question and then performing nearest neighbor on the FAISS index of context vectors. Args: question (str): The natural language question, e.g. `who is bill gates?` k (int): The number of documents to return from the index. ''' if k > self.total_chunks: k = self.total_chunks question_embedding = self.embed_question(question) dists, chunk_ids = self.faiss_index.search(question_embedding, k=k) dists, chunk_ids = list(dists[0]), list(chunk_ids[0]) dists = list(map(float, dists)) # For Flask structured_response = [] for dist, chunk_id in zip(dists, chunk_ids): chunk = self.chunks[chunk_id] document_id = self.chunk_index[chunk_id] document = self.documents[document_id] blob = { 'document': document, 'document_id': document_id, 'chunk': chunk, 'chunk_id': int(chunk_id), # For Flask 'faiss_dist': dist } structured_response.append(blob) return structured_response def search_sparse_index(self, query): body = {'size': 10, 'query': {'match': {'chunk': query}}} results = self.es.search(index=self.INDEX_NAME, body=body) hits = results['hits']['hits'] return hits def _merge_results(self, sparse_results, dense_results): '''Merges the results of sparse and dense retrieval.''' results_index = {} for sparse_result in sparse_results: id, score = sparse_result['_id'], sparse_result['_score'] id = int(id) results_index[id] = {'elastic_score': score} for dense_result in dense_results: id, score = dense_result['chunk_id'], dense_result['faiss_dist'] if id in results_index: results_index[id]['faiss_dist'] = score else: results_index[id] = {'faiss_dist': score} results = [] for chunk_id, scores in results_index.items(): document_id = self.chunk_index[chunk_id] document = self.documents[document_id] chunk = self.chunks[chunk_id] doc_profile = document.to_dict() result = { 'chunk_id': chunk_id, 'chunk': chunk, 'document_id': document_id, 'document': doc_profile, 'scores': scores } results.append(result) return results def search_dual_index(self, query: str): '''Search both the sparse and dense indices and merge the results.''' sparse_result = self.search_sparse_index(query) dense_result = self.search_dense_index(query) merged_results = self._merge_results(sparse_result, dense_result) return merged_results