def preload_weights(): models = { "Camembert_Q_A": "illuin/camembert-large-fquad", "Camembert": "camembert/camembert-large", "Bert": "bert-large-uncased", "Bert_Q_A": "bert-large-uncased-whole-word-masking-finetuned-squad" } for folder in models.keys(): p = f'{WEIGHTS_PATH}/{folder}' if not os.path.exists(p): os.makedirs(p) if not os.path.exists(f'{WEIGHTS_PATH}/Camembert_Q_A/pytorch_model.bin'): QA_MODEL_NAME_FR = "illuin/camembert-large-fquad" QA_TOK_FR = AutoTokenizer.from_pretrained(QA_MODEL_NAME_FR) QA_MODEL_FR = CamembertForQuestionAnswering.from_pretrained( QA_MODEL_NAME_FR) QA_FR = QuestionAnsweringPipeline(model=QA_MODEL_FR, tokenizer=QA_TOK_FR) QA_FR.save_pretrained(f'{WEIGHTS_PATH}/Camembert_Q_A') del QA_FR del QA_TOK_FR del QA_MODEL_FR if not os.path.exists(f'{WEIGHTS_PATH}/Camembert/pytorch_model.bin'): EMB_MODEL_NAME_FR = "camembert/camembert-large" EMB_TOK_FR = AutoTokenizer.from_pretrained(EMB_MODEL_NAME_FR) EMB_FR = AutoModel.from_pretrained(EMB_MODEL_NAME_FR) EMB_FR.save_pretrained(f'{WEIGHTS_PATH}/Camembert') EMB_TOK_FR.save_pretrained(f'{WEIGHTS_PATH}/Camembert') del EMB_TOK_FR del EMB_FR if not os.path.exists(f'{WEIGHTS_PATH}/Bert_Q_A/pytorch_model.bin'): QA_MODEL_NAME_EN = "bert-large-uncased-whole-word-masking-finetuned-squad" QA_TOK_EN = AutoTokenizer.from_pretrained(QA_MODEL_NAME_EN) QA_MODEL_EN = AutoModelForQuestionAnswering.from_pretrained( QA_MODEL_NAME_EN) QA_EN = QuestionAnsweringPipeline(model=QA_MODEL_EN, tokenizer=QA_TOK_EN) QA_EN.save_pretrained(f'{WEIGHTS_PATH}/Bert_Q_A') del QA_EN del QA_MODEL_EN del QA_TOK_EN if not os.path.exists(f'{WEIGHTS_PATH}/Bert/pytorch_model.bin'): EMB_MODEL_NAME_EN = "bert-large-uncased" EMB_TOK_EN = AutoTokenizer.from_pretrained(EMB_MODEL_NAME_EN) EMB_EN = AutoModel.from_pretrained(EMB_MODEL_NAME_EN) EMB_EN.save_pretrained(f'{WEIGHTS_PATH}/Bert') EMB_TOK_EN.save_pretrained(f'{WEIGHTS_PATH}/Bert') del EMB_TOK_EN del EMB_EN
def get_test_pipeline(self, model, tokenizer, feature_extractor): if isinstance(model.config, LxmertConfig): # This is an bimodal model, we need to find a more consistent way # to switch on those models. return None, None question_answerer = QuestionAnsweringPipeline(model, tokenizer) examples = [ {"question": "Where was HuggingFace founded ?", "context": "HuggingFace was founded in Paris."}, {"question": "In what field is HuggingFace ?", "context": "HuggingFace is an AI startup."}, ] return question_answerer, examples
def load_models(): print(f"Loading models ") print(":floppy_disk: [yellow]Loading FR model [/yellow]", end="") print(f"on [blue]{DEVICE}[/blue]...") QA_TOK_FR = AutoTokenizer.from_pretrained(QA_MODEL_NAME_FR) QA_MODEL_FR = CamembertForQuestionAnswering.from_pretrained( QA_MODEL_NAME_FR) _LOADED_MODELS['FR'] = { 'QNA': QuestionAnsweringPipeline(model=QA_MODEL_FR, tokenizer=QA_TOK_FR, device=DEVICE_PIPELINE), 'TOK': AutoTokenizer.from_pretrained(EMB_MODEL_NAME_FR), 'EMB': AutoModel.from_pretrained(EMB_MODEL_NAME_FR).to(DEVICE) } del QA_TOK_FR del QA_MODEL_FR print(":floppy_disk: [green]Loaded FR models [/green]") print(":floppy_disk: [yellow]Loading EN model [/yellow]", end="") print(f"on [blue]{DEVICE}[/blue]...") QA_TOK_EN = AutoTokenizer.from_pretrained(QA_MODEL_NAME_EN) QA_MODEL_EN = AutoModelForQuestionAnswering.from_pretrained( QA_MODEL_NAME_EN) _LOADED_MODELS['EN'] = { 'QNA': QuestionAnsweringPipeline(model=QA_MODEL_EN, tokenizer=QA_TOK_EN, device=DEVICE_PIPELINE), 'TOK': AutoTokenizer.from_pretrained(EMB_MODEL_NAME_EN), 'EMB': AutoModel.from_pretrained(EMB_MODEL_NAME_EN).to(DEVICE) } del QA_TOK_EN del QA_MODEL_EN print(":floppy_disk: [green]Loaded EN models[/green]")
def __init__(self, model_type="DISTILBERT", model_name="distilbert-base-cased-distilled-squad"): self.adaptor = get_adaptor(model_type) model = AutoModelForQuestionAnswering.from_pretrained(model_name) super().__init__(model_type, model_name, model) device_number = detect_cuda_device_number() self._pipeline = QuestionAnsweringPipeline(model=self.model, tokenizer=self.tokenizer, device=device_number) self._trainer = QATrainer(self.model, model_type, self.tokenizer, self._device, self.logger)
TOK_QA_EN = BertTokenizer.from_pretrained( "bert-large-uncased-whole-word-masking-finetuned-squad") QA_EN = BertForQuestionAnswering.from_pretrained( "bert-large-uncased-whole-word-masking-finetuned-squad") EMB_EN = EMB_EN.to(DEVICE) QA_EN = QA_EN.to(DEVICE) # TOK_QA_FR = CamembertTokenizer.from_pretrained("illuin/camembert-large-fquad") # QA_FR = CamembertForQuestionAnswering.from_pretrained( # "illuin/camembert-large-fquad") # PIP_Q_A_FR = pipeline("question-answering", model=QA_FR, tokenizer=TOK_QA_FR) PIP_Q_A_EN = QuestionAnsweringPipeline(model=QA_EN, tokenizer=TOK_QA_EN, device=dev_pipeline) # TOK_EMB_FR = TOK_EMB_FR.to(DEVICE) # EMB_FR = EMB_FR.to(DEVICE) # PIP_Q_A_FR = PIP_Q_A_FR.to(DEVICE) @app.post("/embeddings") async def get_embedding(request): lang = request.json.get('lang') text = request.json.get('text') if lang == "fr": embedder = EMB_FR tokenizer = TOK_EMB_FR
bertBaseUncased = "/home/sabur/Downloads/TweetQAexperiments/bert-base-uncased-vocab.txt" bertLargeCased = "/home/sabur/Downloads/TweetQAexperiments/bert-large-cased-vocab.txt" bertLargeUncased = "/home/sabur/Downloads/TweetQAexperiments/bert-large-uncased-vocab.txt" # GPT-2 vocabularies gpt2Vocab = "gpt2-vocab.json" gpt2LargeVocab = "gpt2-large-vocab.json" # Instantiate a Bert tokenizers WordPiece = BertWordPieceTokenizer(bertLargeUncased) WordPieceEncoder = WordPiece.encode(sentence) # Print the ids, tokens and offsets print(WordPieceEncoder.ids) print(WordPieceEncoder.tokens) print(WordPieceEncoder.offsets) tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') model = QuestionAnsweringPipeline.from_pretrained('xlnet-base-cased') input_ids = tf.constant( tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) start_scores, end_scores = outputs[:2] trans = BertTokenizer(bertLargeUncased, do_lower_case=True, do_basic_tokenize=True, never_split=None, unk_token='[UNK]', sep_token='[SEP]', pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]',
def __init__(self, tokenizer, model): tokenizer = AutoTokenizer.from_pretrained(tokenizer) model = AutoModelForQuestionAnswering.from_pretrained(model) self.nlp = QuestionAnsweringPipeline(model=model, tokenizer=tokenizer)
def run_pipeline_test(self, model, tokenizer, feature_extractor): if isinstance(model.config, LxmertConfig): # This is an bimodal model, we need to find a more consistent way # to switch on those models. return question_answerer = QuestionAnsweringPipeline(model, tokenizer) outputs = question_answerer( question="Where was HuggingFace founded ?", context="HuggingFace was founded in Paris.") self.assertEqual( outputs, { "answer": ANY(str), "start": ANY(int), "end": ANY(int), "score": ANY(float) }) outputs = question_answerer( question=[ "In what field is HuggingFace working ?", "In what field is HuggingFace working ?" ], context="HuggingFace was founded in Paris.", ) self.assertEqual( outputs, [ { "answer": ANY(str), "start": ANY(int), "end": ANY(int), "score": ANY(float) }, { "answer": ANY(str), "start": ANY(int), "end": ANY(int), "score": ANY(float) }, ], ) outputs = question_answerer( question=[ "What field is HuggingFace working ?", "In what field is HuggingFace ?" ], context=[ "HuggingFace is a startup based in New-York", "HuggingFace is a startup founded in Paris", ], ) self.assertEqual( outputs, [ { "answer": ANY(str), "start": ANY(int), "end": ANY(int), "score": ANY(float) }, { "answer": ANY(str), "start": ANY(int), "end": ANY(int), "score": ANY(float) }, ], ) with self.assertRaises(ValueError): question_answerer(question="", context="HuggingFace was founded in Paris.") with self.assertRaises(ValueError): question_answerer(question=None, context="HuggingFace was founded in Paris.") with self.assertRaises(ValueError): question_answerer( question="In what field is HuggingFace working ?", context="") with self.assertRaises(ValueError): question_answerer( question="In what field is HuggingFace working ?", context=None) outputs = question_answerer( question="Where was HuggingFace founded ?", context="HuggingFace was founded in Paris.", topk=20) self.assertEqual(outputs, [{ "answer": ANY(str), "start": ANY(int), "end": ANY(int), "score": ANY(float) } for i in range(20)]) # Very long context require multiple features outputs = question_answerer( question="Where was HuggingFace founded ?", context="HuggingFace was founded in Paris." * 20) self.assertEqual( outputs, { "answer": ANY(str), "start": ANY(int), "end": ANY(int), "score": ANY(float) })
'dbert-s2': 'twmkn9/distilbert-base-uncased-squad2', 'sbert-s2': 'mrm8488/bert-small-finetuned-squadv2', 'dbert-s1': 'distilbert-base-uncased-distilled-squad', '_bert-s2': 'twmkn9/bert-base-uncased-squad2', } models = { k: {'model':AutoModelForQuestionAnswering.from_pretrained(v), 'tokenizer':AutoTokenizer.from_pretrained(v)} for k,v in model_names.items() } for k,m in models.items(): m['model'].eval() pipelines = { k: QuestionAnsweringPipeline(**v, device=-1) for k,v in models.items() } def query_all(question,context): """Get answer to question given context for all pipelines""" if isinstance(context,dict): context = context['text'] ansiprint(h1('question:') + ' ' + h2(question)) ansiprint(h1('context:')) ctx_ = textwrap.fill(context, 60) ctx_ = textwrap.indent(ctx_, ' -- ') print(ctx_) for name,pipeline in pipelines.items(): ansiprint(h3(name)) answer = pipeline({'question':question, 'context':context},
def __init__(self, sents: Union[str, SentenceStore], index: Union[str, faiss.Index], encoder: Optional[Union[str, SentenceEncoder]] = None, cord19: Optional[CORD19] = None, model: Optional[Union[BertForQuestionAnswering, PreTrainedModel]] = None, tokenizer: Optional[Union[BertTokenizerFast, PreTrainedTokenizerBase]] = None, model_name_or_path: Optional[str] = None, max_seq_length: int = 256, do_lower_case: Optional[bool] = None, nlp_model: str = 'en_core_sci_sm', model_device: Optional[str] = None, encoder_device: Optional[str] = None, **compressor_kwargs) -> None: """ :param summarizer_hidden: Determines the hidden layer to use for embeddings. (Needs to be negative.) :param summarizer_reduce: Determines the reduction statistic of the encoding layer `(mean, median, max)`. In other words it determines how you want to reduce results. :param summarizer_kwargs: Kwargs to pass to the summarizer along w/input texts. Or with a `coronanlp.summarization. BertSummarizerArguments` instance. (These arguments can be overridden anytime). By either updating the properties in place e.g., `self.summarizer_kwargs.ratio=0.5`. Note that the `body` argument can be disregarded or left as None since it's always overridden. """ self.max_seq_length = max_seq_length self.sents = SentenceStore.from_disk(sents) \ if isinstance(sents, str) else sents assert isinstance(self.sents, SentenceStore) self.index = faiss.read_index(index) \ if isinstance(index, str) else index assert isinstance(self.index, faiss.Index) sentencizer = None if cord19 is None and hasattr(self.sents, 'init_args'): cord19 = self.sents.init_cord19_dataset() if not cord19.sentencizer_enabled: cord19.init_sentencizer() sentencizer = cord19.sentencizer self.cord19 = cord19 elif isinstance(cord19, CORD19): if not cord19.sentencizer_enabled: cord19.init_sentencizer() sentencizer = cord19.sentencizer self.cord19 = cord19 else: sentencizer = SpacySentenceTokenizer(nlp_model) if sentencizer is not None: self._sentencizer = sentencizer self.nlp = sentencizer.nlp if model_name_or_path is None: model_name_or_path = self.default_model_name if do_lower_case is None: do_lower_case = self.do_lower_case if model is None or isinstance(model, str): self.model = BertForQuestionAnswering \ .from_pretrained(model_name_or_path) elif isinstance(model, (PreTrainedModel, BertForQuestionAnswering)) \ and self.architecture in model.config.architectures: self.model = model else: raise InvalidModelNameOrPathError if model_device is not None: device = torch.device(model_device) self.model = self.model.to(device) if tokenizer is None or isinstance(tokenizer, str): self.tokenizer = BertTokenizerFast \ .from_pretrained(model_name_or_path, do_lower_case=do_lower_case) elif isinstance(tokenizer, (BertTokenizerFast, PreTrainedTokenizerBase)): self.tokenizer = tokenizer else: raise InvalidModelNameOrPathError if encoder is None: base: PreTrainedModel = None if hasattr(self.model, 'bert'): base = self.model.bert elif hasattr(self.model, 'base_model'): base = self.model.base_model base_device = base.device.type self.encoder = SentenceEncoder(transformer=base, tokenizer=self.tokenizer, device=base_device) elif isinstance(encoder, str): if encoder_device is None: encoder_device = 'cpu' self.encoder = SentenceEncoder \ .from_pretrained(encoder, device=encoder_device) elif isinstance(encoder, SentenceEncoder): self.encoder = encoder else: raise InvalidModelNameOrPathError self.compressor = Compressor(model=self.model.base_model, **compressor_kwargs) # HF QAPipeline uses index, -1 is the default for CPU. device_index = -1 if self.model.device.index is not None \ and self.model.device.type == 'cuda': # Model using CUDA, set CUDA idx. device_index = self.model.device.index self.pipeline = QuestionAnsweringPipeline(model=self.model, tokenizer=self.tokenizer, device=device_index) self._freq_summarizer = frequency_summarizer self.device = self.model.device
def __init__(self, model): self.tokenizer = AutoTokenizer.from_pretrained(model) self.model = AutoModelForQuestionAnswering.from_pretrained(model) self.bert = QuestionAnsweringPipeline(model=self.model, tokenizer=self.tokenizer)
from transformers import QuestionAnsweringPipeline, BertForQuestionAnswering, BertTokenizerFast from flask import Flask, jsonify, request from flask_cors import cross_origin, CORS #model = pickle.load(open('modelo-qa','rb')) name_model = "francoMG/sara-qa" tokenizer_model = "dccuchile/bert-base-spanish-wwm-uncased" #name_model = 'ktrapeznikov/scibert_scivocab_uncased_squad_v2' #name_model = 'mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es' #tokenizer_model = "mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es" #name_model = 'amoux/scibert_nli_squad' model = BertForQuestionAnswering.from_pretrained(name_model) tokenizer = BertTokenizerFast.from_pretrained(tokenizer_model, do_lower_case=False) nlp = QuestionAnsweringPipeline(model, tokenizer, framework="pt") app = Flask(__name__) CORS(app) @app.route('/') @cross_origin(origin='*') def ServerStatus(): return "Server Started" @app.route('/preguntar', methods=['POST']) @cross_origin(origin='*') def preguntar(): _pregunta = request.json['pregunta'] score = -1 resp = "" for dat in request.json['contexto']: