from typing import Dict from allennlp.commands.subcommand import Subcommand from allennlp.service import server_sanic from allennlp.service.predictors import DemoModel # This maps from the name of the task # to the ``DemoModel`` indicating the location of the trained model # and the type of the ``Predictor``. This is necessary, as you might # have multiple models (for example, a NER tagger and a POS tagger) # that have the same ``Predictor`` wrapper. The corresponding model # will be served at the `/predict/<name-of-task>` API endpoint. DEFAULT_MODELS = { 'machine-comprehension': DemoModel( 'https://s3-us-west-2.amazonaws.com/allennlp/models/bidaf-model-2017.09.15-charpad.tar.gz', # pylint: disable=line-too-long 'machine-comprehension'), 'semantic-role-labeling': DemoModel( 'https://s3-us-west-2.amazonaws.com/allennlp/models/srl-model-2017.09.05.tar.gz', # pylint: disable=line-too-long 'semantic-role-labeling'), 'textual-entailment': DemoModel( 'https://s3-us-west-2.amazonaws.com/allennlp/models/decomposable-attention-2017.09.04.tar.gz', # pylint: disable=line-too-long 'textual-entailment'), 'coreference-resolution': DemoModel( 'https://s3-us-west-2.amazonaws.com/allennlp/models/coref-model-2017.11.09.tar.gz', # pylint: disable=line-too-long 'coreference-resolution'), 'named-entity-recognition': DemoModel(
import argparse from allennlp.commands.subcommand import Subcommand from allennlp.service import server_flask as server from allennlp.service.predictors import DemoModel # This maps from the name of the task # to the ``DemoModel`` indicating the location of the trained model # and the type of the ``Predictor``. This is necessary, as you might # have multiple models (for example, a NER tagger and a POS tagger) # that have the same ``Predictor`` wrapper. The corresponding model # will be served at the `/predict/<name-of-task>` API endpoint. DEFAULT_MODELS = { 'machine-comprehension-hard': DemoModel( '/home/ziyaoh/allennlp/models/bidaf/model.tar.gz', # pylint: disable=line-too-long 'machine-comprehension'), 'machine-comprehension-normal': DemoModel( '/home/ziyaoh/allennlp/models/bidaf_half/model.tar.gz', # pylint: disable=line-too-long 'machine-comprehension'), 'machine-comprehension-easy': DemoModel( '/home/ziyaoh/allennlp/models/bidaf_tenth/model.tar.gz', # pylint: disable=line-too-long 'machine-comprehension'), 'machine-comprehension-elmo-easy': DemoModel( '/home/ziyaoh/allennlp/models/bidaf_elmo_tenth/model.tar.gz', # pylint: disable=line-too-long 'machine-comprehension'), 'machine-comprehension-elmo-normal': DemoModel(
def load_trained_model(self, path): self.logger.log("Loading model: " + path) self.bidaf_model = DemoModel(path, 'machine-comprehension') # predictor self.predictor = self.bidaf_model.predictor()
import argparse from allennlp.service.predictors import DemoModel print("Start.... test") # Machine Comprehension (MC) models answer natural language questions by selecting an answer span within an evidence text. # The AllenNLP MC model is a reimplementation of BiDAF (Seo et al, 2017), or Bi-Directional Attention Flow # , a widely used MC baseline that achieves near state-of-the-art accuracies on the SQuAD dataset. bidaf_model = DemoModel( '../../allennlp/train_out/bidaf-model-2017.09.15-charpad.tar.gz', 'machine-comprehension') #bidaf_model = DemoModel('../../allennlp/train_out/model01.tar.gz','machine-comprehension') #bidaf_model = DemoModel('../../allennlp/train_out/model02.tar.gz','machine-comprehension') #bidaf_model = DemoModel('../../allennlp/train_out/model04.tar.gz','machine-comprehension') # predictor predictor = bidaf_model.predictor() # Example 1 data = { "passage": "A reusable launch system (RLS, or reusable launch vehicle, RLV) is a launch system which is capable of launching a payload into space more than once. This contrasts with expendable launch systems, where each launch vehicle is launched once and then discarded. No completely reusable orbital launch system has ever been created. Two partially reusable launch systems were developed, the Space Shuttle and Falcon 9. The Space Shuttle was partially reusable: the orbiter (which included the Space Shuttle main engines and the Orbital Maneuvering System engines), and the two solid rocket boosters were reused after several months of refitting work for each launch. The external tank was discarded after each flight.", "question": "How many partially reusable launch systems were developed?" } prediction = predictor.predict_json(data) print(prediction) print(prediction['best_span_str']) # Example 2 data = { "passage":
class StudyBuddyAI: # Logger logger = common.Logger() bidaf_model = None predictor = None tfidf_vectorizer = TfidfVectorizer() tfidf_matrix = None context_list = None all_tfidf_vectorizer = TfidfVectorizer() all_tfidf_matrix = None all_context_list = None # Trained Models trained_models = [{ 'name': 'Base Model (9/15/2017)', 'path': '../../allennlp/train_out/bidaf-model-2017.09.15-charpad.tar.gz' }, { 'name': 'ReTrained Model 1 (12/9/2017)', 'path': '../../allennlp/train_out/model01.tar.gz' }, { 'name': 'ReTrained Model 2 (12/10/2017)', 'path': '../../allennlp/train_out/model02.tar.gz' }, { 'name': 'ReTrained Model 3 (12/11/2017)', 'path': '../../allennlp/train_out/model03.tar.gz' }, { 'name': 'ReTrained Model 4 (12/12/2017)', 'path': '../../allennlp/train_out/model04.tar.gz' }, { 'name': 'ReTrained Model 5 (12/13/2017)', 'path': '../../allennlp/train_out/model05.tar.gz' }] # Context Memory Settings context_memory_time = 1 # in minutes context_memory_size = 5 context_memory = [] context_qa = [] # Class StudyBuddyAI def __init__(self): self.logger.log( "StudyBuddyAI ------------------------------------ Init") # Load pretrained model self.load_trained_model('../../allennlp/train_out/model05.tar.gz') def get_trained_model_list(self): return self.trained_models def load_trained_model(self, path): self.logger.log("Loading model: " + path) self.bidaf_model = DemoModel(path, 'machine-comprehension') # predictor self.predictor = self.bidaf_model.predictor() def save_in_context_memory(self, context): # Save the context self.context_memory.insert(0, context) if len(self.context_memory) > self.context_memory_size: # ensure our context list is limited self.context_memory = self.context_memory[:self. context_memory_size] def save_qa_in_context_memory(self, qa): # Save the context self.context_qa.insert(0, qa) def clear_context_memory(self): self.context_memory = [] self.context_qa = [] def get_context_memory(self): return { 'context_memory': self.context_memory, 'context_qa': self.context_qa } def load_tfidf_vectorizer(self, context_list, all=False): corpus = list() if all == True: self.all_context_list = context_list else: self.context_list = context_list for context in context_list: # Tokenize tokens = self.tokenize_text(context) cleaned_context_text = ' '.join(tokens) corpus.append(cleaned_context_text) # Tf–idf term weighting using TfidfVectorizer if all == True: self.all_tfidf_matrix = self.all_tfidf_vectorizer.fit_transform( corpus) else: self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(corpus) def predict_from_passage(self, data): prediction = self.predictor.predict_json(data) self.logger.log(prediction) return prediction def predict_for_title(self, question, all=False, check_context=False): passage = '' current_context_list = [] current_context_start_index = [] current_context_end_index = [] # if we need to look at the context only if (check_context == True) and (len(self.context_memory) > 0): # the top context item current_context_list = self.context_memory[:1] else: # Tokenize tokens = self.tokenize_text(question) cleaned_context_text = ' '.join(tokens) if all == False: question_vector = self.tfidf_vectorizer.transform( [cleaned_context_text]) else: question_vector = self.all_tfidf_vectorizer.transform( [cleaned_context_text]) # Find Cosine Similarity of question with the contexts if all == False: cs = cosine_similarity(question_vector, self.tfidf_matrix) else: cs = cosine_similarity(question_vector, self.all_tfidf_matrix) self.logger.log(cs) cs_list = cs[0] idx = 0 threshold = 0.25 values_greater_than_zero = [i for i in cs_list if i > 0.0] if len(values_greater_than_zero) == 0: return {'status': 0} # for ctx in self.context_memory: # current_context_start_index.append(len(passage)) # passage = passage + ctx + ' ' # current_context_list.append(ctx) # current_context_end_index.append(len(passage)) # else: min_value = min(values_greater_than_zero) max_value = max(cs_list) range = max_value - min_value threshold = max_value - range / 3 for cs_val in cs_list: if cs_val >= threshold: if all == False: current_context_list.append(self.context_list[idx]) else: current_context_list.append(self.all_context_list[idx]) idx = idx + 1 # build passage for txt in current_context_list: current_context_start_index.append(len(passage)) passage = passage + txt + ' ' current_context_end_index.append(len(passage)) data = {} data['question'] = question data['passage'] = passage # Build the return object result = {} result['status'] = 1 result['prediction'] = self.predict_from_passage(data) result['current_context_list'] = current_context_list # print(current_context_start_index) # print(current_context_end_index) # print(current_context_list) # print(passage) # Save the context from which answer was predicted from # best_span = result['prediction']['best_span'] # for idx, ctx in enumerate(current_context_end_index): # if (best_span[0] >= current_context_start_index[idx]) and (best_span[1] <= current_context_end_index[idx]): # self.save_in_context_memory(current_context_list[idx],{'question':question,'answer':result['prediction']['best_span_str']}) # result['current_context'] = current_context_list[idx] # continue; best_span_str = result['prediction']['best_span_str'] for ctx in current_context_list: if best_span_str in ctx: self.save_in_context_memory(ctx) self.save_qa_in_context_memory({ 'question': question, 'answer': best_span_str }) result['current_context'] = ctx continue # return the current context memory result['context_memory'] = self.context_memory result['context_qa'] = self.context_qa return result # Helper Methods # Tokenize text using NLTK def tokenize_text(self, text, remove_stop_words=True, stem_words=True, filter_short_token=1): # split into words words = nltk.word_tokenize(text) # convert to lower case words = [w.lower() for w in words] # prepare regex for char filtering re_punc = re.compile( '[%s]' % re.escape(string.punctuation)) # remove punctuation from each word tokens = [re_punc.sub('', w) for w in words] # remove not alphabets tokens = [word for word in tokens if word.isalpha()] # filter out stop words if remove_stop_words == True: stop_words = set(nltk.corpus.stopwords.words('english')) tokens = [w for w in tokens if not w in stop_words] # Perfomring if stem_words == True: # stemming of words porter = nltk.stem.porter.PorterStemmer() tokens = [porter.stem(word) for word in tokens] # filter out short tokens if filter_short_token > 0: tokens = [ word for word in tokens if len(word) > filter_short_token ] return tokens