def process_inference_input(input_file): with io.open(input_file) as fp: texts = [l.strip() for l in fp.readlines()] tokenizer = SpacyInstance(disable=["tagger", "parser", "ner"]) examples = [] for i, t in enumerate(texts): examples.append(TokenClsInputExample(str(i), t, tokenizer.tokenize(t))) return examples
def _parse_json(self, data): tok = SpacyInstance(disable=['tagger', 'ner', 'parser', 'vectors', 'textcat']) sentences = [] for s in data: tokens = [] tags = [] for t in s: new_tokens = tok.tokenize(t['text'].strip()) tokens += new_tokens ent = t.get('entity', None) if ent is not None: tags += self._create_tags(ent, len(new_tokens)) else: tags += ['O'] * len(new_tokens) sentences.append((tokens, tags)) return sentences
def _parse_json(self, data): tok = SpacyInstance( disable=["tagger", "ner", "parser", "vectors", "textcat"]) sentences = [] for s in data: tokens = [] tags = [] for t in s: new_tokens = tok.tokenize(t["text"].strip()) tokens += new_tokens ent = t.get("entity", None) if ent is not None: tags += self._create_tags(ent, len(new_tokens)) else: tags += ["O"] * len(new_tokens) sentences.append((tokens, tags)) return sentences
class IntentExtractionApi(AbstractApi): model_dir = str(LIBRARY_OUT / "intent-pretrained") pretrained_model_info = path.join(model_dir, "model_info.dat") pretrained_model = path.join(model_dir, "model.h5") def __init__(self, prompt=False): self.model = None self.model_type = None self.word_vocab = None self.tags_vocab = None self.char_vocab = None self.intent_vocab = None self._download_pretrained_model(prompt) self.nlp = SpacyInstance( disable=["tagger", "ner", "parser", "vectors", "textcat"]) def process_text(self, text): input_text = " ".join(text.strip().split()) return self.nlp.tokenize(input_text) @staticmethod def _prompt(): response = input("\nTo download '{}', please enter YES: ".format( "intent_extraction")) res = response.lower().strip() if res == "yes" or (len(res) == 1 and res == "y"): print("Downloading {}...".format("ner")) responded_yes = True else: print("Download declined. Response received {} != YES|Y. ".format( res)) responded_yes = False return responded_yes @staticmethod def _download_pretrained_model(prompt=True): """Downloads the pre-trained BIST model if non-existent.""" model_info_exists = path.isfile( IntentExtractionApi.pretrained_model_info) model_exists = path.isfile(IntentExtractionApi.pretrained_model) if not model_exists or not model_info_exists: print( "The pre-trained models to be downloaded for the intent extraction dataset " "are licensed under Apache 2.0. By downloading, you accept the terms " "and conditions provided by the license") makedirs(IntentExtractionApi.model_dir, exist_ok=True) if prompt is True: agreed = IntentExtractionApi._prompt() if agreed is False: sys.exit(0) download_unlicensed_file( "https://s3-us-west-2.amazonaws.com/nlp-architect-data" "/models/intent/", "model_info.dat", IntentExtractionApi.pretrained_model_info, ) download_unlicensed_file( "https://s3-us-west-2.amazonaws.com/nlp-architect-data" "/models/intent/", "model.h5", IntentExtractionApi.pretrained_model, ) print("Done.") @staticmethod def display_results(text_str, predictions, intent_type): ret = { "annotation_set": [], "doc_text": " ".join([t for t in text_str]) } spans = [] available_tags = set() for s, e, tag in bio_to_spans(text_str, predictions): spans.append({"start": s, "end": e, "type": tag}) available_tags.add(tag) ret["annotation_set"] = list(available_tags) ret["spans"] = spans ret["title"] = intent_type return {"doc": ret, "type": "high_level"} def vectorize(self, doc, vocab, char_vocab=None): words = np.asarray([ vocab[w.lower()] if w.lower() in vocab else 1 for w in doc ]).reshape(1, -1) if char_vocab is not None: sentence_chars = [] for w in doc: word_chars = [] for c in w: if c in char_vocab: _cid = char_vocab[c] else: _cid = 1 word_chars.append(_cid) sentence_chars.append(word_chars) sentence_chars = np.expand_dims(pad_sentences( sentence_chars, self.model.word_length), axis=0) return [words, sentence_chars] return words def inference(self, doc): text_arr = self.process_text(doc) intent_type = None if self.model_type == "mtl": doc_vec = self.vectorize(text_arr, self.word_vocab, self.char_vocab) intent, tags = self.model.predict(doc_vec, batch_size=1) intent = int(intent.argmax(1).flatten()) intent_type = self.intent_vocab.get(intent, None) print("Detected intent type: {}".format(intent_type)) else: doc_vec = self.vectorize(text_arr, self.word_vocab, None) tags = self.model.predict(doc_vec, batch_size=1) tags = tags.argmax(2).flatten() tag_str = [self.tags_vocab.get(n, None) for n in tags] for t, n in zip(text_arr, tag_str): print("{}\t{}\t".format(t, n)) return self.display_results(text_arr, tag_str, intent_type) def load_model(self): with open(IntentExtractionApi.pretrained_model_info, "rb") as fp: model_info = pickle.load(fp) self.model_type = model_info["type"] self.word_vocab = model_info["word_vocab"] self.tags_vocab = {v: k for k, v in model_info["tags_vocab"].items()} if self.model_type == "mtl": self.char_vocab = model_info["char_vocab"] self.intent_vocab = { v: k for k, v in model_info["intent_vocab"].items() } model = MultiTaskIntentModel() else: model = Seq2SeqIntentModel() model.load(self.pretrained_model) self.model = model
class IntentExtractionApi(AbstractApi): model_dir = path.join(LIBRARY_STORAGE_PATH, 'intent-pretrained') pretrained_model_info = path.join(model_dir, 'model_info.dat') pretrained_model = path.join(model_dir, 'model.h5') def __init__(self, prompt=True): self.model = None self.model_type = None self.word_vocab = None self.tags_vocab = None self.char_vocab = None self.intent_vocab = None self._download_pretrained_model(prompt) self.nlp = SpacyInstance( disable=['tagger', 'ner', 'parser', 'vectors', 'textcat']) def process_text(self, text): input_text = ' '.join(text.strip().split()) return self.nlp.tokenize(input_text) @staticmethod def _prompt(): response = input('\nTo download \'{}\', please enter YES: '.format( 'intent_extraction')) res = response.lower().strip() if res == "yes" or (len(res) == 1 and res == 'y'): print('Downloading {}...'.format('ner')) responded_yes = True else: print('Download declined. Response received {} != YES|Y. '.format( res)) responded_yes = False return responded_yes @staticmethod def _download_pretrained_model(prompt=True): """Downloads the pre-trained BIST model if non-existent.""" model_info_exists = path.isfile( IntentExtractionApi.pretrained_model_info) model_exists = path.isfile(IntentExtractionApi.pretrained_model) if not model_exists or not model_info_exists: print( 'The pre-trained models to be downloaded for the intent extraction dataset ' 'are licensed under Apache 2.0. By downloading, you accept the terms ' 'and conditions provided by the license') makedirs(IntentExtractionApi.model_dir, exist_ok=True) if prompt is True: agreed = IntentExtractionApi._prompt() if agreed is False: sys.exit(0) download_unlicensed_file( 'http://nervana-modelzoo.s3.amazonaws.com/NLP/intent/', 'model_info.dat', IntentExtractionApi.pretrained_model_info) download_unlicensed_file( 'http://nervana-modelzoo.s3.amazonaws.com/NLP/intent/', 'model.h5', IntentExtractionApi.pretrained_model) print('Done.') @staticmethod def display_results(text_str, predictions, intent_type): ret = { 'annotation_set': [], 'doc_text': ' '.join([t for t in text_str]) } spans = [] available_tags = set() for s, e, tag in bio_to_spans(text_str, predictions): spans.append({'start': s, 'end': e, 'type': tag}) available_tags.add(tag) ret['annotation_set'] = list(available_tags) ret['spans'] = spans ret['title'] = intent_type return {'doc': ret, 'type': 'high_level'} def vectorize(self, doc, vocab, char_vocab=None): words = np.asarray([vocab[w.lower()] if w.lower() in vocab else 1 for w in doc])\ .reshape(1, -1) if char_vocab is not None: sentence_chars = [] for w in doc: word_chars = [] for c in w: if c in char_vocab: _cid = char_vocab[c] else: _cid = 1 word_chars.append(_cid) sentence_chars.append(word_chars) sentence_chars = np.expand_dims(pad_sentences( sentence_chars, self.model.word_length), axis=0) return [words, sentence_chars] return words def inference(self, doc): text_arr = self.process_text(doc) intent_type = None if self.model_type == 'mtl': doc_vec = self.vectorize(text_arr, self.word_vocab, self.char_vocab) intent, tags = self.model.predict(doc_vec, batch_size=1) intent = int(intent.argmax(1).flatten()) intent_type = self.intent_vocab.get(intent, None) print('Detected intent type: {}'.format(intent_type)) else: doc_vec = self.vectorize(text_arr, self.word_vocab, None) tags = self.model.predict(doc_vec, batch_size=1) tags = tags.argmax(2).flatten() tag_str = [self.tags_vocab.get(n, None) for n in tags] for t, n in zip(text_arr, tag_str): print('{}\t{}\t'.format(t, n)) return self.display_results(text_arr, tag_str, intent_type) def load_model(self): with open(IntentExtractionApi.pretrained_model_info, 'rb') as fp: model_info = pickle.load(fp) self.model_type = model_info['type'] self.word_vocab = model_info['word_vocab'] self.tags_vocab = {v: k for k, v in model_info['tags_vocab'].items()} if self.model_type == 'mtl': self.char_vocab = model_info['char_vocab'] self.intent_vocab = { v: k for k, v in model_info['intent_vocab'].items() } model = MultiTaskIntentModel() else: model = Seq2SeqIntentModel() model.load(self.pretrained_model) self.model = model
class NerApi(AbstractApi): """ NER model API """ model_dir = str(LIBRARY_OUT / "ner-pretrained") pretrained_model = path.join(model_dir, "model_v4.h5") pretrained_model_info = path.join(model_dir, "model_info_v4.dat") def __init__(self, prompt=True): self.model = None self.model_info = None self.word_vocab = None self.y_vocab = None self.char_vocab = None self._download_pretrained_model(prompt) self.nlp = SpacyInstance( disable=["tagger", "ner", "parser", "vectors", "textcat"]) @staticmethod def _prompt(): response = input( "\nTo download '{}', please enter YES: ".format("ner")) res = response.lower().strip() if res == "yes" or (len(res) == 1 and res == "y"): print("Downloading {}...".format("ner")) responded_yes = True else: print("Download declined. Response received {} != YES|Y. ".format( res)) responded_yes = False return responded_yes def _download_pretrained_model(self, prompt=True): """Downloads the pre-trained BIST model if non-existent.""" model_exists = path.isfile(self.pretrained_model) model_info_exists = path.isfile(self.pretrained_model_info) if not model_exists or not model_info_exists: print( "The pre-trained models to be downloaded for the NER dataset " "are licensed under Apache 2.0. By downloading, you accept the terms " "and conditions provided by the license") makedirs(self.model_dir, exist_ok=True) if prompt is True: agreed = NerApi._prompt() if agreed is False: sys.exit(0) download_unlicensed_file( "https://s3-us-west-2.amazonaws.com/nlp-architect-data" "/models/ner/", "model_v4.h5", self.pretrained_model, ) download_unlicensed_file( "https://s3-us-west-2.amazonaws.com/nlp-architect-data" "/models/ner/", "model_info_v4.dat", self.pretrained_model_info, ) print("Done.") def load_model(self): self.model = NERCRF() self.model.load(self.pretrained_model) with open(self.pretrained_model_info, "rb") as fp: model_info = pickle.load(fp) self.word_vocab = model_info["word_vocab"] self.y_vocab = {v: k for k, v in model_info["y_vocab"].items()} self.char_vocab = model_info["char_vocab"] @staticmethod def pretty_print(text, tags): spans = [] for s, e, tag in bio_to_spans(text, tags): spans.append({"start": s, "end": e, "type": tag}) ents = dict((obj["type"].lower(), obj) for obj in spans).keys() ret = { "doc_text": " ".join(text), "annotation_set": list(ents), "spans": spans, "title": "None", } print({"doc": ret, "type": "high_level"}) return {"doc": ret, "type": "high_level"} def process_text(self, text): input_text = " ".join(text.strip().split()) return self.nlp.tokenize(input_text) def vectorize(self, doc, vocab, char_vocab): words = np.asarray([ vocab[w.lower()] if w.lower() in vocab else 1 for w in doc ]).reshape(1, -1) sentence_chars = [] for w in doc: word_chars = [] for c in w: if c in char_vocab: _cid = char_vocab[c] else: _cid = 1 word_chars.append(_cid) sentence_chars.append(word_chars) sentence_chars = np.expand_dims(pad_sentences(sentence_chars, self.model.word_length), axis=0) return words, sentence_chars def inference(self, doc): text_arr = self.process_text(doc) doc_vec = self.vectorize(text_arr, self.word_vocab, self.char_vocab) seq_len = np.array([len(text_arr)]).reshape(-1, 1) inputs = list(doc_vec) # pylint: disable=no-member inputs = list(doc_vec) + [seq_len] doc_ner = self.model.predict(inputs, batch_size=1).argmax(2).flatten() tags = [self.y_vocab.get(n, None) for n in doc_ner] return self.pretty_print(text_arr, tags)
validate_existing_filepath(settings_path) # load model and parameters model = SequenceChunker() model.load(model_path) word_length = model.max_word_len with open(settings_path, 'rb') as fp: model_params = pickle.load(fp) word_vocab = model_params['word_vocab'] chunk_vocab = model_params['chunk_vocab'] char_vocab = model_params.get('char_vocab', None) # parse documents and get tokens nlp = SpacyInstance( disable=['tagger', 'ner', 'parser', 'vectors', 'textcat']) with open(args.input_file) as fp: document_texts = [nlp.tokenize(t.strip()) for t in fp.readlines()] # vectorize input tokens and run inference doc_vecs = vectorize(document_texts, word_vocab, char_vocab) document_annotations = [] for vec in doc_vecs: doc_chunks = model.predict(vec, batch_size=args.b) chunk_a = [ chunk_vocab.id_to_word(l) for l in doc_chunks.argmax(2).flatten() ] document_annotations.append(chunk_a) # print document text and annotations build_annotation(document_texts, document_annotations)
class NerApi(AbstractApi): """ NER model API """ model_dir = path.join(LIBRARY_STORAGE_PATH, 'ner-pretrained') pretrained_model = path.join(model_dir, 'model.h5') pretrained_model_info = path.join(model_dir, 'model_info.dat') def __init__(self, prompt=True): self.model = None self.model_info = None self.word_vocab = None self.y_vocab = None self.char_vocab = None self._download_pretrained_model(prompt) self.nlp = SpacyInstance( disable=['tagger', 'ner', 'parser', 'vectors', 'textcat']) @staticmethod def _prompt(): response = input( '\nTo download \'{}\', please enter YES: '.format('ner')) res = response.lower().strip() if res == "yes" or (len(res) == 1 and res == 'y'): print('Downloading {}...'.format('ner')) responded_yes = True else: print('Download declined. Response received {} != YES|Y. '.format( res)) responded_yes = False return responded_yes def _download_pretrained_model(self, prompt=True): """Downloads the pre-trained BIST model if non-existent.""" model_exists = path.isfile(self.pretrained_model) model_info_exists = path.isfile(self.pretrained_model_info) if not model_exists or not model_info_exists: print( 'The pre-trained models to be downloaded for the NER dataset ' 'are licensed under Apache 2.0. By downloading, you accept the terms ' 'and conditions provided by the license') makedirs(self.model_dir, exist_ok=True) if prompt is True: agreed = NerApi._prompt() if agreed is False: sys.exit(0) download_unlicensed_file( 'http://nervana-modelzoo.s3.amazonaws.com/NLP/ner/', 'model.h5', self.pretrained_model) download_unlicensed_file( 'http://nervana-modelzoo.s3.amazonaws.com/NLP/ner/', 'model_info.dat', self.pretrained_model_info) print('Done.') def load_model(self): self.model = NERCRF() self.model.load(self.pretrained_model) with open(self.pretrained_model_info, 'rb') as fp: model_info = pickle.load(fp) self.word_vocab = model_info['word_vocab'] self.y_vocab = {v: k for k, v in model_info['y_vocab'].items()} self.char_vocab = model_info['char_vocab'] @staticmethod def pretty_print(text, tags): spans = [] for s, e, tag in bio_to_spans(text, tags): spans.append({'start': s, 'end': e, 'type': tag}) ents = dict((obj['type'].lower(), obj) for obj in spans).keys() ret = { 'doc_text': ' '.join(text), 'annotation_set': list(ents), 'spans': spans, 'title': 'None' } print({"doc": ret, 'type': 'high_level'}) return {"doc": ret, 'type': 'high_level'} def process_text(self, text): input_text = ' '.join(text.strip().split()) return self.nlp.tokenize(input_text) def vectorize(self, doc, vocab, char_vocab): words = np.asarray([vocab[w.lower()] if w.lower() in vocab else 1 for w in doc]) \ .reshape(1, -1) sentence_chars = [] for w in doc: word_chars = [] for c in w: if c in char_vocab: _cid = char_vocab[c] else: _cid = 1 word_chars.append(_cid) sentence_chars.append(word_chars) sentence_chars = np.expand_dims(pad_sentences(sentence_chars, self.model.word_length), axis=0) return words, sentence_chars def inference(self, doc): text_arr = self.process_text(doc) doc_vec = self.vectorize(text_arr, self.word_vocab, self.char_vocab) seq_len = np.array([len(text_arr)]).reshape(-1, 1) inputs = list(doc_vec) if self.model.crf_mode == 'pad': inputs = list(doc_vec) + [seq_len] doc_ner = self.model.predict(inputs, batch_size=1).argmax(2).flatten() tags = [self.y_vocab.get(n, None) for n in doc_ner] return self.pretty_print(text_arr, tags)
class MatchLSTMAnswerPointer(object): """ Defines end to end MatchLSTM and Answer_Pointer network for Reading Comprehension """ def __init__(self, params_dict, embeddings): """ Args: params_dict: Dictionary containing the following keys- 'max_question' : max length of all questions in the dataset 'max_para' : max length of all paragraphs in the dataset 'hidden_size': number of hidden units in the network 'batch_size' : batch size defined by user embeddings: Glove pretrained embedding matrix """ # Assign Variables: self.max_question = params_dict["max_question"] self.max_para = params_dict["max_para"] self.hidden_size = params_dict["hidden_size"] self.batch_size = params_dict["batch_size"] self.embeddings = embeddings self.inference_only = params_dict["inference_only"] self.G_i = None self.attn = None self.stacked_lists_forward = None self.stacked_lists_reverse = None self.logits_withsf = None # init tokenizer self.tokenizer = SpacyInstance( disable=["tagger", "ner", "parser", "vectors", "textcat"]) # Create Placeholders # Question ids self.question_ids = tf.placeholder(tf.int32, shape=[None, self.max_question], name="question_ids") # Paragraph ids self.para_ids = tf.placeholder(tf.int32, shape=[None, self.max_para], name="para_ids") # Length of question self.question_length = tf.placeholder(tf.int32, shape=[None], name="question_len") # Length of paragraph self.para_length = tf.placeholder(tf.int32, shape=[None], name="para_len") # Mask for paragraph self.para_mask = tf.placeholder(tf.float32, shape=[None, self.max_para], name="para_mask") # Mask for question self.ques_mask = tf.placeholder(tf.float32, shape=[None, self.max_question], name="ques_mask") # Answer spans if self.inference_only is False: self.labels = tf.placeholder(tf.int32, shape=[None, 2], name="labels") # Dropout value self.dropout = tf.placeholder(tf.float32, shape=[], name="dropout") self.global_step = tf.Variable(0, name="global") # Get variables self.create_variables() # Define model self.create_model() def create_variables(self): """ Function to create variables used for training """ # define all variables required for training self.W_p = tf.get_variable("W_p", [1, self.hidden_size, self.hidden_size]) self.W_r = tf.get_variable("W_r", [1, self.hidden_size, self.hidden_size]) self.W_q = tf.get_variable("W_q", [1, self.hidden_size, self.hidden_size]) self.w_lr = tf.get_variable("w_lr", [1, 1, self.hidden_size]) self.b_p = tf.get_variable("b_p", [1, self.hidden_size, 1]) self.c_p = tf.get_variable("c_p", [1]) self.ones_vector = tf.constant(np.ones([1, self.max_question]), dtype=tf.float32) self.ones_vector_exp = tf.tile(tf.expand_dims(self.ones_vector, 0), [self.batch_size, 1, 1]) self.ones_vector_para = tf.constant(np.ones([1, self.max_para]), dtype=tf.float32) self.ones_para_exp = tf.tile(tf.expand_dims(self.ones_vector_para, 0), [self.batch_size, 1, 1]) self.ones_embed = tf.tile( tf.expand_dims( tf.constant(np.ones([1, self.hidden_size]), dtype=tf.float32), 0), [self.batch_size, 1, 1], ) self.V_r = tf.get_variable("V_r", [1, self.hidden_size, 2 * self.hidden_size]) self.W_a = tf.get_variable("W_a", [1, self.hidden_size, self.hidden_size]) self.b_a = tf.get_variable("b_a", [1, self.hidden_size, 1]) self.v_a_pointer = tf.get_variable("v_a_pointer", [1, 1, self.hidden_size]) self.c_pointer = tf.get_variable("c_pointer", [1, 1, 1]) self.Wans_q = tf.get_variable("Wans_q", [1, self.hidden_size, self.hidden_size]) self.Wans_v = tf.get_variable("Wans_v", [1, self.hidden_size, self.hidden_size]) self.Vans_r = tf.get_variable("Vans_r", [1, self.hidden_size, self.max_question]) self.mask_ques_mul = tf.matmul( tf.transpose(self.ones_embed, [0, 2, 1]), tf.expand_dims(self.ques_mask, 1)) def create_model(self): """ Function to set up the end 2 end reading comprehension model """ # Embedding Layer embedding_lookup = tf.Variable(self.embeddings, name="word_embeddings", dtype=tf.float32, trainable=False) # Embedding Lookups self.question_emb = tf.nn.embedding_lookup(embedding_lookup, self.question_ids, name="question_embed") self.para_emb = tf.nn.embedding_lookup(embedding_lookup, self.para_ids, name="para_embed") # Apply dropout after embeddings self.question = tf.nn.dropout(self.question_emb, self.dropout) self.para = tf.nn.dropout(self.para_emb, self.dropout) # Encoding Layer # Share weights of pre-processing LSTM layer with both para and # question with tf.variable_scope("encoded_question"): self.lstm_cell_question = tf.nn.rnn_cell.BasicLSTMCell( self.hidden_size, state_is_tuple=True) self.encoded_question, _ = tf.nn.dynamic_rnn( self.lstm_cell_question, self.question, self.question_length, dtype=tf.float32) with tf.variable_scope("encoded_para"): self.encoded_para, _ = tf.nn.dynamic_rnn(self.lstm_cell_question, self.para, self.para_length, dtype=tf.float32) # Define Match LSTM and Answer Pointer Cells with tf.variable_scope("match_lstm_cell"): self.match_lstm_cell = tf.nn.rnn_cell.BasicLSTMCell( self.hidden_size, state_is_tuple=True) with tf.variable_scope("answer_pointer_cell"): self.lstm_pointer_cell = tf.nn.rnn_cell.BasicLSTMCell( self.hidden_size, state_is_tuple=True) print("Match LSTM Pass") # Match LSTM Pass in forward direction self.unroll_with_attention(reverse=False) self.encoded_para_reverse = tf.reverse(self.encoded_para, axis=[1]) # Match LSTM Pass in reverse direction self.unroll_with_attention(reverse=True) # Apply dropout self.stacked_lists = tf.concat( [ tf.nn.dropout(self.stacked_lists_forward, tf.maximum(self.dropout, 0.8)), tf.nn.dropout(self.stacked_lists_reverse, tf.maximum(self.dropout, 0.8)), ], 1, ) # Answer pointer pass self.logits = self.answer_pointer_pass() if self.inference_only is False: print("Settting up Loss") # Compute Losses loss_1 = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.logits[0], labels=self.labels[:, 0]) loss_2 = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.logits[1], labels=self.labels[:, 1]) # Total Loss self.loss = tf.reduce_mean(loss_1 + loss_2) self.learning_rate = tf.constant(0.002) print("Set up optmizer") # Optmizer self.optimizer = tf.train.AdamOptimizer( self.learning_rate).minimize(self.loss, global_step=self.global_step) def unroll_with_attention(self, reverse=False): """ Function to run the match_lstm pass in both forward and reverse directions Args: reverse: Boolean indicating whether to unroll in reverse directions """ # Intitialze first hidden_state with zeros h_r_old = tf.constant(np.zeros([self.batch_size, self.hidden_size, 1]), dtype=tf.float32) final_state_list = [] for i in range(self.max_para): if not reverse: encoded_paraslice = tf.gather(self.encoded_para, indices=i, axis=1) else: encoded_paraslice = tf.gather(self.encoded_para_reverse, indices=i, axis=1) W_p_expanded = tf.tile(self.W_p, [self.batch_size, 1, 1]) W_q_expanded = tf.tile(self.W_q, [self.batch_size, 1, 1]) W_r_expanded = tf.tile(self.W_r, [self.batch_size, 1, 1]) w_lr_expanded = tf.tile(self.w_lr, [self.batch_size, 1, 1]) b_p_expanded = tf.tile(self.b_p, [self.batch_size, 1, 1]) int_sum = ( tf.matmul(W_p_expanded, tf.expand_dims(encoded_paraslice, 2)) + tf.matmul(W_r_expanded, h_r_old) + b_p_expanded) int_sum_new = tf.matmul(int_sum, tf.expand_dims(self.ques_mask, 1)) int_sum1 = tf.matmul( W_q_expanded, tf.transpose(self.encoded_question, [0, 2, 1])) self.G_i = tf.nn.tanh(int_sum_new + int_sum1) + tf.expand_dims( self.c_p * self.ques_mask, 1) # Attention Vector self.attn = tf.nn.softmax(tf.matmul(w_lr_expanded, self.G_i)) z1 = encoded_paraslice z2 = tf.squeeze( tf.matmul( tf.transpose(self.encoded_question, [0, 2, 1]), tf.transpose(self.attn, [0, 2, 1]), ), axis=2, ) z_i_stacked = tf.concat([z1, z2], 1) if i == 0: h_r_old, cell_state_old = self.match_lstm_cell( z_i_stacked, state=self.match_lstm_cell.zero_state(self.batch_size, dtype=tf.float32), ) else: h_r_old, cell_state_old = self.match_lstm_cell( z_i_stacked, state=cell_state_old) final_state_list.append(h_r_old) h_r_old = tf.expand_dims(h_r_old, 2) stacked_lists = tf.stack(final_state_list, 1) if not reverse: # Mask Output mask_mult_lstm_forward = tf.matmul( tf.transpose(self.ones_embed, [0, 2, 1]), tf.expand_dims(self.para_mask, 1)) self.stacked_lists_forward = tf.multiply( tf.transpose(stacked_lists, [0, 2, 1]), mask_mult_lstm_forward) else: # Mask Output mask_mult_lstm_reverse = tf.matmul( tf.transpose(self.ones_embed, [0, 2, 1]), tf.expand_dims(tf.reverse(self.para_mask, axis=[1]), 1), ) self.stacked_lists_reverse = tf.reverse( tf.multiply(tf.transpose(stacked_lists, [0, 2, 1]), mask_mult_lstm_reverse), axis=[2], ) def answer_pointer_pass(self): """ Function to run the answer pointer pass: Args: None Returns: List of logits for start and end indices of the answer """ V_r_expanded = tf.tile(self.V_r, [self.batch_size, 1, 1]) W_a_expanded = tf.tile(self.W_a, [self.batch_size, 1, 1]) b_a_expanded = tf.tile(self.b_a, [self.batch_size, 1, 1]) mask_multiplier_1 = tf.expand_dims(self.para_mask, 1) mask_multiplier = self.ones_para_exp v_apointer_exp = tf.tile(self.v_a_pointer, [self.batch_size, 1, 1]) # Zero initialization h_k_old = tf.constant(np.zeros([self.batch_size, self.hidden_size, 1]), dtype=tf.float32) b_k_lists = [] print("Answer Pointer Pass") for i in range(0, 2): sum1 = tf.matmul(V_r_expanded, self.stacked_lists) sum2 = tf.matmul(W_a_expanded, h_k_old) + b_a_expanded F_k = tf.nn.tanh(sum1 + tf.matmul(sum2, mask_multiplier)) b_k_withoutsf = tf.matmul(v_apointer_exp, F_k) b_k = tf.nn.softmax(b_k_withoutsf + tf.log(mask_multiplier_1)) lstm_cell_inp = tf.squeeze(tf.matmul(self.stacked_lists, tf.transpose(b_k, [0, 2, 1])), axis=2) with tf.variable_scope("lstm_pointer"): if i == 0: h_k_old, cell_state_pointer = self.lstm_pointer_cell( lstm_cell_inp, state=self.lstm_pointer_cell.zero_state( self.batch_size, dtype=tf.float32), ) else: h_k_old, cell_state_pointer = self.lstm_pointer_cell( lstm_cell_inp, state=cell_state_pointer) h_k_old = tf.expand_dims(h_k_old, 2) b_k_lists.append(b_k_withoutsf + tf.log(mask_multiplier_1)) self.logits_withsf = [ tf.nn.softmax(tf.squeeze(b_k_lists[0], axis=1)), tf.nn.softmax(tf.squeeze(b_k_lists[1], axis=1)), ] return [ tf.squeeze(b_k_lists[0], axis=1), tf.squeeze(b_k_lists[1], axis=1) ] @staticmethod def obtain_indices(preds_start, preds_end): """ Function to get answer indices given the predictions Args: preds_start: predicted start indices predictions: predicted end indices Returns: final start and end indices for the answer """ ans_start = [] ans_end = [] for i in range(preds_start.shape[0]): max_ans_id = -100000000 st_idx = 0 en_idx = 0 ele1 = preds_start[i] ele2 = preds_end[i] len_para = len(ele1) for j in range(len_para): for k in range(15): if j + k >= len_para: break ans_start_int = ele1[j] ans_end_int = ele2[j + k] if (ans_start_int + ans_end_int) > max_ans_id: max_ans_id = ans_start_int + ans_end_int st_idx = j en_idx = j + k ans_start.append(st_idx) ans_end.append(en_idx) return (np.array(ans_start), np.array(ans_end)) def cal_f1_score(self, ground_truths, predictions): """ Function to calculate F-1 and EM scores Args: ground_truths: labels given in the dataset predictions: logits predicted by the network Returns: F1 score and Exact-Match score """ start_idx, end_idx = self.obtain_indices(predictions[0], predictions[1]) f1 = 0 exact_match = 0 for i in range(self.batch_size): ele1 = start_idx[i] ele2 = end_idx[i] preds = np.linspace(ele1, ele2, abs(ele2 - ele1 + 1)) length_gts = abs(ground_truths[i][1] - ground_truths[i][0] + 1) gts = np.linspace(ground_truths[i][0], ground_truths[i][1], length_gts) common = Counter(preds) & Counter(gts) num_same = sum(common.values()) exact_match += int(np.array_equal(preds, gts)) if num_same == 0: f1 += 0 else: precision = 1.0 * num_same / len(preds) recall = 1.0 * num_same / len(gts) f1 += (2 * precision * recall) / (precision + recall) return 100 * (f1 / self.batch_size), 100 * (exact_match / self.batch_size) def get_dynamic_feed_params(self, question_str, vocab_reverse): """ Function to get required feed_dict format for user entered questions. Used mainly in the demo mode. Args: question_str: question string vocab_reverse: vocab dictionary with words as keys and indices as values Returns: question_idx: list of indicies represnting the question padded to max length question_len: actual length of the question ques_mask: mask for question_idx """ question_words = [ word.replace("``", '"').replace("''", '"') for word in self.tokenizer.tokenize(question_str) ] question_ids = [vocab_reverse[ele] for ele in question_words] if len(question_ids) < self.max_question: pad_length = self.max_question - len(question_ids) question_idx = question_ids + [0] * pad_length question_len = len(question_ids) ques_mask = np.zeros([1, self.max_question]) ques_mask[0, 0:question_len] = 1 ques_mask = ques_mask.tolist()[0] return question_idx, question_len, ques_mask def run_loop(self, session, train, mode="train", dropout=0.6): """ Function to run training/validation loop and display training loss, F1 & EM scores Args: session: tensorflow session train: data dictionary for training/validation dropout: float value mode: 'train'/'val' """ nbatches = int((len(train["para"]) / self.batch_size)) f1_score = 0 em_score = 0 for idx in range(nbatches): # Train for all batches start_batch = self.batch_size * idx end_batch = self.batch_size * (idx + 1) if end_batch > len(train["para"]): break # Create feed dictionary feed_dict_qa = { self.para_ids: np.asarray(train["para"][start_batch:end_batch]), self.question_ids: np.asarray(train["question"][start_batch:end_batch]), self.para_length: np.asarray(train["para_len"][start_batch:end_batch]), self.question_length: np.asarray(train["question_len"][start_batch:end_batch]), self.labels: np.asarray(train["answer"][start_batch:end_batch]), self.para_mask: np.asarray(train["para_mask"][start_batch:end_batch]), self.ques_mask: np.asarray(train["question_mask"][start_batch:end_batch]), self.dropout: dropout, } # Training Phase if mode == "train": _, train_loss, _, logits, labels = session.run( [ self.optimizer, self.loss, self.learning_rate, self.logits_withsf, self.labels, ], feed_dict=feed_dict_qa, ) if idx % 20 == 0: print("iteration = {}, train loss = {}".format( idx, train_loss)) f1_score, em_score = self.cal_f1_score(labels, logits) print("F-1 and EM Scores are", f1_score, em_score) self.global_step.assign(self.global_step + 1) else: logits, labels = session.run([self.logits_withsf, self.labels], feed_dict=feed_dict_qa) f1_score_int, em_score_int = self.cal_f1_score(labels, logits) f1_score += f1_score_int em_score += em_score_int # Validation Phase if mode == "val": print("Validation F1 and EM scores are", f1_score / nbatches, em_score / nbatches) # pylint: disable=inconsistent-return-statements def inference_mode( self, session, valid, vocab_tuple, num_examples, dropout=1.0, dynamic_question_mode=False, dynamic_usr_question="", dynamic_question_index=0, ): """ Function to run inference_mode for reading comprehension Args: session: tensorflow session valid: data dictionary for validation set vocab_tuple: a tuple containing voacab dictionaries in forward and reverse directions num_examples : specify the number of samples to run for inference dropout : Float value which is always 1.0 for inference dynamic_question_mode : boolean to enable whether or not accept questions from the user(used in the demo mode) """ vocab_forward = vocab_tuple[0] vocab_reverse = vocab_tuple[1] for idx in range(num_examples): if dynamic_question_mode is True: idx = dynamic_question_index required_params = self.get_dynamic_feed_params( dynamic_usr_question, vocab_reverse) question_ids = required_params[0] question_length = required_params[1] ques_mask = required_params[2] test_paragraph = [ vocab_forward[ele] for ele in valid[idx][0] if ele != 0 ] para_string = " ".join(map(str, test_paragraph)) else: # Print Paragraph print("\n") print("Paragraph Number AA:", idx) test_paragraph = [ vocab_forward[ele].replace(" ", "") for ele in valid[idx][0] if ele != 0 ] para_string = " ".join(map(str, test_paragraph)) print(para_string) # Print corresponding Question test_question = [ vocab_forward[ele].replace(" ", "") for ele in valid[idx][1] if ele != 0 ] ques_string = " ".join(map(str, test_question)) print("Question:", ques_string) question_ids = valid[idx][1] question_length = valid[idx][3] ques_mask = valid[idx][6] # Create a feed dictionary feed_dict_qa = { self.para_ids: np.expand_dims(valid[idx][0], 0), self.question_ids: np.expand_dims(question_ids, 0), self.para_length: np.expand_dims(valid[idx][2], 0), self.question_length: np.expand_dims(question_length, 0), self.para_mask: np.expand_dims(valid[idx][5], 0), self.ques_mask: np.expand_dims(ques_mask, 0), self.dropout: dropout, } # Run session and obtain indices predictions = session.run([self.logits_withsf], feed_dict=feed_dict_qa) # Get the start and end indices of the answer start_idx, end_idx = self.obtain_indices(predictions[0][0], predictions[0][1]) answer_ind = valid[idx][0][start_idx[0]:end_idx[0] + 1] # Print answer req_ans = [ vocab_forward[ele].replace(" ", "") for ele in answer_ind if ele != 0 ] ans_string = " ".join(map(str, req_ans)) answer = re.sub(r'\s([?.!",])', r"\1", ans_string) print("Answer:", answer) if dynamic_question_mode is True: return {"answer": answer}