def _process_input(self, data_raw): questions = [] inputs = [] answers = [] fact_counts = [] input_masks = [] for x in data_raw: inp = x["C"].lower().split(' ') inp = [w for w in inp if len(w) > 0] q = x["Q"].lower().split(' ') q = [w for w in q if len(w) > 0] inp_vector = [utils.process_word(word = w, word2vec = self.word2vec, vocab = self.vocab, ivocab = self.ivocab, word_vector_size = self.word_vector_size, to_return = "word2vec") for w in inp] q_vector = [utils.process_word(word = w, word2vec = self.word2vec, vocab = self.vocab, ivocab = self.ivocab, word_vector_size = self.word_vector_size, to_return = "word2vec") for w in q] if (self.input_mask_mode == 'word'): input_mask = range(len(inp)) elif (self.input_mask_mode == 'sentence'): input_mask = [index for index, w in enumerate(inp) if w == '.'] else: raise Exception("unknown input_mask_mode") fact_count = len(input_mask) inputs.append(inp_vector) questions.append(q_vector) # NOTE: here we assume the answer is one word! answers.append(utils.process_word(word = x["A"], word2vec = self.word2vec, vocab = self.vocab, ivocab = self.ivocab, word_vector_size = self.word_vector_size, to_return = "index")) fact_counts.append(fact_count) input_masks.append(input_mask) return inputs, questions, answers, fact_counts, input_masks
def _process_input(self, data_raw): ''' This module processes the raw data input and grabs all the relevant sections and calculates the input_mask. Args: data_raw: raw data coming in from main class. Returns: inputs section, answers section, questions section, and input_masks as numpy arrays. ''' inputs = [] answers = [] input_masks = [] questions = [] for x in data_raw: # inp = x["C"].lower().split(' ') # inp = [w for w in inp if len(w) > 0] # q = x["Q"].lower().split(' ') # q = [w for w in q if len(w) > 0] inp = utils.get_one_hot_doc(x["C"], self.char_vocab, max_length=self.max_doc_length) q = utils.get_one_hot_doc(x["Q"], self.char_vocab, max_length=self.max_doc_length) # # Process the words from the input, answers, and questions to see what needs a new vector in word2vec. # inp_vector = [utils.process_word(word = w, # word2vec = self.word2vec, # vocab = self.vocab, # ivocab = self.ivocab, # word_vector_size = self.word_vector_size, # to_return = "word2vec") for w in inp] # # q_vector = [utils.process_word(word = w, # word2vec = self.word2vec, # vocab = self.vocab, # ivocab = self.ivocab, # word_vector_size = self.word_vector_size, # to_return = "word2vec") for w in q] # # inputs.append(np.vstack(inp_vector).astype(floatX)) # questions.append(np.vstack(q_vector).astype(floatX)) answers.append( utils.process_word(word=x["A"], word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="index")) # NOTE: here we assume the answer is one word! if self.input_mask_mode == 'word': input_masks.append( np.array( [index for index, w in enumerate(inp)], dtype=np.int32)) # Get the input_masks for the data elif self.input_mask_mode == 'sentence': input_masks.append( np.array( [index for index, w in enumerate(inp) if w == '.'], dtype=np.int32)) else: raise Exception("invalid input_mask_mode") # print(x["C"]) # print(inp.shape) # print(inp[0]) #inp_vector = self.build_cnn(inp) inputs.append(inp) # q_vector = self.build_cnn(q) questions.append(q) #input_masks = None return inputs, questions, answers, input_masks
def _process_input(self, data_raw): """ This module processes the raw data input and grabs all the relevant sections and calculates the input_mask. Args: data_raw: raw data coming in from main class. Returns: inputs section, answers section, questions section, and input_masks as numpy arrays. """ inputs = [] answers = [] input_masks = [] questions = [] for x in data_raw: # inp = x["C"].lower().split(' ') # inp = [w for w in inp if len(w) > 0] # q = x["Q"].lower().split(' ') # q = [w for w in q if len(w) > 0] inp = utils.get_one_hot_doc(x["C"], self.char_vocab, max_length=self.max_doc_length) q = utils.get_one_hot_doc(x["Q"], self.char_vocab, max_length=self.max_doc_length) # # Process the words from the input, answers, and questions to see what needs a new vector in word2vec. # inp_vector = [utils.process_word(word = w, # word2vec = self.word2vec, # vocab = self.vocab, # ivocab = self.ivocab, # word_vector_size = self.word_vector_size, # to_return = "word2vec") for w in inp] # # q_vector = [utils.process_word(word = w, # word2vec = self.word2vec, # vocab = self.vocab, # ivocab = self.ivocab, # word_vector_size = self.word_vector_size, # to_return = "word2vec") for w in q] # # inputs.append(np.vstack(inp_vector).astype(floatX)) # questions.append(np.vstack(q_vector).astype(floatX)) answers.append( utils.process_word( word=x["A"], word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="index", ) ) # NOTE: here we assume the answer is one word! if self.input_mask_mode == "word": input_masks.append( np.array([index for index, w in enumerate(inp)], dtype=np.int32) ) # Get the input_masks for the data elif self.input_mask_mode == "sentence": input_masks.append(np.array([index for index, w in enumerate(inp) if w == "."], dtype=np.int32)) else: raise Exception("invalid input_mask_mode") # print(x["C"]) # print(inp.shape) # print(inp[0]) # inp_vector = self.build_cnn(inp) inputs.append(inp) # q_vector = self.build_cnn(q) questions.append(q) # input_masks = None return inputs, questions, answers, input_masks
def _process_input(self, data_raw): ''' This module processes the raw data input and grabs all the relevant sections and calculates the input_mask. Args: data_raw: raw data coming in from main class. Returns: inputs section, answers section, questions section, and input_masks as numpy arrays. ''' inputs = [] answers = [] input_masks = [] questions = [] for x in data_raw: inp = x["C"].lower().split(' ') inp = [w for w in inp if len(w) > 0] q = x["Q"].lower().split(' ') q = [w for w in q if len(w) > 0] # Sentence punctuation delimiters punkt = ['.', '?', '!'] problem = False # NOTE: here we assume the answer is one word! if self.input_mask_mode == 'word': input_masks.append( np.array( [index for index, w in enumerate(inp)], dtype=np.int32)) # Get the input_masks for the data elif self.input_mask_mode == 'sentence': sent_mask = np.array( [index for index, w in enumerate(inp) if w in punkt], dtype=np.int32) input_masks.append(sent_mask) if (len(sent_mask) < 1): #Pass over the data if there is only one sentence as this will cause an error later problem = True else: raise Exception("invalid input_mask_mode") if problem: print("Passing over data: ", x["C"]) continue #Process the documents inp_vector = utils.process_sent( inp, word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return=self.to_return, silent=True, encoder_decoder=self.encoder_decoder, vocab_dict=self.vocab_dict) q_vector = utils.process_sent( q, word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return=self.to_return, silent=True, encoder_decoder=self.encoder_decoder, vocab_dict=self.vocab_dict) # Process the words from the input, answers, and questions to see what needs a new vector in word2vec. # inp_vector = [utils.process_word(word = w, # word2vec = self.word2vec, # vocab = self.vocab, # ivocab = self.ivocab, # word_vector_size = self.word_vector_size, # to_return = "word2vec", silent=True) for w in inp] # # q_vector = [utils.process_word(word = w, # word2vec = self.word2vec, # vocab = self.vocab, # ivocab = self.ivocab, # word_vector_size = self.word_vector_size, # to_return = "word2vec", silent=True) for w in q] inputs.append(np.vstack(inp_vector).astype(floatX)) questions.append(np.vstack(q_vector).astype(floatX)) answers.append( utils.process_word(word=x["A"], word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return="bool")) return inputs, questions, answers, input_masks
def _process_input(self, data_raw): ''' This module processes the raw data input and grabs all the relevant sections and calculates the input_mask. Args: data_raw: raw data coming in from main class. Returns: inputs section, answers section, questions section, and input_masks as numpy arrays. ''' inputs = [] answers = [] input_masks = [] questions = [] for x in data_raw: inp = x["C"].lower().split(' ') inp = [w for w in inp if len(w) > 0] q = x["Q"].lower().split(' ') q = [w for w in q if len(w) > 0] # Sentence punctuation delimiters punkt = ['.','?','!'] problem=False # NOTE: here we assume the answer is one word! if self.input_mask_mode == 'word': input_masks.append(np.array([index for index, w in enumerate(inp)], dtype=np.int32)) # Get the input_masks for the data elif self.input_mask_mode == 'sentence': sent_mask = np.array([index for index, w in enumerate(inp) if w in punkt], dtype=np.int32) input_masks.append(sent_mask) if(len(sent_mask)<1): #Pass over the data if there is only one sentence as this will cause an error later problem=True else: raise Exception("invalid input_mask_mode") if problem: print("Passing over data: ", x["C"]) continue #Process the documents inp_vector = utils.process_sent(inp, word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return=self.to_return, silent=True, encoder_decoder=self.encoder_decoder, vocab_dict=self.vocab_dict) q_vector = utils.process_sent(q, word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.word_vector_size, to_return=self.to_return, silent=True, encoder_decoder=self.encoder_decoder, vocab_dict=self.vocab_dict) # Process the words from the input, answers, and questions to see what needs a new vector in word2vec. # inp_vector = [utils.process_word(word = w, # word2vec = self.word2vec, # vocab = self.vocab, # ivocab = self.ivocab, # word_vector_size = self.word_vector_size, # to_return = "word2vec", silent=True) for w in inp] # # q_vector = [utils.process_word(word = w, # word2vec = self.word2vec, # vocab = self.vocab, # ivocab = self.ivocab, # word_vector_size = self.word_vector_size, # to_return = "word2vec", silent=True) for w in q] inputs.append(np.vstack(inp_vector).astype(floatX)) questions.append(np.vstack(q_vector).astype(floatX)) answers.append(utils.process_word(word = x["A"], word2vec = self.word2vec, vocab = self.vocab, ivocab = self.ivocab, word_vector_size = self.word_vector_size, to_return = "bool")) return inputs, questions, answers, input_masks