def test(self, ds, qa_pair): #reload best weights self.model.load_weights(self.best_params) #Construct Test dataset test_X1 = [] test_X2 = [] test_X3 = [] test_Y = [] for qa in ds.build_qa_pairs(qa_pair): q_vect = np.array( self.w2vutil.transform2Word2Vect( nlp_utils.data_preprocess(qa.q, self.prep_step))) q_vect = np.pad(q_vect, ((0, self.timesteps - q_vect.shape[0]), (0, 0)), mode='constant') a_vect = np.array( self.w2vutil.transform2Word2Vect( nlp_utils.data_preprocess(qa.a, self.prep_step))) a_vect = np.pad(a_vect, ((0, self.timesteps - a_vect.shape[0]), (0, 0)), mode='constant') test_X1.append(q_vect) test_X2.append(a_vect) test_X3.append(self.proc_third_input(qa.q, qa.a)) test_Y.append(qa.l) test_X1 = np.array(test_X1) test_X2 = np.array(test_X2) test_X3 = np.array(test_X3) test_Y = np.array(test_Y) #loss, acc = model.evaluate([test_X1, test_X2, test_X3], test_Y, 50) #loss, acc = model.evaluate(test_X, test_Y, batch_size, show_accuracy=True) #print('Test loss / test accuracy = {:.4f} / {:.4f}'.format(loss, acc)) predictions = self.model.predict([test_X1, test_X2, test_X3]) return predictions
def buildCosineSimMatrix(self, questions_answer_pairs, max_terms=20): #Construct Question Answer Matrix Pairs x = [] y = [] for pair in questions_answer_pairs: q_vect = self.w2vutil.transform2Word2Vect( nlp_utils.data_preprocess(pair.q, self.prep_step)) #print q cos_matrix = [] for i, q_i in enumerate(q_vect): if i == max_terms: break sim_qi_a = [] #print q_i #a = nlp_utils.data_preprocess(pair.a,self.prep_step) a_vect = self.w2vutil.transform2Word2Vect( nlp_utils.data_preprocess(pair.a, self.prep_step)) #print a for k, a_k in enumerate(a_vect): if k == max_terms: break #print a_k sim_qi_a += [spatial.distance.cosine(q_i, a_k)] cos_matrix += [sim_qi_a] cos_matrix = np.array(cos_matrix) shape_cos_matrix = cos_matrix.shape cos_matrix = np.pad(cos_matrix, ((0, max_terms - shape_cos_matrix[0]), (0, max_terms - shape_cos_matrix[1])), mode='constant') if np.isnan(cos_matrix).any(): print 'ERROR IS NAN: ', pair x.append(np.expand_dims(cos_matrix, 0)) y.append(pair.l) return x, y
def generateXYBatches(self, qadata, dataset, samples, prep_step, proc_func, positive_rate=0.5): samples = qadata.get_random_samples(dataset, samples, positive_rate) while 1: train_X1 = [] train_X2 = [] train_X3 = [] train_Y = [] for qa_pair in samples: q_vect = self.w2vutil.transform2Word2Vect(nlp_utils.data_preprocess(qa_pair.q,prep_step)) if len(q_vect)==0: q_vect = q_vect + [np.zeros(300)] q_vect = np.array(q_vect) q_vect = np.pad(q_vect, ((0,self.timesteps-q_vect.shape[0]),(0,0)), mode='constant') a_vect = self.w2vutil.transform2Word2Vect(nlp_utils.data_preprocess(qa_pair.a,prep_step)) if len(a_vect)==0: a_vect = a_vect + [np.zeros(300)] a_vect = np.array(a_vect) a_vect = np.pad(a_vect, ((0,self.timesteps-a_vect.shape[0]),(0,0)), mode='constant') train_X1.append(q_vect) train_X2.append(a_vect) train_X3.append(proc_func(qa_pair.q, qa_pair.a)) train_Y.append( np.array(qa_pair.l) ) #print " Samples generated = ", len(train_X1), ' - Validation(',validation, ')' #yield (train_X, train_Y) yield ([np.array(train_X1), np.array(train_X2), np.array(train_X3)], np.array(train_Y))
def proc_third_input(self, q, a): q_w2v = self.w2vutil.transform2Word2Vect(nlp_utils.data_preprocess(q,self.prep_step)) \ if len(self.w2vutil.transform2Word2Vect(nlp_utils.data_preprocess(q,self.prep_step)))>0 else [np.zeros(300)] a_w2v = self.w2vutil.transform2Word2Vect(nlp_utils.data_preprocess(a,self.prep_step)) \ if len(self.w2vutil.transform2Word2Vect(nlp_utils.data_preprocess(a,self.prep_step)))>0 else [np.zeros(300)] #res = np.concatenate((sum(np.array(q_w2v)), sum(np.array(a_w2v)) )) res = spatial.distance.cosine(sum(np.array(q_w2v)), sum(np.array(a_w2v))) return res
def proc_third_input(self, q, a): q_w2v = self.w2vutil.transform2Word2Vect(nlp_utils.data_preprocess(q,self.prep_step)) \ if len(self.w2vutil.transform2Word2Vect(nlp_utils.data_preprocess(q,self.prep_step)))>0 else [np.zeros(300)] a_w2v = self.w2vutil.transform2Word2Vect(nlp_utils.data_preprocess(a,self.prep_step)) \ if len(self.w2vutil.transform2Word2Vect(nlp_utils.data_preprocess(a,self.prep_step)))>0 else [np.zeros(300)] try: res = np.concatenate((sum(np.array(q_w2v)), sum(np.array(a_w2v)))) return res except Exception, e: print e
def buildCosineSimMatrix(self, questions_answer_pairs, ordered_matrix=1, salience_weight=0, max_terms=40): #Construct Question Answer Matrix Pairs x = [] y = [] for pair in questions_answer_pairs: #Question Processin q_list = nlp_utils.data_preprocess(pair.q, self.prep_step) q_vect = self.w2vutil.transform2Word2Vect(q_list) #Answer processing a_list = nlp_utils.data_preprocess(pair.a, self.prep_step) a_vect = self.w2vutil.transform2Word2Vect(a_list) #Get cosine distance #distance = np.absolute( spatial.distance.cdist(q_vect[0:max_terms], a_vect[0:max_terms], 'cosine') ) ''' with that param the MAP and Loss are highly correlated and the improvement in MAP is very fast, but in test the results are almost equal cos_matrix = 1 - (1/(1+np.exp(-distance*3))) it changes a bit with cos_matrix = 1 - (1/(1+np.exp(-distance*2))) distance = spatial.distance.cdist(q_vect[0:max_terms], a_vect[0:max_terms], lambda u, v: (np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))) if (np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))) >= 0 else -1.0*(np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))) ) ''' if len(q_vect) < 1 or len(a_vect) < 1: cos_matrix = np.zeros((max_terms, max_terms)) else: distance = spatial.distance.cdist(q_vect[0:max_terms], a_vect[0:max_terms], 'cosine') #cos_matrix = 1 - distance cos_matrix = 1 - distance / 2 shape_cos_matrix = cos_matrix.shape cos_matrix = np.pad(cos_matrix, ((0, max_terms - shape_cos_matrix[0]), (0, max_terms - shape_cos_matrix[1])), mode='constant') if np.isnan(cos_matrix).any(): print 'ERROR IS NAN: ', pair if salience_weight == 1: #Get salience score sal_matrix = self.getSalienceScore(q_list, a_list, max_terms) cos_matrix = np.multiply(cos_matrix, sal_matrix) #x.append( np.expand_dims( np.multiply(cos_matrix,sal_matrix) ,0) ) if ordered_matrix == 1: cos_matrix.sort() cos_matrix = cos_matrix[::-1] cos_matrix = cos_matrix[:, ::-1] y.append(pair.l) x.append(np.expand_dims(cos_matrix, 0)) return np.array(x), np.array(y)
def buildCosineSimMatrix(self, questions_answer_pairs, max_terms=20): #Construct Question Answer Matrix Pairs x = [] y = [] for pair in questions_answer_pairs: q_list = nlp_utils.data_preprocess(pair.q, self.prep_step) a_list = [] q_vect = self.w2vutil.transform2Word2Vect(q_list) cos_matrix = [] sal_matrix = [] for i, q_i in enumerate(q_vect): if i == max_terms: break sim_qi_a = [] a_list = nlp_utils.data_preprocess(pair.a, self.prep_step) a_vect = self.w2vutil.transform2Word2Vect(a_list) for k, a_k in enumerate(a_vect): if k == max_terms: break pw1 = nltk.pos_tag(q_list[i])[0][1] pw2 = nltk.pos_tag(a_list[k])[0][1] result = spatial.distance.cosine(q_i, a_k) if set([pw1, pw2]).intersection(set(['VB', 'VBD', 'VBG', 'VBN', 'VBP', \ 'VBZ', 'WDT', 'WP', 'WP', 'WRB', \ 'NN', 'NNS', 'NNP', 'NNPS', 'MD'])) : result = (1 - result) * 1 else: result = (1 - result) * 0.2 sim_qi_a += [result] cos_matrix += [sim_qi_a] cos_matrix = np.array(cos_matrix) sal_matrix = np.array(sal_matrix) shape_cos_matrix = cos_matrix.shape #print 'shapes: ', sal_matrix.shape, cos_matrix.shape cos_matrix = np.pad(cos_matrix, ((0, max_terms - shape_cos_matrix[0]), (0, max_terms - shape_cos_matrix[1])), mode='constant') #sal_matrix = np.pad(sal_matrix, ((0,max_terms-shape_cos_matrix[0]),(0,max_terms-shape_cos_matrix[1])), mode='constant') if np.isnan(cos_matrix).any(): print 'ERROR IS NAN: ', pair x.append(np.expand_dims(cos_matrix, 0)) y.append(pair.l) return np.array(x), np.array(y)
def buildCosineSimMatrix(self, questions_answer_pairs, max_terms=20): #Construct Question Answer Matrix Pairs x = [] y = [] for pair in questions_answer_pairs: q_list = nlp_utils.data_preprocess(pair.q, self.prep_step) a_list = [] q_vect = self.w2vutil.transform2Word2Vect(q_list) cos_matrix = [] #sal_matrix = [] for i, q_i in enumerate(q_vect): if i == max_terms: break sim_qi_a = [] a_list = nlp_utils.data_preprocess(pair.a, self.prep_step) a_vect = self.w2vutil.transform2Word2Vect(a_list) for k, a_k in enumerate(a_vect): if k == max_terms: break sim_qi_a += [spatial.distance.cosine(q_i, a_k)] cos_matrix += [sim_qi_a] #sal_matrix = self.getSalienceScore(q_list,a_list,max_terms) cos_matrix = np.array(cos_matrix) #sal_matrix = np.array(sal_matrix) shape_cos_matrix = cos_matrix.shape #print 'shapes: ', sal_matrix.shape, cos_matrix.shape cos_matrix = np.pad(cos_matrix, ((0, max_terms - shape_cos_matrix[0]), (0, max_terms - shape_cos_matrix[1])), mode='constant') #sal_matrix = np.pad(sal_matrix, ((0,max_terms-shape_cos_matrix[0]),(0,max_terms-shape_cos_matrix[1])), mode='constant') if np.isnan(cos_matrix).any(): print 'ERROR IS NAN: ', pair #x.append( np.expand_dims(np.multiply(cos_matrix, sal_matrix),0) ) x.append(np.expand_dims(cos_matrix, 0)) y.append(pair.l) return np.array(x), np.array(y)
def buildCosineSimMatrix(self, questions_answer_pairs, ordered_matrix=1, salience_weight=0, max_terms=40): #Construct Question Answer Matrix Pairs x = [] y = [] for pair in questions_answer_pairs: #Question Processin q_list = nlp_utils.data_preprocess(pair.q, self.prep_step) #Answer processing a_list = nlp_utils.data_preprocess(pair.a, self.prep_step) #Get composed similarity matrix cos_matrix = self.composed_similarity(self.w2vutil.w2v_model, q_list, a_list, \ wg_wordnet=1, wg_levenshtein=1, maxterms=max_terms) #Reshape shape_cos_matrix = cos_matrix.shape cos_matrix = np.pad(cos_matrix, ((0, max_terms - shape_cos_matrix[0]), (0, max_terms - shape_cos_matrix[1])), mode='constant') if np.isnan(cos_matrix).any(): print 'ERROR IS NAN: ', pair if salience_weight == 1: #Get salience score sal_matrix = self.getSalienceScore(q_list, a_list, max_terms) cos_matrix = np.multiply(cos_matrix, sal_matrix) #x.append( np.expand_dims( np.multiply(cos_matrix,sal_matrix) ,0) ) if ordered_matrix == 1: cos_matrix.sort() cos_matrix = cos_matrix[::-1] cos_matrix = cos_matrix[:, ::-1] y.append(pair.l) x.append(np.expand_dims(cos_matrix, 0)) return np.array(x), np.array(y)