def test_simple_train_compose_intercept(self): #TODO test a1_car twice in the phrase list train_data = [("a1", "car", "a1_car"), ("a1", "man", "a1_man"), ] #model with train and then compose learner_ = LstsqRegressionLearner(intercept=True) model = LexicalFunction(learner=learner_) model._MIN_SAMPLES = 1 model.train(train_data, self.n_space, self.an_space) new_space = model.function_space np.testing.assert_array_almost_equal(new_space.cooccurrence_matrix.mat, np.mat([[0.66666667,0.33333333, -0.33333333,0.33333333, 0.66666667,0.33333333]]), 7) self.assertTupleEqual(new_space.element_shape, (2,3)) self.assertListEqual(new_space.id2row, ["a1"]) self.assertListEqual(new_space.id2column, []) comp_space = model.compose(train_data, self.n_space) np.testing.assert_array_almost_equal(comp_space.cooccurrence_matrix.mat, self.an_space.cooccurrence_matrix.mat, 10 ) self.assertListEqual(comp_space.id2row, ["a1_car", "a1_man"]) self.assertListEqual(comp_space.id2column, self.ft) #new model, without training model2 = LexicalFunction(function_space=new_space, intercept=True) model2._MIN_SAMPLES = 1 comp_space = model2.compose(train_data, self.n_space) self.assertListEqual(comp_space.id2row, ["a1_car", "a1_man"]) self.assertListEqual(comp_space.id2column, []) np.testing.assert_array_almost_equal(comp_space.cooccurrence_matrix.mat, self.n_space.cooccurrence_matrix.mat, 8) #recursive application comp_space2 = model2.compose([("a1", "a1_car", "a1_a1_car"), ("a1", "a1_man", "a1_a1_man")], comp_space) self.assertListEqual(comp_space2.id2row, ["a1_a1_car", "a1_a1_man"]) self.assertListEqual(comp_space.id2column, []) np.testing.assert_array_almost_equal(comp_space2.cooccurrence_matrix.mat, self.n_space.cooccurrence_matrix.mat, 8) self.assertEqual(comp_space.element_shape, (2,)) self.assertEqual(comp_space2.element_shape, (2,))
def test_simple_train_compose_intercept(self): #TODO test a1_car twice in the phrase list train_data = [("a1", "car", "a1_car"), ("a1", "man", "a1_man"), ] #model with train and then compose learner_ = LstsqRegressionLearner(intercept=True) model = LexicalFunction(learner=learner_) model.train(train_data, self.n_space, self.an_space) new_space = model.function_space np.testing.assert_array_almost_equal(new_space.cooccurrence_matrix.mat, np.mat([[0.66666667, 0.33333333, -0.33333333, 0.33333333, 0.66666667, 0.33333333]]), 7) self.assertTupleEqual(new_space.element_shape, (2, 3)) self.assertListEqual(new_space.id2row, ["a1"]) self.assertListEqual(new_space.id2column, []) comp_space = model.compose(train_data, self.n_space) np.testing.assert_array_almost_equal(comp_space.cooccurrence_matrix.mat, self.an_space.cooccurrence_matrix.mat, 10 ) self.assertListEqual(comp_space.id2row, ["a1_car", "a1_man"]) self.assertListEqual(comp_space.id2column, self.ft) #new model, without training model2 = LexicalFunction(function_space=new_space, intercept=True) comp_space = model2.compose(train_data, self.n_space) self.assertListEqual(comp_space.id2row, ["a1_car", "a1_man"]) self.assertListEqual(comp_space.id2column, []) np.testing.assert_array_almost_equal(comp_space.cooccurrence_matrix.mat, self.n_space.cooccurrence_matrix.mat, 8) #recursive application comp_space2 = model2.compose([("a1", "a1_car", "a1_a1_car"), ("a1", "a1_man", "a1_a1_man")], comp_space) self.assertListEqual(comp_space2.id2row, ["a1_a1_car", "a1_a1_man"]) self.assertListEqual(comp_space.id2column, []) np.testing.assert_array_almost_equal(comp_space2.cooccurrence_matrix.mat, self.n_space.cooccurrence_matrix.mat, 8) self.assertEqual(comp_space.element_shape, (2,)) self.assertEqual(comp_space2.element_shape, (2,))
class LexfunModel(Model): lexfun = None def __init__(self, space, learner='LeastSquares', intercept=True, param=None): # super(LexfunModel, self).__init__(space) Model.__init__(self, space) if learner == 'Ridge': # If param==None, generalized CV will be performed within standard param range learner = RidgeRegressionLearner(intercept=intercept, param=param) elif learner == 'LeastSquares': learner = LstsqRegressionLearner() else: raise NameError("No such learner: %s" % learner) self.lexfun = LexicalFunction(learner=learner) def fit(self, train_pairs, verbose=False): if len(train_pairs) == 0: raise NameError('Error: Train set is empty') else: if verbose: print 'fit: Fitting a lexfun model on %d pairs' % (len(train_pairs)) # LexicalFunction class is designed to be run on a dataset with different function words (==patterns). # We use a dummy function word here. train_pairs_ext = [('dummy', base, derived) for (base, derived) in train_pairs] self.lexfun.train(train_pairs_ext, self.space, self.space) def predict(self, base, verbose=False): if self.lexfun is None: raise NameError('Error: Model has not yet been trained') composed_space = self.lexfun.compose([('dummy', base, 'derived')], self.space) return composed_space.get_row('derived')
def get_vector(self, df): # 3. use the trained models to compose new SVO sentences # 3.1 use the V model to create new VO combinations data = (str(df[1]), str(df[2]), str(df[1:])) # ("take/V", "place/N", "take/V_place/N") vo_composed_space = self.v_model.compose([data], self.n_space) # todo how do we get VO vectors? these are (100x100)+100 dimensional (intercept). # todo do we allow document features of different dimensionality # vo_composed_space.cooccurrence_matrix.mat # 3.2 the new VO combinations will be used as functions: # load the new VO combinations obtained through composition into # a new composition model expanded_vo_model = LexicalFunction(function_space=vo_composed_space, intercept=self.v_model._has_intercept) # 3.3 use the new VO combinations by composing them with subject nouns # in order to obtain new SVO sentences data = (str(df[1:]), str(df[0]), str(df)) svo_composed_space = expanded_vo_model.compose([data], self.n_space) # print the composed spaces: # logging.info("SVO composed space:") # logging.info(svo_composed_space.id2row) # logging.info(svo_composed_space.cooccurrence_matrix) # get vectors out. these are 100-dimensional return svo_composed_space.cooccurrence_matrix.mat
def test_train_intercept(self): a1_mat = DenseMatrix(np.mat([[3,4],[5,6]])) a2_mat = DenseMatrix(np.mat([[1,2],[3,4]])) train_data = [("a1", "man", "a1_man"), ("a2", "car", "a2_car"), ("a1", "boy", "a1_boy"), ("a2", "boy", "a2_boy") ] n_mat = DenseMatrix(np.mat([[13,21],[3,4],[5,6]])) n_space = Space(n_mat, ["man", "car", "boy"], self.ft) an1_mat = (a1_mat * n_mat.transpose()).transpose() an2_mat = (a2_mat * n_mat.transpose()).transpose() an_mat = an1_mat.vstack(an2_mat) an_space = Space(an_mat, ["a1_man","a1_car","a1_boy","a2_man","a2_car","a2_boy"], self.ft) #test train model = LexicalFunction(learner=LstsqRegressionLearner(intercept=True)) model._MIN_SAMPLES = 1 model.train(train_data, n_space, an_space) a_space = model.function_space a1_mat.reshape((1,4)) #np.testing.assert_array_almost_equal(a1_mat.mat, # a_space.cooccurrence_matrix.mat[0]) a2_mat.reshape((1,4)) #np.testing.assert_array_almost_equal(a2_mat.mat, # a_space.cooccurrence_matrix.mat[1]) self.assertListEqual(a_space.id2row, ["a1", "a2"]) self.assertTupleEqual(a_space.element_shape, (2,3)) #test compose a1_mat = DenseMatrix(np.mat([[3,4,5,6]])) a2_mat = DenseMatrix(np.mat([[1,2,3,4]])) a_mat = a_space.cooccurrence_matrix a_space = Space(a_mat, ["a1", "a2"], [], element_shape=(2,3)) model = LexicalFunction(function_space=a_space, intercept=True) model._MIN_SAMPLES = 1 comp_space = model.compose(train_data, n_space) self.assertListEqual(comp_space.id2row, ["a1_man", "a2_car", "a1_boy", "a2_boy"]) self.assertListEqual(comp_space.id2column, []) self.assertEqual(comp_space.element_shape, (2,)) np.testing.assert_array_almost_equal(comp_space.cooccurrence_matrix.mat, an_mat[[0,4,2,5]].mat, 8)
def test_train_intercept(self): a1_mat = DenseMatrix(np.mat([[3, 4], [5, 6]])) a2_mat = DenseMatrix(np.mat([[1, 2], [3, 4]])) train_data = [("a1", "man", "a1_man"), ("a2", "car", "a2_car"), ("a1", "boy", "a1_boy"), ("a2", "boy", "a2_boy") ] n_mat = DenseMatrix(np.mat([[13, 21], [3, 4], [5, 6]])) n_space = Space(n_mat, ["man", "car", "boy"], self.ft) an1_mat = (a1_mat * n_mat.transpose()).transpose() an2_mat = (a2_mat * n_mat.transpose()).transpose() an_mat = an1_mat.vstack(an2_mat) an_space = Space(an_mat, ["a1_man", "a1_car", "a1_boy", "a2_man", "a2_car", "a2_boy"], self.ft) #test train model = LexicalFunction(learner=LstsqRegressionLearner(intercept=True)) model.train(train_data, n_space, an_space) a_space = model.function_space a1_mat.reshape((1, 4)) #np.testing.assert_array_almost_equal(a1_mat.mat, # a_space.cooccurrence_matrix.mat[0]) a2_mat.reshape((1, 4)) #np.testing.assert_array_almost_equal(a2_mat.mat, # a_space.cooccurrence_matrix.mat[1]) self.assertListEqual(a_space.id2row, ["a1", "a2"]) self.assertTupleEqual(a_space.element_shape, (2, 3)) #test compose a1_mat = DenseMatrix(np.mat([[3, 4, 5, 6]])) a2_mat = DenseMatrix(np.mat([[1, 2, 3, 4]])) a_mat = a_space.cooccurrence_matrix a_space = Space(a_mat, ["a1", "a2"], [], element_shape=(2, 3)) model = LexicalFunction(function_space=a_space, intercept=True) comp_space = model.compose(train_data, n_space) self.assertListEqual(comp_space.id2row, ["a1_man", "a2_car", "a1_boy", "a2_boy"]) self.assertListEqual(comp_space.id2column, []) self.assertEqual(comp_space.element_shape, (2,)) np.testing.assert_array_almost_equal(comp_space.cooccurrence_matrix.mat, an_mat[[0, 4, 2, 5]].mat, 8)
def predict_using_TENSOR ( compound, TENSOR_matrix, unigram_space ) : adj = compound.split('_')[0] noun = compound.split('_')[1] composed_space_1 = TENSOR_matrix.compose([("tens_adj", adj, "predicted_ADJ_"+adj)], unigram_space ) # eg ( "tens_adj", "good", "predicted_ADJ_good") #tens_adj -> Tensor matrix , good -> unigram, predicted_ADJ_good -> to compute ( using tens_adj * good ) #print composed_space_1.id2row expanded_model = LexicalFunction(function_space=composed_space_1, intercept=TENSOR_matrix._has_intercept) composed_space_2 = expanded_model.compose([("predicted_ADJ_"+adj, noun, compound)], unigram_space ) # eg ( "predicted_ADJ_good", "boy" , "good_boy" ) #predicted_ADJ_good -> ADJ_good matrix computed above, boy -> unigram, good_boy -> to compute ( predicted_ADJ_good * boy ) return composed_space_2
def compose_space_TENSOR ( ) : bigram_space = load_space(args.function[2]) TENSOR_matrix = load_space(args.function[3]) predicted_ADJs = [] predicted_bigrams = [] adj_list = extract_adj(bigram_space) for adj in adj_list : predicted_ADJs.append(("tens_adj", adj, "predicted_ADJ_"+adj) ) # eg ( "tens_adj", "good", "predicted_ADJ_good") #tens_adj -> Tensor matrix , good -> unigram, predicted_ADJ_good -> to compute ( using tens_adj * good ) # Obtain the ADJ matrices using => TENSOR * adj composed_space_1 = TENSOR_matrix.compose(predicted_ADJs, unigram_space ) expanded_model = LexicalFunction(function_space=composed_space_1, intercept=TENSOR_matrix._has_intercept) for bigram in bigram_space.id2row : adj = bigram.split('_')[0] noun = bigram.split('_')[1] if( not adj in adj_list or noun not in unigram_space.id2row ) : continue predicted_bigrams.append(("predicted_ADJ_"+adj, noun, "predicted_"+bigram) ) # eg ( "predicted_ADJ_good", "boy" , "predict_good_boy" ) #predicted_ADJ_good -> ADJ_good matrix computed above, boy -> unigram, predicted_good_boy -> to compute (predicted_ADJ_good * boy ) # Predicted composition = predicted_ADJ * noun ( where predicted_ADJ = TENSOR * adj ) composed_space_2 = expanded_model.compose(predicted_bigrams, unigram_space ) print "Number of elements in the space : ", len(composed_space_2.id2row) save_space(composed_space_2, "composed_space_TENSOR" , "composed_space")
data=data_path + "per.raw.SV.sm", cols=data_path + "per.raw.SV.cols", format="sm") #reading in train data train_data_file = data_path + "ML08_SV_train.txt" train_data = io_utils.read_tuple_list(train_data_file, fields=[0, 1, 2]) print "Training Lexical Function composition model..." comp_model = LexicalFunction(learner=RidgeRegressionLearner(param=2)) comp_model.train(train_data, space, per_space) print "Composing phrases..." test_phrases_file = data_path + "ML08nvs_test.txt" test_phrases = io_utils.read_tuple_list(test_phrases_file, fields=[0, 1, 2]) composed_space = comp_model.compose(test_phrases, space) print "Reading similarity test data..." test_similarity_file = data_path + "ML08data_new.txt" test_pairs = io_utils.read_tuple_list(test_similarity_file, fields=[0, 1]) gold = io_utils.read_list(test_similarity_file, field=2) print "Computing similarity with lexical function..." pred = composed_space.get_sims(test_pairs, CosSimilarity()) #use this composed space to assign similarities print "Scoring lexical function..." print scoring_utils.score(gold, pred, "spearman") print "Training Full Additive composition model..." comp_model = FullAdditive(learner=RidgeRegressionLearner(param=2))
#2. train a model to learn V functions on train data: V N -> VO # where VO space: function space learned in step 1 print "\nStep 2 training" vo_space = vo_model.function_space v_model = LexicalFunction(learner=LstsqRegressionLearner()) v_model.train(train_v_data, n_space, vo_space) #print the learned model print "\n3D Verb space" print v_model.function_space.id2row print v_model.function_space.cooccurrence_matrix #3. use the trained models to compose new SVO sentences #3.1 use the V model to create new VO combinations vo_composed_space = v_model.compose([("hate", "woman", "hate_woman"), ("hate", "man", "hate_man")], n_space) #3.2 the new VO combinations will be used as functions: # load the new VO combinations obtained through composition into # a new composition model expanded_vo_model = LexicalFunction(function_space=vo_composed_space, intercept=v_model._has_intercept) #3.3 use the new VO combinations by composing them with subject nouns # in order to obtain new SVO sentences svo_composed_space = expanded_vo_model.compose( [("hate_woman", "woman", "woman_hates_woman"), ("hate_man", "man", "man_hates_man")], n_space) #print the composed spaces: print "\nVO composed space:"
def test_3d(self): # setting up v_mat = DenseMatrix(np.mat([[0,0,1,1,2,2,3,3],#hate [0,1,2,4,5,6,8,9]])) #love vo11_mat = DenseMatrix(np.mat([[0,11],[22,33]])) #hate boy vo12_mat = DenseMatrix(np.mat([[0,7],[14,21]])) #hate man vo21_mat = DenseMatrix(np.mat([[6,34],[61,94]])) #love boy vo22_mat = DenseMatrix(np.mat([[2,10],[17,26]])) #love car train_vo_data = [("hate_boy", "man", "man_hate_boy"), ("hate_man", "man", "man_hate_man"), ("hate_boy", "boy", "boy_hate_boy"), ("hate_man", "boy", "boy_hate_man"), ("love_car", "boy", "boy_love_car"), ("love_boy", "man", "man_love_boy"), ("love_boy", "boy", "boy_love_boy"), ("love_car", "man", "man_love_car") ] # if do not find a phrase # what to do? train_v_data = [("love", "boy", "love_boy"), ("hate", "man", "hate_man"), ("hate", "boy", "hate_boy"), ("love", "car", "love_car")] sentences = ["man_hate_boy", "car_hate_boy", "boy_hate_boy", "man_hate_man", "car_hate_man", "boy_hate_man", "man_love_boy", "car_love_boy", "boy_love_boy", "man_love_car", "car_love_car", "boy_love_car" ] n_mat = DenseMatrix(np.mat([[3,4],[1,2],[5,6]])) n_space = Space(n_mat, ["man", "car", "boy"], self.ft) s1_mat = (vo11_mat * n_mat.transpose()).transpose() s2_mat = (vo12_mat * n_mat.transpose()).transpose() s3_mat = (vo21_mat * n_mat.transpose()).transpose() s4_mat = (vo22_mat * n_mat.transpose()).transpose() s_mat = vo11_mat.nary_vstack([s1_mat,s2_mat,s3_mat,s4_mat]) s_space = Space(s_mat, sentences, self.ft) #test train 2d model = LexicalFunction(learner=LstsqRegressionLearner(intercept=False)) model._MIN_SAMPLES = 1 model.train(train_vo_data, n_space, s_space) vo_space = model.function_space self.assertListEqual(vo_space.id2row, ["hate_boy", "hate_man","love_boy", "love_car"]) self.assertTupleEqual(vo_space.element_shape, (2,2)) vo11_mat.reshape((1,4)) np.testing.assert_array_almost_equal(vo11_mat.mat, vo_space.cooccurrence_matrix.mat[0]) vo12_mat.reshape((1,4)) np.testing.assert_array_almost_equal(vo12_mat.mat, vo_space.cooccurrence_matrix.mat[1]) vo21_mat.reshape((1,4)) np.testing.assert_array_almost_equal(vo21_mat.mat, vo_space.cooccurrence_matrix.mat[2]) vo22_mat.reshape((1,4)) np.testing.assert_array_almost_equal(vo22_mat.mat, vo_space.cooccurrence_matrix.mat[3]) # test train 3d model2 = LexicalFunction(learner=LstsqRegressionLearner(intercept=False)) model2._MIN_SAMPLES = 1 model2.train(train_v_data, n_space, vo_space) v_space = model2.function_space np.testing.assert_array_almost_equal(v_mat.mat, v_space.cooccurrence_matrix.mat) self.assertListEqual(v_space.id2row, ["hate","love"]) self.assertTupleEqual(v_space.element_shape, (2,2,2)) # test compose 3d vo_space2 = model2.compose(train_v_data, n_space) id2row1 = list(vo_space.id2row) id2row2 = list(vo_space2.id2row) id2row2.sort() self.assertListEqual(id2row1, id2row2) row_list = vo_space.id2row vo_rows1 = vo_space.get_rows(row_list) vo_rows2 = vo_space2.get_rows(row_list) np.testing.assert_array_almost_equal(vo_rows1.mat, vo_rows2.mat,7) self.assertTupleEqual(vo_space.element_shape, vo_space2.element_shape)
def test_simple_3d_intercept(self): train_data1 = [("drive_car", "I", "I_drive_car"), ("read_man", "You", "You_read_man"), ("read_man", "I", "I_read_man"), ("drive_car", "You", "You_drive_car"), ("drive_man", "You", "You_drive_man"), ("drive_man", "I", "I_drive_man") ] train_data2 = [("drive", "car", "drive_car"), ("drive", "man", "drive_man"), ] n_mat = DenseMatrix(np.mat([[1, 2], [3, 4], [5, 6], [7, 8]])) svo_mat = DenseMatrix(np.mat([[1, 2], [3, 4], [1, 2], [3, 4], [3, 4], [1, 2]])) n_space = Space(n_mat, ["I", "You", "man", "car"], []) svo_space = Space(svo_mat, ["I_drive_car", "You_read_man", "I_read_man", "You_drive_car", "You_drive_man", "I_drive_man"], ["f1", "f2"]) #test first stage train model = LexicalFunction(learner=LstsqRegressionLearner(intercept=True)) model.train(train_data1, n_space, svo_space) vo_space = model.function_space np.testing.assert_array_almost_equal(vo_space.cooccurrence_matrix.mat, np.mat([[0.6666, 0.3333, -0.3333, 0.3333, 0.6666, 0.3333], [0.6666, 0.3333, -0.3333, 0.3333, 0.6666, 0.3333], [0.6666, 0.3333, -0.3333, 0.3333, 0.6666, 0.3333]]), 4) self.assertTupleEqual(vo_space.element_shape, (2, 3)) self.assertListEqual(vo_space.id2row, ["drive_car", "drive_man", "read_man"]) self.assertListEqual(vo_space.id2column, []) #test first stage compose comp_space = model.compose([train_data1[0]], n_space) np.testing.assert_array_almost_equal(comp_space.cooccurrence_matrix.mat, np.mat([[1, 2]]), 8) self.assertTupleEqual(comp_space.element_shape, (2,)) self.assertListEqual(comp_space.id2row, ["I_drive_car"]) self.assertListEqual(comp_space.id2column, ["f1", "f2"]) #test second stage train model = LexicalFunction(learner=LstsqRegressionLearner(intercept=True)) model.train(train_data2, n_space, vo_space) v_space = model.function_space np.testing.assert_array_almost_equal(v_space.cooccurrence_matrix.mat, np.mat([[-0.2222, 0.2222, 0.4444, -0.1111, 0.1111, 0.2222, 0.1111, -0.1111, -0.2222, -0.1111, 0.1111, 0.2222, -0.2222, 0.2222, 0.4444, -0.1111, 0.1111, 0.2222]]), 4) self.assertTupleEqual(v_space.element_shape, (2, 3, 3)) self.assertListEqual(v_space.id2row, ["drive"]) self.assertListEqual(v_space.id2column, []) #test compose1 comp_space = model.compose([train_data2[0]], n_space) np.testing.assert_array_almost_equal(comp_space.cooccurrence_matrix.mat, np.mat([[0.6666, 0.3333, -0.3333, 0.3333, 0.6666, 0.3333]]), 4) self.assertTupleEqual(comp_space.element_shape, (2, 3)) self.assertListEqual(comp_space.id2row, ["drive_car"]) self.assertListEqual(comp_space.id2column, []) #test compose2 model2 = LexicalFunction(function_space=comp_space, intercept=True) comp_space2 = model2.compose([train_data1[0]], n_space) np.testing.assert_array_almost_equal(comp_space2.cooccurrence_matrix.mat, np.mat([[1, 2]]), 8) self.assertTupleEqual(comp_space2.element_shape, (2,)) self.assertListEqual(comp_space2.id2row, ["I_drive_car"]) self.assertListEqual(comp_space2.id2column, []) #recursive application, write a wrapper around it!!! comp_space2 = model2.compose([("drive_car", "I", "I_drive_car")], n_space) np.testing.assert_array_almost_equal(comp_space2.cooccurrence_matrix.mat, np.mat([[1, 2]]), 8) self.assertTupleEqual(comp_space2.element_shape, (2,)) self.assertListEqual(comp_space2.id2row, ["I_drive_car"]) self.assertListEqual(comp_space2.id2column, [])
def test_3d(self): # setting up v_mat = DenseMatrix(np.mat([[0, 0, 1, 1, 2, 2, 3, 3], #hate [0, 1, 2, 4, 5, 6, 8, 9]])) #love vo11_mat = DenseMatrix(np.mat([[0, 11], [22, 33]])) #hate boy vo12_mat = DenseMatrix(np.mat([[0, 7], [14, 21]])) #hate man vo21_mat = DenseMatrix(np.mat([[6, 34], [61, 94]])) #love boy vo22_mat = DenseMatrix(np.mat([[2, 10], [17, 26]])) #love car train_vo_data = [("hate_boy", "man", "man_hate_boy"), ("hate_man", "man", "man_hate_man"), ("hate_boy", "boy", "boy_hate_boy"), ("hate_man", "boy", "boy_hate_man"), ("love_car", "boy", "boy_love_car"), ("love_boy", "man", "man_love_boy"), ("love_boy", "boy", "boy_love_boy"), ("love_car", "man", "man_love_car") ] # if do not find a phrase # what to do? train_v_data = [("love", "boy", "love_boy"), ("hate", "man", "hate_man"), ("hate", "boy", "hate_boy"), ("love", "car", "love_car")] sentences = ["man_hate_boy", "car_hate_boy", "boy_hate_boy", "man_hate_man", "car_hate_man", "boy_hate_man", "man_love_boy", "car_love_boy", "boy_love_boy", "man_love_car", "car_love_car", "boy_love_car"] n_mat = DenseMatrix(np.mat([[3, 4], [1, 2], [5, 6]])) n_space = Space(n_mat, ["man", "car", "boy"], self.ft) s1_mat = (vo11_mat * n_mat.transpose()).transpose() s2_mat = (vo12_mat * n_mat.transpose()).transpose() s3_mat = (vo21_mat * n_mat.transpose()).transpose() s4_mat = (vo22_mat * n_mat.transpose()).transpose() s_mat = vo11_mat.nary_vstack([s1_mat, s2_mat, s3_mat, s4_mat]) s_space = Space(s_mat, sentences, self.ft) #test train 2d model = LexicalFunction(learner=LstsqRegressionLearner(intercept=False)) model.train(train_vo_data, n_space, s_space) vo_space = model.function_space self.assertListEqual(vo_space.id2row, ["hate_boy", "hate_man", "love_boy", "love_car"]) self.assertTupleEqual(vo_space.element_shape, (2, 2)) vo11_mat.reshape((1, 4)) np.testing.assert_array_almost_equal(vo11_mat.mat, vo_space.cooccurrence_matrix.mat[0]) vo12_mat.reshape((1, 4)) np.testing.assert_array_almost_equal(vo12_mat.mat, vo_space.cooccurrence_matrix.mat[1]) vo21_mat.reshape((1, 4)) np.testing.assert_array_almost_equal(vo21_mat.mat, vo_space.cooccurrence_matrix.mat[2]) vo22_mat.reshape((1, 4)) np.testing.assert_array_almost_equal(vo22_mat.mat, vo_space.cooccurrence_matrix.mat[3]) # test train 3d model2 = LexicalFunction(learner=LstsqRegressionLearner(intercept=False)) model2.train(train_v_data, n_space, vo_space) v_space = model2.function_space np.testing.assert_array_almost_equal(v_mat.mat, v_space.cooccurrence_matrix.mat) self.assertListEqual(v_space.id2row, ["hate", "love"]) self.assertTupleEqual(v_space.element_shape, (2, 2, 2)) # test compose 3d vo_space2 = model2.compose(train_v_data, n_space) id2row1 = list(vo_space.id2row) id2row2 = list(vo_space2.id2row) id2row2.sort() self.assertListEqual(id2row1, id2row2) row_list = vo_space.id2row vo_rows1 = vo_space.get_rows(row_list) vo_rows2 = vo_space2.get_rows(row_list) np.testing.assert_array_almost_equal(vo_rows1.mat, vo_rows2.mat, 7) self.assertTupleEqual(vo_space.element_shape, vo_space2.element_shape)
def test_simple_3d_intercept(self): train_data1 = [("drive_car", "I", "I_drive_car"), ("read_man", "You", "You_read_man"), ("read_man", "I", "I_read_man"), ("drive_car", "You", "You_drive_car"), ("drive_man", "You", "You_drive_man"), ("drive_man", "I", "I_drive_man") ] train_data2 = [("drive", "car", "drive_car"), ("drive", "man", "drive_man"), ] n_mat = DenseMatrix(np.mat([[1,2],[3,4],[5,6],[7,8]])) svo_mat = DenseMatrix(np.mat([[1,2],[3,4],[1,2],[3,4],[3,4],[1,2]])) n_space = Space(n_mat,["I", "You", "man", "car"],[]) svo_space = Space(svo_mat,["I_drive_car","You_read_man", "I_read_man", "You_drive_car", "You_drive_man", "I_drive_man"],["f1","f2"]) #test first stage train model = LexicalFunction(learner=LstsqRegressionLearner(intercept=True)) model._MIN_SAMPLES = 1 model.train(train_data1, n_space, svo_space) vo_space = model.function_space np.testing.assert_array_almost_equal(vo_space.cooccurrence_matrix.mat, np.mat([[0.6666,0.3333,-0.3333, 0.3333,0.6666,0.3333], [0.6666,0.3333,-0.3333, 0.3333,0.6666,0.3333], [0.6666,0.3333,-0.3333, 0.3333,0.6666,0.3333]]), 4) self.assertTupleEqual(vo_space.element_shape, (2,3)) self.assertListEqual(vo_space.id2row, ["drive_car","drive_man","read_man"]) self.assertListEqual(vo_space.id2column, []) #test first stage compose comp_space = model.compose([train_data1[0]], n_space) np.testing.assert_array_almost_equal(comp_space.cooccurrence_matrix.mat, np.mat([[1,2]]), 8) self.assertTupleEqual(comp_space.element_shape, (2,)) self.assertListEqual(comp_space.id2row, ["I_drive_car"]) self.assertListEqual(comp_space.id2column, ["f1","f2"]) #test second stage train model = LexicalFunction(learner=LstsqRegressionLearner(intercept=True)) model._MIN_SAMPLES = 1 model.train(train_data2, n_space, vo_space) v_space = model.function_space np.testing.assert_array_almost_equal(v_space.cooccurrence_matrix.mat, np.mat([[-0.2222,0.2222,0.4444, -0.1111,0.1111,0.2222, 0.1111,-0.1111,-0.2222, -0.1111,0.1111,0.2222, -0.2222,0.2222,0.4444, -0.1111,0.1111,0.2222]]), 4) self.assertTupleEqual(v_space.element_shape, (2,3,3)) self.assertListEqual(v_space.id2row, ["drive"]) self.assertListEqual(v_space.id2column, []) #test compose1 comp_space = model.compose([train_data2[0]], n_space) np.testing.assert_array_almost_equal(comp_space.cooccurrence_matrix.mat, np.mat([[0.6666,0.3333,-0.3333, 0.3333,0.6666,0.3333]]), 4) self.assertTupleEqual(comp_space.element_shape, (2,3)) self.assertListEqual(comp_space.id2row, ["drive_car"]) self.assertListEqual(comp_space.id2column, []) #test compose2 model2 = LexicalFunction(function_space=comp_space, intercept=True) model2._MIN_SAMPLES = 1 comp_space2 = model2.compose([train_data1[0]], n_space) np.testing.assert_array_almost_equal(comp_space2.cooccurrence_matrix.mat, np.mat([[1,2]]), 8) self.assertTupleEqual(comp_space2.element_shape, (2,)) self.assertListEqual(comp_space2.id2row, ["I_drive_car"]) self.assertListEqual(comp_space2.id2column, []) #recursive application, write a wrapper around it!!! comp_space2 = model2.compose([("drive_car", "I", "I_drive_car")], n_space) np.testing.assert_array_almost_equal(comp_space2.cooccurrence_matrix.mat, np.mat([[1,2]]), 8) self.assertTupleEqual(comp_space2.element_shape, (2,)) self.assertListEqual(comp_space2.id2row, ["I_drive_car"]) self.assertListEqual(comp_space2.id2column, [])
from composes.composition.lexical_function import LexicalFunction #training data #trying to learn a "book" function train_data = [("good_function", "car", "good_car"), ("good_function", "book", "good_book")] #load argument and phrase space arg_space = io_utils.load("./data/out/ex10.pkl") phrase_space = io_utils.load("data/out/PHRASE_SS.ex10.pkl") #train a lexical function model on the data my_comp = LexicalFunction() my_comp.train(train_data, arg_space, phrase_space) #apply the trained model comp_sp1 = my_comp.compose([("good_function", "car", "good_car")], arg_space) #apply the trained model a second time comp_sp2 = my_comp.compose([("good_function", "good_car", "good_good_car")], comp_sp1) #print the composed spaces: print "\nComposed space 1:" print comp_sp1.id2row print comp_sp1.cooccurrence_matrix print "\nComposed space 2:" print comp_sp2.id2row print comp_sp2.cooccurrence_matrix
print "\nStep 2 training" vo_space = vo_model.function_space v_model = LexicalFunction(learner=LstsqRegressionLearner()) v_model.train(train_v_data, n_space, vo_space) #print the learned model print "\n3D Verb space" print v_model.function_space.id2row print v_model.function_space.cooccurrence_matrix #3. use the trained models to compose new SVO sentences #3.1 use the V model to create new VO combinations vo_composed_space = v_model.compose([("hate", "woman", "hate_woman"), ("hate", "man", "hate_man")], n_space) #3.2 the new VO combinations will be used as functions: # load the new VO combinations obtained through composition into # a new composition model expanded_vo_model = LexicalFunction(function_space=vo_composed_space, intercept=v_model._has_intercept) #3.3 use the new VO combinations by composing them with subject nouns # in order to obtain new SVO sentences svo_composed_space = expanded_vo_model.compose([("hate_woman", "woman", "woman_hates_woman"), ("hate_man", "man", "man_hates_man")], n_space) #print the composed spaces:
#trying to learn a "book" function train_data = [("good_function", "car", "good_car"), ("good_function", "book", "good_book") ] #load argument and phrase space arg_space = io_utils.load("./data/out/ex10.pkl") phrase_space = io_utils.load("data/out/PHRASE_SS.ex10.pkl") #train a lexical function model on the data my_comp = LexicalFunction() my_comp.train(train_data, arg_space, phrase_space) #apply the trained model comp_sp1 = my_comp.compose([("good_function", "car", "good_car")], arg_space) #apply the trained model a second time comp_sp2 = my_comp.compose([("good_function", "good_car", "good_good_car")], comp_sp1) #print the composed spaces: print "\nComposed space 1:" print comp_sp1.id2row print comp_sp1.cooccurrence_matrix print "\nComposed space 2:" print comp_sp2.id2row
train_data.append(tup) ''' try: with open('temp_func.pkl', 'rb') as file: print("Loading model...") my_comp = pickle.load(file) except FileNotFoundError: ''' print("Training LexicalFunction...") my_comp = LexicalFunction() my_comp.train(train_data, my_space, my_space) with open('temp_func.pkl', 'wb') as file: pickle.dump(my_comp, file) print("Building composed space...") composed_space = my_comp.compose(train_data, my_space) # print(composed_space.id2row) # print(composed_space.cooccurrence_matrix) # compute similarity between two words in the space cos_sim = {} for pair in train_data: cos = my_space.get_sim(pair[1], pair[2], CosSimilarity(), space2=composed_space) if pair[0] in cos_sim: cos_sim[pair[0]].append(cos) else: cos_sim[pair[0]] = [cos]
cols = data_path + "per.raw.SV.cols", format = "sm" ) #reading in train data train_data_file = data_path + "ML08_SV_train.txt" train_data = io_utils.read_tuple_list(train_data_file, fields=[0,1,2]) print "Training Lexical Function composition model..." comp_model = LexicalFunction(learner = RidgeRegressionLearner(param=2)) comp_model.train(train_data, space, per_space) print "Composing phrases..." test_phrases_file = data_path + "ML08nvs_test.txt" test_phrases = io_utils.read_tuple_list(test_phrases_file, fields=[0,1,2]) composed_space = comp_model.compose(test_phrases, space) print "Reading similarity test data..." test_similarity_file = data_path + "ML08data_new.txt" test_pairs = io_utils.read_tuple_list(test_similarity_file, fields=[0,1]) gold = io_utils.read_list(test_similarity_file, field=2) print "Computing similarity with lexical function..." pred = composed_space.get_sims(test_pairs, CosSimilarity()) #use this composed space to assign similarities print "Scoring lexical function..." print scoring_utils.score(gold, pred, "spearman") print "Training Full Additive composition model..."