def test_min_samples1(self): #TODO test a1_car twice in the phrase list train_data = [("bla3", "man", "a1_car"), ("a1", "car", "a1_car"), ("bla2", "man", "a1_car"), ("a1", "man", "a1_man"), ("bla1", "man", "a1_car") ] #model with train and then compose learner_ = LstsqRegressionLearner(intercept=True) model = LexicalFunction(learner=learner_) model._MIN_SAMPLES = 2 model.train(train_data, self.n_space, self.an_space) new_space = model.function_space np.testing.assert_array_almost_equal(new_space.cooccurrence_matrix.mat, np.mat([[0.66666667,0.33333333, -0.33333333,0.33333333, 0.66666667,0.33333333]]), 7) self.assertTupleEqual(new_space.element_shape, (2,3)) self.assertListEqual(new_space.id2row, ["a1"]) self.assertListEqual(new_space.id2column, [])
class LexfunModel(Model): lexfun = None def __init__(self, space, learner='LeastSquares', intercept=True, param=None): # super(LexfunModel, self).__init__(space) Model.__init__(self, space) if learner == 'Ridge': # If param==None, generalized CV will be performed within standard param range learner = RidgeRegressionLearner(intercept=intercept, param=param) elif learner == 'LeastSquares': learner = LstsqRegressionLearner() else: raise NameError("No such learner: %s" % learner) self.lexfun = LexicalFunction(learner=learner) def fit(self, train_pairs, verbose=False): if len(train_pairs) == 0: raise NameError('Error: Train set is empty') else: if verbose: print 'fit: Fitting a lexfun model on %d pairs' % (len(train_pairs)) # LexicalFunction class is designed to be run on a dataset with different function words (==patterns). # We use a dummy function word here. train_pairs_ext = [('dummy', base, derived) for (base, derived) in train_pairs] self.lexfun.train(train_pairs_ext, self.space, self.space) def predict(self, base, verbose=False): if self.lexfun is None: raise NameError('Error: Model has not yet been trained') composed_space = self.lexfun.compose([('dummy', base, 'derived')], self.space) return composed_space.get_row('derived')
def test_min_samples1(self): #TODO test a1_car twice in the phrase list train_data = [("bla3", "man", "a1_car"), ("a1", "car", "a1_car"), ("bla2", "man", "a1_car"), ("a1", "man", "a1_man"), ("bla1", "man", "a1_car") ] #model with train and then compose learner_ = LstsqRegressionLearner(intercept=True) model = LexicalFunction(learner=learner_, min_samples=2) model.train(train_data, self.n_space, self.an_space) new_space = model.function_space np.testing.assert_array_almost_equal(new_space.cooccurrence_matrix.mat, np.mat([[0.66666667, 0.33333333, -0.33333333, 0.33333333, 0.66666667, 0.33333333]]), 7) self.assertTupleEqual(new_space.element_shape, (2, 3)) self.assertListEqual(new_space.id2row, ["a1"]) self.assertListEqual(new_space.id2column, [])
def train_one_space(core_space, per_space, func_pos, number_of_lambdas): param_range = np.logspace(-1,1,number_of_lambdas) training_list = get_training_list(per_space, 1, func_pos) per_space = per_space.apply(RowNormalization()) composition_model = LexicalFunction( learner=RidgeRegressionLearner(param_range=param_range, intercept=False)) composition_model.train(training_list, core_space, per_space) return composition_model.function_space
def test_simple_train_compose_intercept(self): #TODO test a1_car twice in the phrase list train_data = [("a1", "car", "a1_car"), ("a1", "man", "a1_man"), ] #model with train and then compose learner_ = LstsqRegressionLearner(intercept=True) model = LexicalFunction(learner=learner_) model._MIN_SAMPLES = 1 model.train(train_data, self.n_space, self.an_space) new_space = model.function_space np.testing.assert_array_almost_equal(new_space.cooccurrence_matrix.mat, np.mat([[0.66666667,0.33333333, -0.33333333,0.33333333, 0.66666667,0.33333333]]), 7) self.assertTupleEqual(new_space.element_shape, (2,3)) self.assertListEqual(new_space.id2row, ["a1"]) self.assertListEqual(new_space.id2column, []) comp_space = model.compose(train_data, self.n_space) np.testing.assert_array_almost_equal(comp_space.cooccurrence_matrix.mat, self.an_space.cooccurrence_matrix.mat, 10 ) self.assertListEqual(comp_space.id2row, ["a1_car", "a1_man"]) self.assertListEqual(comp_space.id2column, self.ft) #new model, without training model2 = LexicalFunction(function_space=new_space, intercept=True) model2._MIN_SAMPLES = 1 comp_space = model2.compose(train_data, self.n_space) self.assertListEqual(comp_space.id2row, ["a1_car", "a1_man"]) self.assertListEqual(comp_space.id2column, []) np.testing.assert_array_almost_equal(comp_space.cooccurrence_matrix.mat, self.n_space.cooccurrence_matrix.mat, 8) #recursive application comp_space2 = model2.compose([("a1", "a1_car", "a1_a1_car"), ("a1", "a1_man", "a1_a1_man")], comp_space) self.assertListEqual(comp_space2.id2row, ["a1_a1_car", "a1_a1_man"]) self.assertListEqual(comp_space.id2column, []) np.testing.assert_array_almost_equal(comp_space2.cooccurrence_matrix.mat, self.n_space.cooccurrence_matrix.mat, 8) self.assertEqual(comp_space.element_shape, (2,)) self.assertEqual(comp_space2.element_shape, (2,))
def test_train_intercept(self): a1_mat = DenseMatrix(np.mat([[3,4],[5,6]])) a2_mat = DenseMatrix(np.mat([[1,2],[3,4]])) train_data = [("a1", "man", "a1_man"), ("a2", "car", "a2_car"), ("a1", "boy", "a1_boy"), ("a2", "boy", "a2_boy") ] n_mat = DenseMatrix(np.mat([[13,21],[3,4],[5,6]])) n_space = Space(n_mat, ["man", "car", "boy"], self.ft) an1_mat = (a1_mat * n_mat.transpose()).transpose() an2_mat = (a2_mat * n_mat.transpose()).transpose() an_mat = an1_mat.vstack(an2_mat) an_space = Space(an_mat, ["a1_man","a1_car","a1_boy","a2_man","a2_car","a2_boy"], self.ft) #test train model = LexicalFunction(learner=LstsqRegressionLearner(intercept=True)) model._MIN_SAMPLES = 1 model.train(train_data, n_space, an_space) a_space = model.function_space a1_mat.reshape((1,4)) #np.testing.assert_array_almost_equal(a1_mat.mat, # a_space.cooccurrence_matrix.mat[0]) a2_mat.reshape((1,4)) #np.testing.assert_array_almost_equal(a2_mat.mat, # a_space.cooccurrence_matrix.mat[1]) self.assertListEqual(a_space.id2row, ["a1", "a2"]) self.assertTupleEqual(a_space.element_shape, (2,3)) #test compose a1_mat = DenseMatrix(np.mat([[3,4,5,6]])) a2_mat = DenseMatrix(np.mat([[1,2,3,4]])) a_mat = a_space.cooccurrence_matrix a_space = Space(a_mat, ["a1", "a2"], [], element_shape=(2,3)) model = LexicalFunction(function_space=a_space, intercept=True) model._MIN_SAMPLES = 1 comp_space = model.compose(train_data, n_space) self.assertListEqual(comp_space.id2row, ["a1_man", "a2_car", "a1_boy", "a2_boy"]) self.assertListEqual(comp_space.id2column, []) self.assertEqual(comp_space.element_shape, (2,)) np.testing.assert_array_almost_equal(comp_space.cooccurrence_matrix.mat, an_mat[[0,4,2,5]].mat, 8)
def test_simple_train_compose_intercept(self): #TODO test a1_car twice in the phrase list train_data = [("a1", "car", "a1_car"), ("a1", "man", "a1_man"), ] #model with train and then compose learner_ = LstsqRegressionLearner(intercept=True) model = LexicalFunction(learner=learner_) model.train(train_data, self.n_space, self.an_space) new_space = model.function_space np.testing.assert_array_almost_equal(new_space.cooccurrence_matrix.mat, np.mat([[0.66666667, 0.33333333, -0.33333333, 0.33333333, 0.66666667, 0.33333333]]), 7) self.assertTupleEqual(new_space.element_shape, (2, 3)) self.assertListEqual(new_space.id2row, ["a1"]) self.assertListEqual(new_space.id2column, []) comp_space = model.compose(train_data, self.n_space) np.testing.assert_array_almost_equal(comp_space.cooccurrence_matrix.mat, self.an_space.cooccurrence_matrix.mat, 10 ) self.assertListEqual(comp_space.id2row, ["a1_car", "a1_man"]) self.assertListEqual(comp_space.id2column, self.ft) #new model, without training model2 = LexicalFunction(function_space=new_space, intercept=True) comp_space = model2.compose(train_data, self.n_space) self.assertListEqual(comp_space.id2row, ["a1_car", "a1_man"]) self.assertListEqual(comp_space.id2column, []) np.testing.assert_array_almost_equal(comp_space.cooccurrence_matrix.mat, self.n_space.cooccurrence_matrix.mat, 8) #recursive application comp_space2 = model2.compose([("a1", "a1_car", "a1_a1_car"), ("a1", "a1_man", "a1_a1_man")], comp_space) self.assertListEqual(comp_space2.id2row, ["a1_a1_car", "a1_a1_man"]) self.assertListEqual(comp_space.id2column, []) np.testing.assert_array_almost_equal(comp_space2.cooccurrence_matrix.mat, self.n_space.cooccurrence_matrix.mat, 8) self.assertEqual(comp_space.element_shape, (2,)) self.assertEqual(comp_space2.element_shape, (2,))
def test_train_intercept(self): a1_mat = DenseMatrix(np.mat([[3, 4], [5, 6]])) a2_mat = DenseMatrix(np.mat([[1, 2], [3, 4]])) train_data = [("a1", "man", "a1_man"), ("a2", "car", "a2_car"), ("a1", "boy", "a1_boy"), ("a2", "boy", "a2_boy") ] n_mat = DenseMatrix(np.mat([[13, 21], [3, 4], [5, 6]])) n_space = Space(n_mat, ["man", "car", "boy"], self.ft) an1_mat = (a1_mat * n_mat.transpose()).transpose() an2_mat = (a2_mat * n_mat.transpose()).transpose() an_mat = an1_mat.vstack(an2_mat) an_space = Space(an_mat, ["a1_man", "a1_car", "a1_boy", "a2_man", "a2_car", "a2_boy"], self.ft) #test train model = LexicalFunction(learner=LstsqRegressionLearner(intercept=True)) model.train(train_data, n_space, an_space) a_space = model.function_space a1_mat.reshape((1, 4)) #np.testing.assert_array_almost_equal(a1_mat.mat, # a_space.cooccurrence_matrix.mat[0]) a2_mat.reshape((1, 4)) #np.testing.assert_array_almost_equal(a2_mat.mat, # a_space.cooccurrence_matrix.mat[1]) self.assertListEqual(a_space.id2row, ["a1", "a2"]) self.assertTupleEqual(a_space.element_shape, (2, 3)) #test compose a1_mat = DenseMatrix(np.mat([[3, 4, 5, 6]])) a2_mat = DenseMatrix(np.mat([[1, 2, 3, 4]])) a_mat = a_space.cooccurrence_matrix a_space = Space(a_mat, ["a1", "a2"], [], element_shape=(2, 3)) model = LexicalFunction(function_space=a_space, intercept=True) comp_space = model.compose(train_data, n_space) self.assertListEqual(comp_space.id2row, ["a1_man", "a2_car", "a1_boy", "a2_boy"]) self.assertListEqual(comp_space.id2column, []) self.assertEqual(comp_space.element_shape, (2,)) np.testing.assert_array_almost_equal(comp_space.cooccurrence_matrix.mat, an_mat[[0, 4, 2, 5]].mat, 8)
def test_lexical_function(self): self.m12 = DenseMatrix(np.mat([[3,1],[9,2]])) self.m22 = DenseMatrix(np.mat([[4,3],[2,1]])) self.ph2 = DenseMatrix(np.mat([[18,11],[24,7]])) self.row = ["a", "b"] self.ft = ["f1","f2"] self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft) self.space2 = Space(DenseMatrix(self.ph2), ["a_a","a_b"], self.ft) m = LexicalFunction() m._MIN_SAMPLES = 1 self.assertRaises(IllegalStateError, m.export, self.prefix + ".lf1") m.train([("a","b","a_b"),("a","a","a_a")], self.space1, self.space2) m.export(self.prefix + ".lf2")
def test_lexical_function(self): self.m12 = DenseMatrix(np.mat([[3, 1], [9, 2]])) self.m22 = DenseMatrix(np.mat([[4, 3], [2, 1]])) self.ph2 = DenseMatrix(np.mat([[18, 11], [24, 7]])) self.row = ["a", "b"] self.ft = ["f1", "f2"] self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft) self.space2 = Space(DenseMatrix(self.ph2), ["a_a", "a_b"], self.ft) m = LexicalFunction() m._MIN_SAMPLES = 1 self.assertRaises(IllegalStateError, m.export, self.prefix + ".lf1") m.train([("a", "b", "a_b"), ("a", "a", "a_a")], self.space1, self.space2) m.export(self.prefix + ".lf2")
def learn_TENSOR_matrix ( ) : bigram_space = load_space(args.function[2]) my_comp_list = [] id2row_list = [] adj_list = extract_adj(bigram_space) for adj in adj_list : train_data=[] for bigram in bigram_space.id2row : pair = bigram.split('_') if( not pair[0] == adj ) : continue train_data.append(("ADJ"+"_"+adj, pair[1], bigram)) # eg ( "ADJ_good", "boy", "good_boy"), where "ADJ_good" -> matrix to learn, boy -> unigram , good_boy -> bigram my_comp=LexicalFunction() # 1) #Learn ADJ matrix for each adjective my_comp.train(train_data, unigram_space, bigram_space) my_comp_list.append(my_comp.function_space.cooccurrence_matrix) id2row_list.append(my_comp.function_space.id2row) my_mat_id2row=id2row_list.pop() my_mat_space=Space(my_comp_list.pop(),my_mat_id2row,[]) #Create a new space using the ADJ matrices created for i in range(len(id2row_list)): my_mat_id2row.extend(id2row_list[i]) my_mat_space=Space(my_mat_space.cooccurrence_matrix.vstack(my_comp_list[i]),my_mat_id2row,[]) my_mat_space._element_shape = my_comp.function_space.element_shape #Use the ADJ matrices space to learn the tensor matrix train_data=[('tens_adj',adj,"ADJ"+"_"+adj) for adj in adj_list] # eg ( "tens_adj", good, ADJ_good ) #where "tens_adj" -> tensor matrix to learn, good -> unigram , ADJ_good -> adjective matrix learnt by 'my_comp' in 1) my_tens_adj=LexicalFunction() my_tens_adj.train(train_data, unigram_space, my_mat_space) # unigram_space -> for "good" , my_mat_space -> for "ADJ_good" save_space(my_tens_adj, "TENSOR_matrix", "matrices")
def learn_ADJ_matrices ( ) : bigram_space = load_space(args.function[2]) train_data=[] adj_list = extract_adj(bigram_space) for bigram in bigram_space.id2row : pair = bigram.split('_') if( pair[0] in adj_list ) : train_data.append(("ADJ"+"_"+pair[0], pair[1], bigram)) # eg ( "ADJ_good", boy, good_boy ) , where "ADJ_good" -> matrix to learn, boy -> unigram , good_boy -> bigram my_comp=LexicalFunction() my_comp.train(train_data, unigram_space, bigram_space) #unigram_space -> for "boy" , bigram_space -> for "good_boy" save_space(my_comp, "ADJ_matrices", "matrices")
#ex16.py #------- from composes.utils import io_utils from composes.composition.lexical_function import LexicalFunction from composes.similarity.cos import CosSimilarity #training data #trying to learn a "good" function train_data = [("good_function", "car", "good_car"), ("good_function", "book", "good_book")] #load argument and phrase space arg_space = io_utils.load("./data/out/ex10.pkl") phrase_space = io_utils.load("data/out/PHRASE_SS.ex10.pkl") #train a lexical function model on the data my_comp = LexicalFunction() my_comp.train(train_data, arg_space, phrase_space) #print its parameters print "\nLexical function space:" print my_comp.function_space.id2row cooc_mat = my_comp.function_space.cooccurrence_matrix cooc_mat.reshape(my_comp.function_space.element_shape) print cooc_mat #similarity within the learned functional space print "\nSimilarity between good and good in the function space:" print my_comp.function_space.get_sim("good_function", "good_function", CosSimilarity())
print "Applying SVD..." space = space.apply(Svd(100)) print "Creating peripheral space.." per_space = PeripheralSpace.build(space, data=data_path + "per.raw.SV.sm", cols=data_path + "per.raw.SV.cols", format="sm") #reading in train data train_data_file = data_path + "ML08_SV_train.txt" train_data = io_utils.read_tuple_list(train_data_file, fields=[0, 1, 2]) print "Training Lexical Function composition model..." comp_model = LexicalFunction(learner=RidgeRegressionLearner(param=2)) comp_model.train(train_data, space, per_space) print "Composing phrases..." test_phrases_file = data_path + "ML08nvs_test.txt" test_phrases = io_utils.read_tuple_list(test_phrases_file, fields=[0, 1, 2]) composed_space = comp_model.compose(test_phrases, space) print "Reading similarity test data..." test_similarity_file = data_path + "ML08data_new.txt" test_pairs = io_utils.read_tuple_list(test_similarity_file, fields=[0, 1]) gold = io_utils.read_list(test_similarity_file, field=2) print "Computing similarity with lexical function..." pred = composed_space.get_sims(test_pairs, CosSimilarity()) #use this composed space to assign similarities
n_space = Space.build(data="./data/in/ex19-n.sm", cols="./data/in/ex19-n.cols", format="sm") svo_space = Space.build(data="./data/in/ex19-svo.sm", cols="./data/in/ex19-svo.cols", format="sm") print "\nInput SVO training space:" print svo_space.id2row print svo_space.cooccurrence_matrix #1. train a model to learn VO functions on train data: VO N -> SVO print "\nStep 1 training" vo_model = LexicalFunction(learner=LstsqRegressionLearner()) vo_model.train(train_vo_data, n_space, svo_space) #2. train a model to learn V functions on train data: V N -> VO # where VO space: function space learned in step 1 print "\nStep 2 training" vo_space = vo_model.function_space v_model = LexicalFunction(learner=LstsqRegressionLearner()) v_model.train(train_v_data, n_space, vo_space) #print the learned model print "\n3D Verb space" print v_model.function_space.id2row print v_model.function_space.cooccurrence_matrix #3. use the trained models to compose new SVO sentences
def test_3d(self): # setting up v_mat = DenseMatrix(np.mat([[0, 0, 1, 1, 2, 2, 3, 3], #hate [0, 1, 2, 4, 5, 6, 8, 9]])) #love vo11_mat = DenseMatrix(np.mat([[0, 11], [22, 33]])) #hate boy vo12_mat = DenseMatrix(np.mat([[0, 7], [14, 21]])) #hate man vo21_mat = DenseMatrix(np.mat([[6, 34], [61, 94]])) #love boy vo22_mat = DenseMatrix(np.mat([[2, 10], [17, 26]])) #love car train_vo_data = [("hate_boy", "man", "man_hate_boy"), ("hate_man", "man", "man_hate_man"), ("hate_boy", "boy", "boy_hate_boy"), ("hate_man", "boy", "boy_hate_man"), ("love_car", "boy", "boy_love_car"), ("love_boy", "man", "man_love_boy"), ("love_boy", "boy", "boy_love_boy"), ("love_car", "man", "man_love_car") ] # if do not find a phrase # what to do? train_v_data = [("love", "boy", "love_boy"), ("hate", "man", "hate_man"), ("hate", "boy", "hate_boy"), ("love", "car", "love_car")] sentences = ["man_hate_boy", "car_hate_boy", "boy_hate_boy", "man_hate_man", "car_hate_man", "boy_hate_man", "man_love_boy", "car_love_boy", "boy_love_boy", "man_love_car", "car_love_car", "boy_love_car"] n_mat = DenseMatrix(np.mat([[3, 4], [1, 2], [5, 6]])) n_space = Space(n_mat, ["man", "car", "boy"], self.ft) s1_mat = (vo11_mat * n_mat.transpose()).transpose() s2_mat = (vo12_mat * n_mat.transpose()).transpose() s3_mat = (vo21_mat * n_mat.transpose()).transpose() s4_mat = (vo22_mat * n_mat.transpose()).transpose() s_mat = vo11_mat.nary_vstack([s1_mat, s2_mat, s3_mat, s4_mat]) s_space = Space(s_mat, sentences, self.ft) #test train 2d model = LexicalFunction(learner=LstsqRegressionLearner(intercept=False)) model.train(train_vo_data, n_space, s_space) vo_space = model.function_space self.assertListEqual(vo_space.id2row, ["hate_boy", "hate_man", "love_boy", "love_car"]) self.assertTupleEqual(vo_space.element_shape, (2, 2)) vo11_mat.reshape((1, 4)) np.testing.assert_array_almost_equal(vo11_mat.mat, vo_space.cooccurrence_matrix.mat[0]) vo12_mat.reshape((1, 4)) np.testing.assert_array_almost_equal(vo12_mat.mat, vo_space.cooccurrence_matrix.mat[1]) vo21_mat.reshape((1, 4)) np.testing.assert_array_almost_equal(vo21_mat.mat, vo_space.cooccurrence_matrix.mat[2]) vo22_mat.reshape((1, 4)) np.testing.assert_array_almost_equal(vo22_mat.mat, vo_space.cooccurrence_matrix.mat[3]) # test train 3d model2 = LexicalFunction(learner=LstsqRegressionLearner(intercept=False)) model2.train(train_v_data, n_space, vo_space) v_space = model2.function_space np.testing.assert_array_almost_equal(v_mat.mat, v_space.cooccurrence_matrix.mat) self.assertListEqual(v_space.id2row, ["hate", "love"]) self.assertTupleEqual(v_space.element_shape, (2, 2, 2)) # test compose 3d vo_space2 = model2.compose(train_v_data, n_space) id2row1 = list(vo_space.id2row) id2row2 = list(vo_space2.id2row) id2row2.sort() self.assertListEqual(id2row1, id2row2) row_list = vo_space.id2row vo_rows1 = vo_space.get_rows(row_list) vo_rows2 = vo_space2.get_rows(row_list) np.testing.assert_array_almost_equal(vo_rows1.mat, vo_rows2.mat, 7) self.assertTupleEqual(vo_space.element_shape, vo_space2.element_shape)
def test_simple_3d_intercept(self): train_data1 = [("drive_car", "I", "I_drive_car"), ("read_man", "You", "You_read_man"), ("read_man", "I", "I_read_man"), ("drive_car", "You", "You_drive_car"), ("drive_man", "You", "You_drive_man"), ("drive_man", "I", "I_drive_man") ] train_data2 = [("drive", "car", "drive_car"), ("drive", "man", "drive_man"), ] n_mat = DenseMatrix(np.mat([[1, 2], [3, 4], [5, 6], [7, 8]])) svo_mat = DenseMatrix(np.mat([[1, 2], [3, 4], [1, 2], [3, 4], [3, 4], [1, 2]])) n_space = Space(n_mat, ["I", "You", "man", "car"], []) svo_space = Space(svo_mat, ["I_drive_car", "You_read_man", "I_read_man", "You_drive_car", "You_drive_man", "I_drive_man"], ["f1", "f2"]) #test first stage train model = LexicalFunction(learner=LstsqRegressionLearner(intercept=True)) model.train(train_data1, n_space, svo_space) vo_space = model.function_space np.testing.assert_array_almost_equal(vo_space.cooccurrence_matrix.mat, np.mat([[0.6666, 0.3333, -0.3333, 0.3333, 0.6666, 0.3333], [0.6666, 0.3333, -0.3333, 0.3333, 0.6666, 0.3333], [0.6666, 0.3333, -0.3333, 0.3333, 0.6666, 0.3333]]), 4) self.assertTupleEqual(vo_space.element_shape, (2, 3)) self.assertListEqual(vo_space.id2row, ["drive_car", "drive_man", "read_man"]) self.assertListEqual(vo_space.id2column, []) #test first stage compose comp_space = model.compose([train_data1[0]], n_space) np.testing.assert_array_almost_equal(comp_space.cooccurrence_matrix.mat, np.mat([[1, 2]]), 8) self.assertTupleEqual(comp_space.element_shape, (2,)) self.assertListEqual(comp_space.id2row, ["I_drive_car"]) self.assertListEqual(comp_space.id2column, ["f1", "f2"]) #test second stage train model = LexicalFunction(learner=LstsqRegressionLearner(intercept=True)) model.train(train_data2, n_space, vo_space) v_space = model.function_space np.testing.assert_array_almost_equal(v_space.cooccurrence_matrix.mat, np.mat([[-0.2222, 0.2222, 0.4444, -0.1111, 0.1111, 0.2222, 0.1111, -0.1111, -0.2222, -0.1111, 0.1111, 0.2222, -0.2222, 0.2222, 0.4444, -0.1111, 0.1111, 0.2222]]), 4) self.assertTupleEqual(v_space.element_shape, (2, 3, 3)) self.assertListEqual(v_space.id2row, ["drive"]) self.assertListEqual(v_space.id2column, []) #test compose1 comp_space = model.compose([train_data2[0]], n_space) np.testing.assert_array_almost_equal(comp_space.cooccurrence_matrix.mat, np.mat([[0.6666, 0.3333, -0.3333, 0.3333, 0.6666, 0.3333]]), 4) self.assertTupleEqual(comp_space.element_shape, (2, 3)) self.assertListEqual(comp_space.id2row, ["drive_car"]) self.assertListEqual(comp_space.id2column, []) #test compose2 model2 = LexicalFunction(function_space=comp_space, intercept=True) comp_space2 = model2.compose([train_data1[0]], n_space) np.testing.assert_array_almost_equal(comp_space2.cooccurrence_matrix.mat, np.mat([[1, 2]]), 8) self.assertTupleEqual(comp_space2.element_shape, (2,)) self.assertListEqual(comp_space2.id2row, ["I_drive_car"]) self.assertListEqual(comp_space2.id2column, []) #recursive application, write a wrapper around it!!! comp_space2 = model2.compose([("drive_car", "I", "I_drive_car")], n_space) np.testing.assert_array_almost_equal(comp_space2.cooccurrence_matrix.mat, np.mat([[1, 2]]), 8) self.assertTupleEqual(comp_space2.element_shape, (2,)) self.assertListEqual(comp_space2.id2row, ["I_drive_car"]) self.assertListEqual(comp_space2.id2column, [])
def test_simple_3d_intercept(self): train_data1 = [("drive_car", "I", "I_drive_car"), ("read_man", "You", "You_read_man"), ("read_man", "I", "I_read_man"), ("drive_car", "You", "You_drive_car"), ("drive_man", "You", "You_drive_man"), ("drive_man", "I", "I_drive_man") ] train_data2 = [("drive", "car", "drive_car"), ("drive", "man", "drive_man"), ] n_mat = DenseMatrix(np.mat([[1,2],[3,4],[5,6],[7,8]])) svo_mat = DenseMatrix(np.mat([[1,2],[3,4],[1,2],[3,4],[3,4],[1,2]])) n_space = Space(n_mat,["I", "You", "man", "car"],[]) svo_space = Space(svo_mat,["I_drive_car","You_read_man", "I_read_man", "You_drive_car", "You_drive_man", "I_drive_man"],["f1","f2"]) #test first stage train model = LexicalFunction(learner=LstsqRegressionLearner(intercept=True)) model._MIN_SAMPLES = 1 model.train(train_data1, n_space, svo_space) vo_space = model.function_space np.testing.assert_array_almost_equal(vo_space.cooccurrence_matrix.mat, np.mat([[0.6666,0.3333,-0.3333, 0.3333,0.6666,0.3333], [0.6666,0.3333,-0.3333, 0.3333,0.6666,0.3333], [0.6666,0.3333,-0.3333, 0.3333,0.6666,0.3333]]), 4) self.assertTupleEqual(vo_space.element_shape, (2,3)) self.assertListEqual(vo_space.id2row, ["drive_car","drive_man","read_man"]) self.assertListEqual(vo_space.id2column, []) #test first stage compose comp_space = model.compose([train_data1[0]], n_space) np.testing.assert_array_almost_equal(comp_space.cooccurrence_matrix.mat, np.mat([[1,2]]), 8) self.assertTupleEqual(comp_space.element_shape, (2,)) self.assertListEqual(comp_space.id2row, ["I_drive_car"]) self.assertListEqual(comp_space.id2column, ["f1","f2"]) #test second stage train model = LexicalFunction(learner=LstsqRegressionLearner(intercept=True)) model._MIN_SAMPLES = 1 model.train(train_data2, n_space, vo_space) v_space = model.function_space np.testing.assert_array_almost_equal(v_space.cooccurrence_matrix.mat, np.mat([[-0.2222,0.2222,0.4444, -0.1111,0.1111,0.2222, 0.1111,-0.1111,-0.2222, -0.1111,0.1111,0.2222, -0.2222,0.2222,0.4444, -0.1111,0.1111,0.2222]]), 4) self.assertTupleEqual(v_space.element_shape, (2,3,3)) self.assertListEqual(v_space.id2row, ["drive"]) self.assertListEqual(v_space.id2column, []) #test compose1 comp_space = model.compose([train_data2[0]], n_space) np.testing.assert_array_almost_equal(comp_space.cooccurrence_matrix.mat, np.mat([[0.6666,0.3333,-0.3333, 0.3333,0.6666,0.3333]]), 4) self.assertTupleEqual(comp_space.element_shape, (2,3)) self.assertListEqual(comp_space.id2row, ["drive_car"]) self.assertListEqual(comp_space.id2column, []) #test compose2 model2 = LexicalFunction(function_space=comp_space, intercept=True) model2._MIN_SAMPLES = 1 comp_space2 = model2.compose([train_data1[0]], n_space) np.testing.assert_array_almost_equal(comp_space2.cooccurrence_matrix.mat, np.mat([[1,2]]), 8) self.assertTupleEqual(comp_space2.element_shape, (2,)) self.assertListEqual(comp_space2.id2row, ["I_drive_car"]) self.assertListEqual(comp_space2.id2column, []) #recursive application, write a wrapper around it!!! comp_space2 = model2.compose([("drive_car", "I", "I_drive_car")], n_space) np.testing.assert_array_almost_equal(comp_space2.cooccurrence_matrix.mat, np.mat([[1,2]]), 8) self.assertTupleEqual(comp_space2.element_shape, (2,)) self.assertListEqual(comp_space2.id2row, ["I_drive_car"]) self.assertListEqual(comp_space2.id2column, [])
def test_3d(self): # setting up v_mat = DenseMatrix(np.mat([[0,0,1,1,2,2,3,3],#hate [0,1,2,4,5,6,8,9]])) #love vo11_mat = DenseMatrix(np.mat([[0,11],[22,33]])) #hate boy vo12_mat = DenseMatrix(np.mat([[0,7],[14,21]])) #hate man vo21_mat = DenseMatrix(np.mat([[6,34],[61,94]])) #love boy vo22_mat = DenseMatrix(np.mat([[2,10],[17,26]])) #love car train_vo_data = [("hate_boy", "man", "man_hate_boy"), ("hate_man", "man", "man_hate_man"), ("hate_boy", "boy", "boy_hate_boy"), ("hate_man", "boy", "boy_hate_man"), ("love_car", "boy", "boy_love_car"), ("love_boy", "man", "man_love_boy"), ("love_boy", "boy", "boy_love_boy"), ("love_car", "man", "man_love_car") ] # if do not find a phrase # what to do? train_v_data = [("love", "boy", "love_boy"), ("hate", "man", "hate_man"), ("hate", "boy", "hate_boy"), ("love", "car", "love_car")] sentences = ["man_hate_boy", "car_hate_boy", "boy_hate_boy", "man_hate_man", "car_hate_man", "boy_hate_man", "man_love_boy", "car_love_boy", "boy_love_boy", "man_love_car", "car_love_car", "boy_love_car" ] n_mat = DenseMatrix(np.mat([[3,4],[1,2],[5,6]])) n_space = Space(n_mat, ["man", "car", "boy"], self.ft) s1_mat = (vo11_mat * n_mat.transpose()).transpose() s2_mat = (vo12_mat * n_mat.transpose()).transpose() s3_mat = (vo21_mat * n_mat.transpose()).transpose() s4_mat = (vo22_mat * n_mat.transpose()).transpose() s_mat = vo11_mat.nary_vstack([s1_mat,s2_mat,s3_mat,s4_mat]) s_space = Space(s_mat, sentences, self.ft) #test train 2d model = LexicalFunction(learner=LstsqRegressionLearner(intercept=False)) model._MIN_SAMPLES = 1 model.train(train_vo_data, n_space, s_space) vo_space = model.function_space self.assertListEqual(vo_space.id2row, ["hate_boy", "hate_man","love_boy", "love_car"]) self.assertTupleEqual(vo_space.element_shape, (2,2)) vo11_mat.reshape((1,4)) np.testing.assert_array_almost_equal(vo11_mat.mat, vo_space.cooccurrence_matrix.mat[0]) vo12_mat.reshape((1,4)) np.testing.assert_array_almost_equal(vo12_mat.mat, vo_space.cooccurrence_matrix.mat[1]) vo21_mat.reshape((1,4)) np.testing.assert_array_almost_equal(vo21_mat.mat, vo_space.cooccurrence_matrix.mat[2]) vo22_mat.reshape((1,4)) np.testing.assert_array_almost_equal(vo22_mat.mat, vo_space.cooccurrence_matrix.mat[3]) # test train 3d model2 = LexicalFunction(learner=LstsqRegressionLearner(intercept=False)) model2._MIN_SAMPLES = 1 model2.train(train_v_data, n_space, vo_space) v_space = model2.function_space np.testing.assert_array_almost_equal(v_mat.mat, v_space.cooccurrence_matrix.mat) self.assertListEqual(v_space.id2row, ["hate","love"]) self.assertTupleEqual(v_space.element_shape, (2,2,2)) # test compose 3d vo_space2 = model2.compose(train_v_data, n_space) id2row1 = list(vo_space.id2row) id2row2 = list(vo_space2.id2row) id2row2.sort() self.assertListEqual(id2row1, id2row2) row_list = vo_space.id2row vo_rows1 = vo_space.get_rows(row_list) vo_rows2 = vo_space2.get_rows(row_list) np.testing.assert_array_almost_equal(vo_rows1.mat, vo_rows2.mat,7) self.assertTupleEqual(vo_space.element_shape, vo_space2.element_shape)
train_data = [] vocab = set(my_space.id2row) for tup in pair_data: if tup[1] in vocab and tup[2] in vocab: train_data.append(tup) ''' try: with open('temp_func.pkl', 'rb') as file: print("Loading model...") my_comp = pickle.load(file) except FileNotFoundError: ''' print("Training LexicalFunction...") my_comp = LexicalFunction() my_comp.train(train_data, my_space, my_space) with open('temp_func.pkl', 'wb') as file: pickle.dump(my_comp, file) print("Building composed space...") composed_space = my_comp.compose(train_data, my_space) # print(composed_space.id2row) # print(composed_space.cooccurrence_matrix) # compute similarity between two words in the space cos_sim = {} for pair in train_data: cos = my_space.get_sim(pair[1], pair[2], CosSimilarity(),
def train_grefenstette_multistep_composer(all_vectors_file, root_dir): """ Train Grefenstette et al's multistep regression VO/SVO model Adapted from dissect's ex19.py :param all_vectors_file: file containing N, V, VO and SVO vectors :param root_dir: where to write temp files and output """ mkdirs_if_not_exists(root_dir) vo_composer_output_file = join(root_dir, 'vo_comp.pkl') svo_composer_output_file = join(root_dir, 'svo_comp.pkl') filename = basename(all_vectors_file) noun_events_file = join(root_dir, '%s-onlyN.tmp' % filename) # verb_events_file = join(root_dir, '%s-onlyV.tmp' % filename) # vo_events_file = join(root_dir, '%s-onlyVO.tmp' % filename) svo_events_file = join(root_dir, '%s-onlySVO.tmp' % filename) # this has unigrams and observed phrases thes = Vectors.from_tsv(all_vectors_file) thes.to_tsv(noun_events_file, entry_filter=lambda x: x.type == '1-GRAM' and x.tokens[0].pos == 'N') _translate_byblo_to_dissect(noun_events_file) # thes.to_tsv(verb_events_file, # entry_filter=lambda x: x.type == '1-GRAM' and x.tokens[0].pos == 'V') # _translate_byblo_to_dissect(verb_events_file) # thes.to_tsv(vo_events_file, # entry_filter=lambda x: x.type == 'VO') # _translate_byblo_to_dissect(vo_events_file) thes.to_tsv(svo_events_file, entry_filter=lambda x: x.type == 'SVO') _translate_byblo_to_dissect(svo_events_file) train_vo_data, train_v_data = [], [] for phrase in thes.keys(): df = DocumentFeature.from_string(phrase) if df.type == 'SVO': train_vo_data.append((str(df[1:]), str(df[0]), str(df))) if df.type == 'VO': train_v_data.append((str(df[0]), str(df[1]), str(df))) # logging.info('train_vo_data %r', len(train_vo_data)) # logging.info('train_v_data %r', len(train_v_data)) # load N and SVO spaces n_space = Space.build(data=noun_events_file + '.sm', cols=noun_events_file + '.cols', format="sm") svo_space = Space.build(data=svo_events_file + '.sm', cols=svo_events_file + '.cols', format="sm") logging.info("Input SVO training space:") logging.info(svo_space.id2row) # logging.info(svo_space.cooccurrence_matrix) # 1. train a model to learn VO functions on train data: VO N -> SVO logging.info("Step 1 training") vo_model = LexicalFunction(learner=RidgeRegressionLearner(), min_samples=2) # Gref et al 2013, §5 says 3 vo_model.train(train_vo_data, n_space, svo_space) io_utils.save(vo_model, vo_composer_output_file) # 2. train a model to learn V functions on train data: V N -> VO # where VO space: function space learned in step 1 logging.info("Step 2 training") vo_space = vo_model.function_space v_model = LexicalFunction(learner=RidgeRegressionLearner(), min_samples=2) v_model.train(train_v_data, n_space, vo_space) io_utils.save(v_model, svo_composer_output_file)
n_space = Space.build(data = "./data/in/ex19-n.sm", cols = "./data/in/ex19-n.cols", format = "sm") svo_space = Space.build(data = "./data/in/ex19-svo.sm", cols = "./data/in/ex19-svo.cols", format = "sm") print "\nInput SVO training space:" print svo_space.id2row print svo_space.cooccurrence_matrix #1. train a model to learn VO functions on train data: VO N -> SVO print "\nStep 1 training" vo_model = LexicalFunction(learner=LstsqRegressionLearner()) vo_model.train(train_vo_data, n_space, svo_space) #2. train a model to learn V functions on train data: V N -> VO # where VO space: function space learned in step 1 print "\nStep 2 training" vo_space = vo_model.function_space v_model = LexicalFunction(learner=LstsqRegressionLearner()) v_model.train(train_v_data, n_space, vo_space) #print the learned model print "\n3D Verb space" print v_model.function_space.id2row print v_model.function_space.cooccurrence_matrix #3. use the trained models to compose new SVO sentences
from composes.utils import io_utils from composes.composition.lexical_function import LexicalFunction #training data #trying to learn a "book" function train_data = [("good_function", "car", "good_car"), ("good_function", "book", "good_book") ] #load argument and phrase space arg_space = io_utils.load("./data/out/ex10.pkl") phrase_space = io_utils.load("data/out/PHRASE_SS.ex10.pkl") #train a lexical function model on the data my_comp = LexicalFunction() my_comp.train(train_data, arg_space, phrase_space) #apply the trained model comp_sp1 = my_comp.compose([("good_function", "car", "good_car")], arg_space) #apply the trained model a second time comp_sp2 = my_comp.compose([("good_function", "good_car", "good_good_car")], comp_sp1) #print the composed spaces: print "\nComposed space 1:" print comp_sp1.id2row
space = space.apply(Svd(100)) print "Creating peripheral space.." per_space = PeripheralSpace.build(space, data = data_path + "per.raw.SV.sm", cols = data_path + "per.raw.SV.cols", format = "sm" ) #reading in train data train_data_file = data_path + "ML08_SV_train.txt" train_data = io_utils.read_tuple_list(train_data_file, fields=[0,1,2]) print "Training Lexical Function composition model..." comp_model = LexicalFunction(learner = RidgeRegressionLearner(param=2)) comp_model.train(train_data, space, per_space) print "Composing phrases..." test_phrases_file = data_path + "ML08nvs_test.txt" test_phrases = io_utils.read_tuple_list(test_phrases_file, fields=[0,1,2]) composed_space = comp_model.compose(test_phrases, space) print "Reading similarity test data..." test_similarity_file = data_path + "ML08data_new.txt" test_pairs = io_utils.read_tuple_list(test_similarity_file, fields=[0,1]) gold = io_utils.read_list(test_similarity_file, field=2) print "Computing similarity with lexical function..." pred = composed_space.get_sims(test_pairs, CosSimilarity()) #use this composed space to assign similarities