def test_init_svd(self): test_cases = [(self.space2, self.us, self.us2, self.x, self.row3)] red1 = Svd(2) red2 = Svd(1) for in_s, expected_mat, expected_mat2, data, rows in test_cases: in_s = in_s.apply(red1) per_s = PeripheralSpace(in_s, DenseMatrix(data), rows) np.testing.assert_array_almost_equal(expected_mat, per_s.cooccurrence_matrix.mat, 2) self.assertListEqual(per_s.id2row, in_s.id2row) self.assertListEqual(per_s.id2column, []) self.assertDictEqual(per_s.row2id, in_s.row2id) self.assertDictEqual(per_s.column2id, {}) self.assertEqual(1, len(per_s.operations)) in_s = in_s.apply(red2) per_s = PeripheralSpace(in_s, DenseMatrix(data), rows) np.testing.assert_array_almost_equal(expected_mat2, per_s.cooccurrence_matrix.mat, 2) self.assertListEqual(per_s.id2row, in_s.id2row) self.assertListEqual(per_s.id2column, []) self.assertDictEqual(per_s.row2id, in_s.row2id) self.assertDictEqual(per_s.column2id, {}) self.assertEqual(2, len(per_s.operations))
def transform_raw_per_space(raw_per_space, in_file_prefix, out_dir, out_format, core_space_file): in_file_descr = "PER_SS." + in_file_prefix.split("/")[-1] core_space = io_utils.load(core_space_file, Space) core_descr = ".".join(core_space_file.split("/")[-1].split(".")[0:-1]) space = PeripheralSpace(core_space, raw_per_space.cooccurrence_matrix, raw_per_space.id2row, raw_per_space.row2id) print "Printing..." out_file_prefix = "%s/%s.%s" % (out_dir, in_file_descr, core_descr) io_utils.save(space, out_file_prefix + ".pkl") if not out_format is None: space.export(out_file_prefix, format=out_format)
def test_init(self): test_cases = [(self.space1, self.m2, self.row2, np.array([[2, 0.5, 1]]), np.array([[0.69314718, 0, 0]]))] w1 = EpmiWeighting() w2 = PlogWeighting() for core_s, per_mat, per_row, per_mat_out1, per_mat_out2 in test_cases: tmp_mat = per_mat.copy() tmp_core_mat = core_s.cooccurrence_matrix.mat per_s1 = PeripheralSpace(core_s, DenseMatrix(per_mat), per_row) np.testing.assert_array_equal(per_s1.cooccurrence_matrix.mat, tmp_mat) self.assert_column_identical(per_s1, core_s) self.assertListEqual(per_s1.id2row, per_row) self.assertListEqual(per_s1.operations, core_s.operations) core_s1 = core_s.apply(w1) per_s2 = PeripheralSpace(core_s1, DenseMatrix(per_mat), per_row) np.testing.assert_array_almost_equal( per_s2.cooccurrence_matrix.mat, per_mat_out1) self.assert_column_identical(per_s2, core_s1) self.assertListEqual(per_s2.id2row, per_row) self.assertListEqual(per_s2.operations, core_s1.operations) self.assertEqual(len(per_s2.operations), 1) core_s2 = core_s1.apply(w2) per_s3 = PeripheralSpace(core_s2, DenseMatrix(per_mat), per_row) np.testing.assert_array_almost_equal( per_s3.cooccurrence_matrix.mat, per_mat_out2) self.assert_column_identical(per_s3, core_s2) self.assertListEqual(per_s3.id2row, per_row) self.assertListEqual(per_s3.operations, core_s2.operations) self.assertEqual(len(per_s3.operations), 2) np.testing.assert_array_equal(tmp_core_mat, core_s.cooccurrence_matrix.mat) core_s3 = core_s2 per_s4 = PeripheralSpace(core_s3, DenseMatrix(per_mat), per_row) np.testing.assert_array_almost_equal( per_s4.cooccurrence_matrix.mat, per_mat_out2) self.assert_column_identical(per_s4, core_s2) self.assertListEqual(per_s4.id2row, per_row) self.assertListEqual(per_s4.operations, core_s3.operations) self.assertEqual(len(per_s4.operations), 2) np.testing.assert_array_equal(tmp_core_mat, core_s.cooccurrence_matrix.mat)
def transform_raw_per_space(raw_per_space, in_file_prefix, out_dir, out_format, core_space_file): in_file_descr = "PER_SS." + in_file_prefix.split("/")[-1] core_space = io_utils.load(core_space_file, Space) core_descr = ".".join(core_space_file.split("/")[-1].split(".")[0:-1]) space = PeripheralSpace(core_space, raw_per_space.cooccurrence_matrix, raw_per_space.id2row, raw_per_space.row2id) print("Printing...") out_file_prefix = "%s/%s.%s" % (out_dir, in_file_descr, core_descr) io_utils.save(space, out_file_prefix + ".pkl") if not out_format is None: space.export(out_file_prefix, format=out_format)
def build_bigram_space(): bigrams_space = PeripheralSpace.build(unigrams_space, data=args.function[3], cols=args.function[1], format="sm") save_space(bigrams_space, "bigrams_space")
def test_add_rows_svd(self): test_cases = [(self.space2, np.vstack([self.us2[0], self.us2[0]]), self.m1, ["e"], ["f"], {"e":0, "f":1})] red1 = Svd(2) red2 = Svd(1) for in_s, expected_mat, data, id2row1, id2row2, row2id in test_cases: in_s = in_s.apply(red1) in_s = in_s.apply(red2) per_s = PeripheralSpace(in_s, DenseMatrix(data), id2row1) per_s.add_rows(DenseMatrix(data), id2row2) np.testing.assert_array_almost_equal(expected_mat, per_s.cooccurrence_matrix.mat, 2) self.assertListEqual(per_s.id2row, id2row1 + id2row2) self.assertListEqual(per_s.id2column, []) self.assertDictEqual(per_s.row2id, row2id) self.assertDictEqual(per_s.column2id, {}) self.assertEqual(2, len(per_s.operations))
def test_add_rows_svd(self): test_cases = [(self.space2, np.vstack([self.us2[0], self.us2[0]]), self.m1, ["e"], ["f"], { "e": 0, "f": 1 })] red1 = Svd(2) red2 = Svd(1) for in_s, expected_mat, data, id2row1, id2row2, row2id in test_cases: in_s = in_s.apply(red1) in_s = in_s.apply(red2) per_s = PeripheralSpace(in_s, DenseMatrix(data), id2row1) per_s.add_rows(DenseMatrix(data), id2row2) np.testing.assert_array_almost_equal(expected_mat, per_s.cooccurrence_matrix.mat, 2) self.assertListEqual(per_s.id2row, id2row1 + id2row2) self.assertListEqual(per_s.id2column, []) self.assertDictEqual(per_s.row2id, row2id) self.assertDictEqual(per_s.column2id, {}) self.assertEqual(2, len(per_s.operations))
def test_add_rows(self): test_cases = [(self.space1, self.m2, self.row2, np.array([[4, 2, 6]]), ["c"], np.array([[4, 2, 6], [4, 2, 6]]), np.array([[0.69314718, 0, 0], [0.69314718, 0, 0]]), { "b": 0, "c": 1 }, ["b", "c"])] for (core_sp, per_mat1, id2row1, per_mat2, id2row2, per_exp_mat1, per_exp_mat2, per_exp_row2id, per_exp_id2row) in test_cases: per_sp = PeripheralSpace(core_sp, DenseMatrix(per_mat1), id2row1) per_sp.add_rows(DenseMatrix(per_mat2), id2row2) np.testing.assert_array_almost_equal( per_sp.cooccurrence_matrix.mat, per_exp_mat1, 7) self.assertDictEqual(per_sp.row2id, per_exp_row2id) self.assertListEqual(per_sp.id2row, per_exp_id2row) self.assertDictEqual(per_sp.column2id, core_sp.column2id) self.assertListEqual(per_sp.id2column, core_sp.id2column) core_sp2 = core_sp.apply(PpmiWeighting()) per_sp2 = PeripheralSpace(core_sp2, DenseMatrix(per_mat1), id2row1) per_sp2.add_rows(DenseMatrix(per_mat2), id2row2) np.testing.assert_array_almost_equal( per_sp2.cooccurrence_matrix.mat, per_exp_mat2, 7) self.assertRaises(ValueError, per_sp2.add_rows, DenseMatrix(per_mat2), id2row1) self.assertRaises(ValueError, per_sp2.add_rows, DenseMatrix(per_mat2), id2row2) self.assertRaises(ValueError, per_sp2.add_rows, DenseMatrix(per_mat2), ["d", "e"])
def test_add_rows(self): test_cases = [(self.space1, self.m2, self.row2, np.array([[4,2,6]]), ["c"], np.array([[4,2,6],[4,2,6]]), np.array([[0.69314718,0,0],[0.69314718,0,0]]), {"b":0,"c":1}, ["b", "c"])] for (core_sp, per_mat1, id2row1, per_mat2, id2row2, per_exp_mat1, per_exp_mat2, per_exp_row2id, per_exp_id2row) in test_cases: per_sp = PeripheralSpace(core_sp, DenseMatrix(per_mat1), id2row1) per_sp.add_rows(DenseMatrix(per_mat2), id2row2) np.testing.assert_array_almost_equal(per_sp.cooccurrence_matrix.mat, per_exp_mat1, 7) self.assertDictEqual(per_sp.row2id, per_exp_row2id) self.assertListEqual(per_sp.id2row, per_exp_id2row) self.assertDictEqual(per_sp.column2id, core_sp.column2id) self.assertListEqual(per_sp.id2column, core_sp.id2column) core_sp2 = core_sp.apply(PpmiWeighting()) per_sp2 = PeripheralSpace(core_sp2, DenseMatrix(per_mat1), id2row1) per_sp2.add_rows(DenseMatrix(per_mat2), id2row2) np.testing.assert_array_almost_equal(per_sp2.cooccurrence_matrix.mat, per_exp_mat2, 7) self.assertRaises(ValueError, per_sp2.add_rows, DenseMatrix(per_mat2), id2row1) self.assertRaises(ValueError, per_sp2.add_rows, DenseMatrix(per_mat2), id2row2) self.assertRaises(ValueError, per_sp2.add_rows, DenseMatrix(per_mat2), ["d", "e"])
def test_per_space_top_feat_selection(self): test_cases = [(self.space_d, 1, ["f3"], { "f3": 0 }, np.mat([[3], [5]])), (self.space_d, 2, ["f3", "f1"], { "f3": 0, "f1": 1 }, np.mat([[3, 1], [5, 4]])), (self.space_d, 4, ["f3", "f1", "f2"], { "f3": 0, "f1": 1, "f2": 2 }, np.mat([[3, 1, 2], [5, 4, 0]]))] for space_d, no_dim, id2col, col2id, mat in test_cases: trans = TopFeatureSelection(no_dim) new_space = space_d.apply(trans) #peripheral test simple test per_sp = PeripheralSpace(new_space, DenseMatrix(self.a), ["c", "d"]) self.assertListEqual(per_sp.id2row, ["c", "d"]) self.assertListEqual(per_sp.id2column, id2col) self.assertDictEqual(per_sp.column2id, col2id) np.testing.assert_array_equal(per_sp.cooccurrence_matrix.mat, mat) #peripheral test with add rows per_sp = PeripheralSpace(new_space, DenseMatrix(self.a[0, :]), ["c"]) per_sp.add_rows(DenseMatrix(self.a[1, :]), ["d"]) self.assertListEqual(per_sp.id2row, ["c", "d"]) self.assertListEqual(per_sp.id2column, id2col) self.assertDictEqual(per_sp.column2id, col2id) np.testing.assert_array_equal(per_sp.cooccurrence_matrix.mat, mat) #peripheral test, with plog applied to core BEFORE feat selection plogmat = mat.copy() plogmat[plogmat == 0] = 1 plogmat = np.log(plogmat) new_space = space_d.apply(PlogWeighting()) trans = TopFeatureSelection(no_dim) new_space = new_space.apply(trans) per_sp = PeripheralSpace(new_space, DenseMatrix(self.a), ["c", "d"]) self.assertListEqual(per_sp.id2row, ["c", "d"]) self.assertListEqual(per_sp.id2column, id2col) self.assertDictEqual(per_sp.column2id, col2id) np.testing.assert_array_almost_equal( per_sp.cooccurrence_matrix.mat, plogmat, 7) #peripheral test, with plog applied to core AFTER feat selection trans = TopFeatureSelection(no_dim) new_space = space_d.apply(trans) new_space = new_space.apply(PlogWeighting()) per_sp = PeripheralSpace(new_space, DenseMatrix(self.a), ["c", "d"]) self.assertListEqual(per_sp.id2row, ["c", "d"]) self.assertListEqual(per_sp.id2column, id2col) self.assertDictEqual(per_sp.column2id, col2id) np.testing.assert_array_almost_equal( per_sp.cooccurrence_matrix.mat, plogmat, 7)
space_file = data_path + "CORE_SS.verbnoun.core.pkl" space = io_utils.load(space_file) print "Applying PPMI..." space = space.apply(PpmiWeighting()) print "Applying feature selection..." space = space.apply(TopFeatureSelection(2000)) print "Applying SVD..." space = space.apply(Svd(100)) print "Creating peripheral space.." per_space = PeripheralSpace.build(space, data=data_path + "per.raw.SV.sm", cols=data_path + "per.raw.SV.cols", format="sm") #reading in train data train_data_file = data_path + "ML08_SV_train.txt" train_data = io_utils.read_tuple_list(train_data_file, fields=[0, 1, 2]) print "Training Lexical Function composition model..." comp_model = LexicalFunction(learner=RidgeRegressionLearner(param=2)) comp_model.train(train_data, space, per_space) print "Composing phrases..." test_phrases_file = data_path + "ML08nvs_test.txt" test_phrases = io_utils.read_tuple_list(test_phrases_file, fields=[0, 1, 2]) composed_space = comp_model.compose(test_phrases, space)
#ex05.py #------- from composes.utils import io_utils from composes.semantic_space.peripheral_space import PeripheralSpace from composes.transformation.scaling.ppmi_weighting import PpmiWeighting #load a space and apply ppmi on it my_space = io_utils.load("./data/out/ex01.pkl") my_space = my_space.apply(PpmiWeighting()) print(my_space.cooccurrence_matrix) print(my_space.id2row) #create a peripheral space my_per_space = PeripheralSpace.build(my_space, data="./data/in/ex05.sm", cols="./data/in/ex05.cols", format="sm") print(my_per_space.cooccurrence_matrix) print(my_per_space.id2row) #save the space io_utils.save(my_per_space, "./data/out/PER_SS.ex05.pkl")
space_file = data_path + "CORE_SS.verbnoun.core.pkl" space = io_utils.load(space_file) print "Applying PPMI..." space = space.apply(PpmiWeighting()) print "Applying feature selection..." space = space.apply(TopFeatureSelection(2000)) print "Applying SVD..." space = space.apply(Svd(100)) print "Creating peripheral space.." per_space = PeripheralSpace.build(space, data = data_path + "per.raw.SV.sm", cols = data_path + "per.raw.SV.cols", format = "sm" ) #reading in train data train_data_file = data_path + "ML08_SV_train.txt" train_data = io_utils.read_tuple_list(train_data_file, fields=[0,1,2]) print "Training Lexical Function composition model..." comp_model = LexicalFunction(learner = RidgeRegressionLearner(param=2)) comp_model.train(train_data, space, per_space) print "Composing phrases..." test_phrases_file = data_path + "ML08nvs_test.txt" test_phrases = io_utils.read_tuple_list(test_phrases_file, fields=[0,1,2]) composed_space = comp_model.compose(test_phrases, space)
#ex05.py #------- from composes.utils import io_utils from composes.semantic_space.peripheral_space import PeripheralSpace from composes.transformation.scaling.ppmi_weighting import PpmiWeighting #load a space and apply ppmi on it my_space = io_utils.load("./data/out/ex01.pkl") my_space = my_space.apply(PpmiWeighting()) print my_space.cooccurrence_matrix print my_space.id2row #create a peripheral space my_per_space = PeripheralSpace.build(my_space, data="./data/in/ex05.sm", cols="./data/in/ex05.cols", format="sm") print my_per_space.cooccurrence_matrix print my_per_space.id2row #save the space io_utils.save(my_per_space, "./data/out/PER_SS.ex05.pkl")
def train_baroni_guevara_composers(all_vectors, ROOT_DIR, baroni_output_path, guevara_output_path, baroni_threshold=10): """ :type all_vectors: str; path to vectors file containing both N and observed AN vectors :type ROOT_DIR: str; where to write temp files :type baroni_output_path: str; where to write pickled baroni composer :type guevara_output_path: str :type baroni_threshold: int """ SVD_DIMS = 100 baroni_training_phrase_types = {'AN', 'NN'} # what kind of NPs to train Baroni composer for # prepare the input files to be fed into Dissect mkdirs_if_not_exists(ROOT_DIR) filename = basename(all_vectors) noun_events_file = join(ROOT_DIR, '%s-onlyN-SVD%d.tmp' % (filename, SVD_DIMS)) NPs_events_file = join(ROOT_DIR, '%s-onlyPhrases-SVD%d.tmp' % (filename, SVD_DIMS)) thes = Vectors.from_tsv(all_vectors, lowercasing=False) thes.to_tsv(noun_events_file, entry_filter=lambda x: x.type == '1-GRAM' and x.tokens[0].pos == 'N') _translate_byblo_to_dissect(noun_events_file) thes.to_tsv(NPs_events_file, entry_filter=lambda x: x.type in baroni_training_phrase_types, row_transform=lambda x: str(x).replace(' ', '_')) _translate_byblo_to_dissect(NPs_events_file) my_space = Space.build(data="{}.sm".format(noun_events_file), rows="{}.rows".format(noun_events_file), cols="{}.cols".format(noun_events_file), format="sm") logging.info('Each unigram vector has dimensionality %r', my_space.element_shape) # create a peripheral space my_per_space = PeripheralSpace.build(my_space, data="{}.sm".format(NPs_events_file), rows="{}.rows".format(NPs_events_file), # The columns of the peripheral space have to be identical to those # in the core space (including their order)! cols="{}.cols".format(NPs_events_file), format="sm") logging.info('Each phrase vector has dimensionality %r', my_per_space.element_shape) # use the model to compose words in my_space all_data = [] for phrase in my_per_space._row2id: # make sure there are only NPs here if DocumentFeature.from_string(phrase.replace(' ', '_')).type in baroni_training_phrase_types: adj, noun = phrase.split('_') all_data.append((adj, noun, '%s_%s' % (adj, noun))) # train a composition model on the data and save it baroni = LexicalFunction(min_samples=baroni_threshold, learner=RidgeRegressionLearner()) guevara = FullAdditive(learner=RidgeRegressionLearner()) for composer, out_path in zip([baroni, guevara], [baroni_output_path, guevara_output_path]): composer.train(all_data, my_space, my_per_space) io_utils.save(composer, out_path) logging.info('Saved trained composer to %s', out_path)
def test_per_space_top_feat_selection(self): test_cases = [(self.space_d, 1, ["f3"], {"f3":0}, np.mat([[3],[5]])), (self.space_d, 2, ["f3", "f1"], {"f3":0, "f1":1}, np.mat([[3,1],[5,4]])), (self.space_d, 4, ["f3", "f1", "f2"], {"f3":0, "f1":1, "f2":2}, np.mat([[3,1,2],[5,4,0]])) ] for space_d, no_dim, id2col, col2id, mat in test_cases: trans = TopFeatureSelection(no_dim) new_space = space_d.apply(trans) #peripheral test simple test per_sp = PeripheralSpace(new_space, DenseMatrix(self.a), ["c","d"]) self.assertListEqual(per_sp.id2row, ["c","d"]) self.assertListEqual(per_sp.id2column, id2col) self.assertDictEqual(per_sp.column2id, col2id) np.testing.assert_array_equal(per_sp.cooccurrence_matrix.mat, mat) #peripheral test with add rows per_sp = PeripheralSpace(new_space, DenseMatrix(self.a[0,:]), ["c"]) per_sp.add_rows(DenseMatrix(self.a[1,:]), ["d"]) self.assertListEqual(per_sp.id2row, ["c","d"]) self.assertListEqual(per_sp.id2column, id2col) self.assertDictEqual(per_sp.column2id, col2id) np.testing.assert_array_equal(per_sp.cooccurrence_matrix.mat, mat) #peripheral test, with plog applied to core BEFORE feat selection plogmat = mat.copy() plogmat[plogmat==0] = 1 plogmat = np.log(plogmat) new_space = space_d.apply(PlogWeighting()) trans = TopFeatureSelection(no_dim) new_space = new_space.apply(trans) per_sp = PeripheralSpace(new_space, DenseMatrix(self.a), ["c","d"]) self.assertListEqual(per_sp.id2row, ["c","d"]) self.assertListEqual(per_sp.id2column, id2col) self.assertDictEqual(per_sp.column2id, col2id) np.testing.assert_array_almost_equal(per_sp.cooccurrence_matrix.mat, plogmat, 7) #peripheral test, with plog applied to core AFTER feat selection trans = TopFeatureSelection(no_dim) new_space = space_d.apply(trans) new_space = new_space.apply(PlogWeighting()) per_sp = PeripheralSpace(new_space, DenseMatrix(self.a), ["c","d"]) self.assertListEqual(per_sp.id2row, ["c","d"]) self.assertListEqual(per_sp.id2column, id2col) self.assertDictEqual(per_sp.column2id, col2id) np.testing.assert_array_almost_equal(per_sp.cooccurrence_matrix.mat, plogmat, 7)
sys.stderr.flush() gastrovec = Space.build(data = "../corpus_collection/corpus.sm", rows = "../corpus_collection/corpus.rows", cols = "../corpus_collection/corpus.cols", format = "sm") print("done.", file=sys.stderr) io_utils.save(gastrovec, "gastrovec.pkl") print("Applying PPMI... ",end="", file=sys.stderr) sys.stderr.flush() gastrovec = gastrovec.apply(PpmiWeighting()) print("Applying SVD (20)... ",end="",file=sys.stderr) sys.stderr.flush() gastrovec = gastrovec.apply(Svd(20)) print("done.", file=sys.stderr) io_utils.save(gastrovec, "gastrovec.ppmi.svd20.pkl") print("Loading recipe peripheral space...",end="",file=sys.stderr) sys.stderr.flush() recipes = PeripheralSpace.build(gastrovec, data = "../corpus_collection/recipes.sm", rows = "../corpus_collection/recipes.rows", cols = "../corpus_collection/recipes.cols", format = "sm") print("done.", file=sys.stderr) io_utils.save(recipes, "recipes.ppmi.svd20.pkl")