def test_simple_sparse_zipped(self): bcs.main(["build_core_space.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "mat1", "-o", self.dir_, "--input_format", "sm", "--output_format", "sm", "--gz", "True" ]) s1 = Space.build(data=self.dir_ + "mat1.sm.gz", cols= self.dir_ + "mat1.cols", format = "sm") s2 = Space.build(data=self.dir_ + "CORE_SS.mat1.sm", cols=self.dir_ + "CORE_SS.mat1.cols", format="sm") s3 = io_utils.load(self.dir_ + "CORE_SS.mat1.pkl", Space) s4 = Space.build(data=self.dir_ + "mat1.sm", cols= self.dir_ + "mat1.cols", format = "sm") self._test_equal_spaces_sparse(s1, s2) self._test_equal_spaces_sparse(s1, s3) self._test_equal_spaces_sparse(s1, s4)
def test_build_data(self): test_cases = [("data1",["red", "blue"], ["car", "man"], np.mat([[3,5],[0,10]]), np.mat([[3,5],[0,10]])), ("data2",["red"], ["car"], np.mat([[3]]), np.mat([[3]])), ("data3",["red", "blue"], ["car", "man"], np.mat([[15,0],[0,6]]), np.mat([[5,0],[0,6]])), ("data7",["red"], ["car"], np.mat([[0]]), np.mat([[0]])), ("data9",["man"], ["car"], np.mat([[4]]), None), ] for data_file, rows, cols, smat, dmat in test_cases: data_file1 = self.dir_ + data_file + ".sparse" sp = Space.build(data=data_file1, cols= self.dir_ + data_file + ".cols", format="sm") self.assertListEqual(rows, sp.id2row) self.assertListEqual(cols, sp.id2column) self.assertIsInstance(sp.cooccurrence_matrix, SparseMatrix) np.testing.assert_array_equal(smat, sp.cooccurrence_matrix.mat.todense()) data_file2 = self.dir_ + data_file + ".dense" if not dmat is None: sp = Space.build(data=data_file2, format="dm") self.assertListEqual(rows, sp.id2row) self.assertListEqual([], sp.id2column) self.assertIsInstance(sp.cooccurrence_matrix, DenseMatrix) np.testing.assert_array_equal(dmat, sp.cooccurrence_matrix.mat)
def eval_on_file(path_composed_emb, path_observed_emb, save_path): raw_observed_space = Space.build(data=path_observed_emb, format='dm') observed_space = raw_observed_space.apply(RowNormalization('length')) observed_words = observed_space.get_id2row() print("Observed words, size: " + str(len(observed_words)) + ", first:") print(observed_words[:10]) observed_words_set = set(observed_words) raw_composed_space = Space.build(data=path_composed_emb, format='dm') composed_space = raw_composed_space.apply(RowNormalization('length')) composed_words = composed_space.get_id2row() print("Composed words, size: " + str(len(composed_words)) + ", first:") print(composed_words[:10]) # all composed words should be in the initial space for idx, word in enumerate(composed_words): assert (word in observed_words_set) q1, q2, q3, ranks = evaluateRank(composed_words, composed_space, observed_space) print("Q1: " + str(q1) + ", Q2: " + str(q2) + ", Q3: " + str(q3)) printDictToFile(ranks, save_path + '_rankedCompounds.txt') sortedRanks = sorted(ranks.values()) printListToFile(sortedRanks, save_path + '_ranks.txt') logResult(q1, q2, q3, save_path + '_quartiles.txt') return q1, q2, q3, ranks
def test_simple_dense(self): bcs.main([ "build_core_space.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "mat2", "-o", self.dir_, "--input_format", "dm", "--output_format", "dm" ]) s1 = Space.build(data=self.dir_ + "mat2.dm", format="dm") s2 = Space.build(data=self.dir_ + "CORE_SS.mat2.dm", format="dm") s3 = io_utils.load(self.dir_ + "CORE_SS.mat2.pkl", Space) self._test_equal_spaces_dense(s1, s2) self._test_equal_spaces_dense(s1, s3) bcs.main([ "build_core_space.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "CORE_SS.mat2", "-o", self.dir_, "--input_format", "pkl", "--output_format", "dm" ]) s1 = io_utils.load(self.dir_ + "CORE_SS.CORE_SS.mat2.pkl", Space) s3 = io_utils.load(self.dir_ + "CORE_SS.mat2.pkl", Space) self._test_equal_spaces_dense(s1, s3)
def train_from_core(lexical_space_file, an_dn_file, pn_file, sv_file, vo_file, output_file_prefix): if (not exists(lexical_space_file) or not exists(pn_file) or not exists(sv_file) or not exists(vo_file) or not exists(an_dn_file)): print "some file doesn't exist" print lexical_space_file, an_dn_file, pn_file, sv_file, vo_file print "load core" core_space = Space.build(data=lexical_space_file, format="dm") print "load an dn" an_dn_space = Space.build(data=an_dn_file, format="dm") print "load pn" pn_space = Space.build(data=pn_file, format="dm") print "load sv" sv_space = Space.build(data=sv_file, format="dm") print "load vo" vo_space = Space.build(data=vo_file, format="dm") print "start training" all_mat_space_normed = train_all_spaces(core_space, an_dn_space, pn_space, sv_space, vo_space) print "exporting trained file" all_mat_space_normed.export(output_file_prefix, format="dm") del all_mat_space_normed print "DONE"
def test_simple_dense(self): bcs.main(["build_core_space.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "mat2", "-o", self.dir_, "--input_format", "dm", "--output_format", "dm" ]) s1 = Space.build(data = self.dir_ + "mat2.dm", format = "dm") s2 = Space.build(data = self.dir_ + "CORE_SS.mat2.dm", format="dm") s3 = io_utils.load(self.dir_ + "CORE_SS.mat2.pkl", Space) self._test_equal_spaces_dense(s1, s2) self._test_equal_spaces_dense(s1, s3) bcs.main(["build_core_space.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "CORE_SS.mat2", "-o", self.dir_, "--input_format", "pkl", "--output_format", "dm" ]) s1 = io_utils.load(self.dir_ + "CORE_SS.CORE_SS.mat2.pkl", Space) s3 = io_utils.load(self.dir_ + "CORE_SS.mat2.pkl", Space) self._test_equal_spaces_dense(s1, s3)
def eval_on_file(path_composed_emb, path_observed_emb, save_path): raw_observed_space = Space.build(data=path_observed_emb, format='dm') observed_space = raw_observed_space.apply(RowNormalization('length')) observed_words = observed_space.get_id2row() print("Observed words, size: " + str(len(observed_words)) + ", first:") print(observed_words[:10]) observed_words_set = set(observed_words) raw_composed_space = Space.build(data=path_composed_emb, format='dm') composed_space = raw_composed_space.apply(RowNormalization('length')) composed_words = composed_space.get_id2row() print("Composed words, size: " + str(len(composed_words)) + ", first:") print(composed_words[:10]) # all composed words should be in the initial space for idx, word in enumerate(composed_words): assert(word in observed_words_set) q1, q2, q3, ranks = evaluateRank(composed_words, composed_space, observed_space) print("Q1: " + str(q1) + ", Q2: " + str(q2) + ", Q3: " + str(q3)) printDictToFile(ranks, save_path + '_rankedCompounds.txt') sortedRanks = sorted(ranks.values()) printListToFile(sortedRanks, save_path + '_ranks.txt') logResult(q1, q2, q3, save_path + '_quartiles.txt') return q1,q2,q3,ranks
def test_as_conversion_tool(self): bcs.main(["build_core_space.py", "-i", self.dir_ + "mat3", "-o", self.dir_, "--input_format", "sm", "--output_format", "sm" ]) s1 = Space.build(data=self.dir_ + "mat3.sm", cols= self.dir_ + "mat3.cols", format = "sm") s2 = Space.build(data=self.dir_ + "CORE_SS.mat3.sm", rows=self.dir_ + "CORE_SS.mat3.rows", cols=self.dir_ + "CORE_SS.mat3.cols", format="sm") s3 = io_utils.load(self.dir_ + "CORE_SS.mat3.pkl", Space) self._test_equal_spaces_sparse(s1, s2) self._test_equal_spaces_sparse(s1, s3) bcs.main(["build_core_space.py", "-i", self.dir_ + "mat3", "-o", self.dir_, "--input_format", "sm", "--output_format", "dm" ]) s1 = Space.build(data=self.dir_ + "mat3.dm", cols=self.dir_ + "CORE_SS.mat3.cols", format = "dm") s2 = Space.build(data=self.dir_ + "CORE_SS.mat3.dm", rows=self.dir_ + "CORE_SS.mat3.rows", cols=self.dir_ + "CORE_SS.mat3.cols", format = "dm") s3 = io_utils.load(self.dir_ + "CORE_SS.mat3.pkl", Space) self._test_equal_spaces_dense(s1, s2) s3.to_dense() self._test_equal_spaces_dense(s1, s3) bcs.main(["build_core_space.py", "-i", self.dir_ + "mat3", "-o", self.dir_, "--input_format", "dm", "--output_format", "dm" ]) s1 = Space.build(data=self.dir_ + "CORE_SS.mat3.dm", cols=self.dir_ + "CORE_SS.mat3.cols", format = "dm") s3 = io_utils.load(self.dir_ + "CORE_SS.mat3.pkl", Space) s3.to_dense() self._test_equal_spaces_dense(s1, s3)
def test_simple_lstsq_no_inter(self): tc.main(["train_composition.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "an_train_data.txt", "-o", self.dir_, "-m", "lexical_func", "-p", self.dir_ + "CORE_SS.AN_mat.pkl", "-a", self.dir_ + "CORE_SS.N_mat.pkl", "-r", "lstsq", "--intercept", "False", "--export_params", "True" ]) trained = io_utils.load(self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl") new_space = trained.function_space np.testing.assert_array_almost_equal(new_space.cooccurrence_matrix.mat, np.mat([1,0,0,1]), 10) self.assertTupleEqual(new_space.element_shape, (2,2)) self.assertListEqual(new_space.id2row, ["big"]) self.assertListEqual(new_space.id2column, []) a_space = Space.build(data=self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.params.dm", format="dm") self._test_equal_spaces_dense(a_space, new_space) tc.main(["train_composition.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "an_train_data.txt", "-o", self.dir_, "-m", "lexical_func", "-p", self.dir_ + "CORE_SS.AN_mat.pkl", "-a", self.dir_ + "CORE_SS.N_mat.pkl", "-r", "ridge", "--lambda", "0", "--crossvalidation", "False", "--intercept", "False", "--export_params", "True" ]) trained = io_utils.load(self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl") new_space2 = trained.function_space np.testing.assert_array_almost_equal(new_space2.cooccurrence_matrix.mat, np.mat([1,0,0,1]), 10) self.assertTupleEqual(new_space2.element_shape, (2,2)) self.assertListEqual(new_space2.id2row, ["big"]) self.assertListEqual(new_space2.id2column, []) a_space = Space.build(data=self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.params.dm", format="dm") self._test_equal_spaces_dense(a_space, new_space2)
def test_simple_ops(self): bcs.main(["build_core_space.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "mat3", "-w", "raw", "-s", "top_sum_3,top_length_3,top_sum_4", "-r", "svd_2,svd_1", "-o", self.dir_, "--input_format", "dm", "--output_format", "dm" ]) core_mats = ["CORE_SS.mat3.raw.top_sum_3.svd_2", "CORE_SS.mat3.raw.top_sum_3.svd_1", "CORE_SS.mat3.raw.top_length_3.svd_2", "CORE_SS.mat3.raw.top_length_3.svd_1", "CORE_SS.mat3.raw.top_sum_4.svd_2", "CORE_SS.mat3.raw.top_sum_4.svd_1" ] core_spaces = [Space.build(data=self.dir_ + suffix + ".dm", format="dm") for suffix in core_mats] for i, core_mat in enumerate(core_mats): bps.main(["build_peripheral_space.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "mat3", "-o", self.dir_, "-c", self.dir_ + core_mat + ".pkl", "--input_format", "dm", "--output_format", "dm" ]) s1 = core_spaces[i] data_file = self.dir_ + "PER_SS.mat3." + core_mats[i] + ".dm" s2 = Space.build(data=data_file, format="dm") self._test_equal_spaces_dense(s1, s2) bps.main(["build_peripheral_space.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "mat3", "-o", self.dir_, "-c", self.dir_ + core_mat + ".pkl", "--input_format", "sm", "--output_format", "dm" ]) s1 = core_spaces[i] data_file = self.dir_ + "PER_SS.mat3." + core_mats[i] + ".dm" s2 = Space.build(data=data_file, format="dm") self._test_equal_spaces_dense(s1, s2)
def test_simple_dense(self): bps.main(["build_peripheral_space.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "mat2", "-o", self.dir_, "-c", self.dir_ + "CORE_SS.mat2.pkl", "--input_format", "dm", "--output_format", "dm" ]) s1 = Space.build(data=self.dir_ + "mat2.dm", format="dm") s2 = Space.build(data=self.dir_ + "PER_SS.mat2.CORE_SS.mat2.dm", format="dm") self._test_equal_spaces_dense(s1, s2)
def test_simple_lstsq_no_inter(self): tc.main([ "train_composition.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "an_train_data.txt", "-o", self.dir_, "-m", "lexical_func", "-p", self.dir_ + "CORE_SS.AN_mat.pkl", "-a", self.dir_ + "CORE_SS.N_mat.pkl", "-r", "lstsq", "--intercept", "False", "--export_params", "True" ]) trained = io_utils.load( self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl") new_space = trained.function_space np.testing.assert_array_almost_equal(new_space.cooccurrence_matrix.mat, np.mat([1, 0, 0, 1]), 10) self.assertTupleEqual(new_space.element_shape, (2, 2)) self.assertListEqual(new_space.id2row, ["big"]) self.assertListEqual(new_space.id2column, []) a_space = Space.build( data=self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.params.dm", format="dm") self._test_equal_spaces_dense(a_space, new_space) tc.main([ "train_composition.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "an_train_data.txt", "-o", self.dir_, "-m", "lexical_func", "-p", self.dir_ + "CORE_SS.AN_mat.pkl", "-a", self.dir_ + "CORE_SS.N_mat.pkl", "-r", "ridge", "--lambda", "0", "--crossvalidation", "False", "--intercept", "False", "--export_params", "True" ]) trained = io_utils.load( self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl") new_space2 = trained.function_space np.testing.assert_array_almost_equal( new_space2.cooccurrence_matrix.mat, np.mat([1, 0, 0, 1]), 10) self.assertTupleEqual(new_space2.element_shape, (2, 2)) self.assertListEqual(new_space2.id2row, ["big"]) self.assertListEqual(new_space2.id2column, []) a_space = Space.build( data=self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.params.dm", format="dm") self._test_equal_spaces_dense(a_space, new_space2)
def test_as_conversion_tool(self): bcs.main([ "build_core_space.py", "-i", self.dir_ + "mat3", "-o", self.dir_, "--input_format", "sm", "--output_format", "sm" ]) s1 = Space.build(data=self.dir_ + "mat3.sm", cols=self.dir_ + "mat3.cols", format="sm") s2 = Space.build(data=self.dir_ + "CORE_SS.mat3.sm", rows=self.dir_ + "CORE_SS.mat3.rows", cols=self.dir_ + "CORE_SS.mat3.cols", format="sm") s3 = io_utils.load(self.dir_ + "CORE_SS.mat3.pkl", Space) self._test_equal_spaces_sparse(s1, s2) self._test_equal_spaces_sparse(s1, s3) bcs.main([ "build_core_space.py", "-i", self.dir_ + "mat3", "-o", self.dir_, "--input_format", "sm", "--output_format", "dm" ]) s1 = Space.build(data=self.dir_ + "mat3.dm", cols=self.dir_ + "CORE_SS.mat3.cols", format="dm") s2 = Space.build(data=self.dir_ + "CORE_SS.mat3.dm", rows=self.dir_ + "CORE_SS.mat3.rows", cols=self.dir_ + "CORE_SS.mat3.cols", format="dm") s3 = io_utils.load(self.dir_ + "CORE_SS.mat3.pkl", Space) self._test_equal_spaces_dense(s1, s2) s3.to_dense() self._test_equal_spaces_dense(s1, s3) bcs.main([ "build_core_space.py", "-i", self.dir_ + "mat3", "-o", self.dir_, "--input_format", "dm", "--output_format", "dm" ]) s1 = Space.build(data=self.dir_ + "CORE_SS.mat3.dm", cols=self.dir_ + "CORE_SS.mat3.cols", format="dm") s3 = io_utils.load(self.dir_ + "CORE_SS.mat3.pkl", Space) s3.to_dense() self._test_equal_spaces_dense(s1, s3)
def test_to_dissect_sparse_files(vectors_c, tmpdir): """ :type vectors_c: Thesaurus :type tmpdir: py.path.local """ from composes.semantic_space.space import Space prefix = str(tmpdir.join('output')) vectors_c.to_dissect_sparse_files(prefix) # check that files are there for suffix in ['sm', 'rows', 'cols']: outfile = '{}.{}'.format(prefix, suffix) assert os.path.exists(outfile) assert os.path.isfile(outfile) # check that reading the files in results in the same matrix space = Space.build(data="{}.sm".format(prefix), rows="{}.rows".format(prefix), cols="{}.cols".format(prefix), format="sm") matrix, rows, cols = space.cooccurrence_matrix.mat, space.id2row, space.id2column exp_matrix, exp_cols, exp_rows = vectors_c.to_sparse_matrix() assert exp_cols == cols assert exp_rows == rows assert_array_equal(exp_matrix.A, matrix.A) _assert_matrix_of_thesaurus_c_is_as_expected(matrix.A, rows, cols) _assert_matrix_of_thesaurus_c_is_as_expected(exp_matrix.A, exp_rows, exp_cols)
def build_raw_per_space(in_file_prefix, in_format, is_gz): if not in_format in ("sm", "dm", "pkl"): raise ValueError("Invalid input format:%s" % in_format) data_file = '%s.%s' % (in_file_prefix, in_format) if in_format == "pkl": space = io_utils.load(data_file, Space) else: if is_gz: data_file = '%s.gz' % data_file row_file = '%s.rows' % (in_file_prefix) column_file = '%s.cols' % (in_file_prefix) if not os.path.exists(row_file): row_file = None if not os.path.exists(column_file): if in_format == "sm": raise ValueError("Column file: %s needs to be provided!" % column_file) column_file = None print("Building matrix...") space = Space.build(data=data_file, rows=row_file, cols=column_file, format=in_format) return space
def build_raw_per_space(in_file_prefix, in_format, is_gz): if not in_format in ("sm", "dm", "pkl"): raise ValueError("Invalid input format:%s" % in_format) data_file = "%s.%s" % (in_file_prefix, in_format) if in_format == "pkl": space = io_utils.load(data_file, Space) else: if is_gz: data_file = "%s.gz" % data_file row_file = "%s.rows" % (in_file_prefix) column_file = "%s.cols" % (in_file_prefix) if not os.path.exists(row_file): row_file = None if not os.path.exists(column_file): if in_format == "sm": raise ValueError("Column file: %s needs to be provided!" % column_file) column_file = None print "Building matrix..." space = Space.build(data=data_file, rows=row_file, cols=column_file, format=in_format) return space
def test_build_data_row_col(self): test_cases = [("data1", "row1.row", "col1.col", ["red"], ["man", "car"], np.mat([[5,3]]), np.mat([[3,5]])), ("data1", "row1.row", "col5.col", ["red"], ["man", "car"], np.mat([[5,3]]), np.mat([[3,5]])), ("data3", "row2.row", "col2.col", ["blue", "red"], ["car"], np.mat([[0],[15]]), None), ("data2", "row1.row","col1.col", ["red"], ["man","car"], np.mat([[0,3]]), None), ("data3", "row3.row", "col3.col", ["blue", "red"], ["man", "car"], np.mat([[6,0],[0,15]]), np.mat([[0,6],[5,0]])), ("data7", "row2.row", "col3.col", ["blue", "red"], ["man", "car"], np.mat([[0,0],[0,0]]), None), ("data3", "row2.row", "col4.col", ["blue", "red"], ["airplane"], np.mat([[0],[0]]), None) ] for data_file, row_file, col_file, rows, cols, smat, dmat in test_cases: row_file = self.dir_ + row_file col_file = self.dir_ + col_file data_file1 = self.dir_ + data_file + ".sparse" if smat is None: self.assertRaises(ValueError, Space.build, data=data_file1, rows= row_file, cols=col_file, format="sm") else: sp = Space.build(data=data_file1, rows= row_file, cols=col_file, format="sm") self.assertListEqual(rows, sp.id2row) self.assertListEqual(cols, sp.id2column) self.assertIsInstance(sp.cooccurrence_matrix, SparseMatrix) np.testing.assert_array_equal(smat, sp.cooccurrence_matrix.mat.todense()) data_file2 = self.dir_ + data_file + ".dense" if dmat is None: self.assertRaises(ValueError, Space.build, data=data_file2, rows= row_file, cols=col_file, format="dm") else: sp = Space.build(data=data_file2, rows= row_file, cols=col_file, format="dm") self.assertListEqual(rows, sp.id2row) self.assertListEqual(cols, sp.id2column) self.assertIsInstance(sp.cooccurrence_matrix, DenseMatrix) np.testing.assert_array_equal(dmat, sp.cooccurrence_matrix.mat)
def test_simple_sparse(self): bcs.main([ "build_core_space.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "mat1", "-o", self.dir_, "--input_format", "sm", "--output_format", "sm" ]) s1 = Space.build(data=self.dir_ + "mat1.sm", cols=self.dir_ + "mat1.cols", format="sm") s2 = Space.build(data=self.dir_ + "CORE_SS.mat1.sm", cols=self.dir_ + "CORE_SS.mat1.cols", format="sm") s3 = io_utils.load(self.dir_ + "CORE_SS.mat1.pkl", Space) self._test_equal_spaces_sparse(s1, s2) self._test_equal_spaces_sparse(s1, s3)
def test_simple_sparse(self): bps.main(["build_peripheral_space.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "mat1", "-o", self.dir_, "-c", self.dir_ + "CORE_SS.mat1.pkl", "--input_format", "sm", "--output_format", "sm" ]) s1 = Space.build(data=self.dir_ + "mat1.sm", cols=self.dir_ + "mat1.cols", format="sm") s2 = Space.build(data=self.dir_ + "PER_SS.mat1.CORE_SS.mat1.sm", cols=self.dir_ + "PER_SS.mat1.CORE_SS.mat1.cols", format="sm") self._test_equal_spaces_sparse(s1, s2)
def test_simple_nmf(self): bcs.main([ "build_core_space.py", "-l", self.dir_ + "log_nmf.txt", "-i", self.dir_ + "mat3", "-w", "raw", "-r", "nmf_2", "-o", self.dir_, "--input_format", "dm", "--output_format", "dm" ]) s1 = Space.build(data=self.dir_ + "CORE_SS.mat3.raw.nmf_2.dm", format="dm") self.assertEqual(s1.cooccurrence_matrix.mat.shape, (3, 2))
def test_simple_load(self): #trained = io_utils.load(self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl") #new_space = trained.function_space ac.main([ "apply_composition.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "an_train_data.txt", "-o", self.dir_, "--load_model", self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl", "-a", self.dir_ + "CORE_SS.N_mat.pkl", "--output_format", "dm" ]) sp1 = Space.build(data=self.dir_ + "COMPOSED_SS.LexicalFunction.an_train_data.txt.dm", format="dm") sp2 = Space.build(data=self.dir_ + "AN_mat.dm", format="dm") self._test_equal_spaces_dense(sp1, sp2)
def build_unigram_space() : unigram_space = Space.build(data = args.function[3], rows = args.function[2], cols = args.function[1], format = "sm") ppmi_space = ppmi(unigram_space) ppmi_norm_space = norm(ppmi_space) ppmi_norm_svd_space = svd(ppmi_norm_space) save_space(ppmi_norm_svd_space, "unigrams_space") return ppmi_norm_svd_space
def write_pkl(self): """ Create spaces from co-occurrence counts in sparse format (.sm) """ # For direction DE-EN my_space_1 = Space.build( data=OUTPUT_FILE_DE_DE_EN_SM, rows=OUTPUT_FILE_DE_WORDS_ROW, cols=OUTPUT_FILE_DE_EN_WORDS_COL, format="sm" ) # For direction EN-DE my_space_2 = Space.build( data=OUTPUT_FILE_EN_EN_DE_SM, rows=OUTPUT_FILE_EN_WORDS_ROW, cols=OUTPUT_FILE_DE_EN_WORDS_COL, format="sm" ) # Save the space objects in pickle format io_utils.save(my_space_1, OUTPUT_FILE_DE_DE_EN_PKL) io_utils.save(my_space_2, OUTPUT_FILE_EN_EN_DE_PKL) print >> stderr, "Pickle file 1 written out:", OUTPUT_FILE_DE_DE_EN_PKL print >> stderr, "Pickle file 2 written out:", OUTPUT_FILE_EN_EN_DE_PKL
def inspect_representations(path_composed_emb, output_path): print('Inspecting representations...') composed_space = Space.build(data=path_composed_emb, format='dm') f = codecs.open(output_path, 'w', 'utf8') word_list=[w for w in composed_space.get_row2id()] for j, w in enumerate(word_list): if j < 1000: neighbours = composed_space.get_neighbours(w, 10, CosSimilarity()) f.write('Neighbours for ' + w + '\n') f.write("\n".join('%s %.6f' % x for x in neighbours)) f.write('\n----------------------------\n') f.close()
def test_simple_load(self): #trained = io_utils.load(self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl") #new_space = trained.function_space ac.main(["apply_composition.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "an_train_data.txt", "-o", self.dir_, "--load_model", self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl", "-a", self.dir_ + "CORE_SS.N_mat.pkl", "--output_format", "dm" ] ) sp1 = Space.build(data=self.dir_ + "COMPOSED_SS.LexicalFunction.an_train_data.txt.dm", format="dm") sp2 = Space.build(data=self.dir_ + "AN_mat.dm", format="dm") self._test_equal_spaces_dense(sp1, sp2)
def inspect_representations(path_composed_emb, output_path): print('Inspecting representations...') composed_space = Space.build(data=path_composed_emb, format='dm') f = codecs.open(output_path, 'w', 'utf8') word_list = [w for w in composed_space.get_row2id()] for j, w in enumerate(word_list): if j < 1000: neighbours = composed_space.get_neighbours(w, 10, CosSimilarity()) f.write('Neighbours for ' + w + '\n') f.write("\n".join('%s %.6f' % x for x in neighbours)) f.write('\n----------------------------\n') f.close()
def test_simple_nmf(self): bcs.main(["build_core_space.py", "-l", self.dir_ + "log_nmf.txt", "-i", self.dir_ + "mat3", "-w", "raw", "-r", "nmf_2", "-o", self.dir_, "--input_format", "dm", "--output_format", "dm" ]) s1 = Space.build(data = self.dir_ + "CORE_SS.mat3.raw.nmf_2.dm", format="dm") self.assertEqual(s1.cooccurrence_matrix.mat.shape, (3,2))
def test_build_data_row(self): test_cases = [("data1", "row1.row", ["red"], ["car", "man"], np.mat([[3,5]]), np.mat([[3,5]])), ("data2", "row1.row",["red"], ["car"], np.mat([[3]]), np.mat([[3]])), ("data3", "row2.row", ["blue", "red"], ["car", "man"], np.mat([[0,6],[15,0]]), np.mat([[0,6],[5,0]])), ("data3", "row3.row", ["blue", "red"], ["car", "man"], np.mat([[0,6],[15,0]]), np.mat([[0,6],[5,0]])), ("data7", "row2.row", ["blue", "red"], ["car"], np.mat([[0],[0]]), np.mat([[0],[0]])), ] for data_file, row_file, rows, cols, smat, dmat in test_cases: row_file = self.dir_ + row_file data_file1 = self.dir_ + data_file + ".sparse" sp = Space.build(data=data_file1, rows= row_file, cols= self.dir_ + data_file + ".cols", format="sm") self.assertListEqual(rows, sp.id2row) self.assertListEqual(cols, sp.id2column) self.assertIsInstance(sp.cooccurrence_matrix, SparseMatrix) np.testing.assert_array_equal(smat, sp.cooccurrence_matrix.mat.todense()) data_file2 = self.dir_ + data_file + ".dense" sp = Space.build(data=data_file2, rows= row_file, format="dm") self.assertListEqual(rows, sp.id2row) self.assertListEqual([], sp.id2column) self.assertIsInstance(sp.cooccurrence_matrix, DenseMatrix) np.testing.assert_array_equal(dmat, sp.cooccurrence_matrix.mat)
def build_spaces(in_file_prefix, in_format, out_dir, out_format, weightings, selections, reductions, normalizations, is_gz): in_file_descr = "CORE_SS." + in_file_prefix.split("/")[-1] data_file = '%s.%s' % (in_file_prefix, in_format) if not in_format in ("sm", "dm", "pkl"): raise ValueError("Invalid input format:%s" % in_format) if in_format == "pkl": space = io_utils.load(data_file, Space) else: if is_gz: data_file = '%s.gz' % data_file row_file = '%s.rows' % (in_file_prefix) column_file = '%s.cols' % (in_file_prefix) if not os.path.exists(row_file): row_file = None if not os.path.exists(column_file): if in_format == "sm": raise ValueError("Column file: %s needs to be provided!" % column_file) column_file = None print("Building matrix...") space = Space.build(data=data_file, rows=row_file, cols=column_file, format=in_format) for w in weightings: w_space = apply_weighting(space, w) for s in selections: s_space = apply_selection(w_space, s) for r in reductions: r_space = apply_reduction(s_space, r) for n in normalizations: n_space = apply_normalization(r_space, n) print("Printing...") print_space(n_space, out_dir, [in_file_descr, w, s, r, n], out_format)
def main(): """ Compute the FREQ/PPMI/PLMI matrix from a co-occurrence matrix, as default pickle the raw matrix """ # Get the arguments args = docopt( '''Compute the FREQ/PPMI/PLMI matrix from a co-occurrence matrix, as default pickle the raw matrix Usage: create_dsm.py <dsm_prefix> [-p | -l] <dsm_prefix> = the prefix for the input files (.sm for the matrix, .rows and .cols) and output files (.ppmi) Options: <none> weight the matrice entries via FREQUENCY -p, --ppmi weight the matrice entries via PPMI -l, --plmi weight the matrice entries via PLMI ''') dsm_prefix = args['<dsm_prefix>'] is_ppmi = args['--ppmi'] is_plmi = args['--plmi'] postfix = "_freq" # Create a space from co-occurrence counts in sparse format dsm = Space.build(data=dsm_prefix + '.sm', rows=dsm_prefix + '.rows', cols=dsm_prefix + '.cols', format='sm') if is_ppmi: # Apply ppmi weighting dsm = dsm.apply(PpmiWeighting()) postfix = "_ppmi" elif is_plmi: # Apply plmi weighting dsm = dsm.apply(PlmiWeighting()) postfix = "_plmi" # Save the Space object in pickle format save_pkl_files(dsm_prefix + postfix, dsm)
def build_spaces(in_file_prefix, in_format, out_dir, out_format, weightings, selections, reductions, normalizations, is_gz): in_file_descr = "CORE_SS." + in_file_prefix.split("/")[-1] data_file = '%s.%s' % (in_file_prefix, in_format) if not in_format in ("sm", "dm", "pkl"): raise ValueError("Invalid input format:%s" % in_format) if in_format == "pkl": space = io_utils.load(data_file, Space) else: if is_gz: data_file = '%s.gz' % data_file row_file = '%s.rows' % (in_file_prefix) column_file = '%s.cols' % (in_file_prefix) if not os.path.exists(row_file): row_file = None if not os.path.exists(column_file): if in_format == "sm": raise ValueError("Column file: %s needs to be provided!" % column_file) column_file = None print "Building matrix..." space = Space.build(data=data_file, rows=row_file, cols=column_file, format=in_format) for w in weightings: w_space = apply_weighting(space, w) for s in selections: s_space = apply_selection(w_space, s) for r in reductions: r_space = apply_reduction(s_space, r) for n in normalizations: n_space = apply_normalization(r_space, n) print "Printing..." print_space(n_space, out_dir, [in_file_descr, w, s, r, n], out_format)
def getThesaurus(word): if isinstance(word, unicode): word = word.encode('utf-8') else: try: word.decode('utf-8') except: raise # find synonyms in chilin for line in open(THES_PATH + 'chilin-zh-TW.csv'): synonyms = line.split() if word in synonyms: break # calculate word similarity word_sim_dict = {} my_space = Space.build(data=THES_PATH + 'sm', rows=THES_PATH + 'words.rows', cols=THES_PATH + 'cols', format='sm') for row in open(THES_PATH + 'words.rows'): word1 = row.strip() sim = my_space.get_sim(word1, word, CosSimilarity()) if sim > .3: word_sim_dict[word1] = sim # rank first those overlapping with chilin synonyms word_sim_list = [] if word_sim_dict.get(word): word_sim_dict.pop(word) for key in word_sim_dict.keys(): if key in synonyms: word_sim_dict.pop(key) word_sim_list += [key] # sort the rest of words d = sorted(word_sim_dict.items(), key=lambda x: x[1], reverse=True) word_sim_list += [word for word, sim in d] word_sim_list = word_sim_list[:9] return word_sim_list
from composes.semantic_space.space import Space from composes.composition.lexical_function import LexicalFunction from composes.utils.regression_learner import LstsqRegressionLearner #training data1: VO N -> SVO train_vo_data = [("hate_boy", "man", "man_hate_boy"), ("hate_man", "man", "man_hate_man"), ("hate_boy", "boy", "boy_hate_boy"), ("hate_man", "boy", "boy_hate_man")] #training data2: V N -> VO train_v_data = [("hate", "man", "hate_man"), ("hate", "boy", "hate_boy")] #load N and SVO spaces n_space = Space.build(data="./data/in/ex19-n.sm", cols="./data/in/ex19-n.cols", format="sm") svo_space = Space.build(data="./data/in/ex19-svo.sm", cols="./data/in/ex19-svo.cols", format="sm") print "\nInput SVO training space:" print svo_space.id2row print svo_space.cooccurrence_matrix #1. train a model to learn VO functions on train data: VO N -> SVO print "\nStep 1 training" vo_model = LexicalFunction(learner=LstsqRegressionLearner()) vo_model.train(train_vo_data, n_space, svo_space)
import sys import os folder = os.path.expandvars('/home/luka/Downloads/dissect-master/src') if folder not in sys.path: sys.path.append(folder) from composes.semantic_space.space import Space my_space = Space.build( data="/home/luka/Downloads/dissect-master/src/examples/data/in/ex01.sm", rows="/home/luka/Downloads/dissect-master/src/examples/data/in/ex01.rows", cols="/home/luka/Downloads/dissect-master/src/examples/data/in/ex01.cols", format="sm") from composes.utils import io_utils from composes.transformation.scaling.ppmi_weighting import PpmiWeighting my_space = io_utils.load( "/home/luka/Downloads/dissect-master/src/examples/data/out/ex01.pkl") print my_space.cooccurrence_matrix my_space = my_space.apply(PpmiWeighting()) print my_space.cooccurrence_matrix
def test_simple_define(self): #trained = io_utils.load(self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl") #new_space = trained.function_space #compose with lexical function ac.main([ "apply_composition.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "an_train_data.txt", "-o", self.dir_, "--load_model", self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl", "-a", self.dir_ + "CORE_SS.N_mat.pkl", "--output_format", "dm" ]) sp2 = Space.build(data=self.dir_ + "COMPOSED_SS.LexicalFunction.an_train_data.txt.dm", format="dm") #compose with weighted addition ac.main([ "apply_composition.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "an_train_data.txt", "-o", self.dir_, "-m", "weighted_add", "--alpha", "0.5", "--beta", "0.5", "-a", self.dir_ + "CORE_SS.A_mat.pkl" + "," + self.dir_ + "CORE_SS.N_mat.pkl", "--output_format", "dm" ]) sp1 = Space.build(data=self.dir_ + "COMPOSED_SS.WeightedAdditive.an_train_data.txt.dm", format="dm") sp3 = io_utils.load( self.dir_ + "COMPOSED_SS.WeightedAdditive.an_train_data.txt.pkl") np.testing.assert_array_equal(sp1.cooccurrence_matrix.mat, np.mat([[3, 4], [4, 5]])) self._test_equal_spaces_structs(sp1, sp2) sp1.to_sparse() sp3.to_sparse() self._test_equal_spaces_sparse(sp1, sp3) #the two output format have to contain identical data sp1.to_dense() sp3.to_dense() self._test_equal_spaces_dense(sp1, sp3) #compose with dilation ac.main([ "apply_composition.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "an_train_data.txt", "-o", self.dir_, "-m", "dilation", "--lambda", "1", "-a", self.dir_ + "CORE_SS.A_mat.pkl" + "," + self.dir_ + "CORE_SS.N_mat.pkl", "--output_format", "dm" ]) sp1 = Space.build(data=self.dir_ + "COMPOSED_SS.Dilation.an_train_data.txt.dm", format="dm") n_space = io_utils.load(self.dir_ + "CORE_SS.N_mat.pkl") sp1.to_dense() n_space.to_dense() np.testing.assert_array_almost_equal( sp1.cooccurrence_matrix.mat, n_space.cooccurrence_matrix.mat * 25) self._test_equal_spaces_structs(sp1, sp2) #compose with dilation, change the order of the arguments ac.main([ "apply_composition.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "na_train_data.txt", "-o", self.dir_, "-m", "dilation", "--lambda", "1", "-a", self.dir_ + "CORE_SS.N_mat.pkl" + "," + self.dir_ + "CORE_SS.A_mat.pkl", "--output_format", "dm" ]) sp1 = Space.build(data=self.dir_ + "COMPOSED_SS.Dilation.na_train_data.txt.dm", format="dm") sp1.to_dense() np.testing.assert_array_almost_equal(sp1.cooccurrence_matrix.mat, np.mat([[75, 100], [183, 244]]), 5) self._test_equal_spaces_structs(sp1, sp2) #compose with multiplicative ac.main([ "apply_composition.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "aan_train_data.txt", "-o", self.dir_, "-m", "mult", "-a", self.dir_ + "CORE_SS.A_mat.pkl" + "," + self.dir_ + "COMPOSED_SS.Dilation.an_train_data.txt.pkl", "--output_format", "dm" ]) sp1 = Space.build(data=self.dir_ + "COMPOSED_SS.Multiplicative.aan_train_data.txt.dm", format="dm")
def create_space(dmFile, rowsFile): space = Space.build(data=dmFile, rows=rowsFile, format="dm") return space
#ex02.py #------- from composes.semantic_space.space import Space from composes.utils import io_utils #create a space from co-occurrence counts in sparse format my_space = Space.build(data="./data/in/ex01.sm", rows="./data/in/ex01.rows", cols="./data/in/ex01.cols", format="sm") #print the co-occurrence matrix of the space print my_space.cooccurrence_matrix #save the Space object in pickle format io_utils.save(my_space, "./data/out/ex01.pkl") #load the saved object my_space2 = io_utils.load("./data/out/ex01.pkl") #print the co-occurrence matrix of the loaded space print my_space2.cooccurrence_matrix
def test_export(self): out_file = self.dir_ + "tmp" mat1 = np.mat([[1,2],[3,0]]) mat1row, mat1col = ["a","b"], ["f1","f2"] mat2 = np.mat([[0,0]]) mat2row, mat2col = ["a"], [] test_cases = [(Space(DenseMatrix(mat1), mat1row, mat1col), Space(SparseMatrix(mat1), mat1row, mat1col)), (Space(DenseMatrix(mat2), mat2row, mat1col), Space(SparseMatrix(mat2), mat2row, mat1col))] #3 cases allowed at the moment for sp_d, sp_s in test_cases: self.reset_export_files(out_file) sp_d.export(out_file, format="dm") new_sp = Space.build(data=out_file + ".dm", rows=out_file + ".rows", cols=out_file + ".cols", format="dm") self._test_equal_spaces_dense(sp_d, new_sp) self.reset_export_files(out_file) sp_d.export(out_file, format="sm") new_sp = Space.build(data=out_file + ".sm", rows=out_file + ".rows", cols=out_file + ".cols", format="sm") self._test_equal_spaces_sparse(sp_s, new_sp) self.reset_export_files(out_file) sp_s.export(out_file, format="sm") new_sp = Space.build(data=out_file + ".sm", rows=out_file + ".rows", cols=out_file + ".cols", format="sm") self._test_equal_spaces_sparse(sp_s, new_sp) self.reset_export_files(out_file) sp_s.export(out_file, format="dm") new_sp = Space.build(data=out_file + ".dm", rows=out_file + ".rows", cols=out_file + ".cols", format="dm") self._test_equal_spaces_dense(sp_d, new_sp) test_cases = [(Space(DenseMatrix(mat2), mat2row, mat2col), Space(SparseMatrix(mat2), mat2row, mat2col))] for sp_d, sp_s in test_cases: self.reset_export_files(out_file) sp_d.export(out_file, format="dm") new_sp = Space.build(data=out_file + ".dm", rows=out_file + ".rows", format="dm") self._test_equal_spaces_dense(sp_d, new_sp) self.reset_export_files(out_file) sp_s.export(out_file, format="dm") new_sp = Space.build(data=out_file + ".dm", rows=out_file + ".rows", format="dm") self._test_equal_spaces_dense(sp_d, new_sp)
import sys if __name__ == '__main__': # set constants data_path = sys.argv[0] + "/" + sys.argv[1] + "_" log_file = data_path + "all.log" core_cooccurrence_file = data_path + "GemmaData_sm" core_row_file = data_path + "GemmaData_rows" core_col_file = data_path + "GemmaData_cols" core_space_file = data_path + "core.pkl" # config log file log_utils.config_logging(log_file) print "Building semantic space from co-occurrence counts" core_space = Space.build(data=core_cooccurrence_file, rows=core_row_file, cols=core_col_file, format="sm") print "Applying ppmi weighting" core_space = core_space.apply(PpmiWeighting()) print "Applying feature selection" core_space = core_space.apply(TopFeatureSelection(5000)) print "Applying svd 500" core_space = core_space.apply(Svd(100)) print "Saving the semantic space" io_utils.save(core_space, core_space_file) #print "Finding 10 neighbors of " + sys.argv[1] #neighbors = core_space.get_neighbours(sys.argv[1], 10, CosSimilarity()) #print neighbors
def test_simple_ops(self): #x = matrix([[ -2.19426495e+00, 3.16751379e+00, -3.89945798e-01], #x = np.mat([[1,2,3],[2,4,6],[4,675,43]]) us = np.mat([[2.19272110e+00, 3.03174768e+00], [4.38544220e+00, 6.06349536e+00], [6.76369708e+02, -4.91431927e-02]]) us2 = np.mat([[2.19426495e+00, 3.16751379e+00], [4.38703714e+00, 6.14112794e+00], [6.76380808e+02, -5.01074549e-02]]) bcs.main([ "build_core_space.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "mat3", "-w", "raw", "-s", "top_sum_3,top_length_3,top_sum_4", "-r", "svd_2,svd_1", "-n", "none,all,row", "-o", self.dir_, "--input_format", "dm", "--output_format", "dm" ]) s1 = Space.build(data=self.dir_ + "CORE_SS.mat3.raw.top_sum_3.svd_2.dm", format="dm") s2 = Space.build(data=self.dir_ + "CORE_SS.mat3.raw.top_sum_3.svd_1.dm", format="dm") s3 = Space.build(data=self.dir_ + "CORE_SS.mat3.raw.top_length_3.svd_2.dm", format="dm") s4 = Space.build(data=self.dir_ + "CORE_SS.mat3.raw.top_length_3.svd_1.dm", format="dm") s5 = Space.build(data=self.dir_ + "CORE_SS.mat3.raw.top_sum_4.svd_2.dm", format="dm") s6 = Space.build(data=self.dir_ + "CORE_SS.mat3.raw.top_sum_4.svd_1.dm", format="dm") s7 = Space.build(data=self.dir_ + "CORE_SS.mat3.raw.top_sum_4.svd_1.all.dm", format="dm") s8 = Space.build(data=self.dir_ + "CORE_SS.mat3.raw.top_sum_4.svd_1.row.dm", format="dm") s9 = s6.apply(Normalization()) s10 = s6.apply(RowNormalization()) self._test_equal_spaces_dense(s1, s3) self._test_equal_spaces_dense(s2, s4) self._test_equal_spaces_dense(s7, s9) self._test_equal_spaces_dense(s8, s10) np.testing.assert_array_almost_equal(abs(s1.cooccurrence_matrix.mat), abs(us), 2) np.testing.assert_array_almost_equal(abs(s2.cooccurrence_matrix.mat), abs(us[:, 0:1]), 2) np.testing.assert_array_almost_equal(abs(s5.cooccurrence_matrix.mat), abs(us2), 2) np.testing.assert_array_almost_equal(abs(s6.cooccurrence_matrix.mat), abs(us2[:, 0:1]), 2) self._test_equal_spaces_structs(s3, s5) self._test_equal_spaces_structs(s2, s6) bcs.main([ "build_core_space.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "mat3", "--weighting", "raw", "--selection", "top_sum_3,top_length_3,top_sum_4", "--reduction", "svd_2,svd_1", "-o", self.dir_, "--input_format", "sm", "--output_format", "dm" ]) s1 = Space.build(data=self.dir_ + "CORE_SS.mat3.raw.top_sum_3.svd_2.dm", format="dm") s2 = Space.build(data=self.dir_ + "CORE_SS.mat3.raw.top_sum_3.svd_1.dm", format="dm") s3 = Space.build(data=self.dir_ + "CORE_SS.mat3.raw.top_length_3.svd_2.dm", format="dm") s4 = Space.build(data=self.dir_ + "CORE_SS.mat3.raw.top_length_3.svd_1.dm", format="dm") s5 = Space.build(data=self.dir_ + "CORE_SS.mat3.raw.top_sum_4.svd_2.dm", format="dm") s6 = Space.build(data=self.dir_ + "CORE_SS.mat3.raw.top_sum_4.svd_1.dm", format="dm") self._test_equal_spaces_dense(s1, s3) self._test_equal_spaces_dense(s2, s4) np.testing.assert_array_almost_equal(abs(s1.cooccurrence_matrix.mat), abs(us), 2) np.testing.assert_array_almost_equal(abs(s2.cooccurrence_matrix.mat), abs(us[:, 0:1]), 2) np.testing.assert_array_almost_equal(abs(s5.cooccurrence_matrix.mat), abs(us2), 2) np.testing.assert_array_almost_equal(abs(s6.cooccurrence_matrix.mat), abs(us2[:, 0:1]), 2) self._test_equal_spaces_structs(s3, s5) self._test_equal_spaces_structs(s2, s6)
from subprocess import Popen, PIPE import os import time usage = """ Usage: python dissect.py dissect_format_file_name dissect_format_file_name: path to a file containing dissect format """ CMD_EXTRACTOR_SCRIPT = '~/Programming/terminology_extractor/extract_patterns.py' file_name = sys.argv[1] my_space = Space.build(data = file_name+".sm", rows = file_name+".rows", cols = file_name+".cols", format = "sm") my_space = my_space.apply(PpmiWeighting()) # print my_space.get_sim("spain", "netherlands", CosSimilarity()) # print my_space.get_neighbours('parenchymopbouw', 4, CosSimilarity()) # print my_space.get_neighbours('pension-n', 4, CosSimilarity()) # print my_space.id2row def prettify(elem): """ Return a pretty-printed XML string for the Element. """ rough_string = ElementTree.tostring(elem, 'utf-8') reparsed = minidom.parseString(rough_string)
# Uses dissect toolkit to import the sparse matrix from sort-cooccur-matrix. # Applies ppmi weighting and exports the result to ./cooccurence/weighted/ # Note that this file is in python 2, not 3. import sys import os folder = os.path.expandvars('/home/luka/Downloads/dissect-master/src') if folder not in sys.path: sys.path.append(folder) from composes.semantic_space.space import Space #pathnames path = '/home/luka/ThLi/cooccurrence/' #import matrix holspace = Space.build(data=path + "spm1.sm", rows=path + "rows1.rows", cols=path + "cols1.cols", format="sm") #apply ppmi weighting from composes.transformation.scaling.ppmi_weighting import PpmiWeighting holspace = holspace.apply(PpmiWeighting()) #export matrix from composes.utils import io_utils io_utils.save(holspace, path + "weighted") holspace.export(path + "weighted_sm", format="sm")
#ex02.py #------- from composes.semantic_space.space import Space from composes.utils import io_utils #create a space from co-occurrence counts in sparse format my_space = Space.build(data = "./data/in/ex01.sm", rows = "./data/in/ex01.rows", cols = "./data/in/ex01.cols", format = "sm") #print the co-occurrence matrix of the space print my_space.cooccurrence_matrix #save the Space object in pickle format io_utils.save(my_space, "./data/out/ex01.pkl") #load the saved object my_space2 = io_utils.load("./data/out/ex01.pkl") #print the co-occurrence matrix of the loaded space print my_space2.cooccurrence_matrix
def train_baroni_guevara_composers(all_vectors, ROOT_DIR, baroni_output_path, guevara_output_path, baroni_threshold=10): """ :type all_vectors: str; path to vectors file containing both N and observed AN vectors :type ROOT_DIR: str; where to write temp files :type baroni_output_path: str; where to write pickled baroni composer :type guevara_output_path: str :type baroni_threshold: int """ SVD_DIMS = 100 baroni_training_phrase_types = {'AN', 'NN'} # what kind of NPs to train Baroni composer for # prepare the input files to be fed into Dissect mkdirs_if_not_exists(ROOT_DIR) filename = basename(all_vectors) noun_events_file = join(ROOT_DIR, '%s-onlyN-SVD%d.tmp' % (filename, SVD_DIMS)) NPs_events_file = join(ROOT_DIR, '%s-onlyPhrases-SVD%d.tmp' % (filename, SVD_DIMS)) thes = Vectors.from_tsv(all_vectors, lowercasing=False) thes.to_tsv(noun_events_file, entry_filter=lambda x: x.type == '1-GRAM' and x.tokens[0].pos == 'N') _translate_byblo_to_dissect(noun_events_file) thes.to_tsv(NPs_events_file, entry_filter=lambda x: x.type in baroni_training_phrase_types, row_transform=lambda x: str(x).replace(' ', '_')) _translate_byblo_to_dissect(NPs_events_file) my_space = Space.build(data="{}.sm".format(noun_events_file), rows="{}.rows".format(noun_events_file), cols="{}.cols".format(noun_events_file), format="sm") logging.info('Each unigram vector has dimensionality %r', my_space.element_shape) # create a peripheral space my_per_space = PeripheralSpace.build(my_space, data="{}.sm".format(NPs_events_file), rows="{}.rows".format(NPs_events_file), # The columns of the peripheral space have to be identical to those # in the core space (including their order)! cols="{}.cols".format(NPs_events_file), format="sm") logging.info('Each phrase vector has dimensionality %r', my_per_space.element_shape) # use the model to compose words in my_space all_data = [] for phrase in my_per_space._row2id: # make sure there are only NPs here if DocumentFeature.from_string(phrase.replace(' ', '_')).type in baroni_training_phrase_types: adj, noun = phrase.split('_') all_data.append((adj, noun, '%s_%s' % (adj, noun))) # train a composition model on the data and save it baroni = LexicalFunction(min_samples=baroni_threshold, learner=RidgeRegressionLearner()) guevara = FullAdditive(learner=RidgeRegressionLearner()) for composer, out_path in zip([baroni, guevara], [baroni_output_path, guevara_output_path]): composer.train(all_data, my_space, my_per_space) io_utils.save(composer, out_path) logging.info('Saved trained composer to %s', out_path)
def train_grefenstette_multistep_composer(all_vectors_file, root_dir): """ Train Grefenstette et al's multistep regression VO/SVO model Adapted from dissect's ex19.py :param all_vectors_file: file containing N, V, VO and SVO vectors :param root_dir: where to write temp files and output """ mkdirs_if_not_exists(root_dir) vo_composer_output_file = join(root_dir, 'vo_comp.pkl') svo_composer_output_file = join(root_dir, 'svo_comp.pkl') filename = basename(all_vectors_file) noun_events_file = join(root_dir, '%s-onlyN.tmp' % filename) # verb_events_file = join(root_dir, '%s-onlyV.tmp' % filename) # vo_events_file = join(root_dir, '%s-onlyVO.tmp' % filename) svo_events_file = join(root_dir, '%s-onlySVO.tmp' % filename) # this has unigrams and observed phrases thes = Vectors.from_tsv(all_vectors_file) thes.to_tsv(noun_events_file, entry_filter=lambda x: x.type == '1-GRAM' and x.tokens[0].pos == 'N') _translate_byblo_to_dissect(noun_events_file) # thes.to_tsv(verb_events_file, # entry_filter=lambda x: x.type == '1-GRAM' and x.tokens[0].pos == 'V') # _translate_byblo_to_dissect(verb_events_file) # thes.to_tsv(vo_events_file, # entry_filter=lambda x: x.type == 'VO') # _translate_byblo_to_dissect(vo_events_file) thes.to_tsv(svo_events_file, entry_filter=lambda x: x.type == 'SVO') _translate_byblo_to_dissect(svo_events_file) train_vo_data, train_v_data = [], [] for phrase in thes.keys(): df = DocumentFeature.from_string(phrase) if df.type == 'SVO': train_vo_data.append((str(df[1:]), str(df[0]), str(df))) if df.type == 'VO': train_v_data.append((str(df[0]), str(df[1]), str(df))) # logging.info('train_vo_data %r', len(train_vo_data)) # logging.info('train_v_data %r', len(train_v_data)) # load N and SVO spaces n_space = Space.build(data=noun_events_file + '.sm', cols=noun_events_file + '.cols', format="sm") svo_space = Space.build(data=svo_events_file + '.sm', cols=svo_events_file + '.cols', format="sm") logging.info("Input SVO training space:") logging.info(svo_space.id2row) # logging.info(svo_space.cooccurrence_matrix) # 1. train a model to learn VO functions on train data: VO N -> SVO logging.info("Step 1 training") vo_model = LexicalFunction(learner=RidgeRegressionLearner(), min_samples=2) # Gref et al 2013, §5 says 3 vo_model.train(train_vo_data, n_space, svo_space) io_utils.save(vo_model, vo_composer_output_file) # 2. train a model to learn V functions on train data: V N -> VO # where VO space: function space learned in step 1 logging.info("Step 2 training") vo_space = vo_model.function_space v_model = LexicalFunction(learner=RidgeRegressionLearner(), min_samples=2) v_model.train(train_v_data, n_space, vo_space) io_utils.save(v_model, svo_composer_output_file)
from tree.semantic_node import SemanticNode from tree.syntactic_tree import SyntacticTree from composes.semantic_space.space import Space from examples import test_vector_file_prefix, test_matrix_file_prefix # FIRST TEST xml_string = ''' <ccg> <lf start="0" span="1" word="dog" lemma="dog" pos="NN" chunk="I-NP" entity="O" cat="N" /> </ccg> ''' syntactic_tree = SyntacticTree.parse_tree_from_xml_string(xml_string) vecfilepref = test_vector_file_prefix matfilepref = test_matrix_file_prefix vecspace = Space.build(data = vecfilepref + ".dm", rows = vecfilepref + ".rows", format = "dm") matspace = Space.build(data = matfilepref + ".dm", rows = matfilepref + ".rows", format = "dm") semnode = SemanticNode.create_semantic_node(syntactic_tree.root,None) papnode = Papfunc_SemanticNode.create_papfunc_node(semnode,vecspace,matspace) print "*****" print "Syntactic tree:", semnode print "Symbolic representation:", papnode._matrep print "Numeric representation:" for x in papnode._numrep: print x # SECOND TEST
def test_simple_ops(self): #x = matrix([[ -2.19426495e+00, 3.16751379e+00, -3.89945798e-01], #x = np.mat([[1,2,3],[2,4,6],[4,675,43]]) us = np.mat([[ 2.19272110e+00, 3.03174768e+00], [ 4.38544220e+00, 6.06349536e+00], [ 6.76369708e+02, -4.91431927e-02]]) us2 = np.mat([[ 2.19426495e+00, 3.16751379e+00], [ 4.38703714e+00, 6.14112794e+00], [ 6.76380808e+02, -5.01074549e-02]]) bcs.main(["build_core_space.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "mat3", "-w", "raw", "-s", "top_sum_3,top_length_3,top_sum_4", "-r", "svd_2,svd_1", "-n", "none,all,row", "-o", self.dir_, "--input_format", "dm", "--output_format", "dm" ]) s1 = Space.build(data = self.dir_ + "CORE_SS.mat3.raw.top_sum_3.svd_2.dm", format="dm") s2 = Space.build(data = self.dir_ + "CORE_SS.mat3.raw.top_sum_3.svd_1.dm", format="dm") s3 = Space.build(data = self.dir_ + "CORE_SS.mat3.raw.top_length_3.svd_2.dm", format="dm") s4 = Space.build(data = self.dir_ + "CORE_SS.mat3.raw.top_length_3.svd_1.dm", format="dm") s5 = Space.build(data = self.dir_ + "CORE_SS.mat3.raw.top_sum_4.svd_2.dm", format="dm") s6 = Space.build(data = self.dir_ + "CORE_SS.mat3.raw.top_sum_4.svd_1.dm", format="dm") s7 = Space.build(data = self.dir_ + "CORE_SS.mat3.raw.top_sum_4.svd_1.all.dm", format="dm") s8 = Space.build(data = self.dir_ + "CORE_SS.mat3.raw.top_sum_4.svd_1.row.dm", format="dm") s9 = s6.apply(Normalization()) s10 = s6.apply(RowNormalization()) self._test_equal_spaces_dense(s1, s3) self._test_equal_spaces_dense(s2, s4) self._test_equal_spaces_dense(s7, s9) self._test_equal_spaces_dense(s8, s10) np.testing.assert_array_almost_equal(abs(s1.cooccurrence_matrix.mat), abs(us), 2) np.testing.assert_array_almost_equal(abs(s2.cooccurrence_matrix.mat), abs(us[:,0:1]), 2) np.testing.assert_array_almost_equal(abs(s5.cooccurrence_matrix.mat), abs(us2), 2) np.testing.assert_array_almost_equal(abs(s6.cooccurrence_matrix.mat), abs(us2[:,0:1]), 2) self._test_equal_spaces_structs(s3, s5) self._test_equal_spaces_structs(s2, s6) bcs.main(["build_core_space.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "mat3", "--weighting", "raw", "--selection", "top_sum_3,top_length_3,top_sum_4", "--reduction", "svd_2,svd_1", "-o", self.dir_, "--input_format", "sm", "--output_format", "dm" ]) s1 = Space.build(data = self.dir_ + "CORE_SS.mat3.raw.top_sum_3.svd_2.dm", format="dm") s2 = Space.build(data = self.dir_ + "CORE_SS.mat3.raw.top_sum_3.svd_1.dm", format="dm") s3 = Space.build(data = self.dir_ + "CORE_SS.mat3.raw.top_length_3.svd_2.dm", format="dm") s4 = Space.build(data = self.dir_ + "CORE_SS.mat3.raw.top_length_3.svd_1.dm", format="dm") s5 = Space.build(data = self.dir_ + "CORE_SS.mat3.raw.top_sum_4.svd_2.dm", format="dm") s6 = Space.build(data = self.dir_ + "CORE_SS.mat3.raw.top_sum_4.svd_1.dm", format="dm") self._test_equal_spaces_dense(s1, s3) self._test_equal_spaces_dense(s2, s4) np.testing.assert_array_almost_equal(abs(s1.cooccurrence_matrix.mat), abs(us), 2) np.testing.assert_array_almost_equal(abs(s2.cooccurrence_matrix.mat), abs(us[:,0:1]), 2) np.testing.assert_array_almost_equal(abs(s5.cooccurrence_matrix.mat), abs(us2), 2) np.testing.assert_array_almost_equal(abs(s6.cooccurrence_matrix.mat), abs(us2[:,0:1]), 2) self._test_equal_spaces_structs(s3, s5) self._test_equal_spaces_structs(s2, s6)
from composes.semantic_space.space import Space from composes.utils import io_utils from composes.transformation.scaling.ppmi_weighting import PpmiWeighting from composes.transformation.scaling.row_normalization import RowNormalization import sys #create a space from co-occurrence counts in sparse format my_space = Space.build(data="../data/" + sys.argv[1] + ".sm", rows="../data/" + sys.argv[1] + ".rows", cols="../data/" + sys.argv[1] + ".cols", format="sm") my_space = my_space.apply(PpmiWeighting()) my_space = my_space.apply(RowNormalization()) #export the space in dense format and pkl format my_space.export("../spaces/" + sys.argv[1], format="dm") io_utils.save(my_space, "../spaces/" + sys.argv[1] + ".pkl")
#training data1: VO N -> SVO train_vo_data = [("hate_boy", "man", "man_hate_boy"), ("hate_man", "man", "man_hate_man"), ("hate_boy", "boy", "boy_hate_boy"), ("hate_man", "boy", "boy_hate_man") ] #training data2: V N -> VO train_v_data = [("hate", "man", "hate_man"), ("hate", "boy", "hate_boy") ] #load N and SVO spaces n_space = Space.build(data = "./data/in/ex19-n.sm", cols = "./data/in/ex19-n.cols", format = "sm") svo_space = Space.build(data = "./data/in/ex19-svo.sm", cols = "./data/in/ex19-svo.cols", format = "sm") print "\nInput SVO training space:" print svo_space.id2row print svo_space.cooccurrence_matrix #1. train a model to learn VO functions on train data: VO N -> SVO print "\nStep 1 training" vo_model = LexicalFunction(learner=LstsqRegressionLearner()) vo_model.train(train_vo_data, n_space, svo_space)
import sys import os folder = os.path.expandvars('/home/luka/Downloads/dissect-master/src') if folder not in sys.path: sys.path.append(folder) from composes.semantic_space.space import Space lassy_space = Space.build(data="/home/luka/ThLi/cooccurrence/spm1.sm", rows="/home/luka/ThLi/cooccurrence/rows1.rows", cols="/home/luka/ThLi/cooccurrence/cols1.cols", format="sm") #%% from composes.utils import io_utils from composes.transformation.scaling.ppmi_weighting import PpmiWeighting lassy_space = lassy_space.apply(PpmiWeighting())
from composes.semantic_space.space import Space from composes.utils import io_utils from composes.transformation.scaling.ppmi_weighting import PpmiWeighting from composes.transformation.scaling.row_normalization import RowNormalization from composes.transformation.dim_reduction.svd import Svd; import sys #create a space from co-occurrence counts in sparse format my_space = Space.build(data = "../data/"+sys.argv[1]+".sm", rows = "../data/"+sys.argv[1]+".rows", cols = "../data/"+sys.argv[1]+".cols", format = "sm") my_space = my_space.apply(PpmiWeighting()) my_space = my_space.apply(RowNormalization()) #apply svd reduction my_space = my_space.apply(Svd(1500)) #export the space in dense format and pkl format my_space.export("../spaces/"+sys.argv[1], format = "dm") io_utils.save(my_space, "../spaces/"+sys.argv[1]+".pkl")
#Convert .dm file to .pkl #Usage: python dm2pkl bnc.dm from composes.semantic_space.space import Space from composes.utils import io_utils import sys space = Space.build(data=sys.argv[1], format='dm') name = sys.argv[1][0:-3] io_utils.save(space, name+".pkl")