예제 #1
0
    def test_simple_sparse_zipped(self):
            
        bcs.main(["build_core_space.py", 
                  "-l", self.dir_ + "log1.txt",
                  "-i", self.dir_ + "mat1", 
                  "-o", self.dir_,
                  "--input_format", "sm",
                  "--output_format", "sm",
                  "--gz", "True"
                  ])
        
        s1 = Space.build(data=self.dir_ + "mat1.sm.gz",
                         cols= self.dir_ + "mat1.cols",
                         format = "sm")

        s2 = Space.build(data=self.dir_ + "CORE_SS.mat1.sm",
                         cols=self.dir_ + "CORE_SS.mat1.cols",
                         format="sm")
        s3 = io_utils.load(self.dir_ + "CORE_SS.mat1.pkl", Space)
        s4 = Space.build(data=self.dir_ + "mat1.sm",
                         cols= self.dir_ + "mat1.cols",
                         format = "sm")
                
        self._test_equal_spaces_sparse(s1, s2)
        self._test_equal_spaces_sparse(s1, s3)
        self._test_equal_spaces_sparse(s1, s4)
예제 #2
0
    def test_build_data(self):

        test_cases = [("data1",["red", "blue"], ["car", "man"],
                       np.mat([[3,5],[0,10]]), np.mat([[3,5],[0,10]])),
                      ("data2",["red"], ["car"],
                       np.mat([[3]]), np.mat([[3]])),
                      ("data3",["red", "blue"], ["car", "man"],
                       np.mat([[15,0],[0,6]]), np.mat([[5,0],[0,6]])),
                      ("data7",["red"], ["car"], np.mat([[0]]), np.mat([[0]])),
                      ("data9",["man"], ["car"], np.mat([[4]]), None),
                      ]
        for data_file, rows, cols, smat, dmat in test_cases:
            data_file1 = self.dir_ + data_file + ".sparse"

            sp = Space.build(data=data_file1,
                             cols= self.dir_ + data_file + ".cols",
                             format="sm")
            self.assertListEqual(rows, sp.id2row)
            self.assertListEqual(cols, sp.id2column)

            self.assertIsInstance(sp.cooccurrence_matrix, SparseMatrix)
            np.testing.assert_array_equal(smat,
                                          sp.cooccurrence_matrix.mat.todense())

            data_file2 = self.dir_ + data_file + ".dense"
            if not dmat is None:
                sp = Space.build(data=data_file2, format="dm")
                self.assertListEqual(rows, sp.id2row)
                self.assertListEqual([], sp.id2column)

                self.assertIsInstance(sp.cooccurrence_matrix, DenseMatrix)
                np.testing.assert_array_equal(dmat, sp.cooccurrence_matrix.mat)
예제 #3
0
def eval_on_file(path_composed_emb, path_observed_emb, save_path):
    raw_observed_space = Space.build(data=path_observed_emb, format='dm')
    observed_space = raw_observed_space.apply(RowNormalization('length'))
    observed_words = observed_space.get_id2row()
    print("Observed words, size: " + str(len(observed_words)) + ", first:")
    print(observed_words[:10])
    observed_words_set = set(observed_words)

    raw_composed_space = Space.build(data=path_composed_emb, format='dm')
    composed_space = raw_composed_space.apply(RowNormalization('length'))
    composed_words = composed_space.get_id2row()
    print("Composed words, size: " + str(len(composed_words)) + ", first:")
    print(composed_words[:10])

    # all composed words should be in the initial space
    for idx, word in enumerate(composed_words):
        assert (word in observed_words_set)

    q1, q2, q3, ranks = evaluateRank(composed_words, composed_space,
                                     observed_space)
    print("Q1: " + str(q1) + ", Q2: " + str(q2) + ", Q3: " + str(q3))

    printDictToFile(ranks, save_path + '_rankedCompounds.txt')

    sortedRanks = sorted(ranks.values())
    printListToFile(sortedRanks, save_path + '_ranks.txt')
    logResult(q1, q2, q3, save_path + '_quartiles.txt')

    return q1, q2, q3, ranks
예제 #4
0
    def test_simple_dense(self):

        bcs.main([
            "build_core_space.py", "-l", self.dir_ + "log1.txt", "-i",
            self.dir_ + "mat2", "-o", self.dir_, "--input_format", "dm",
            "--output_format", "dm"
        ])

        s1 = Space.build(data=self.dir_ + "mat2.dm", format="dm")
        s2 = Space.build(data=self.dir_ + "CORE_SS.mat2.dm", format="dm")
        s3 = io_utils.load(self.dir_ + "CORE_SS.mat2.pkl", Space)

        self._test_equal_spaces_dense(s1, s2)
        self._test_equal_spaces_dense(s1, s3)

        bcs.main([
            "build_core_space.py", "-l", self.dir_ + "log1.txt", "-i",
            self.dir_ + "CORE_SS.mat2", "-o", self.dir_, "--input_format",
            "pkl", "--output_format", "dm"
        ])

        s1 = io_utils.load(self.dir_ + "CORE_SS.CORE_SS.mat2.pkl", Space)
        s3 = io_utils.load(self.dir_ + "CORE_SS.mat2.pkl", Space)

        self._test_equal_spaces_dense(s1, s3)
예제 #5
0
def train_from_core(lexical_space_file, an_dn_file, pn_file, sv_file, vo_file, output_file_prefix):
    
    if (not exists(lexical_space_file) or not exists(pn_file) or not exists(sv_file)
        or not exists(vo_file) or not exists(an_dn_file)):
        print "some file doesn't exist"
        print lexical_space_file, an_dn_file, pn_file, sv_file, vo_file
    
    print "load core"
    core_space = Space.build(data=lexical_space_file, format="dm")
    print "load an dn"
    
    an_dn_space = Space.build(data=an_dn_file, format="dm")
    print "load pn"
    pn_space = Space.build(data=pn_file, format="dm")
    print "load sv"
    sv_space = Space.build(data=sv_file, format="dm")
    print "load vo"
    vo_space = Space.build(data=vo_file, format="dm")
    
    print "start training"
    all_mat_space_normed = train_all_spaces(core_space, an_dn_space, 
                                     pn_space, sv_space, vo_space)
    print "exporting trained file"
    all_mat_space_normed.export(output_file_prefix, format="dm")
    del all_mat_space_normed
    print "DONE"
예제 #6
0
    def test_simple_dense(self):
            
        bcs.main(["build_core_space.py", 
                  "-l", self.dir_ + "log1.txt",
                  "-i", self.dir_ + "mat2", 
                  "-o", self.dir_,
                  "--input_format", "dm",
                  "--output_format", "dm"
                  ])
        
        s1 = Space.build(data = self.dir_ + "mat2.dm", format = "dm")
        s2 = Space.build(data = self.dir_ + "CORE_SS.mat2.dm", format="dm")
        s3 = io_utils.load(self.dir_ + "CORE_SS.mat2.pkl", Space)
        
        self._test_equal_spaces_dense(s1, s2)
        self._test_equal_spaces_dense(s1, s3)        
 
        bcs.main(["build_core_space.py", 
                  "-l", self.dir_ + "log1.txt",
                  "-i", self.dir_ + "CORE_SS.mat2", 
                  "-o", self.dir_,
                  "--input_format", "pkl",
                  "--output_format", "dm"
                  ])
        
        s1 = io_utils.load(self.dir_ + "CORE_SS.CORE_SS.mat2.pkl", Space)
        s3 = io_utils.load(self.dir_ + "CORE_SS.mat2.pkl", Space)
        
        self._test_equal_spaces_dense(s1, s3)  
예제 #7
0
def eval_on_file(path_composed_emb, path_observed_emb, save_path):
    raw_observed_space = Space.build(data=path_observed_emb, format='dm')
    observed_space = raw_observed_space.apply(RowNormalization('length'))
    observed_words = observed_space.get_id2row()
    print("Observed words, size: " + str(len(observed_words)) + ", first:")
    print(observed_words[:10])
    observed_words_set = set(observed_words)

    raw_composed_space = Space.build(data=path_composed_emb, format='dm')
    composed_space = raw_composed_space.apply(RowNormalization('length'))
    composed_words = composed_space.get_id2row()
    print("Composed words, size: " + str(len(composed_words)) + ", first:")
    print(composed_words[:10])

    # all composed words should be in the initial space
    for idx, word in enumerate(composed_words):
        assert(word in observed_words_set)

    q1, q2, q3, ranks = evaluateRank(composed_words, composed_space, observed_space)
    print("Q1: " + str(q1) + ", Q2: " + str(q2) + ", Q3: " + str(q3))

    printDictToFile(ranks, save_path + '_rankedCompounds.txt')
    
    sortedRanks = sorted(ranks.values())
    printListToFile(sortedRanks, save_path + '_ranks.txt')
    logResult(q1, q2, q3, save_path + '_quartiles.txt')

    return q1,q2,q3,ranks
예제 #8
0
    def test_as_conversion_tool(self):
        
        bcs.main(["build_core_space.py", 
                  "-i", self.dir_ + "mat3", 
                  "-o", self.dir_,
                  "--input_format", "sm",
                  "--output_format", "sm"
                  ])        
        
        s1 = Space.build(data=self.dir_ + "mat3.sm",
                         cols= self.dir_ + "mat3.cols",
                         format = "sm")
        s2 = Space.build(data=self.dir_ + "CORE_SS.mat3.sm",
                         rows=self.dir_ + "CORE_SS.mat3.rows",
                         cols=self.dir_ + "CORE_SS.mat3.cols", 
                         format="sm")
        s3 = io_utils.load(self.dir_ + "CORE_SS.mat3.pkl", Space)
        
        self._test_equal_spaces_sparse(s1, s2)
        self._test_equal_spaces_sparse(s1, s3)
        
        bcs.main(["build_core_space.py", 
                  "-i", self.dir_ + "mat3", 
                  "-o", self.dir_,
                  "--input_format", "sm",
                  "--output_format", "dm"
                  ])
        
        s1 = Space.build(data=self.dir_ + "mat3.dm",
                         cols=self.dir_ + "CORE_SS.mat3.cols",
                         format = "dm")
        s2 = Space.build(data=self.dir_ + "CORE_SS.mat3.dm",
                         rows=self.dir_ + "CORE_SS.mat3.rows",
                         cols=self.dir_ + "CORE_SS.mat3.cols",
                         format = "dm")                 
        s3 = io_utils.load(self.dir_ + "CORE_SS.mat3.pkl", Space)

        self._test_equal_spaces_dense(s1, s2)
        s3.to_dense()
        self._test_equal_spaces_dense(s1, s3)
        
        bcs.main(["build_core_space.py", 
                  "-i", self.dir_ + "mat3", 
                  "-o", self.dir_,
                  "--input_format", "dm",
                  "--output_format", "dm"
                  ])        
       
        s1 = Space.build(data=self.dir_ + "CORE_SS.mat3.dm",
                         cols=self.dir_ + "CORE_SS.mat3.cols",
                         format = "dm")                 
        s3 = io_utils.load(self.dir_ + "CORE_SS.mat3.pkl", Space)
        
        s3.to_dense()
        self._test_equal_spaces_dense(s1, s3)
예제 #9
0
 def test_simple_lstsq_no_inter(self):
     tc.main(["train_composition.py", 
               "-l", self.dir_ + "log1.txt",
               "-i", self.dir_ + "an_train_data.txt", 
               "-o", self.dir_,
               "-m", "lexical_func",
               "-p", self.dir_ + "CORE_SS.AN_mat.pkl",
               "-a", self.dir_ + "CORE_SS.N_mat.pkl",
               "-r", "lstsq",
               "--intercept", "False",
               "--export_params", "True"
               ]) 
     
     trained = io_utils.load(self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl")
     new_space = trained.function_space
     np.testing.assert_array_almost_equal(new_space.cooccurrence_matrix.mat, 
                                          np.mat([1,0,0,1]), 10)
     self.assertTupleEqual(new_space.element_shape, (2,2))
     self.assertListEqual(new_space.id2row, ["big"])
     self.assertListEqual(new_space.id2column, [])
     
     a_space = Space.build(data=self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.params.dm", 
                           format="dm")
     
     self._test_equal_spaces_dense(a_space, new_space)
     
     tc.main(["train_composition.py", 
               "-l", self.dir_ + "log1.txt",
               "-i", self.dir_ + "an_train_data.txt", 
               "-o", self.dir_,
               "-m", "lexical_func",
               "-p", self.dir_ + "CORE_SS.AN_mat.pkl",
               "-a", self.dir_ + "CORE_SS.N_mat.pkl",
               "-r", "ridge",
               "--lambda", "0",
               "--crossvalidation", "False",
               "--intercept", "False",
               "--export_params", "True"
               ]) 
     
     trained = io_utils.load(self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl")
     new_space2 = trained.function_space
     np.testing.assert_array_almost_equal(new_space2.cooccurrence_matrix.mat, 
                                          np.mat([1,0,0,1]), 10)
     self.assertTupleEqual(new_space2.element_shape, (2,2))
     self.assertListEqual(new_space2.id2row, ["big"])
     self.assertListEqual(new_space2.id2column, [])
     
     a_space = Space.build(data=self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.params.dm", 
                           format="dm")
     
     self._test_equal_spaces_dense(a_space, new_space2)
예제 #10
0
    def test_simple_ops(self):

        bcs.main(["build_core_space.py",
                  "-l", self.dir_ + "log1.txt",
                  "-i", self.dir_ + "mat3",
                  "-w", "raw",
                  "-s", "top_sum_3,top_length_3,top_sum_4",
                  "-r", "svd_2,svd_1",
                  "-o", self.dir_,
                  "--input_format", "dm",
                  "--output_format", "dm"
                  ])

        core_mats = ["CORE_SS.mat3.raw.top_sum_3.svd_2",
                     "CORE_SS.mat3.raw.top_sum_3.svd_1",
                     "CORE_SS.mat3.raw.top_length_3.svd_2",
                     "CORE_SS.mat3.raw.top_length_3.svd_1",
                     "CORE_SS.mat3.raw.top_sum_4.svd_2",
                     "CORE_SS.mat3.raw.top_sum_4.svd_1"
                     ]

        core_spaces = [Space.build(data=self.dir_ + suffix + ".dm", format="dm") for suffix in core_mats]

        for i, core_mat in enumerate(core_mats):
            bps.main(["build_peripheral_space.py",
                      "-l", self.dir_ + "log1.txt",
                      "-i", self.dir_ + "mat3",
                      "-o", self.dir_,
                      "-c", self.dir_ + core_mat + ".pkl",
                      "--input_format", "dm",
                      "--output_format", "dm"
                      ])

            s1 = core_spaces[i]
            data_file = self.dir_ + "PER_SS.mat3." + core_mats[i] + ".dm"
            s2 = Space.build(data=data_file, format="dm")
            self._test_equal_spaces_dense(s1, s2)

            bps.main(["build_peripheral_space.py",
                      "-l", self.dir_ + "log1.txt",
                      "-i", self.dir_ + "mat3",
                      "-o", self.dir_,
                      "-c", self.dir_ + core_mat + ".pkl",
                      "--input_format", "sm",
                      "--output_format", "dm"
                      ])

            s1 = core_spaces[i]
            data_file = self.dir_ + "PER_SS.mat3." + core_mats[i] + ".dm"
            s2 = Space.build(data=data_file, format="dm")

            self._test_equal_spaces_dense(s1, s2)
예제 #11
0
    def test_simple_dense(self):
        bps.main(["build_peripheral_space.py",
                  "-l", self.dir_ + "log1.txt",
                  "-i", self.dir_ + "mat2",
                  "-o", self.dir_,
                  "-c", self.dir_ + "CORE_SS.mat2.pkl",
                  "--input_format", "dm",
                  "--output_format", "dm"
                  ])
        s1 = Space.build(data=self.dir_ + "mat2.dm", format="dm")
        s2 = Space.build(data=self.dir_ + "PER_SS.mat2.CORE_SS.mat2.dm", format="dm")

        self._test_equal_spaces_dense(s1, s2)
예제 #12
0
    def test_simple_lstsq_no_inter(self):
        tc.main([
            "train_composition.py", "-l", self.dir_ + "log1.txt", "-i",
            self.dir_ + "an_train_data.txt", "-o", self.dir_, "-m",
            "lexical_func", "-p", self.dir_ + "CORE_SS.AN_mat.pkl", "-a",
            self.dir_ + "CORE_SS.N_mat.pkl", "-r", "lstsq", "--intercept",
            "False", "--export_params", "True"
        ])

        trained = io_utils.load(
            self.dir_ +
            "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl")
        new_space = trained.function_space
        np.testing.assert_array_almost_equal(new_space.cooccurrence_matrix.mat,
                                             np.mat([1, 0, 0, 1]), 10)
        self.assertTupleEqual(new_space.element_shape, (2, 2))
        self.assertListEqual(new_space.id2row, ["big"])
        self.assertListEqual(new_space.id2column, [])

        a_space = Space.build(
            data=self.dir_ +
            "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.params.dm",
            format="dm")

        self._test_equal_spaces_dense(a_space, new_space)

        tc.main([
            "train_composition.py", "-l", self.dir_ + "log1.txt", "-i",
            self.dir_ + "an_train_data.txt", "-o", self.dir_, "-m",
            "lexical_func", "-p", self.dir_ + "CORE_SS.AN_mat.pkl", "-a",
            self.dir_ + "CORE_SS.N_mat.pkl", "-r", "ridge", "--lambda", "0",
            "--crossvalidation", "False", "--intercept", "False",
            "--export_params", "True"
        ])

        trained = io_utils.load(
            self.dir_ +
            "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl")
        new_space2 = trained.function_space
        np.testing.assert_array_almost_equal(
            new_space2.cooccurrence_matrix.mat, np.mat([1, 0, 0, 1]), 10)
        self.assertTupleEqual(new_space2.element_shape, (2, 2))
        self.assertListEqual(new_space2.id2row, ["big"])
        self.assertListEqual(new_space2.id2column, [])

        a_space = Space.build(
            data=self.dir_ +
            "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.params.dm",
            format="dm")

        self._test_equal_spaces_dense(a_space, new_space2)
예제 #13
0
    def test_as_conversion_tool(self):

        bcs.main([
            "build_core_space.py", "-i", self.dir_ + "mat3", "-o", self.dir_,
            "--input_format", "sm", "--output_format", "sm"
        ])

        s1 = Space.build(data=self.dir_ + "mat3.sm",
                         cols=self.dir_ + "mat3.cols",
                         format="sm")
        s2 = Space.build(data=self.dir_ + "CORE_SS.mat3.sm",
                         rows=self.dir_ + "CORE_SS.mat3.rows",
                         cols=self.dir_ + "CORE_SS.mat3.cols",
                         format="sm")
        s3 = io_utils.load(self.dir_ + "CORE_SS.mat3.pkl", Space)

        self._test_equal_spaces_sparse(s1, s2)
        self._test_equal_spaces_sparse(s1, s3)

        bcs.main([
            "build_core_space.py", "-i", self.dir_ + "mat3", "-o", self.dir_,
            "--input_format", "sm", "--output_format", "dm"
        ])

        s1 = Space.build(data=self.dir_ + "mat3.dm",
                         cols=self.dir_ + "CORE_SS.mat3.cols",
                         format="dm")
        s2 = Space.build(data=self.dir_ + "CORE_SS.mat3.dm",
                         rows=self.dir_ + "CORE_SS.mat3.rows",
                         cols=self.dir_ + "CORE_SS.mat3.cols",
                         format="dm")
        s3 = io_utils.load(self.dir_ + "CORE_SS.mat3.pkl", Space)

        self._test_equal_spaces_dense(s1, s2)
        s3.to_dense()
        self._test_equal_spaces_dense(s1, s3)

        bcs.main([
            "build_core_space.py", "-i", self.dir_ + "mat3", "-o", self.dir_,
            "--input_format", "dm", "--output_format", "dm"
        ])

        s1 = Space.build(data=self.dir_ + "CORE_SS.mat3.dm",
                         cols=self.dir_ + "CORE_SS.mat3.cols",
                         format="dm")
        s3 = io_utils.load(self.dir_ + "CORE_SS.mat3.pkl", Space)

        s3.to_dense()
        self._test_equal_spaces_dense(s1, s3)
예제 #14
0
def test_to_dissect_sparse_files(vectors_c, tmpdir):
    """

    :type vectors_c: Thesaurus
    :type tmpdir: py.path.local
    """
    from composes.semantic_space.space import Space

    prefix = str(tmpdir.join('output'))
    vectors_c.to_dissect_sparse_files(prefix)
    # check that files are there
    for suffix in ['sm', 'rows', 'cols']:
        outfile = '{}.{}'.format(prefix, suffix)
        assert os.path.exists(outfile)
        assert os.path.isfile(outfile)

    # check that reading the files in results in the same matrix
    space = Space.build(data="{}.sm".format(prefix),
                        rows="{}.rows".format(prefix),
                        cols="{}.cols".format(prefix),
                        format="sm")

    matrix, rows, cols = space.cooccurrence_matrix.mat, space.id2row, space.id2column
    exp_matrix, exp_cols, exp_rows = vectors_c.to_sparse_matrix()

    assert exp_cols == cols
    assert exp_rows == rows
    assert_array_equal(exp_matrix.A, matrix.A)
    _assert_matrix_of_thesaurus_c_is_as_expected(matrix.A, rows, cols)
    _assert_matrix_of_thesaurus_c_is_as_expected(exp_matrix.A, exp_rows, exp_cols)
예제 #15
0
def build_raw_per_space(in_file_prefix, in_format, is_gz):

    if not in_format in ("sm", "dm", "pkl"):
        raise ValueError("Invalid input format:%s" % in_format)

    data_file = '%s.%s' % (in_file_prefix, in_format)

    if in_format == "pkl":
        space = io_utils.load(data_file, Space)

    else:
        if is_gz:
            data_file = '%s.gz' % data_file
        row_file = '%s.rows' % (in_file_prefix)
        column_file = '%s.cols' % (in_file_prefix)
        if not os.path.exists(row_file):
            row_file = None
        if not os.path.exists(column_file):
            if in_format == "sm":
                raise ValueError("Column file: %s needs to be provided!" %
                                 column_file)
            column_file = None
        print("Building matrix...")
        space = Space.build(data=data_file,
                            rows=row_file,
                            cols=column_file,
                            format=in_format)

    return space
예제 #16
0
def build_raw_per_space(in_file_prefix, in_format, is_gz):

    if not in_format in ("sm", "dm", "pkl"):
        raise ValueError("Invalid input format:%s" % in_format)

    data_file = "%s.%s" % (in_file_prefix, in_format)

    if in_format == "pkl":
        space = io_utils.load(data_file, Space)

    else:
        if is_gz:
            data_file = "%s.gz" % data_file
        row_file = "%s.rows" % (in_file_prefix)
        column_file = "%s.cols" % (in_file_prefix)
        if not os.path.exists(row_file):
            row_file = None
        if not os.path.exists(column_file):
            if in_format == "sm":
                raise ValueError("Column file: %s needs to be provided!" % column_file)
            column_file = None
        print "Building matrix..."
        space = Space.build(data=data_file, rows=row_file, cols=column_file, format=in_format)

    return space
예제 #17
0
    def test_build_data_row_col(self):
        test_cases = [("data1", "row1.row", "col1.col", ["red"], ["man", "car"],
                       np.mat([[5,3]]), np.mat([[3,5]])),
                      ("data1", "row1.row", "col5.col", ["red"], ["man", "car"],
                       np.mat([[5,3]]), np.mat([[3,5]])),
                      ("data3", "row2.row", "col2.col", ["blue", "red"], ["car"],
                       np.mat([[0],[15]]), None),
                      ("data2", "row1.row","col1.col", ["red"], ["man","car"],
                       np.mat([[0,3]]), None),
                      ("data3", "row3.row", "col3.col", ["blue", "red"], ["man", "car"],
                       np.mat([[6,0],[0,15]]), np.mat([[0,6],[5,0]])),
                      ("data7", "row2.row", "col3.col", ["blue", "red"], ["man", "car"],
                       np.mat([[0,0],[0,0]]), None),
                      ("data3", "row2.row", "col4.col", ["blue", "red"], ["airplane"],
                       np.mat([[0],[0]]), None)
                      ]

        for data_file, row_file, col_file, rows, cols, smat, dmat in test_cases:
            row_file = self.dir_ + row_file
            col_file = self.dir_ + col_file

            data_file1 = self.dir_ + data_file + ".sparse"

            if smat is None:
                self.assertRaises(ValueError, Space.build, data=data_file1, rows= row_file, cols=col_file, format="sm")

            else:
                sp = Space.build(data=data_file1, rows= row_file, cols=col_file, format="sm")
                self.assertListEqual(rows, sp.id2row)
                self.assertListEqual(cols, sp.id2column)

                self.assertIsInstance(sp.cooccurrence_matrix, SparseMatrix)
                np.testing.assert_array_equal(smat,
                                              sp.cooccurrence_matrix.mat.todense())

            data_file2 = self.dir_ + data_file + ".dense"

            if dmat is None:
                self.assertRaises(ValueError, Space.build, data=data_file2, rows= row_file, cols=col_file, format="dm")

            else:
                sp = Space.build(data=data_file2, rows= row_file, cols=col_file, format="dm")
                self.assertListEqual(rows, sp.id2row)
                self.assertListEqual(cols, sp.id2column)

                self.assertIsInstance(sp.cooccurrence_matrix, DenseMatrix)
                np.testing.assert_array_equal(dmat, sp.cooccurrence_matrix.mat)
예제 #18
0
    def test_simple_sparse(self):

        bcs.main([
            "build_core_space.py", "-l", self.dir_ + "log1.txt", "-i",
            self.dir_ + "mat1", "-o", self.dir_, "--input_format", "sm",
            "--output_format", "sm"
        ])

        s1 = Space.build(data=self.dir_ + "mat1.sm",
                         cols=self.dir_ + "mat1.cols",
                         format="sm")
        s2 = Space.build(data=self.dir_ + "CORE_SS.mat1.sm",
                         cols=self.dir_ + "CORE_SS.mat1.cols",
                         format="sm")
        s3 = io_utils.load(self.dir_ + "CORE_SS.mat1.pkl", Space)

        self._test_equal_spaces_sparse(s1, s2)
        self._test_equal_spaces_sparse(s1, s3)
예제 #19
0
    def test_simple_sparse(self):

        bps.main(["build_peripheral_space.py",
                  "-l", self.dir_ + "log1.txt",
                  "-i", self.dir_ + "mat1",
                  "-o", self.dir_,
                  "-c", self.dir_ + "CORE_SS.mat1.pkl",
                  "--input_format", "sm",
                  "--output_format", "sm"
                  ])

        s1 = Space.build(data=self.dir_ + "mat1.sm",
                         cols=self.dir_ + "mat1.cols",
                         format="sm")
        s2 = Space.build(data=self.dir_ + "PER_SS.mat1.CORE_SS.mat1.sm",
                         cols=self.dir_ + "PER_SS.mat1.CORE_SS.mat1.cols",
                         format="sm")

        self._test_equal_spaces_sparse(s1, s2)
예제 #20
0
    def test_simple_nmf(self):

        bcs.main([
            "build_core_space.py", "-l", self.dir_ + "log_nmf.txt", "-i",
            self.dir_ + "mat3", "-w", "raw", "-r", "nmf_2", "-o", self.dir_,
            "--input_format", "dm", "--output_format", "dm"
        ])

        s1 = Space.build(data=self.dir_ + "CORE_SS.mat3.raw.nmf_2.dm",
                         format="dm")
        self.assertEqual(s1.cooccurrence_matrix.mat.shape, (3, 2))
예제 #21
0
    def test_simple_load(self):

        #trained = io_utils.load(self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl")
        #new_space = trained.function_space

        ac.main([
            "apply_composition.py", "-l", self.dir_ + "log1.txt", "-i",
            self.dir_ + "an_train_data.txt", "-o", self.dir_, "--load_model",
            self.dir_ +
            "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl", "-a",
            self.dir_ + "CORE_SS.N_mat.pkl", "--output_format", "dm"
        ])

        sp1 = Space.build(data=self.dir_ +
                          "COMPOSED_SS.LexicalFunction.an_train_data.txt.dm",
                          format="dm")

        sp2 = Space.build(data=self.dir_ + "AN_mat.dm", format="dm")

        self._test_equal_spaces_dense(sp1, sp2)
예제 #22
0
def build_unigram_space() :
	unigram_space = Space.build(data = args.function[3],
                       	       rows = args.function[2],
                       	       cols = args.function[1],
                       	       format = "sm")
	 
	ppmi_space = ppmi(unigram_space)
	ppmi_norm_space = norm(ppmi_space)
	ppmi_norm_svd_space = svd(ppmi_norm_space)
	
	save_space(ppmi_norm_svd_space, "unigrams_space") 
	return ppmi_norm_svd_space
예제 #23
0
    def write_pkl(self):
        """
        Create spaces from co-occurrence counts in sparse format (.sm)
        """

        # For direction DE-EN
        my_space_1 = Space.build(
            data=OUTPUT_FILE_DE_DE_EN_SM, rows=OUTPUT_FILE_DE_WORDS_ROW, cols=OUTPUT_FILE_DE_EN_WORDS_COL, format="sm"
        )

        # For direction EN-DE
        my_space_2 = Space.build(
            data=OUTPUT_FILE_EN_EN_DE_SM, rows=OUTPUT_FILE_EN_WORDS_ROW, cols=OUTPUT_FILE_DE_EN_WORDS_COL, format="sm"
        )

        # Save the space objects in pickle format
        io_utils.save(my_space_1, OUTPUT_FILE_DE_DE_EN_PKL)
        io_utils.save(my_space_2, OUTPUT_FILE_EN_EN_DE_PKL)

        print >> stderr, "Pickle file 1 written out:", OUTPUT_FILE_DE_DE_EN_PKL
        print >> stderr, "Pickle file 2 written out:", OUTPUT_FILE_EN_EN_DE_PKL
예제 #24
0
def inspect_representations(path_composed_emb, output_path):
    print('Inspecting representations...')
    composed_space = Space.build(data=path_composed_emb, format='dm')
    f = codecs.open(output_path, 'w', 'utf8')
    word_list=[w for w in composed_space.get_row2id()]
    for j, w in enumerate(word_list):
        if j < 1000:
            neighbours = composed_space.get_neighbours(w, 10, CosSimilarity())

            f.write('Neighbours for ' + w + '\n')
            f.write("\n".join('%s %.6f' % x for x in neighbours))
            f.write('\n----------------------------\n')
    f.close()
예제 #25
0
    def test_simple_load(self):

        #trained = io_utils.load(self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl")
        #new_space = trained.function_space

        ac.main(["apply_composition.py",
                  "-l", self.dir_ + "log1.txt",
                  "-i", self.dir_ + "an_train_data.txt",
                  "-o", self.dir_,
                  "--load_model", self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl",
                  "-a", self.dir_ + "CORE_SS.N_mat.pkl",
                  "--output_format", "dm"
                  ]
                )

        sp1 = Space.build(data=self.dir_ + "COMPOSED_SS.LexicalFunction.an_train_data.txt.dm",
                         format="dm")

        sp2 = Space.build(data=self.dir_ + "AN_mat.dm",
                         format="dm")

        self._test_equal_spaces_dense(sp1, sp2)
예제 #26
0
def inspect_representations(path_composed_emb, output_path):
    print('Inspecting representations...')
    composed_space = Space.build(data=path_composed_emb, format='dm')
    f = codecs.open(output_path, 'w', 'utf8')
    word_list = [w for w in composed_space.get_row2id()]
    for j, w in enumerate(word_list):
        if j < 1000:
            neighbours = composed_space.get_neighbours(w, 10, CosSimilarity())

            f.write('Neighbours for ' + w + '\n')
            f.write("\n".join('%s %.6f' % x for x in neighbours))
            f.write('\n----------------------------\n')
    f.close()
예제 #27
0
 def test_simple_nmf(self):
     
     bcs.main(["build_core_space.py", 
       "-l", self.dir_ + "log_nmf.txt",
       "-i", self.dir_ + "mat3",
       "-w", "raw",
       "-r", "nmf_2",
       "-o", self.dir_,
       "--input_format", "dm",
       "--output_format", "dm"
       ])
       
     s1 = Space.build(data = self.dir_ + "CORE_SS.mat3.raw.nmf_2.dm", format="dm")  
     self.assertEqual(s1.cooccurrence_matrix.mat.shape, (3,2))
예제 #28
0
    def test_build_data_row(self):
        test_cases = [("data1", "row1.row", ["red"], ["car", "man"],
                       np.mat([[3,5]]), np.mat([[3,5]])),
                      ("data2", "row1.row",["red"], ["car"],
                       np.mat([[3]]), np.mat([[3]])),
                      ("data3", "row2.row", ["blue", "red"], ["car", "man"],
                       np.mat([[0,6],[15,0]]), np.mat([[0,6],[5,0]])),
                      ("data3", "row3.row", ["blue", "red"], ["car", "man"],
                       np.mat([[0,6],[15,0]]), np.mat([[0,6],[5,0]])),
                      ("data7", "row2.row", ["blue", "red"], ["car"],
                       np.mat([[0],[0]]), np.mat([[0],[0]])),
                      ]

        for data_file, row_file, rows, cols, smat, dmat in test_cases:
            row_file = self.dir_ + row_file

            data_file1 = self.dir_ + data_file + ".sparse"

            sp = Space.build(data=data_file1,
                             rows= row_file,
                             cols= self.dir_ + data_file + ".cols",
                             format="sm")
            self.assertListEqual(rows, sp.id2row)
            self.assertListEqual(cols, sp.id2column)

            self.assertIsInstance(sp.cooccurrence_matrix, SparseMatrix)
            np.testing.assert_array_equal(smat,
                                          sp.cooccurrence_matrix.mat.todense())

            data_file2 = self.dir_ + data_file + ".dense"

            sp = Space.build(data=data_file2, rows= row_file, format="dm")
            self.assertListEqual(rows, sp.id2row)
            self.assertListEqual([], sp.id2column)

            self.assertIsInstance(sp.cooccurrence_matrix, DenseMatrix)
            np.testing.assert_array_equal(dmat, sp.cooccurrence_matrix.mat)
예제 #29
0
def build_spaces(in_file_prefix, in_format, out_dir, out_format, weightings,
                 selections, reductions, normalizations, is_gz):

    in_file_descr = "CORE_SS." + in_file_prefix.split("/")[-1]
    data_file = '%s.%s' % (in_file_prefix, in_format)

    if not in_format in ("sm", "dm", "pkl"):
        raise ValueError("Invalid input format:%s" % in_format)

    if in_format == "pkl":
        space = io_utils.load(data_file, Space)
    else:
        if is_gz:
            data_file = '%s.gz' % data_file
        row_file = '%s.rows' % (in_file_prefix)
        column_file = '%s.cols' % (in_file_prefix)
        if not os.path.exists(row_file):
            row_file = None
        if not os.path.exists(column_file):
            if in_format == "sm":
                raise ValueError("Column file: %s needs to be provided!" %
                                 column_file)
            column_file = None

        print("Building matrix...")
        space = Space.build(data=data_file,
                            rows=row_file,
                            cols=column_file,
                            format=in_format)

    for w in weightings:
        w_space = apply_weighting(space, w)

        for s in selections:
            s_space = apply_selection(w_space, s)

            for r in reductions:
                r_space = apply_reduction(s_space, r)

                for n in normalizations:
                    n_space = apply_normalization(r_space, n)

                    print("Printing...")
                    print_space(n_space, out_dir, [in_file_descr, w, s, r, n],
                                out_format)
예제 #30
0
def main():
    """
    Compute the FREQ/PPMI/PLMI matrix from a co-occurrence matrix, as default pickle the raw matrix
    """

    # Get the arguments
    args = docopt(
        '''Compute the FREQ/PPMI/PLMI matrix from a co-occurrence matrix, as default pickle the raw matrix

    Usage:
        create_dsm.py <dsm_prefix> [-p | -l]

        <dsm_prefix> = the prefix for the input files (.sm for the matrix, .rows and .cols) and output files (.ppmi)
    
    Options:
    <none>      weight the matrice entries via FREQUENCY
    -p, --ppmi  weight the matrice entries via PPMI
    -l, --plmi  weight the matrice entries via PLMI
    
    ''')

    dsm_prefix = args['<dsm_prefix>']
    is_ppmi = args['--ppmi']
    is_plmi = args['--plmi']

    postfix = "_freq"

    # Create a space from co-occurrence counts in sparse format
    dsm = Space.build(data=dsm_prefix + '.sm',
                      rows=dsm_prefix + '.rows',
                      cols=dsm_prefix + '.cols',
                      format='sm')

    if is_ppmi:
        # Apply ppmi weighting
        dsm = dsm.apply(PpmiWeighting())
        postfix = "_ppmi"
    elif is_plmi:
        # Apply plmi weighting
        dsm = dsm.apply(PlmiWeighting())
        postfix = "_plmi"

    # Save the Space object in pickle format
    save_pkl_files(dsm_prefix + postfix, dsm)
예제 #31
0
def build_spaces(in_file_prefix, in_format, out_dir, out_format, weightings, 
                 selections, reductions, normalizations, is_gz):

    in_file_descr = "CORE_SS." + in_file_prefix.split("/")[-1]
    data_file = '%s.%s' % (in_file_prefix, in_format)
    
    if not in_format in ("sm", "dm", "pkl"):
        raise ValueError("Invalid input format:%s" % in_format) 
    
    if in_format == "pkl":
        space = io_utils.load(data_file, Space)
    else:
        if is_gz:
            data_file = '%s.gz' % data_file    
        row_file = '%s.rows' % (in_file_prefix)
        column_file = '%s.cols' % (in_file_prefix)
        if not os.path.exists(row_file):
            row_file = None
        if not os.path.exists(column_file):
            if in_format == "sm":
                raise ValueError("Column file: %s needs to be provided!" 
                                 % column_file)
            column_file = None
            
        print "Building matrix..."   
        space = Space.build(data=data_file, rows=row_file, cols=column_file, 
                            format=in_format)
 
    for w in weightings:
        w_space = apply_weighting(space, w)
                
        for s in selections:
            s_space = apply_selection(w_space, s)

            for r in reductions:
                r_space = apply_reduction(s_space, r)
                
                for n in normalizations:
                    n_space = apply_normalization(r_space, n)
                    
                    print "Printing..."
                    print_space(n_space, out_dir, [in_file_descr, w, s, r, n], out_format)
예제 #32
0
def getThesaurus(word):
    if isinstance(word, unicode):
        word = word.encode('utf-8')
    else:
        try:
            word.decode('utf-8')
        except:
            raise

    # find synonyms in chilin
    for line in open(THES_PATH + 'chilin-zh-TW.csv'):
        synonyms = line.split()
        if word in synonyms:
            break

    # calculate word similarity
    word_sim_dict = {}
    my_space = Space.build(data=THES_PATH + 'sm',
                           rows=THES_PATH + 'words.rows',
                           cols=THES_PATH + 'cols',
                           format='sm')
    for row in open(THES_PATH + 'words.rows'):
        word1 = row.strip()
        sim = my_space.get_sim(word1, word, CosSimilarity())
        if sim > .3:
            word_sim_dict[word1] = sim

    # rank first those overlapping with chilin synonyms
    word_sim_list = []
    if word_sim_dict.get(word):
        word_sim_dict.pop(word)
        for key in word_sim_dict.keys():
            if key in synonyms:
                word_sim_dict.pop(key)
                word_sim_list += [key]

        # sort the rest of words
        d = sorted(word_sim_dict.items(), key=lambda x: x[1], reverse=True)
        word_sim_list += [word for word, sim in d]

        word_sim_list = word_sim_list[:9]
    return word_sim_list
예제 #33
0
파일: ex19.py 프로젝트: totonac/dissect
from composes.semantic_space.space import Space
from composes.composition.lexical_function import LexicalFunction
from composes.utils.regression_learner import LstsqRegressionLearner

#training data1: VO N -> SVO
train_vo_data = [("hate_boy", "man", "man_hate_boy"),
                 ("hate_man", "man", "man_hate_man"),
                 ("hate_boy", "boy", "boy_hate_boy"),
                 ("hate_man", "boy", "boy_hate_man")]

#training data2: V N -> VO
train_v_data = [("hate", "man", "hate_man"), ("hate", "boy", "hate_boy")]

#load N and SVO spaces
n_space = Space.build(data="./data/in/ex19-n.sm",
                      cols="./data/in/ex19-n.cols",
                      format="sm")

svo_space = Space.build(data="./data/in/ex19-svo.sm",
                        cols="./data/in/ex19-svo.cols",
                        format="sm")

print "\nInput SVO training space:"
print svo_space.id2row
print svo_space.cooccurrence_matrix

#1. train a model to learn VO functions on train data: VO N -> SVO
print "\nStep 1 training"
vo_model = LexicalFunction(learner=LstsqRegressionLearner())
vo_model.train(train_vo_data, n_space, svo_space)
예제 #34
0
import sys
import os

folder = os.path.expandvars('/home/luka/Downloads/dissect-master/src')
if folder not in sys.path:
    sys.path.append(folder)

from composes.semantic_space.space import Space

my_space = Space.build(
    data="/home/luka/Downloads/dissect-master/src/examples/data/in/ex01.sm",
    rows="/home/luka/Downloads/dissect-master/src/examples/data/in/ex01.rows",
    cols="/home/luka/Downloads/dissect-master/src/examples/data/in/ex01.cols",
    format="sm")

from composes.utils import io_utils
from composes.transformation.scaling.ppmi_weighting import PpmiWeighting

my_space = io_utils.load(
    "/home/luka/Downloads/dissect-master/src/examples/data/out/ex01.pkl")
print my_space.cooccurrence_matrix

my_space = my_space.apply(PpmiWeighting())
print my_space.cooccurrence_matrix
예제 #35
0
    def test_simple_define(self):

        #trained = io_utils.load(self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl")
        #new_space = trained.function_space

        #compose with lexical function
        ac.main([
            "apply_composition.py", "-l", self.dir_ + "log1.txt", "-i",
            self.dir_ + "an_train_data.txt", "-o", self.dir_, "--load_model",
            self.dir_ +
            "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl", "-a",
            self.dir_ + "CORE_SS.N_mat.pkl", "--output_format", "dm"
        ])

        sp2 = Space.build(data=self.dir_ +
                          "COMPOSED_SS.LexicalFunction.an_train_data.txt.dm",
                          format="dm")

        #compose with weighted addition
        ac.main([
            "apply_composition.py", "-l", self.dir_ + "log1.txt", "-i",
            self.dir_ + "an_train_data.txt", "-o", self.dir_, "-m",
            "weighted_add", "--alpha", "0.5", "--beta", "0.5", "-a",
            self.dir_ + "CORE_SS.A_mat.pkl" + "," + self.dir_ +
            "CORE_SS.N_mat.pkl", "--output_format", "dm"
        ])

        sp1 = Space.build(data=self.dir_ +
                          "COMPOSED_SS.WeightedAdditive.an_train_data.txt.dm",
                          format="dm")
        sp3 = io_utils.load(
            self.dir_ + "COMPOSED_SS.WeightedAdditive.an_train_data.txt.pkl")

        np.testing.assert_array_equal(sp1.cooccurrence_matrix.mat,
                                      np.mat([[3, 4], [4, 5]]))
        self._test_equal_spaces_structs(sp1, sp2)
        sp1.to_sparse()
        sp3.to_sparse()
        self._test_equal_spaces_sparse(sp1, sp3)

        #the two output format have to contain identical data
        sp1.to_dense()
        sp3.to_dense()
        self._test_equal_spaces_dense(sp1, sp3)

        #compose with dilation
        ac.main([
            "apply_composition.py", "-l", self.dir_ + "log1.txt", "-i",
            self.dir_ + "an_train_data.txt", "-o", self.dir_, "-m", "dilation",
            "--lambda", "1", "-a", self.dir_ + "CORE_SS.A_mat.pkl" + "," +
            self.dir_ + "CORE_SS.N_mat.pkl", "--output_format", "dm"
        ])

        sp1 = Space.build(data=self.dir_ +
                          "COMPOSED_SS.Dilation.an_train_data.txt.dm",
                          format="dm")
        n_space = io_utils.load(self.dir_ + "CORE_SS.N_mat.pkl")
        sp1.to_dense()
        n_space.to_dense()
        np.testing.assert_array_almost_equal(
            sp1.cooccurrence_matrix.mat, n_space.cooccurrence_matrix.mat * 25)
        self._test_equal_spaces_structs(sp1, sp2)

        #compose with dilation, change the order of the arguments
        ac.main([
            "apply_composition.py", "-l", self.dir_ + "log1.txt", "-i",
            self.dir_ + "na_train_data.txt", "-o", self.dir_, "-m", "dilation",
            "--lambda", "1", "-a", self.dir_ + "CORE_SS.N_mat.pkl" + "," +
            self.dir_ + "CORE_SS.A_mat.pkl", "--output_format", "dm"
        ])

        sp1 = Space.build(data=self.dir_ +
                          "COMPOSED_SS.Dilation.na_train_data.txt.dm",
                          format="dm")

        sp1.to_dense()
        np.testing.assert_array_almost_equal(sp1.cooccurrence_matrix.mat,
                                             np.mat([[75, 100], [183, 244]]),
                                             5)
        self._test_equal_spaces_structs(sp1, sp2)

        #compose with multiplicative
        ac.main([
            "apply_composition.py", "-l", self.dir_ + "log1.txt", "-i",
            self.dir_ + "aan_train_data.txt", "-o", self.dir_, "-m", "mult",
            "-a", self.dir_ + "CORE_SS.A_mat.pkl" + "," + self.dir_ +
            "COMPOSED_SS.Dilation.an_train_data.txt.pkl", "--output_format",
            "dm"
        ])

        sp1 = Space.build(data=self.dir_ +
                          "COMPOSED_SS.Multiplicative.aan_train_data.txt.dm",
                          format="dm")
예제 #36
0
파일: dissect.py 프로젝트: DariaRyzhova/phd
def create_space(dmFile, rowsFile):
    space = Space.build(data=dmFile, rows=rowsFile, format="dm")
    return space
예제 #37
0
#ex02.py
#-------
from composes.semantic_space.space import Space
from composes.utils import io_utils

#create a space from co-occurrence counts in sparse format
my_space = Space.build(data="./data/in/ex01.sm",
                       rows="./data/in/ex01.rows",
                       cols="./data/in/ex01.cols",
                       format="sm")

#print the co-occurrence matrix of the space
print my_space.cooccurrence_matrix

#save the Space object in pickle format
io_utils.save(my_space, "./data/out/ex01.pkl")

#load the saved object
my_space2 = io_utils.load("./data/out/ex01.pkl")

#print the co-occurrence matrix of the loaded space
print my_space2.cooccurrence_matrix
예제 #38
0
    def test_export(self):

        out_file = self.dir_ + "tmp"
        mat1 = np.mat([[1,2],[3,0]])
        mat1row, mat1col = ["a","b"], ["f1","f2"]

        mat2 = np.mat([[0,0]])
        mat2row, mat2col = ["a"], []

        test_cases = [(Space(DenseMatrix(mat1), mat1row, mat1col),
                       Space(SparseMatrix(mat1), mat1row, mat1col)),
                       (Space(DenseMatrix(mat2), mat2row, mat1col),
                       Space(SparseMatrix(mat2), mat2row, mat1col))]

        #3 cases allowed at the moment
        for sp_d, sp_s in test_cases:

            self.reset_export_files(out_file)
            sp_d.export(out_file, format="dm")
            new_sp = Space.build(data=out_file + ".dm",
                                 rows=out_file + ".rows",
                                 cols=out_file + ".cols", format="dm")
            self._test_equal_spaces_dense(sp_d, new_sp)

            self.reset_export_files(out_file)
            sp_d.export(out_file, format="sm")
            new_sp = Space.build(data=out_file + ".sm",
                                 rows=out_file + ".rows",
                                 cols=out_file + ".cols", format="sm")
            self._test_equal_spaces_sparse(sp_s, new_sp)

            self.reset_export_files(out_file)
            sp_s.export(out_file, format="sm")
            new_sp = Space.build(data=out_file + ".sm",
                                 rows=out_file + ".rows",
                                 cols=out_file + ".cols", format="sm")
            self._test_equal_spaces_sparse(sp_s, new_sp)

            self.reset_export_files(out_file)
            sp_s.export(out_file, format="dm")
            new_sp = Space.build(data=out_file + ".dm",
                                 rows=out_file + ".rows",
                                 cols=out_file + ".cols", format="dm")

            self._test_equal_spaces_dense(sp_d, new_sp)

        test_cases = [(Space(DenseMatrix(mat2), mat2row, mat2col),
                       Space(SparseMatrix(mat2), mat2row, mat2col))]

        for sp_d, sp_s in test_cases:

            self.reset_export_files(out_file)
            sp_d.export(out_file, format="dm")
            new_sp = Space.build(data=out_file + ".dm",
                                 rows=out_file + ".rows",
                                 format="dm")
            self._test_equal_spaces_dense(sp_d, new_sp)

            self.reset_export_files(out_file)
            sp_s.export(out_file, format="dm")
            new_sp = Space.build(data=out_file + ".dm",
                                 rows=out_file + ".rows",
                                 format="dm")

            self._test_equal_spaces_dense(sp_d, new_sp)
import sys
if __name__ == '__main__':
    # set constants
    data_path = sys.argv[0] + "/" + sys.argv[1] + "_"

    log_file = data_path + "all.log"
    core_cooccurrence_file = data_path + "GemmaData_sm"
    core_row_file = data_path + "GemmaData_rows"
    core_col_file = data_path + "GemmaData_cols"
    core_space_file = data_path + "core.pkl"
    
    # config log file
    log_utils.config_logging(log_file)
    
    print "Building semantic space from co-occurrence counts"
    core_space = Space.build(data=core_cooccurrence_file, rows=core_row_file,
                             cols=core_col_file, format="sm")
    
    print "Applying ppmi weighting"
    core_space = core_space.apply(PpmiWeighting())
    print "Applying feature selection"
    core_space = core_space.apply(TopFeatureSelection(5000))
    print "Applying svd 500"
    core_space = core_space.apply(Svd(100))
    
    print "Saving the semantic space"
    io_utils.save(core_space, core_space_file)
    
    #print "Finding 10 neighbors of " + sys.argv[1]
    #neighbors = core_space.get_neighbours(sys.argv[1], 10, CosSimilarity())
    #print neighbors
예제 #40
0
    def test_simple_ops(self):

        #x = matrix([[ -2.19426495e+00,   3.16751379e+00,  -3.89945798e-01],
        #x = np.mat([[1,2,3],[2,4,6],[4,675,43]])

        us = np.mat([[2.19272110e+00, 3.03174768e+00],
                     [4.38544220e+00, 6.06349536e+00],
                     [6.76369708e+02, -4.91431927e-02]])
        us2 = np.mat([[2.19426495e+00, 3.16751379e+00],
                      [4.38703714e+00, 6.14112794e+00],
                      [6.76380808e+02, -5.01074549e-02]])

        bcs.main([
            "build_core_space.py", "-l", self.dir_ + "log1.txt", "-i",
            self.dir_ + "mat3", "-w", "raw", "-s",
            "top_sum_3,top_length_3,top_sum_4", "-r", "svd_2,svd_1", "-n",
            "none,all,row", "-o", self.dir_, "--input_format", "dm",
            "--output_format", "dm"
        ])

        s1 = Space.build(data=self.dir_ +
                         "CORE_SS.mat3.raw.top_sum_3.svd_2.dm",
                         format="dm")
        s2 = Space.build(data=self.dir_ +
                         "CORE_SS.mat3.raw.top_sum_3.svd_1.dm",
                         format="dm")
        s3 = Space.build(data=self.dir_ +
                         "CORE_SS.mat3.raw.top_length_3.svd_2.dm",
                         format="dm")
        s4 = Space.build(data=self.dir_ +
                         "CORE_SS.mat3.raw.top_length_3.svd_1.dm",
                         format="dm")
        s5 = Space.build(data=self.dir_ +
                         "CORE_SS.mat3.raw.top_sum_4.svd_2.dm",
                         format="dm")
        s6 = Space.build(data=self.dir_ +
                         "CORE_SS.mat3.raw.top_sum_4.svd_1.dm",
                         format="dm")
        s7 = Space.build(data=self.dir_ +
                         "CORE_SS.mat3.raw.top_sum_4.svd_1.all.dm",
                         format="dm")
        s8 = Space.build(data=self.dir_ +
                         "CORE_SS.mat3.raw.top_sum_4.svd_1.row.dm",
                         format="dm")
        s9 = s6.apply(Normalization())
        s10 = s6.apply(RowNormalization())

        self._test_equal_spaces_dense(s1, s3)
        self._test_equal_spaces_dense(s2, s4)
        self._test_equal_spaces_dense(s7, s9)
        self._test_equal_spaces_dense(s8, s10)

        np.testing.assert_array_almost_equal(abs(s1.cooccurrence_matrix.mat),
                                             abs(us), 2)
        np.testing.assert_array_almost_equal(abs(s2.cooccurrence_matrix.mat),
                                             abs(us[:, 0:1]), 2)
        np.testing.assert_array_almost_equal(abs(s5.cooccurrence_matrix.mat),
                                             abs(us2), 2)
        np.testing.assert_array_almost_equal(abs(s6.cooccurrence_matrix.mat),
                                             abs(us2[:, 0:1]), 2)

        self._test_equal_spaces_structs(s3, s5)
        self._test_equal_spaces_structs(s2, s6)

        bcs.main([
            "build_core_space.py", "-l", self.dir_ + "log1.txt", "-i",
            self.dir_ + "mat3", "--weighting", "raw", "--selection",
            "top_sum_3,top_length_3,top_sum_4", "--reduction", "svd_2,svd_1",
            "-o", self.dir_, "--input_format", "sm", "--output_format", "dm"
        ])

        s1 = Space.build(data=self.dir_ +
                         "CORE_SS.mat3.raw.top_sum_3.svd_2.dm",
                         format="dm")
        s2 = Space.build(data=self.dir_ +
                         "CORE_SS.mat3.raw.top_sum_3.svd_1.dm",
                         format="dm")
        s3 = Space.build(data=self.dir_ +
                         "CORE_SS.mat3.raw.top_length_3.svd_2.dm",
                         format="dm")
        s4 = Space.build(data=self.dir_ +
                         "CORE_SS.mat3.raw.top_length_3.svd_1.dm",
                         format="dm")
        s5 = Space.build(data=self.dir_ +
                         "CORE_SS.mat3.raw.top_sum_4.svd_2.dm",
                         format="dm")
        s6 = Space.build(data=self.dir_ +
                         "CORE_SS.mat3.raw.top_sum_4.svd_1.dm",
                         format="dm")

        self._test_equal_spaces_dense(s1, s3)
        self._test_equal_spaces_dense(s2, s4)

        np.testing.assert_array_almost_equal(abs(s1.cooccurrence_matrix.mat),
                                             abs(us), 2)
        np.testing.assert_array_almost_equal(abs(s2.cooccurrence_matrix.mat),
                                             abs(us[:, 0:1]), 2)
        np.testing.assert_array_almost_equal(abs(s5.cooccurrence_matrix.mat),
                                             abs(us2), 2)
        np.testing.assert_array_almost_equal(abs(s6.cooccurrence_matrix.mat),
                                             abs(us2[:, 0:1]), 2)

        self._test_equal_spaces_structs(s3, s5)
        self._test_equal_spaces_structs(s2, s6)
예제 #41
0
파일: dissect.py 프로젝트: SBelkaid/NEDMED
from subprocess import Popen, PIPE
import os
import time


usage = """
Usage: python dissect.py dissect_format_file_name

dissect_format_file_name: path to a file containing dissect format
"""

CMD_EXTRACTOR_SCRIPT = '~/Programming/terminology_extractor/extract_patterns.py'
file_name = sys.argv[1]

my_space = Space.build(data = file_name+".sm",
                       rows = file_name+".rows",
                       cols = file_name+".cols",
                       format = "sm")

my_space = my_space.apply(PpmiWeighting())
# print my_space.get_sim("spain", "netherlands", CosSimilarity())
# print my_space.get_neighbours('parenchymopbouw', 4, CosSimilarity())
# print my_space.get_neighbours('pension-n', 4, CosSimilarity())
# print my_space.id2row


def prettify(elem):
    """
    Return a pretty-printed XML string for the Element.
    """
    rough_string = ElementTree.tostring(elem, 'utf-8')
    reparsed = minidom.parseString(rough_string)
# Uses dissect toolkit to import the sparse matrix from sort-cooccur-matrix.
# Applies ppmi weighting and exports the result to ./cooccurence/weighted/
# Note that this file is in python 2, not 3.

import sys
import os
folder = os.path.expandvars('/home/luka/Downloads/dissect-master/src')
if folder not in sys.path:
    sys.path.append(folder)
from composes.semantic_space.space import Space

#pathnames
path = '/home/luka/ThLi/cooccurrence/'

#import matrix
holspace = Space.build(data=path + "spm1.sm",
                       rows=path + "rows1.rows",
                       cols=path + "cols1.cols",
                       format="sm")

#apply ppmi weighting
from composes.transformation.scaling.ppmi_weighting import PpmiWeighting
holspace = holspace.apply(PpmiWeighting())

#export matrix
from composes.utils import io_utils
io_utils.save(holspace, path + "weighted")
holspace.export(path + "weighted_sm", format="sm")
예제 #43
0
파일: ex02.py 프로젝트: Aliases/dissect
#ex02.py
#-------
from composes.semantic_space.space import Space
from composes.utils import io_utils

#create a space from co-occurrence counts in sparse format
my_space = Space.build(data = "./data/in/ex01.sm",
                       rows = "./data/in/ex01.rows",
                       cols = "./data/in/ex01.cols",
                       format = "sm")

#print the co-occurrence matrix of the space
print my_space.cooccurrence_matrix

#save the Space object in pickle format
io_utils.save(my_space, "./data/out/ex01.pkl")

#load the saved object
my_space2 = io_utils.load("./data/out/ex01.pkl")

#print the co-occurrence matrix of the loaded space
print my_space2.cooccurrence_matrix

예제 #44
0
def train_baroni_guevara_composers(all_vectors,
                                   ROOT_DIR,
                                   baroni_output_path, guevara_output_path,
                                   baroni_threshold=10):
    """

    :type all_vectors: str; path to vectors file containing both N and observed AN vectors
    :type ROOT_DIR: str; where to write temp files
    :type baroni_output_path: str; where to write pickled baroni composer
    :type guevara_output_path: str
    :type baroni_threshold: int
    """
    SVD_DIMS = 100
    baroni_training_phrase_types = {'AN', 'NN'}  # what kind of NPs to train Baroni composer for

    # prepare the input files to be fed into Dissect
    mkdirs_if_not_exists(ROOT_DIR)

    filename = basename(all_vectors)
    noun_events_file = join(ROOT_DIR, '%s-onlyN-SVD%d.tmp' % (filename, SVD_DIMS))
    NPs_events_file = join(ROOT_DIR, '%s-onlyPhrases-SVD%d.tmp' % (filename, SVD_DIMS))

    thes = Vectors.from_tsv(all_vectors, lowercasing=False)
    thes.to_tsv(noun_events_file,
                entry_filter=lambda x: x.type == '1-GRAM' and x.tokens[0].pos == 'N')
    _translate_byblo_to_dissect(noun_events_file)

    thes.to_tsv(NPs_events_file,
                entry_filter=lambda x: x.type in baroni_training_phrase_types,
                row_transform=lambda x: str(x).replace(' ', '_'))
    _translate_byblo_to_dissect(NPs_events_file)

    my_space = Space.build(data="{}.sm".format(noun_events_file),
                           rows="{}.rows".format(noun_events_file),
                           cols="{}.cols".format(noun_events_file),
                           format="sm")
    logging.info('Each unigram vector has dimensionality %r', my_space.element_shape)

    # create a peripheral space
    my_per_space = PeripheralSpace.build(my_space,
                                         data="{}.sm".format(NPs_events_file),
                                         rows="{}.rows".format(NPs_events_file),
                                         # The columns of the peripheral space have to be identical to those
                                         # in the core space (including their order)!
                                         cols="{}.cols".format(NPs_events_file),
                                         format="sm")
    logging.info('Each phrase vector has dimensionality %r', my_per_space.element_shape)

    # use the model to compose words in my_space
    all_data = []
    for phrase in my_per_space._row2id:
        # make sure there are only NPs here
        if DocumentFeature.from_string(phrase.replace(' ', '_')).type in baroni_training_phrase_types:
            adj, noun = phrase.split('_')
            all_data.append((adj, noun, '%s_%s' % (adj, noun)))

    # train a composition model on the data and save it
    baroni = LexicalFunction(min_samples=baroni_threshold, learner=RidgeRegressionLearner())
    guevara = FullAdditive(learner=RidgeRegressionLearner())
    for composer, out_path in zip([baroni, guevara],
                                  [baroni_output_path, guevara_output_path]):
        composer.train(all_data, my_space, my_per_space)
        io_utils.save(composer, out_path)
        logging.info('Saved trained composer to %s', out_path)
예제 #45
0
def train_grefenstette_multistep_composer(all_vectors_file, root_dir):
    """
    Train Grefenstette et al's multistep regression VO/SVO model
    Adapted from dissect's ex19.py
    :param all_vectors_file: file containing N, V, VO and SVO vectors
    :param root_dir: where to write temp files and output
    """
    mkdirs_if_not_exists(root_dir)
    vo_composer_output_file = join(root_dir, 'vo_comp.pkl')
    svo_composer_output_file = join(root_dir, 'svo_comp.pkl')

    filename = basename(all_vectors_file)
    noun_events_file = join(root_dir, '%s-onlyN.tmp' % filename)
    # verb_events_file = join(root_dir, '%s-onlyV.tmp' % filename)
    # vo_events_file = join(root_dir, '%s-onlyVO.tmp' % filename)
    svo_events_file = join(root_dir, '%s-onlySVO.tmp' % filename)

    # this has unigrams and observed phrases
    thes = Vectors.from_tsv(all_vectors_file)
    thes.to_tsv(noun_events_file,
                entry_filter=lambda x: x.type == '1-GRAM' and x.tokens[0].pos == 'N')
    _translate_byblo_to_dissect(noun_events_file)
    # thes.to_tsv(verb_events_file,
    # entry_filter=lambda x: x.type == '1-GRAM' and x.tokens[0].pos == 'V')
    # _translate_byblo_to_dissect(verb_events_file)
    # thes.to_tsv(vo_events_file,
    #             entry_filter=lambda x: x.type == 'VO')
    # _translate_byblo_to_dissect(vo_events_file)
    thes.to_tsv(svo_events_file,
                entry_filter=lambda x: x.type == 'SVO')
    _translate_byblo_to_dissect(svo_events_file)

    train_vo_data, train_v_data = [], []
    for phrase in thes.keys():
        df = DocumentFeature.from_string(phrase)
        if df.type == 'SVO':
            train_vo_data.append((str(df[1:]), str(df[0]), str(df)))
        if df.type == 'VO':
            train_v_data.append((str(df[0]), str(df[1]), str(df)))

    # logging.info('train_vo_data %r', len(train_vo_data))
    # logging.info('train_v_data %r', len(train_v_data))

    # load N and SVO spaces
    n_space = Space.build(data=noun_events_file + '.sm',
                          cols=noun_events_file + '.cols',
                          format="sm")

    svo_space = Space.build(data=svo_events_file + '.sm',
                            cols=svo_events_file + '.cols',
                            format="sm")

    logging.info("Input SVO training space:")
    logging.info(svo_space.id2row)
    # logging.info(svo_space.cooccurrence_matrix)

    # 1. train a model to learn VO functions on train data: VO N -> SVO
    logging.info("Step 1 training")
    vo_model = LexicalFunction(learner=RidgeRegressionLearner(), min_samples=2)  # Gref et al 2013, §5 says 3
    vo_model.train(train_vo_data, n_space, svo_space)
    io_utils.save(vo_model, vo_composer_output_file)

    # 2. train a model to learn V functions on train data: V N -> VO
    # where VO space: function space learned in step 1
    logging.info("Step 2 training")
    vo_space = vo_model.function_space
    v_model = LexicalFunction(learner=RidgeRegressionLearner(), min_samples=2)
    v_model.train(train_v_data, n_space, vo_space)
    io_utils.save(v_model, svo_composer_output_file)
from tree.semantic_node import SemanticNode
from tree.syntactic_tree import SyntacticTree
from composes.semantic_space.space import Space
from examples import test_vector_file_prefix, test_matrix_file_prefix

# FIRST TEST
xml_string = '''
<ccg>
<lf start="0" span="1" word="dog" lemma="dog" pos="NN" chunk="I-NP" entity="O" cat="N" />
</ccg>
'''
syntactic_tree = SyntacticTree.parse_tree_from_xml_string(xml_string)
vecfilepref = test_vector_file_prefix
matfilepref = test_matrix_file_prefix
vecspace = Space.build(data = vecfilepref + ".dm",
                       rows = vecfilepref + ".rows",
                       format = "dm")
matspace = Space.build(data = matfilepref + ".dm",
                       rows = matfilepref + ".rows",
                       format = "dm")

semnode = SemanticNode.create_semantic_node(syntactic_tree.root,None)
papnode = Papfunc_SemanticNode.create_papfunc_node(semnode,vecspace,matspace)

print "*****"
print "Syntactic tree:", semnode
print "Symbolic representation:", papnode._matrep
print "Numeric representation:"
for x in papnode._numrep: print x

# SECOND TEST
예제 #47
0
    def test_simple_ops(self):
        
        #x = matrix([[ -2.19426495e+00,   3.16751379e+00,  -3.89945798e-01],
        #x = np.mat([[1,2,3],[2,4,6],[4,675,43]])
        
        us = np.mat([[  2.19272110e+00,   3.03174768e+00],
                               [  4.38544220e+00,   6.06349536e+00],
                               [  6.76369708e+02,  -4.91431927e-02]])
        us2 = np.mat([[ 2.19426495e+00,   3.16751379e+00],
                      [ 4.38703714e+00,   6.14112794e+00],
                      [ 6.76380808e+02,  -5.01074549e-02]])
        

        bcs.main(["build_core_space.py", 
                  "-l", self.dir_ + "log1.txt",
                  "-i", self.dir_ + "mat3",
                  "-w", "raw",
                  "-s", "top_sum_3,top_length_3,top_sum_4",
                  "-r", "svd_2,svd_1",
                  "-n", "none,all,row",
                  "-o", self.dir_,
                  "--input_format", "dm",
                  "--output_format", "dm"
                  ])        
        
        s1 = Space.build(data = self.dir_ + "CORE_SS.mat3.raw.top_sum_3.svd_2.dm", format="dm")
        s2 = Space.build(data = self.dir_ + "CORE_SS.mat3.raw.top_sum_3.svd_1.dm", format="dm")
        s3 = Space.build(data = self.dir_ + "CORE_SS.mat3.raw.top_length_3.svd_2.dm", format="dm")
        s4 = Space.build(data = self.dir_ + "CORE_SS.mat3.raw.top_length_3.svd_1.dm", format="dm")
        s5 = Space.build(data = self.dir_ + "CORE_SS.mat3.raw.top_sum_4.svd_2.dm", format="dm")
        s6 = Space.build(data = self.dir_ + "CORE_SS.mat3.raw.top_sum_4.svd_1.dm", format="dm")
        s7 = Space.build(data = self.dir_ + "CORE_SS.mat3.raw.top_sum_4.svd_1.all.dm", format="dm")
        s8 = Space.build(data = self.dir_ + "CORE_SS.mat3.raw.top_sum_4.svd_1.row.dm", format="dm")
        s9 = s6.apply(Normalization())
        s10 = s6.apply(RowNormalization())
        
        self._test_equal_spaces_dense(s1, s3)            
        self._test_equal_spaces_dense(s2, s4)
        self._test_equal_spaces_dense(s7, s9)
        self._test_equal_spaces_dense(s8, s10)
        
        np.testing.assert_array_almost_equal(abs(s1.cooccurrence_matrix.mat), abs(us), 2)
        np.testing.assert_array_almost_equal(abs(s2.cooccurrence_matrix.mat), abs(us[:,0:1]), 2)
        np.testing.assert_array_almost_equal(abs(s5.cooccurrence_matrix.mat), abs(us2), 2)
        np.testing.assert_array_almost_equal(abs(s6.cooccurrence_matrix.mat), abs(us2[:,0:1]), 2)

        self._test_equal_spaces_structs(s3, s5)
        self._test_equal_spaces_structs(s2, s6)
        
        
        bcs.main(["build_core_space.py", 
          "-l", self.dir_ + "log1.txt",
          "-i", self.dir_ + "mat3",
          "--weighting", "raw",
          "--selection", "top_sum_3,top_length_3,top_sum_4",
          "--reduction", "svd_2,svd_1",
          "-o", self.dir_,
          "--input_format", "sm",
          "--output_format", "dm"
          ])        
        
        s1 = Space.build(data = self.dir_ + "CORE_SS.mat3.raw.top_sum_3.svd_2.dm", format="dm")
        s2 = Space.build(data = self.dir_ + "CORE_SS.mat3.raw.top_sum_3.svd_1.dm", format="dm")
        s3 = Space.build(data = self.dir_ + "CORE_SS.mat3.raw.top_length_3.svd_2.dm", format="dm")
        s4 = Space.build(data = self.dir_ + "CORE_SS.mat3.raw.top_length_3.svd_1.dm", format="dm")
        s5 = Space.build(data = self.dir_ + "CORE_SS.mat3.raw.top_sum_4.svd_2.dm", format="dm")
        s6 = Space.build(data = self.dir_ + "CORE_SS.mat3.raw.top_sum_4.svd_1.dm", format="dm")
            
        self._test_equal_spaces_dense(s1, s3)            
        self._test_equal_spaces_dense(s2, s4)
        
        np.testing.assert_array_almost_equal(abs(s1.cooccurrence_matrix.mat), abs(us), 2)
        np.testing.assert_array_almost_equal(abs(s2.cooccurrence_matrix.mat), abs(us[:,0:1]), 2)
        np.testing.assert_array_almost_equal(abs(s5.cooccurrence_matrix.mat), abs(us2), 2)
        np.testing.assert_array_almost_equal(abs(s6.cooccurrence_matrix.mat), abs(us2[:,0:1]), 2)

        self._test_equal_spaces_structs(s3, s5)
        self._test_equal_spaces_structs(s2, s6)
예제 #48
0
from composes.semantic_space.space import Space
from composes.utils import io_utils
from composes.transformation.scaling.ppmi_weighting import PpmiWeighting
from composes.transformation.scaling.row_normalization import RowNormalization

import sys

#create a space from co-occurrence counts in sparse format
my_space = Space.build(data="../data/" + sys.argv[1] + ".sm",
                       rows="../data/" + sys.argv[1] + ".rows",
                       cols="../data/" + sys.argv[1] + ".cols",
                       format="sm")

my_space = my_space.apply(PpmiWeighting())
my_space = my_space.apply(RowNormalization())

#export the space in dense format and pkl format
my_space.export("../spaces/" + sys.argv[1], format="dm")
io_utils.save(my_space, "../spaces/" + sys.argv[1] + ".pkl")
예제 #49
0
#training data1: VO N -> SVO 
train_vo_data = [("hate_boy", "man", "man_hate_boy"),
                 ("hate_man", "man", "man_hate_man"),
                 ("hate_boy", "boy", "boy_hate_boy"),
                 ("hate_man", "boy", "boy_hate_man")
                 ]

#training data2: V N -> VO
train_v_data = [("hate", "man", "hate_man"),
                ("hate", "boy", "hate_boy")
                ]
        
#load N and SVO spaces
n_space = Space.build(data = "./data/in/ex19-n.sm",
                      cols = "./data/in/ex19-n.cols",
                      format = "sm")

svo_space = Space.build(data = "./data/in/ex19-svo.sm",
                        cols = "./data/in/ex19-svo.cols",
                        format = "sm")

print "\nInput SVO training space:" 
print svo_space.id2row
print svo_space.cooccurrence_matrix

#1. train a model to learn VO functions on train data: VO N -> SVO
print "\nStep 1 training"
vo_model = LexicalFunction(learner=LstsqRegressionLearner())
vo_model.train(train_vo_data, n_space, svo_space)
예제 #50
0
import sys
import os

folder = os.path.expandvars('/home/luka/Downloads/dissect-master/src')
if folder not in sys.path:
    sys.path.append(folder)

from composes.semantic_space.space import Space

lassy_space = Space.build(data="/home/luka/ThLi/cooccurrence/spm1.sm",
                          rows="/home/luka/ThLi/cooccurrence/rows1.rows",
                          cols="/home/luka/ThLi/cooccurrence/cols1.cols",
                          format="sm")

#%%

from composes.utils import io_utils
from composes.transformation.scaling.ppmi_weighting import PpmiWeighting

lassy_space = lassy_space.apply(PpmiWeighting())
예제 #51
0
from composes.semantic_space.space import Space
from composes.utils import io_utils
from composes.transformation.scaling.ppmi_weighting import PpmiWeighting
from composes.transformation.scaling.row_normalization import RowNormalization
from composes.transformation.dim_reduction.svd import Svd;

import sys

#create a space from co-occurrence counts in sparse format
my_space = Space.build(data = "../data/"+sys.argv[1]+".sm",
                       rows = "../data/"+sys.argv[1]+".rows",
                       cols = "../data/"+sys.argv[1]+".cols",
                       format = "sm")
                       
my_space = my_space.apply(PpmiWeighting())
my_space = my_space.apply(RowNormalization())

#apply svd reduction
my_space = my_space.apply(Svd(1500))

    
#export the space in dense format and pkl format
my_space.export("../spaces/"+sys.argv[1], format = "dm")
io_utils.save(my_space, "../spaces/"+sys.argv[1]+".pkl")
예제 #52
0
#Convert .dm file to .pkl
#Usage: python dm2pkl bnc.dm

from composes.semantic_space.space import Space
from composes.utils import io_utils
import sys

space = Space.build(data=sys.argv[1], format='dm')
name = sys.argv[1][0:-3]
io_utils.save(space, name+".pkl")