Пример #1
0
def compute_neighbours(in_file, no_neighbours, out_dir, sim_measure,
                       space_files):
    sim_dict = {
        "cos": CosSimilarity(),
        "lin": LinSimilarity(),
        "dot_prod": DotProdSimilarity(),
        "euclidean": EuclideanSimilarity()
    }

    if not sim_measure in sim_dict:
        raise ValueError("Similarity measure:%s not defined" % sim_measure)

    space = io_utils.load(space_files[0], Space)
    space2 = None
    space_descr = ".".join(space_files[0].split("/")[-1].split(".")[0:-1])
    if len(space_files) == 2:
        space2 = io_utils.load(space_files[1], Space)
        space_descr = ".".join([space_descr] +
                               space_files[1].split("/")[-1].split(".")[0:-1])

    sim = sim_dict[sim_measure]

    descr = ".".join(["NEIGHBOURS", in_file.split("/")[-1], space_descr])
    out_file = '%s/%s.%s' % (out_dir, descr, sim_measure)
    io_utils.create_parent_directories(out_file)

    data = io_utils.read_list(in_file)

    print("Computing neighbours: %s" % sim_measure)
    with open(out_file, "w") as out_stream:
        for word in data:
            out_stream.write("%s\n" % word)
            result = space.get_neighbours(word, no_neighbours, sim, space2)
            for neighbour, neighbour_sim in result:
                out_stream.write("\t%s %s\n" % (neighbour, neighbour_sim))
Пример #2
0
def compute_neighbours(in_file, no_neighbours, out_dir, sim_measure, space_files):
    sim_dict = {"cos": CosSimilarity(),
                "lin": LinSimilarity(),
                "dot_prod": DotProdSimilarity(),
                "euclidean": EuclideanSimilarity()}
    
    if not sim_measure in sim_dict:
        raise ValueError("Similarity measure:%s not defined" % sim_measure)
    
    space = io_utils.load(space_files[0], Space)
    space2 = None
    space_descr = ".".join(space_files[0].split("/")[-1].split(".")[0:-1])
    if len(space_files) == 2:
        space2 = io_utils.load(space_files[1], Space)
        space_descr = ".".join([space_descr] + space_files[1].split("/")[-1].split(".")[0:-1])
        
    sim = sim_dict[sim_measure]
    
    descr = ".".join(["NEIGHBOURS", in_file.split("/")[-1], space_descr])
    out_file = '%s/%s.%s' % (out_dir, descr, sim_measure)
    io_utils.create_parent_directories(out_file)
        
    data = io_utils.read_list(in_file)

    print "Computing neighbours: %s" % sim_measure 
    with open(out_file,"w") as out_stream:
        for word in data:
            out_stream.write("%s\n" % word)
            result = space.get_neighbours(word, no_neighbours, sim, space2)
            for neighbour, neighbour_sim in result:
                out_stream.write("\t%s %s\n" % (neighbour, neighbour_sim)) 
def apply_model(in_file, out_dir, model, trained_model, arg_space_files, alpha,
                beta, lambda_, out_format):

    print("Reading in data...")
    in_descr = in_file.split("/")[-1]

    if not model is None:
        model_obj = create_model(model, alpha, beta, lambda_)
    else:
        model_obj = io_utils.load(trained_model, CompositionModel)

    model_descr = type(model_obj).__name__

    arg_space = io_utils.load(arg_space_files[0], Space)
    arg_space2 = None
    if len(arg_space_files) == 2:
        arg_space2 = io_utils.load(arg_space_files[1], Space)

    data = io_utils.read_tuple_list(in_file, fields=[0, 1, 2])

    print("Applying composition model:%s" % model_descr)
    if arg_space2 is None or type(model_obj) is LexicalFunction:
        composed_space = model_obj.compose(data, arg_space)
    else:
        composed_space = model_obj.compose(data, (arg_space, arg_space2))

    print("Printing...")
    out_file = ".".join([out_dir + "/COMPOSED_SS", model_descr, in_descr])
    io_utils.save(composed_space, "%s.pkl" % out_file)

    if not out_format is None:
        composed_space.export(out_file, format=out_format)
Пример #4
0
def train_model(in_file, out_dir, model, arg_space_files, phrase_space_file,
                regression, crossvalid, intercept, param, param_range,
                export_params):

    print "Reading in data..."
    in_descr = in_file.split("/")[-1]

    model_dict = {
        "weighted_add": WeightedAdditive,
        "full_add": FullAdditive,
        "lexical_func": LexicalFunction,
        "dilation": Dilation
    }
    learner_dict = {
        "ridge": RidgeRegressionLearner,
        "lstsq": LstsqRegressionLearner
    }

    arg_space = io_utils.load(arg_space_files[0], Space)
    arg_space2 = None
    if len(arg_space_files) == 2:
        arg_space2 = io_utils.load(arg_space_files[1], Space)

    phrase_space = io_utils.load(phrase_space_file, Space)

    if not model in model_dict:
        raise ValueError("Invalid model:%s for training" % model)

    model_cls = model_dict[model]
    if model_cls in (WeightedAdditive, Dilation):
        model_obj = model_cls()
    else:
        if regression == "ridge":
            regression_obj = learner_dict[regression](
                crossvalidation=crossvalid,
                intercept=intercept,
                param=param,
                param_range=param_range)
            model_obj = model_cls(learner=regression_obj)
        elif regression == "lstsq":
            regression_obj = learner_dict[regression](intercept=intercept)
            model_obj = model_cls(learner=regression_obj)

        else:
            model_obj = model_cls()

    train_data = io_utils.read_tuple_list(in_file, fields=[0, 1, 2])

    print "Training %s model" % model
    if arg_space2 is None or model == "lexical_func":
        model_obj.train(train_data, arg_space, phrase_space)
    else:
        model_obj.train(train_data, (arg_space, arg_space2), phrase_space)

    print "Printing..."
    out_file = ".".join([out_dir + "/TRAINED_COMP_MODEL", model, in_descr])
    io_utils.save(model_obj, "%s.pkl" % out_file)

    if export_params:
        model_obj.export("%s.params" % out_file)
Пример #5
0
    def test_simple_dense(self):
            
        bcs.main(["build_core_space.py", 
                  "-l", self.dir_ + "log1.txt",
                  "-i", self.dir_ + "mat2", 
                  "-o", self.dir_,
                  "--input_format", "dm",
                  "--output_format", "dm"
                  ])
        
        s1 = Space.build(data = self.dir_ + "mat2.dm", format = "dm")
        s2 = Space.build(data = self.dir_ + "CORE_SS.mat2.dm", format="dm")
        s3 = io_utils.load(self.dir_ + "CORE_SS.mat2.pkl", Space)
        
        self._test_equal_spaces_dense(s1, s2)
        self._test_equal_spaces_dense(s1, s3)        
 
        bcs.main(["build_core_space.py", 
                  "-l", self.dir_ + "log1.txt",
                  "-i", self.dir_ + "CORE_SS.mat2", 
                  "-o", self.dir_,
                  "--input_format", "pkl",
                  "--output_format", "dm"
                  ])
        
        s1 = io_utils.load(self.dir_ + "CORE_SS.CORE_SS.mat2.pkl", Space)
        s3 = io_utils.load(self.dir_ + "CORE_SS.mat2.pkl", Space)
        
        self._test_equal_spaces_dense(s1, s3)  
Пример #6
0
def apply_model(in_file, out_dir, model, trained_model, arg_space_files,
                alpha, beta, lambda_, out_format):

    print "Reading in data..."
    in_descr = in_file.split("/")[-1] 
    
    if not model is None: 
        model_obj = create_model(model, alpha, beta, lambda_)
    else:
        model_obj = io_utils.load(trained_model, CompositionModel)
        
    model_descr = type(model_obj).__name__
     
    arg_space = io_utils.load(arg_space_files[0], Space)
    arg_space2 = None
    if len(arg_space_files) == 2:
        arg_space2 = io_utils.load(arg_space_files[1], Space)
    
    data = io_utils.read_tuple_list(in_file, fields=[0, 1, 2])
    
    print "Applying composition model:%s" % model_descr
    if arg_space2 is None or type(model_obj) is LexicalFunction:
        composed_space = model_obj.compose(data, arg_space)
    else:
        composed_space = model_obj.compose(data, (arg_space, arg_space2))
    
    print "Printing..."
    out_file = ".".join([out_dir + "/COMPOSED_SS", model_descr, in_descr])    
    io_utils.save(composed_space, "%s.pkl" % out_file)
    
    if not out_format is None:
        composed_space.export(out_file, format=out_format)
Пример #7
0
    def test_simple_dense(self):

        bcs.main([
            "build_core_space.py", "-l", self.dir_ + "log1.txt", "-i",
            self.dir_ + "mat2", "-o", self.dir_, "--input_format", "dm",
            "--output_format", "dm"
        ])

        s1 = Space.build(data=self.dir_ + "mat2.dm", format="dm")
        s2 = Space.build(data=self.dir_ + "CORE_SS.mat2.dm", format="dm")
        s3 = io_utils.load(self.dir_ + "CORE_SS.mat2.pkl", Space)

        self._test_equal_spaces_dense(s1, s2)
        self._test_equal_spaces_dense(s1, s3)

        bcs.main([
            "build_core_space.py", "-l", self.dir_ + "log1.txt", "-i",
            self.dir_ + "CORE_SS.mat2", "-o", self.dir_, "--input_format",
            "pkl", "--output_format", "dm"
        ])

        s1 = io_utils.load(self.dir_ + "CORE_SS.CORE_SS.mat2.pkl", Space)
        s3 = io_utils.load(self.dir_ + "CORE_SS.mat2.pkl", Space)

        self._test_equal_spaces_dense(s1, s3)
Пример #8
0
def functionneighbours(words,number): 
    #load a space
    if sys.argv[2]=='full':
        my_space = io_utils.load("./data/out/thesisfull.pkl")
    if sys.argv[2]=='nonzero':
        my_space = io_utils.load("./data/out/thesis.pkl")

    return(my_space.get_neighbours(words,number, CosSimilarity()))
Пример #9
0
def train_model(in_file, out_dir, model, arg_space_files, phrase_space_file, regression,
                crossvalid, intercept, param, param_range, export_params):

    print "Reading in data..."
    in_descr = in_file.split("/")[-1]

    model_dict = {"weighted_add": WeightedAdditive,
                  "full_add": FullAdditive,
                  "lexical_func": LexicalFunction,
                  "dilation": Dilation
                  }
    learner_dict = {"ridge": RidgeRegressionLearner,
                    "lstsq": LstsqRegressionLearner
                    }

    arg_space = io_utils.load(arg_space_files[0], Space)
    arg_space2 = None
    if len(arg_space_files) == 2:
        arg_space2 = io_utils.load(arg_space_files[1], Space)

    phrase_space = io_utils.load(phrase_space_file, Space)

    if not model in model_dict:
        raise ValueError("Invalid model:%s for training" % model)

    model_cls = model_dict[model]
    if model_cls in (WeightedAdditive, Dilation):
        model_obj = model_cls()
    else:
        if regression == "ridge":
            regression_obj = learner_dict[regression](crossvalidation=crossvalid,
                                                       intercept=intercept,
                                                       param=param,
                                                       param_range=param_range)
            model_obj = model_cls(learner=regression_obj)
        elif regression == "lstsq":
            regression_obj = learner_dict[regression](intercept=intercept)
            model_obj = model_cls(learner=regression_obj)

        else:
            model_obj = model_cls()

    train_data = io_utils.read_tuple_list(in_file, fields=[0, 1, 2])

    print "Training %s model" % model
    if arg_space2 is None or model == "lexical_func":
        model_obj.train(train_data, arg_space, phrase_space)
    else:
        model_obj.train(train_data, (arg_space, arg_space2), phrase_space)

    print "Printing..."
    out_file = ".".join([out_dir + "/TRAINED_COMP_MODEL", model, in_descr])
    io_utils.save(model_obj, "%s.pkl" % out_file)

    if export_params:
        model_obj.export("%s.params" % out_file)
Пример #10
0
    def test_as_conversion_tool(self):
        
        bcs.main(["build_core_space.py", 
                  "-i", self.dir_ + "mat3", 
                  "-o", self.dir_,
                  "--input_format", "sm",
                  "--output_format", "sm"
                  ])        
        
        s1 = Space.build(data=self.dir_ + "mat3.sm",
                         cols= self.dir_ + "mat3.cols",
                         format = "sm")
        s2 = Space.build(data=self.dir_ + "CORE_SS.mat3.sm",
                         rows=self.dir_ + "CORE_SS.mat3.rows",
                         cols=self.dir_ + "CORE_SS.mat3.cols", 
                         format="sm")
        s3 = io_utils.load(self.dir_ + "CORE_SS.mat3.pkl", Space)
        
        self._test_equal_spaces_sparse(s1, s2)
        self._test_equal_spaces_sparse(s1, s3)
        
        bcs.main(["build_core_space.py", 
                  "-i", self.dir_ + "mat3", 
                  "-o", self.dir_,
                  "--input_format", "sm",
                  "--output_format", "dm"
                  ])
        
        s1 = Space.build(data=self.dir_ + "mat3.dm",
                         cols=self.dir_ + "CORE_SS.mat3.cols",
                         format = "dm")
        s2 = Space.build(data=self.dir_ + "CORE_SS.mat3.dm",
                         rows=self.dir_ + "CORE_SS.mat3.rows",
                         cols=self.dir_ + "CORE_SS.mat3.cols",
                         format = "dm")                 
        s3 = io_utils.load(self.dir_ + "CORE_SS.mat3.pkl", Space)

        self._test_equal_spaces_dense(s1, s2)
        s3.to_dense()
        self._test_equal_spaces_dense(s1, s3)
        
        bcs.main(["build_core_space.py", 
                  "-i", self.dir_ + "mat3", 
                  "-o", self.dir_,
                  "--input_format", "dm",
                  "--output_format", "dm"
                  ])        
       
        s1 = Space.build(data=self.dir_ + "CORE_SS.mat3.dm",
                         cols=self.dir_ + "CORE_SS.mat3.cols",
                         format = "dm")                 
        s3 = io_utils.load(self.dir_ + "CORE_SS.mat3.pkl", Space)
        
        s3.to_dense()
        self._test_equal_spaces_dense(s1, s3)
Пример #11
0
 def test_simple_lstsq_no_inter(self):
     tc.main(["train_composition.py", 
               "-l", self.dir_ + "log1.txt",
               "-i", self.dir_ + "an_train_data.txt", 
               "-o", self.dir_,
               "-m", "lexical_func",
               "-p", self.dir_ + "CORE_SS.AN_mat.pkl",
               "-a", self.dir_ + "CORE_SS.N_mat.pkl",
               "-r", "lstsq",
               "--intercept", "False",
               "--export_params", "True"
               ]) 
     
     trained = io_utils.load(self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl")
     new_space = trained.function_space
     np.testing.assert_array_almost_equal(new_space.cooccurrence_matrix.mat, 
                                          np.mat([1,0,0,1]), 10)
     self.assertTupleEqual(new_space.element_shape, (2,2))
     self.assertListEqual(new_space.id2row, ["big"])
     self.assertListEqual(new_space.id2column, [])
     
     a_space = Space.build(data=self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.params.dm", 
                           format="dm")
     
     self._test_equal_spaces_dense(a_space, new_space)
     
     tc.main(["train_composition.py", 
               "-l", self.dir_ + "log1.txt",
               "-i", self.dir_ + "an_train_data.txt", 
               "-o", self.dir_,
               "-m", "lexical_func",
               "-p", self.dir_ + "CORE_SS.AN_mat.pkl",
               "-a", self.dir_ + "CORE_SS.N_mat.pkl",
               "-r", "ridge",
               "--lambda", "0",
               "--crossvalidation", "False",
               "--intercept", "False",
               "--export_params", "True"
               ]) 
     
     trained = io_utils.load(self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl")
     new_space2 = trained.function_space
     np.testing.assert_array_almost_equal(new_space2.cooccurrence_matrix.mat, 
                                          np.mat([1,0,0,1]), 10)
     self.assertTupleEqual(new_space2.element_shape, (2,2))
     self.assertListEqual(new_space2.id2row, ["big"])
     self.assertListEqual(new_space2.id2column, [])
     
     a_space = Space.build(data=self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.params.dm", 
                           format="dm")
     
     self._test_equal_spaces_dense(a_space, new_space2)
Пример #12
0
    def test_simple_lstsq_no_inter(self):
        tc.main([
            "train_composition.py", "-l", self.dir_ + "log1.txt", "-i",
            self.dir_ + "an_train_data.txt", "-o", self.dir_, "-m",
            "lexical_func", "-p", self.dir_ + "CORE_SS.AN_mat.pkl", "-a",
            self.dir_ + "CORE_SS.N_mat.pkl", "-r", "lstsq", "--intercept",
            "False", "--export_params", "True"
        ])

        trained = io_utils.load(
            self.dir_ +
            "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl")
        new_space = trained.function_space
        np.testing.assert_array_almost_equal(new_space.cooccurrence_matrix.mat,
                                             np.mat([1, 0, 0, 1]), 10)
        self.assertTupleEqual(new_space.element_shape, (2, 2))
        self.assertListEqual(new_space.id2row, ["big"])
        self.assertListEqual(new_space.id2column, [])

        a_space = Space.build(
            data=self.dir_ +
            "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.params.dm",
            format="dm")

        self._test_equal_spaces_dense(a_space, new_space)

        tc.main([
            "train_composition.py", "-l", self.dir_ + "log1.txt", "-i",
            self.dir_ + "an_train_data.txt", "-o", self.dir_, "-m",
            "lexical_func", "-p", self.dir_ + "CORE_SS.AN_mat.pkl", "-a",
            self.dir_ + "CORE_SS.N_mat.pkl", "-r", "ridge", "--lambda", "0",
            "--crossvalidation", "False", "--intercept", "False",
            "--export_params", "True"
        ])

        trained = io_utils.load(
            self.dir_ +
            "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl")
        new_space2 = trained.function_space
        np.testing.assert_array_almost_equal(
            new_space2.cooccurrence_matrix.mat, np.mat([1, 0, 0, 1]), 10)
        self.assertTupleEqual(new_space2.element_shape, (2, 2))
        self.assertListEqual(new_space2.id2row, ["big"])
        self.assertListEqual(new_space2.id2column, [])

        a_space = Space.build(
            data=self.dir_ +
            "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.params.dm",
            format="dm")

        self._test_equal_spaces_dense(a_space, new_space2)
Пример #13
0
def compute_sim(in_file, columns, out_dir, sim_measures, space_files):

    sim_dict = {
        "cos": CosSimilarity(),
        "lin": LinSimilarity(),
        "dot_prod": DotProdSimilarity(),
        "euclidean": EuclideanSimilarity()
    }

    if not len(columns) == 2:
        raise ValueError("Column description unrecognized!")
    col0 = int(columns[0]) - 1
    col1 = int(columns[1]) - 1

    try:
        space = io_utils.load(space_files[0], Space)
    except TypeError:
        warn("Not a Space instance in file: %s" % space_files[0])
        return

    space2 = None
    space_descr = ".".join(space_files[0].split("/")[-1].split(".")[0:-1])

    if len(space_files) == 2:
        space2 = io_utils.load(space_files[1], Space)
        space_descr = ".".join([space_descr] +
                               space_files[1].split("/")[-1].split(".")[0:-1])

    descr = ".".join(["SIMS", in_file.split("/")[-1], space_descr])

    for sim_measure in sim_measures:
        print("Computing similarities: %s" % sim_measure)
        if not sim_measure in sim_dict:
            warn("Similarity measure:%s not defined" % sim_measure)
            continue

        sim = sim_dict[sim_measure]
        out_file = '%s/%s.%s' % (out_dir, descr, sim_measure)
        io_utils.create_parent_directories(out_file)

        with open(in_file) as in_stream, open(out_file, "w") as out_stream:
            for line in in_stream:
                if not line.strip() == "":
                    elems = line.strip().split()
                    word1 = elems[col0]
                    word2 = elems[col1]

                    predicted_sim = space.get_sim(word1, word2, sim, space2)
                    out_stream.write("%s %s\n" %
                                     (line.strip(), str(predicted_sim)))
Пример #14
0
    def test_as_conversion_tool(self):

        bcs.main([
            "build_core_space.py", "-i", self.dir_ + "mat3", "-o", self.dir_,
            "--input_format", "sm", "--output_format", "sm"
        ])

        s1 = Space.build(data=self.dir_ + "mat3.sm",
                         cols=self.dir_ + "mat3.cols",
                         format="sm")
        s2 = Space.build(data=self.dir_ + "CORE_SS.mat3.sm",
                         rows=self.dir_ + "CORE_SS.mat3.rows",
                         cols=self.dir_ + "CORE_SS.mat3.cols",
                         format="sm")
        s3 = io_utils.load(self.dir_ + "CORE_SS.mat3.pkl", Space)

        self._test_equal_spaces_sparse(s1, s2)
        self._test_equal_spaces_sparse(s1, s3)

        bcs.main([
            "build_core_space.py", "-i", self.dir_ + "mat3", "-o", self.dir_,
            "--input_format", "sm", "--output_format", "dm"
        ])

        s1 = Space.build(data=self.dir_ + "mat3.dm",
                         cols=self.dir_ + "CORE_SS.mat3.cols",
                         format="dm")
        s2 = Space.build(data=self.dir_ + "CORE_SS.mat3.dm",
                         rows=self.dir_ + "CORE_SS.mat3.rows",
                         cols=self.dir_ + "CORE_SS.mat3.cols",
                         format="dm")
        s3 = io_utils.load(self.dir_ + "CORE_SS.mat3.pkl", Space)

        self._test_equal_spaces_dense(s1, s2)
        s3.to_dense()
        self._test_equal_spaces_dense(s1, s3)

        bcs.main([
            "build_core_space.py", "-i", self.dir_ + "mat3", "-o", self.dir_,
            "--input_format", "dm", "--output_format", "dm"
        ])

        s1 = Space.build(data=self.dir_ + "CORE_SS.mat3.dm",
                         cols=self.dir_ + "CORE_SS.mat3.cols",
                         format="dm")
        s3 = io_utils.load(self.dir_ + "CORE_SS.mat3.pkl", Space)

        s3.to_dense()
        self._test_equal_spaces_dense(s1, s3)
Пример #15
0
    def test_simple_sparse_zipped(self):
            
        bcs.main(["build_core_space.py", 
                  "-l", self.dir_ + "log1.txt",
                  "-i", self.dir_ + "mat1", 
                  "-o", self.dir_,
                  "--input_format", "sm",
                  "--output_format", "sm",
                  "--gz", "True"
                  ])
        
        s1 = Space.build(data=self.dir_ + "mat1.sm.gz",
                         cols= self.dir_ + "mat1.cols",
                         format = "sm")

        s2 = Space.build(data=self.dir_ + "CORE_SS.mat1.sm",
                         cols=self.dir_ + "CORE_SS.mat1.cols",
                         format="sm")
        s3 = io_utils.load(self.dir_ + "CORE_SS.mat1.pkl", Space)
        s4 = Space.build(data=self.dir_ + "mat1.sm",
                         cols= self.dir_ + "mat1.cols",
                         format = "sm")
                
        self._test_equal_spaces_sparse(s1, s2)
        self._test_equal_spaces_sparse(s1, s3)
        self._test_equal_spaces_sparse(s1, s4)
Пример #16
0
def main():

    pairs_file = sys.argv[1]
    model_id = sys.argv[2]
    space_id = sys.argv[3]
    results_dir = sys.argv[4]

    pairs_df = pd.read_csv(pairs_file, sep=' ')

    space_file = {
        'cbow-w2': 'cbow/cbow_300dim_hs0/sdewac.300.cbow.hs0.w2.vsm.pkl',
        'cbow-w5': 'cbow/cbow_300dim_hs0/sdewac.300.cbow.hs0.w5.vsm.pkl',
        'cbow-w10': 'cbow/cbow_300dim_hs0/sdewac.300.cbow.hs0.w10.vsm.pkl',
        'ppmi': 'count-based/sdewac_2015-11-23/sdewac-mst.prepro.bow-c10k-w5.ppmi.matrix.pkl'
    }

    space = io_utils.load(data_path + space_file[space_id]).apply(RowNormalization(criterion='length'))

    models = {
        'baseline-' + space: BaselineModel(space),
        'add' + space: AdditiveModel(space),
        'lexfun' + space: LexfunModel(space, learner='Ridge')
    }

    split = [0.5, 0.3, 0.2]
    partitioned_pairs_df = partition_pairs(pairs_df, split, random_state=42)

    df = evaluate(partitioned_pairs_df, {model_id: models[model_id]}, verbose=False)

    df.to_pickle(results_dir + model_id + '-' + space_id + '.pkl')

    writer = pd.ExcelWriter(results_dir + model_id + '-' + space_id + '.xlsx')
    df.to_excel(writer, space)
    writer.save()
Пример #17
0
def load_pkl_files(dsm_prefix):
    """
    Load the space from either a single pkl file or numerous files.
    :param dsm_prefix:
    :param dsm:
    """
    
    # Check whether there is a single pickle file for the Space object
    if os.path.isfile(dsm_prefix + '.pkl'):
        return io_utils.load(dsm_prefix + '.pkl')

    # Load the multiple files: npz for the matrix and pkl for the other data members of Space
    with np.load(dsm_prefix + 'cooc.npz') as loader:
        coo = coo_matrix((loader['data'], (loader['row'], loader['col'])), shape=loader['shape'])

    cooccurrence_matrix = SparseMatrix(csr_matrix(coo))

    with open(dsm_prefix + '_row2id.pkl', 'rb') as f_in:
        row2id = pickle.load(f_in)

    with open(dsm_prefix + '_id2row.pkl', 'rb') as f_in:
        id2row = pickle.load(f_in)

    with open(dsm_prefix + '_column2id.pkl', 'rb') as f_in:
        column2id = pickle.load(f_in)

    with open(dsm_prefix + '_id2column.pkl', 'rb') as f_in:
        id2column = pickle.load(f_in)

    return Space(cooccurrence_matrix, id2row, id2column, row2id=row2id, column2id=column2id)
def test():
    #syntactic_tree1 = SyntacticTree.read_tree("VP (VBZ play-v) (NP (N guitar-n))")
    #syntactic_tree2 = SyntacticTree.read_tree("VP (VBZ play-v) (NP (N instrument-n))")
    
    xml_string1 = '''
    <ccg>
      <rule type="fa" cat="S[dcl]\NP">
        <lf start="1" span="1" word="play-v" lemma="play" pos="VBZ" chunk="I-VP" entity="O" cat="(S[dcl]\NP)/NP" />
        <rule type="lex" cat="NP">
          <lf start="2" span="1" word="guitar-n" lemma="guitar" pos="NN" chunk="I-NP" entity="O" cat="N" />
        </rule>
      </rule>
    </ccg>'''
    xml_string2 = '''
    <ccg>
      <rule type="fa" cat="S[dcl]\NP">
        <lf start="1" span="1" word="play-v" lemma="play" pos="VBZ" chunk="I-VP" entity="O" cat="(S[dcl]\NP)/NP" />
        <rule type="lex" cat="NP">
          <lf start="2" span="1" word="instrument-n" lemma="instrument" pos="NN" chunk="I-NP" entity="O" cat="N" />
        </rule>
      </rule>
    </ccg>'''
    syntactic_tree1 = SyntacticTree.parse_tree_from_xml_string(xml_string1)
    syntactic_tree2 = SyntacticTree.parse_tree_from_xml_string(xml_string2)
    lexical_space = io_utils.load("/home/thenghiapham/work/project/tree_kernel/spaces/lexical_ppmi_svd300.pkl")
    kernel = SemanticTreeKernel(1.0)
    composition_model = WeightedAdditive()
    semantic_tree1 = syntactic_tree_2_semantic_tree(syntactic_tree1, lexical_space, composition_model)
    semantic_tree2 = syntactic_tree_2_semantic_tree(syntactic_tree2, lexical_space, composition_model)
    print semantic_tree1
    print semantic_tree2
    print [node._label for node in semantic_tree1.get_nodes()] 
Пример #19
0
def main():

    data_path = "/data/dsm/sdewac/"

    model = sys.argv[1]
    pairs = sys.argv[2]
    pattern_set = sys.argv[3]
    results_dir = sys.argv[4]

    pairs_df = pd.read_csv(pairs, sep=" ")

    model_file = {
        "cbow-w2": "cbow/cbow_300dim_hs0/sdewac.300.cbow.hs0.w2.vsm.pkl",
        "cbow-w5": "cbow/cbow_300dim_hs0/sdewac.300.cbow.hs0.w5.vsm.pkl",
        "cbow-w10": "cbow/cbow_300dim_hs0/sdewac.300.cbow.hs0.w10.vsm.pkl",
        "ppmi": "count-based/sdewac_2015-11-23/sdewac-mst.prepro.bow-c10k-w5.ppmi.matrix.pkl",
    }

    space = io_utils.load(data_path + model_file[model])
    space = space.apply(RowNormalization(criterion="length"))

    patterns = pd.unique(pairs_df["pattern"])

    writer = pd.ExcelWriter(results_dir + "/eval-" + model + "-" + pattern_set + ".xlsx")
    for pattern in patterns:
        df = eval_pattern(space, pairs_df, pattern, folds=10, random_state=42, verbose=True)
        df.to_excel(writer, pattern)
        writer.save()
Пример #20
0
def build_raw_per_space(in_file_prefix, in_format, is_gz):

    if not in_format in ("sm", "dm", "pkl"):
        raise ValueError("Invalid input format:%s" % in_format)

    data_file = "%s.%s" % (in_file_prefix, in_format)

    if in_format == "pkl":
        space = io_utils.load(data_file, Space)

    else:
        if is_gz:
            data_file = "%s.gz" % data_file
        row_file = "%s.rows" % (in_file_prefix)
        column_file = "%s.cols" % (in_file_prefix)
        if not os.path.exists(row_file):
            row_file = None
        if not os.path.exists(column_file):
            if in_format == "sm":
                raise ValueError("Column file: %s needs to be provided!" % column_file)
            column_file = None
        print "Building matrix..."
        space = Space.build(data=data_file, rows=row_file, cols=column_file, format=in_format)

    return space
Пример #21
0
def build_raw_per_space(in_file_prefix, in_format, is_gz):

    if not in_format in ("sm", "dm", "pkl"):
        raise ValueError("Invalid input format:%s" % in_format)

    data_file = '%s.%s' % (in_file_prefix, in_format)

    if in_format == "pkl":
        space = io_utils.load(data_file, Space)

    else:
        if is_gz:
            data_file = '%s.gz' % data_file
        row_file = '%s.rows' % (in_file_prefix)
        column_file = '%s.cols' % (in_file_prefix)
        if not os.path.exists(row_file):
            row_file = None
        if not os.path.exists(column_file):
            if in_format == "sm":
                raise ValueError("Column file: %s needs to be provided!" %
                                 column_file)
            column_file = None
        print("Building matrix...")
        space = Space.build(data=data_file,
                            rows=row_file,
                            cols=column_file,
                            format=in_format)

    return space
Пример #22
0
def compute_sim(in_file, columns, out_dir, sim_measures, space_files):

    sim_dict = {"cos": CosSimilarity(),
                "lin": LinSimilarity(),
                "dot_prod": DotProdSimilarity(),
                "euclidean": EuclideanSimilarity()}

    if not len(columns) == 2:
        raise ValueError("Column description unrecognized!")
    col0 = int(columns[0]) - 1
    col1 = int(columns[1]) - 1

    try:
        space = io_utils.load(space_files[0], Space)
    except TypeError:
        warn("Not a Space instance in file: %s" % space_files[0])
        return

    space2 = None
    space_descr = ".".join(space_files[0].split("/")[-1].split(".")[0:-1])

    if len(space_files) == 2:
        space2 = io_utils.load(space_files[1], Space)
        space_descr = ".".join([space_descr] + space_files[1].split("/")[-1].split(".")[0:-1])

    descr = ".".join(["SIMS", in_file.split("/")[-1], space_descr])

    for sim_measure in sim_measures:
        print "Computing similarities: %s" % sim_measure
        if not sim_measure in sim_dict:
            warn("Similarity measure:%s not defined" % sim_measure)
            continue

        sim = sim_dict[sim_measure]
        out_file = '%s/%s.%s' % (out_dir, descr, sim_measure)
        io_utils.create_parent_directories(out_file)

        with open(in_file) as in_stream, open(out_file,"w") as out_stream:
            for line in in_stream:
                if not line.strip() == "":
                    elems = line.strip().split()
                    word1 = elems[col0]
                    word2 = elems[col1]

                    predicted_sim = space.get_sim(word1, word2, sim, space2)
                    out_stream.write("%s %s\n" % (line.strip(), str(predicted_sim)))
def test():
    print "hello"
    syntactic_tree1 = SyntacticTree.read_tree("VP (VBZ play-v) (NP (N guitar-n))")
    syntactic_tree2 = SyntacticTree.read_tree("VP (VBZ play-v) (NP (N instrument-n))")
    lexical_space = io_utils.load("/home/thenghiapham/work/project/tree_kernel/spaces/lexical_ppmi_svd300.pkl")
    kernel = SemanticSyntacticTreeKernel(1.0, lexical_space)
    print syntactic_tree1
    print syntactic_tree2
    print [node._label for node in syntactic_tree1.get_nodes()] 
Пример #24
0
def main():

    partitioned_pairs_file = sys.argv[1]
    patterns_file = sys.argv[2]
    model_id = sys.argv[3]
    space_id = sys.argv[4]
    pattern_map_file = sys.argv[5]
    results_file = sys.argv[6]

    partitioned_pairs_df = pd.read_csv(partitioned_pairs_file, index_col=0)

    space_file = {
        'cbow-w2': 'cbow/cbow_300dim_hs0/sdewac.300.cbow.hs0.w2.vsm.pkl',
        'cbow-w5': 'cbow/cbow_300dim_hs0/sdewac.300.cbow.hs0.w5.vsm.pkl',
        'cbow-w10': 'cbow/cbow_300dim_hs0/sdewac.300.cbow.hs0.w10.vsm.pkl',
        'ppmi': 'count-based/sdewac_2015-11-23/sdewac-mst.prepro.bow-c10k-w5.ppmi.matrix.pkl'
    }

    space = io_utils.load(data_path + space_file[space_id]).apply(RowNormalization(criterion='length'))

    models = {
        'baseline' : BaselineModel(space),
        'add' : AdditiveModel(space),
        'lexfun' : LexfunModel(space, learner='Ridge'),
        'wadd' : WeightedAdditiveModel(space),
        'mul': MultiplicativeModel(space),
        'waddx': WeightedAdditiveModel(space, no_diff=True)
    }

    model = models[model_id]

    if patterns_file == 'None':
        patterns = None
    else:
        patterns = []
        with open(patterns_file) as f:
            for l in f.read().splitlines():
                patterns += l.split(' ')

    if pattern_map_file == 'None':
        pattern_map = {}
    else:
        pattern_map = {}
        with open(pattern_map_file) as f:
            for l in f.read().splitlines():
                xs = l.split(' ')
                superpattern = xs[0]
                for p in xs[1:]:
                    pattern_map[p] = superpattern

    df = prediction_features(partitioned_pairs_df, model, patterns, verbose=False, pattern_map=pattern_map)

    df.to_pickle(results_file + '.pkl')

    df.to_csv(results_file + '.csv')
Пример #25
0
def transform_raw_per_space(raw_per_space, in_file_prefix, out_dir, out_format, core_space_file):

    in_file_descr = "PER_SS." + in_file_prefix.split("/")[-1]
    core_space = io_utils.load(core_space_file, Space)
    core_descr = ".".join(core_space_file.split("/")[-1].split(".")[0:-1])

    space = PeripheralSpace(core_space, raw_per_space.cooccurrence_matrix, raw_per_space.id2row, raw_per_space.row2id)

    print "Printing..."
    out_file_prefix = "%s/%s.%s" % (out_dir, in_file_descr, core_descr)
    io_utils.save(space, out_file_prefix + ".pkl")
    if not out_format is None:
        space.export(out_file_prefix, format=out_format)
Пример #26
0
def transform_raw_per_space(raw_per_space, in_file_prefix, out_dir, out_format,
                            core_space_file):

    in_file_descr = "PER_SS." + in_file_prefix.split("/")[-1]
    core_space = io_utils.load(core_space_file, Space)
    core_descr = ".".join(core_space_file.split("/")[-1].split(".")[0:-1])

    space = PeripheralSpace(core_space, raw_per_space.cooccurrence_matrix,
                            raw_per_space.id2row, raw_per_space.row2id)

    print("Printing...")
    out_file_prefix = "%s/%s.%s" % (out_dir, in_file_descr, core_descr)
    io_utils.save(space, out_file_prefix + ".pkl")
    if not out_format is None:
        space.export(out_file_prefix, format=out_format)
Пример #27
0
def Load_Semantic_Space(space_file):
    # Load the semantic space
    global my_space;
    my_space = io_utils.load(space_file);
    #Normalize the space
#    my_space = my_space.apply(RowNormalization())    
    
    global space_dim;
    space_dim = my_space.element_shape[0];
    # Get the rows in space
    
    keyset = set([]);
    for x in my_space.get_row2id():
        keyset.add(x);
    global key_set;
    key_set=keyset;
Пример #28
0
def load_context_vocab(context_filename, spaces_dir):
    logging.info('Using {0} contents as context words to build a comparable'
    ' space'.format(context_filename))
    if not os.path.isfile(context_filename):
        logging.info('{0} not found: building...'.format(context_filename))
	space_filenames = glob.glob(os.path.join(spaces_dir, '*.pkl'))
        words = []
        for space_filename in space_filenames:
            sp = io_utils.load(space_filename)
            words.append(set(sp.id2row))
        context_words = set.intersection(*words)
        with open(context_filename, 'w') as f:
            for w in context_words:
                f.write('{0}\n'.format(w))

        logging.info('File {0} created'.format(context_filename))
    return [l.strip() for l in file(context_filename)]
Пример #29
0
def build_spaces(in_file_prefix, in_format, out_dir, out_format, weightings,
                 selections, reductions, normalizations, is_gz):

    in_file_descr = "CORE_SS." + in_file_prefix.split("/")[-1]
    data_file = '%s.%s' % (in_file_prefix, in_format)

    if not in_format in ("sm", "dm", "pkl"):
        raise ValueError("Invalid input format:%s" % in_format)

    if in_format == "pkl":
        space = io_utils.load(data_file, Space)
    else:
        if is_gz:
            data_file = '%s.gz' % data_file
        row_file = '%s.rows' % (in_file_prefix)
        column_file = '%s.cols' % (in_file_prefix)
        if not os.path.exists(row_file):
            row_file = None
        if not os.path.exists(column_file):
            if in_format == "sm":
                raise ValueError("Column file: %s needs to be provided!" %
                                 column_file)
            column_file = None

        print("Building matrix...")
        space = Space.build(data=data_file,
                            rows=row_file,
                            cols=column_file,
                            format=in_format)

    for w in weightings:
        w_space = apply_weighting(space, w)

        for s in selections:
            s_space = apply_selection(w_space, s)

            for r in reductions:
                r_space = apply_reduction(s_space, r)

                for n in normalizations:
                    n_space = apply_normalization(r_space, n)

                    print("Printing...")
                    print_space(n_space, out_dir, [in_file_descr, w, s, r, n],
                                out_format)
Пример #30
0
    def test_simple_sparse(self):

        bcs.main([
            "build_core_space.py", "-l", self.dir_ + "log1.txt", "-i",
            self.dir_ + "mat1", "-o", self.dir_, "--input_format", "sm",
            "--output_format", "sm"
        ])

        s1 = Space.build(data=self.dir_ + "mat1.sm",
                         cols=self.dir_ + "mat1.cols",
                         format="sm")
        s2 = Space.build(data=self.dir_ + "CORE_SS.mat1.sm",
                         cols=self.dir_ + "CORE_SS.mat1.cols",
                         format="sm")
        s3 = io_utils.load(self.dir_ + "CORE_SS.mat1.pkl", Space)

        self._test_equal_spaces_sparse(s1, s2)
        self._test_equal_spaces_sparse(s1, s3)
Пример #31
0
def load_pkl_files(dsm_prefix):
    """
    Load the space from either a single pkl file or numerous files.
    :param dsm_prefix: the prefix of the input files (.pkl, .rows, .cols)
    """

    # Check whether there is a single pickle file for the Space object
    if os.path.isfile(dsm_prefix + '.pkl'):
        return io_utils.load(dsm_prefix + '.pkl')

    # Load the multiple files: npz for the matrix and pkl for the other data members of Space
    if os.path.isfile(dsm_prefix + '.npz'):
        with np.load(dsm_prefix + '.npz') as loader:
            coo = coo_matrix((loader['data'], (loader['row'], loader['col'])), shape=loader['shape'])

        cooccurrence_matrix = SparseMatrix(csr_matrix(coo))

        with open(dsm_prefix + '_row2id.pkl', 'rb') as f_in:
            row2id = pickle.load(f_in)

        with open(dsm_prefix + '_id2row.pkl', 'rb') as f_in:
            id2row = pickle.load(f_in)

        with open(dsm_prefix + '_column2id.pkl', 'rb') as f_in:
            column2id = pickle.load(f_in)

        with open(dsm_prefix + '_id2column.pkl', 'rb') as f_in:
            id2column = pickle.load(f_in)

        return Space(cooccurrence_matrix, id2row, id2column, row2id=row2id, column2id=column2id)

    if os.path.isfile(dsm_prefix + '.tsv'):
        values = np.loadtxt(dsm_prefix + '.tsv', dtype=float, delimiter='\t', skiprows=0, comments=None, encoding='utf-8')
        targets = np.loadtxt(dsm_prefix + '.rows', dtype=str, skiprows=0, comments=None, encoding='utf-8')
        # Convert to space in sparse matrix format        
        return Space(SparseMatrix(values), list(targets), [])
    
    # If everything fails try to load it as single w2v file
    space_array = np.loadtxt(dsm_prefix + '.w2v', dtype=object, delimiter=' ', skiprows=1, comments=None, encoding='utf-8')
    targets = space_array[:,0].flatten()
    values = space_array[:,1:].astype(np.float)
    # Convert to space and sparse matrix format        
    return Space(SparseMatrix(values), list(targets), [])
Пример #32
0
def build_spaces(in_file_prefix, in_format, out_dir, out_format, weightings, 
                 selections, reductions, normalizations, is_gz):

    in_file_descr = "CORE_SS." + in_file_prefix.split("/")[-1]
    data_file = '%s.%s' % (in_file_prefix, in_format)
    
    if not in_format in ("sm", "dm", "pkl"):
        raise ValueError("Invalid input format:%s" % in_format) 
    
    if in_format == "pkl":
        space = io_utils.load(data_file, Space)
    else:
        if is_gz:
            data_file = '%s.gz' % data_file    
        row_file = '%s.rows' % (in_file_prefix)
        column_file = '%s.cols' % (in_file_prefix)
        if not os.path.exists(row_file):
            row_file = None
        if not os.path.exists(column_file):
            if in_format == "sm":
                raise ValueError("Column file: %s needs to be provided!" 
                                 % column_file)
            column_file = None
            
        print "Building matrix..."   
        space = Space.build(data=data_file, rows=row_file, cols=column_file, 
                            format=in_format)
 
    for w in weightings:
        w_space = apply_weighting(space, w)
                
        for s in selections:
            s_space = apply_selection(w_space, s)

            for r in reductions:
                r_space = apply_reduction(s_space, r)
                
                for n in normalizations:
                    n_space = apply_normalization(r_space, n)
                    
                    print "Printing..."
                    print_space(n_space, out_dir, [in_file_descr, w, s, r, n], out_format)
Пример #33
0
def main():
    ap = argparse.ArgumentParser()
    ap.add_argument('spaces_dir')
    ap.add_argument('words_list_dir')
    args = ap.parse_args()
    spaces_dir = args.spaces_dir
    words_list_dir = args.words_list_dir
    #    '/mnt/8tera/shareclic/lucaNgrams/5grams/ITA_5grams/matrices/pkl_matrices/'
    #space_filename = '../spaces/cbow1_wind5_hs0_neg10_size400_smpl1e-05.pkl'

    output_dir = os.path.join('output', os.path.basename(words_list_dir))
    mkdir_p(output_dir)
    all_words = set(l.strip() for words_filename in glob.glob(os.path.join(words_list_dir, '*'))
        for l in file(words_filename))

    for words_filename in glob.glob(os.path.join(words_list_dir, '*')):
        space_filename = os.path.join(spaces_dir,
            os.path.splitext(os.path.basename(words_filename))[0] + '.pkl')
        if not os.path.isfile(space_filename):
            logging.error('{0} not found: ignoring'.format(space_filename ))
            continue

        context_filename = hashlib.md5(spaces_dir).hexdigest() + '.txt'
        context_words = load_context_vocab(context_filename, spaces_dir)

        logging.debug('Processing {0}'.format(space_filename))
        sp = io_utils.load(space_filename)

        #words = [l.strip() for l in file(words_filename)]
        filtered_words = [w for w in all_words if w in sp.row2id]
        words_vectors = sp.get_rows(filtered_words)
        context_vectors = sp.get_rows(context_words)

        m = words_vectors * context_vectors.transpose()

        sp2 = Space(m, filtered_words, context_words)

        io_utils.save(sp2,
            os.path.join(output_dir,os.path.basename(space_filename)))
Пример #34
0
def main():
    ap = argparse.ArgumentParser()
    ap.add_argument('--words', nargs='*')
    ap.add_argument('--spaces', nargs='*')
    ap.add_argument('-m', '--min-occurrences', type=int, default=1)
    args = ap.parse_args()

    words = set(l.strip() for words_filename in args.words for l in
        file(words_filename))

    word_reps = defaultdict(lambda: 0)
    for sp_filename in args.spaces:
        logging.info('Counting words in {0}'.format(sp_filename))
        sp = io_utils.load(sp_filename)
        for w in words:
            if w in sp.row2id:
                word_reps[w] += 1

    for filename in args.words:
        file_words = [l.strip() for l in file(filename)]
        with open(filename, 'w') as f:
            for w in file_words:
                if word_reps[w] >= args.min_occurrences:
                    f.write('{0}\n'.format(w))
Пример #35
0
def run_many_clusters(algo):
    verbs_filename = 'verbs3.txt'

    filelist = ['output_1000BCto500BC_vocab10000_window1_withoutprep',
                'output_499BCto250BC_vocab10000_window1_withoutprep',
                'output_249BCto0AD_vocab10000_window1_withoutprep',
                'output_1ADto250AD_vocab10000_window1_withoutprep',
                'output_251ADto500AD_vocab10000_window1_withoutprep'
            ]

    if 'kmean' in algo:
        f_log = io.open('clusters/kmeans/generallog.csv','w',encoding='utf8')
    else:
        f_log = io.open('clusters/log2.csv', 'w', encoding='utf8')
        f_log.write('n_verb_clusters, n_total_clusters, n_verbs, min_cluster_size, min_samples, metric, dimensions, n_words, n_plot\n'.decode('utf8'))

    for filebase in filelist:
        pickles = [f for f in os.listdir(filebase) if f.endswith('.pkl')]

        if(len(pickles) == 0):
            print 'No pickles found in directory ' + FLAGS.filebase + '!'
            exit()

        pickle = pickles[0]
        fname = filebase + '/' + pickle
        sp = io_utils.load(fname)

        f_verbs = io.open(verbs_filename, 'r', encoding='utf8')
        verbs = list(f_verbs)
        f_verbs.close()

        verbs_bytes = []
        for verb in verbs:
            verb_byte = verb.strip().encode('utf8')
            if verb_byte in sp.id2row:
                verbs_bytes.append(verb_byte)

        print str(len(verbs_bytes)) + ' verbs in main list\n'

        verb_ids = [sp.row2id[verb] for verb in verbs_bytes]

        n_plot = 400

        if 'kmean' in algo:
            for n_words in xrange(3000,3001,1000):
                for dimensions in xrange(2,3):
                    for n_clusters in xrange(8,9):
                        start_time = time.time()
                        [clusterer,
                        mat,
                        clusterids,
                        verb_clusters,
                        ids_for_shortened_mat,
                        cluster_words_plot,
                        cluster_words_all] = cluster_and_plot(sp,
                            'clusters/kmeans/' + verbs_filename[:-4] + filebase[7:-31] + 'Rectangular25',
                            'kmean',
                            n_clusters,
                            dimensions,
                            n_words,
                            verb_ids,
                            n_plot,
                            0,
                            0,
                            0)

                        n_verbs = len(verb_clusters)
                        n_verb_clusters = len(set(verb_clusters))
                        n_total_clusters = len(set(clusterer.labels_)) - 1
                        print str(n_verbs) + ' verbs specified, split into ' + str(n_verb_clusters) + ' clusters.'

                        out_list_str = ['{:4d}'.format(n_verb_clusters),
                                        '{:4d}'.format(n_total_clusters),
                                        '{:3d}'.format(n_verbs),
                                        '{:3d}'.format(dimensions),
                                        '{:5d}'.format(n_words),
                                        '{:4d}'.format(n_plot)]
                        out_str = ','.join(out_list_str)

                        f_log.write(out_str.decode('utf8')+'\n')
                        end_time = time.time()
                        print 'Clustering took {:.2f} s'.format(end_time - start_time)


        else:
            for n_words in xrange(2000,4001,1000):
                for dimensions in xrange(2,8):
                    for min_cluster_size in xrange(15,30):
                        for min_samples in xrange(1,15):
                            for metric in xrange(0,1):
                                start_time = time.time()
                                [clusterer,
                                 mat,
                                 clusterids,
                                 verb_clusters,
                                 ids_for_shortened_mat] = cluster_and_plot(sp,
                                     dimensions,
                                     n_words,
                                     verb_ids,
                                     n_plot,
                                     min_cluster_size,
                                     min_samples,
                                     metric
                                     )

                                n_verbs = len(verb_clusters)
                                n_verb_clusters = len(set(verb_clusters)) - 1
                                n_total_clusters = len(set(clusterer.labels_)) - 1
                                print str(n_verbs) + ' verbs specified, split into ' + str(n_verb_clusters) + ' clusters.'

                                out_list_str = ['{:4d}'.format(n_verb_clusters),
                                                '{:4d}'.format(n_total_clusters),
                                                '{:3d}'.format(n_verbs),
                                                '{:3d}'.format(min_cluster_size),
                                                '{:3d}'.format(min_samples),
                                                '{:2d}'.format(metric),
                                                '{:3d}'.format(dimensions),
                                                '{:5d}'.format(n_words),
                                                '{:4d}'.format(n_plot)]
                                out_str = ','.join(out_list_str)

                                f_log.write(out_str.decode('utf8')+'\n')
                                end_time = time.time()
                                print 'Clustering took {:.2f} s'.format(end_time - start_time)

    f_log.close()
Пример #36
0

    FLAGS, unparsed = parser.parse_known_args()

    FLAGS.filebase = 'output_499BCto250BC_vocab10000_window1_withoutprep'
    FLAGS.verbs_filename = 'verbs_fewer_compounds.txt'

    pickles = [f for f in os.listdir(FLAGS.filebase) if f.endswith('.pkl')]

    if(len(pickles) == 0):
        print 'No pickles found in directory ' + FLAGS.filebase + '!'
        exit()

    pickle = pickles[0]
    fname = FLAGS.filebase + '/' + pickle
    sp = io_utils.load(fname)

    f_verbs = io.open(FLAGS.verbs_filename, 'r', encoding='utf8')
    verbs = list(f_verbs)
    f_verbs.close()

    verbs_bytes = []
    for verb in verbs:
        verb_byte = verb.strip().encode('utf8')
        if verb_byte in sp.id2row:
            verbs_bytes.append(verb_byte)

    print str(len(verbs_bytes)) + ' verbs in main list\n'

    verb_ids = [sp.row2id[verb] for verb in verbs_bytes]
Пример #37
0
#ex07.py
#-------
from composes.utils import io_utils
from composes.similarity.cos import CosSimilarity

#load two spaces
my_space = io_utils.load("./data/out/ex01.pkl")
my_per_space = io_utils.load("./data/out/PER_SS.ex05.pkl")

print my_space.id2row
print my_per_space.id2row

#compute similarity between a word and a phrase in the two spaces
print my_space.get_sim("car", "sports_car", CosSimilarity(),
                       space2 = my_per_space)
    #
    #    FLAGS.filebase = 'output_2000BCto2000AD_vocab20000_window5'
    #    FLAGS.verbs_filename = 'verbs1.txt'
    #    FLAGS.min_similarity = 0.7
    #    FLAGS.number_neighbours = 30
    #    FLAGS.number_mean_neighbours = 500

    pickles = [f for f in os.listdir(FLAGS.filebase) if f.endswith('.pkl')]

    if (len(pickles) == 0):
        print 'No pickles found in directory ' + FLAGS.filebase + '!'
        exit()

    pickle = pickles[0]
    fname = FLAGS.filebase + '/' + pickle
    this_space = io_utils.load(fname)

    f_verbs = io.open(FLAGS.verbs_filename, 'r', encoding='utf8')
    verbs = list(f_verbs)
    f_verbs.close()

    f_verbs_for_mean = io.open(FLAGS.verbs_for_mean_filename,
                               'r',
                               encoding='utf8')
    verbs_for_mean = list(f_verbs_for_mean)
    f_verbs_for_mean.close()

    verbs_bytes = []
    for verb in verbs:
        verb_byte = verb.strip().encode('utf8')
        if verb_byte in this_space.id2row:
Пример #39
0
#ex12.py
#-------
from composes.utils import io_utils

#load a previously saved weighted additive model
my_comp = io_utils.load("./data/out/model01.pkl")

#print its parameters
print "alpha:", my_comp.alpha
print "beta:", my_comp.beta

#load two spaces
my_space = io_utils.load("./data/out/ex10.pkl")
my_per_space = io_utils.load("./data/out/PER_SS.ex05.pkl")

#apply the composition model to them
composed_space = my_comp.compose([("good", "history_book", "good_history_book")],
                                 (my_space, my_per_space))

print composed_space.id2row
print composed_space.cooccurrence_matrix



Пример #40
0
#ex13.py
#-------
from composes.utils import io_utils
from composes.composition.weighted_additive import WeightedAdditive

#training data
train_data = [("good", "car", "good_car"), ("good", "book", "good_book")]

#load an argument space
arg_space = io_utils.load("./data/out/ex10.pkl")
print arg_space.id2row
print arg_space.cooccurrence_matrix

#load a phrase space
phrase_space = io_utils.load("data/out/PHRASE_SS.ex10.pkl")
print phrase_space.id2row
print phrase_space.cooccurrence_matrix

#train a weighted additive model on the data
my_comp = WeightedAdditive()
my_comp.train(train_data, arg_space, phrase_space)

#print its parameters
print "alpha:", my_comp.alpha
print "beta:", my_comp.beta
Пример #41
0
#ex09.py
#-------
from composes.utils import io_utils
from composes.similarity.cos import CosSimilarity

#load two spaces
my_space = io_utils.load("./data/out/ex01.pkl")
my_per_space = io_utils.load("./data/out/PER_SS.ex05.pkl")

print(my_space.id2row)
print(my_space.cooccurrence_matrix)
print(my_per_space.id2row)
print(my_per_space.cooccurrence_matrix)

#get the top two neighbours of "car" in a peripheral space
print(my_space.get_neighbours("car", 2, CosSimilarity(), space2=my_per_space))
Пример #42
0
from __future__ import print_function
from composes.utils import io_utils

gastrovec = io_utils.load("gastrovec.ppmi.svd20.pkl")

gastrovec.export(file_prefix="fullexport", format="dm")
'''
with open("export3.csv","w") as f:
    # f.write("INGREDIENT " + " ".join(gastrovec.id2column) + "\n")
    with open("export.dm") as f_in:
        for line in f_in:
            f.write(line)
'''
Пример #43
0
#ex20.py
#-------
from composes.utils import io_utils
from composes.utils import scoring_utils
from composes.similarity.cos import CosSimilarity

#read in a space
my_space = io_utils.load("data/out/ex01.pkl")

#compute similarities of a list of word pairs
fname = "data/in/word_sims.txt"
word_pairs = io_utils.read_tuple_list(fname, fields=[0, 1])
predicted = my_space.get_sims(word_pairs, CosSimilarity())

#compute correlations
gold = io_utils.read_list(fname, field=2)
print "Spearman"
print scoring_utils.score(gold, predicted, "spearman")
print "Pearson"
print scoring_utils.score(gold, predicted, "pearson")
Пример #44
0
import sys
import os

folder = os.path.expandvars('/home/luka/Downloads/dissect-master/src')
if folder not in sys.path:
    sys.path.append(folder)

from composes.semantic_space.space import Space

my_space = Space.build(
    data="/home/luka/Downloads/dissect-master/src/examples/data/in/ex01.sm",
    rows="/home/luka/Downloads/dissect-master/src/examples/data/in/ex01.rows",
    cols="/home/luka/Downloads/dissect-master/src/examples/data/in/ex01.cols",
    format="sm")

from composes.utils import io_utils
from composes.transformation.scaling.ppmi_weighting import PpmiWeighting

my_space = io_utils.load(
    "/home/luka/Downloads/dissect-master/src/examples/data/out/ex01.pkl")
print my_space.cooccurrence_matrix

my_space = my_space.apply(PpmiWeighting())
print my_space.cooccurrence_matrix
Пример #45
0
def main():
    ap = argparse.ArgumentParser()
    ap.add_argument('--export-only', action='store_true', default=False)
    ap.add_argument('space_dir', help='Directory where the DISSECT spaces are '
    'located')
    ap.add_argument('spaces_order', help='Order in time of the spaces (no '
    'relevant effect when exporting)')
    ap.add_argument('target_word', help='This is the word that we want to '
    'highlight in the animation (no effect when exporting)')
    args = ap.parse_args()


    center_word = args.target_word #'cane'
    space_dir = args.space_dir
    # get the spaces filenames
    space_filenames = [os.path.join(space_dir, os.path.basename(l.strip())) for l in file(args.spaces_order)]
    def guess_year(space_filename):
        try:
            basename = os.path.basename(space_filename)
            year = basename.split("_")[1]
            if len(year) == 3:
                return "1" + year
            else:
                return year
        except:
            #We don't want to take any chances with this feature: if it doesn't
            #work, tough luck
            return ""
    # guess the years
    years = map(guess_year, space_filenames)
    # load the spaces
    spaces = map(lambda f: io_utils.load(f), space_filenames)

    # put together all the spaces adding the year to each of the words
    # to avoid repetitions (the words are unique)
    stacked = None
    for sp,space_filename in zip(spaces, space_filenames):
        stacked = vstack(stacked, add_year(sp,
        os.path.basename(space_filename)))

    # Find a mapping to 2D (in this case we are finding the mapping to 2D by
    # actually finding the 2D coordinates of the vectors, but one could 
    # find such a mapping by other means, and then apply it to get the 
    # vector coordinates)
    stacked = stacked.apply(Svd(2))


    # Apply the mapping (given by the stacked space) to obtain the 2D vectors.
    # As explained below, now this is redundant, but it does not necessarily
    # need to be the case
    transformed_spaces = [PeripheralSpace(stacked, sp.cooccurrence_matrix,
    sp.id2row, sp.row2id) for sp in spaces]

    if args.export_only:
        #print the coordinates
        print ",".join(["year,word,x,y"])
        for year, sp in zip(years, transformed_spaces):
            for w in sp.id2row:
                v = sp.get_row(w).mat
                print ",".join([year,w,str(v[0,0]), str(v[0,1])])

    else:
        #produce animation
        anim = AnimatedScatter(center_word,years, transformed_spaces, stacked,
        scale_factor=40)
        mkdir_p('output')
        anim.save('output/{0}.mp4'.format(center_word))
Пример #46
0
def main():
    global input_is_tokenized, use_lemmatization, space_cols_file, \
           loaded_space_file_s, loaded_space_file_t, source_lang, \
           target_lang, input_file, output_file, tag_cutoff, \
           no_stopword_print, number_of_translations, \
           number_of_neighbours, different_pos_punishment, \
           treetagger_path
    
    parser = argparse.ArgumentParser(description="Word translations" + \
                                     " that fit best to the sentence")
    parser.add_argument("-k", "--tokenized", 
           help="use pretokenized input", action="store_true")
    parser.add_argument("-l", "--lemmatized", 
           help="use lemmatization", action="store_true")
    parser.add_argument("-p", "--returntag", 
           help="return language tag", action="store_true")
    parser.add_argument("-d", "--dimensions", type=str,
           help="column file for the input matrix")
    parser.add_argument("-m", "--sourcematrix", type=str,
           help="pickled input matrix for source language")
    parser.add_argument("-y", "--targetmatrix", type=str,
           help="pickled input matrix for target language")
    parser.add_argument("-s", "--sourcelang", type=str, 
           help="input language")
    parser.add_argument("-t", "--targetlang", type=str,
           help="output language")
    parser.add_argument("-i", "--infile", type=str, 
           help="input file")
    parser.add_argument("-o", "--outfile", type=str, 
           help="output file")
    parser.add_argument("-nsp", "--no-stopword-print", 
           action="store_true", 
           help="Omit to print words without candidates -- usually " + \
                 "stop words.")
    parser.add_argument("-nt", "--number-of-translations", type=float,
           help="The number of candidates to show for each input word.")
    parser.add_argument("-nn", "--number-of-neighbours", type=int,
           help="The number of neighbours for each input word to " + \
                "consider in the similarity space constructed.")
    parser.add_argument("-dpp", "--different-pos-punishment", 
           type=float, help="The score's fraction to punish a " + \
                             "candidate word which is there, but " + \
                             "has not the same POS as its input peer.")
    parser.add_argument
    args = parser.parse_args()
    
    if args.sourcelang:
        source_lang = args.sourcelang
    if args.targetlang:
        target_lang = args.targetlang
    if args.tokenized:
        input_is_tokenized = True
    if args.lemmatized:
        use_lemmatization = True
    if args.dimensions:
        space_cols_file = args.dimensions
    elif source_lang == target_lang:
        space_cols_file = DATA_DIR_OUT + source_lang + '-words.col'
    else:
        space_cols_file = DATA_DIR_OUT \
                        + '_'.join(sorted([source_lang,target_lang])) \
                        + '-words.col'
    if args.sourcematrix:
        loaded_space_file_s = args.sourcematrix
    elif source_lang == target_lang:
        loaded_space_file_s = DATA_DIR_OUT + source_lang + '.pkl'
    else:
        loaded_space_file_s = DATA_DIR_OUT + source_lang \
                            + '_' + source_lang + '-' + target_lang \
                            + '.pkl'
    if args.targetmatrix:
        loaded_space_file_t = args.targetmatrix
    elif source_lang == target_lang and loaded_space_file_t == "":
        loaded_space_file_t = DATA_DIR_OUT + target_lang + '.pkl'
    else:
        loaded_space_file_t = DATA_DIR_OUT + target_lang \
                            + '_' + target_lang + '-' + source_lang \
                            + '.pkl'
    if args.infile:
        input_file = open(args.infile, "r")
    if args.outfile:
        output_file = open(args.outfile, "w")
    if args.returntag:
        tag_cutoff = 0
    else:
        if args.lemmatized:
            tag_cutoff = 5
        else:
            tag_cutoff = 3
    if args.no_stopword_print:
        no_stopword_print = args.no_stopword_print

    # vector dimension/columns for input matrix and matrix per sentence
    space_cols_fileobject = open(space_cols_file, "r")
    # space_cols = space_cols_fileobject.readlines()
    space_cols = space_cols_fileobject.read().split("\n")[:-1] 
    space_cols_fileobject.close()

    # load the space
    loaded_space = {}
    loaded_space[source_lang] = io_utils.load(loaded_space_file_s)
    # only load it once for similary queries in the same language
    if not loaded_space.get(target_lang):
        loaded_space[target_lang] = io_utils.load(loaded_space_file_t)

    # Initialize TreeTagger only once (for later use)
    treetagger = TreeTagger(TAGLANG=source_lang, TAGDIR=treetagger_path,
                            TAGINENC=ENC, TAGOUTENC=ENC)
    
    # work on input file
    while True:
        line = input_file.readline()
        words = [] # words in sentence
        lemmas = [] # lemmas in sentence
        pos = [] # part-of-speech tags per word in sentence
        formatted = []
        # matrix for sentence
        freq = defaultdict(lambda: defaultdict(int))

        # Stop when file is entirely read
        if not line:
            break

        # For pre-treetagged text
        if input_is_tokenized:
            while not re.match(r'[.:?!]', line):
                t = line.rstrip()
                w = t.split("\t")[0]
                p = helpers.getTag(t.split("\t")[1], source_lang)
                l = t.split("\t")[2]
                words.append(w)
                lemmas.append(l)
                pos.append(p)
                formatted.append(helpers.dimensionformat(w, p, l, 
                                 source_lang, use_lemmatization))
                line = input_file.readline()
                if not line:
                    break

        # Use tree-tagger as lemmatizer and/or tokenizer
        else:
            treetagger_sentence = treetagger.TagText(line)
            for t in treetagger_sentence:
                try:
                    w = t.split("\t")[0]
                    p = helpers.getTag(t.split("\t")[1], source_lang)
                    l = t.split("\t")[2]
                except:
                    print >> sys.stderr, \
                             "Caution: TreeTagger token cannot " + \
                             "be processed:", t
                    continue # Skip it
                words.append(w)
                lemmas.append(l)
                pos.append(p)
                formatted.append(helpers.dimensionformat(w, p, l, 
                                 source_lang, use_lemmatization))

        # fill matrix for sentence
        for i in formatted:
            for j in formatted:
                freq[i][j] += 1

        # bild unique list of the words in this sentence for the rows
        uniqwords = set()
        for l in formatted:
            uniqwords.add(l)
        query_rows = list(uniqwords) # rows for sentence matrix

        # dissect compatible matrix
        m = np.mat(np.zeros(shape=(len(query_rows), len(space_cols))))

        # convert sentence matrix to compatible matrix
        for i in range(len(query_rows)):
            for j in range(len(space_cols)):
                m[i, j] = freq[query_rows[i]][space_cols[j]]

        # build dissect matrix
        query_space = Space(DenseMatrix(m), query_rows, space_cols)

        # for every word print neighbours with similarity
        for i in range(len(words)):
            best_translations = get_best_translations(words[i], pos[i], 
                                lemmas[i], query_space, loaded_space)
            output_file.write(format_best_translations(words[i], pos[i], 
                              lemmas[i], best_translations))

        if input_is_tokenized:
            output_file.write(line.split("\t")[0] + "\n")

            
    if args.infile:
        input_file.close()
    if args.outfile:
        output_file.close()
Пример #47
0
from composes.composition.lexical_function import LexicalFunction
from composes.composition.full_additive import FullAdditive
from composes.composition.weighted_additive import WeightedAdditive
from composes.composition.multiplicative import Multiplicative
from composes.composition.dilation import Dilation
from composes.utils.regression_learner import RidgeRegressionLearner

import composes.utils.io_utils as io_utils
import composes.utils.scoring_utils as scoring_utils

#load a core space
print "Loading the data..."
data_path = "/mnt/cimec-storage-sata/users/thenghia.pham/shared/tutorial/"

space_file = data_path + "CORE_SS.verbnoun.core.pkl"
space = io_utils.load(space_file)

print "Applying PPMI..."
space = space.apply(PpmiWeighting())

print "Applying feature selection..."
space = space.apply(TopFeatureSelection(2000))

print "Applying SVD..."
space = space.apply(Svd(100))

print "Creating peripheral space.."
per_space = PeripheralSpace.build(space,
                                  data=data_path + "per.raw.SV.sm",
                                  cols=data_path + "per.raw.SV.cols",
                                  format="sm")
Пример #48
0
            help='Number of dimensions used for SVD')
    parser.add_argument(
            '--normalisation',
            type=str,
            default = '0',
            help = 'Type of normalisation'
        )

    FLAGS, unparsed = parser.parse_known_args()

    #This needs pointing to the location that DISSECT is installed to.
    build_core_str = 'python C:/Users/Rachel/Documents/dissect-master/dissect-master/src/pipelines/build_core_space.py '
    build_core_str += ' -i ' + FLAGS.filebase + '/sparsematrix'
    build_core_str += ' --input_format sm --w ppmi -r svd_' + str(FLAGS.nSVD)  + ' -o ' + FLAGS.filebase
    if FLAGS.normalisation != '0':
        build_core_str += ' -n all'

    print build_core_str
    os.system(build_core_str)

    saved_space_filename = FLAGS.filebase + "/CORE_SS.sparsematrix.ppmi.svd_" \
            + str(FLAGS.nSVD)

    if FLAGS.normalisation != '0':
        saved_space_filename += ".all"

    saved_space_filename += ".pkl"

    this_space = io_utils.load(saved_space_filename)

    plot_space(this_space, FLAGS.nplot, FLAGS.filebase + ".png")
Пример #49
0
#ex02.py
#-------
from composes.semantic_space.space import Space
from composes.utils import io_utils

#create a space from co-occurrence counts in sparse format
my_space = Space.build(data="./data/in/ex01.sm",
                       rows="./data/in/ex01.rows",
                       cols="./data/in/ex01.cols",
                       format="sm")

#print the co-occurrence matrix of the space
print my_space.cooccurrence_matrix

#save the Space object in pickle format
io_utils.save(my_space, "./data/out/ex01.pkl")

#load the saved object
my_space2 = io_utils.load("./data/out/ex01.pkl")

#print the co-occurrence matrix of the loaded space
print my_space2.cooccurrence_matrix
from composes.utils import io_utils
from composes.transformation.scaling.ppmi_weighting import PpmiWeighting
from composes.transformation.dim_reduction.svd import Svd
from composes.composition.lexical_function import LexicalFunction
from composes.similarity.cos import CosSimilarity
import pickle
from composes.utils import scoring_utils

import os
path = os.getcwd()

print("Building space...")
# create a space from co-occurrence counts in sparse format

try:
    my_space = io_utils.load("my_space.pkl")
except FileNotFoundError:

    my_space = Space.build(data="./data/in/spacew.sm",
                           rows="./data/in/spacew.rows",
                           cols="./data/in/spacew.cols",
                           format="sm")

    print("Applying PPMI...")
    my_space = my_space.apply(PpmiWeighting())

    print("Applying SVD...")
    my_space = my_space.apply(Svd(350))
    io_utils.save(my_space, "my_space.pkl")

print("Loading pairs...")
Пример #51
0
#similarity.py
#USAGE: python similarity [space file] [word1] [word2]
#EXAMPLE: python kneighbours ~/UkWac/dissect/ANs/ANs.kpl car_n dog_n
#-------
from composes.utils import io_utils
from composes.similarity.cos import CosSimilarity
import sys

#load a space
my_space = io_utils.load(sys.argv[1])

#print my_space.cooccurrence_matrix
#print my_space.id2row

#compute similarity between two words in the space
print "The similarity of", sys.argv[2], "and", sys.argv[
    3], "is:", my_space.get_sim(sys.argv[2], sys.argv[3], CosSimilarity())
Пример #52
0
#ex05.py
#-------
from composes.utils import io_utils
from composes.semantic_space.peripheral_space import PeripheralSpace
from composes.transformation.scaling.ppmi_weighting import PpmiWeighting


#load a space and apply ppmi on it
my_space = io_utils.load("./data/out/ex01.pkl")
my_space = my_space.apply(PpmiWeighting())

print my_space.cooccurrence_matrix
print my_space.id2row

#create a peripheral space 
my_per_space = PeripheralSpace.build(my_space,
                                     data="./data/in/ex05.sm",
                                     cols="./data/in/ex05.cols",
                                     format="sm")

print my_per_space.cooccurrence_matrix
print my_per_space.id2row

#save the space
io_utils.save(my_per_space, "./data/out/PER_SS.ex05.pkl")

Пример #53
0
    def test_simple_define(self):

        #trained = io_utils.load(self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl")
        #new_space = trained.function_space

        #compose with lexical function
        ac.main([
            "apply_composition.py", "-l", self.dir_ + "log1.txt", "-i",
            self.dir_ + "an_train_data.txt", "-o", self.dir_, "--load_model",
            self.dir_ +
            "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl", "-a",
            self.dir_ + "CORE_SS.N_mat.pkl", "--output_format", "dm"
        ])

        sp2 = Space.build(data=self.dir_ +
                          "COMPOSED_SS.LexicalFunction.an_train_data.txt.dm",
                          format="dm")

        #compose with weighted addition
        ac.main([
            "apply_composition.py", "-l", self.dir_ + "log1.txt", "-i",
            self.dir_ + "an_train_data.txt", "-o", self.dir_, "-m",
            "weighted_add", "--alpha", "0.5", "--beta", "0.5", "-a",
            self.dir_ + "CORE_SS.A_mat.pkl" + "," + self.dir_ +
            "CORE_SS.N_mat.pkl", "--output_format", "dm"
        ])

        sp1 = Space.build(data=self.dir_ +
                          "COMPOSED_SS.WeightedAdditive.an_train_data.txt.dm",
                          format="dm")
        sp3 = io_utils.load(
            self.dir_ + "COMPOSED_SS.WeightedAdditive.an_train_data.txt.pkl")

        np.testing.assert_array_equal(sp1.cooccurrence_matrix.mat,
                                      np.mat([[3, 4], [4, 5]]))
        self._test_equal_spaces_structs(sp1, sp2)
        sp1.to_sparse()
        sp3.to_sparse()
        self._test_equal_spaces_sparse(sp1, sp3)

        #the two output format have to contain identical data
        sp1.to_dense()
        sp3.to_dense()
        self._test_equal_spaces_dense(sp1, sp3)

        #compose with dilation
        ac.main([
            "apply_composition.py", "-l", self.dir_ + "log1.txt", "-i",
            self.dir_ + "an_train_data.txt", "-o", self.dir_, "-m", "dilation",
            "--lambda", "1", "-a", self.dir_ + "CORE_SS.A_mat.pkl" + "," +
            self.dir_ + "CORE_SS.N_mat.pkl", "--output_format", "dm"
        ])

        sp1 = Space.build(data=self.dir_ +
                          "COMPOSED_SS.Dilation.an_train_data.txt.dm",
                          format="dm")
        n_space = io_utils.load(self.dir_ + "CORE_SS.N_mat.pkl")
        sp1.to_dense()
        n_space.to_dense()
        np.testing.assert_array_almost_equal(
            sp1.cooccurrence_matrix.mat, n_space.cooccurrence_matrix.mat * 25)
        self._test_equal_spaces_structs(sp1, sp2)

        #compose with dilation, change the order of the arguments
        ac.main([
            "apply_composition.py", "-l", self.dir_ + "log1.txt", "-i",
            self.dir_ + "na_train_data.txt", "-o", self.dir_, "-m", "dilation",
            "--lambda", "1", "-a", self.dir_ + "CORE_SS.N_mat.pkl" + "," +
            self.dir_ + "CORE_SS.A_mat.pkl", "--output_format", "dm"
        ])

        sp1 = Space.build(data=self.dir_ +
                          "COMPOSED_SS.Dilation.na_train_data.txt.dm",
                          format="dm")

        sp1.to_dense()
        np.testing.assert_array_almost_equal(sp1.cooccurrence_matrix.mat,
                                             np.mat([[75, 100], [183, 244]]),
                                             5)
        self._test_equal_spaces_structs(sp1, sp2)

        #compose with multiplicative
        ac.main([
            "apply_composition.py", "-l", self.dir_ + "log1.txt", "-i",
            self.dir_ + "aan_train_data.txt", "-o", self.dir_, "-m", "mult",
            "-a", self.dir_ + "CORE_SS.A_mat.pkl" + "," + self.dir_ +
            "COMPOSED_SS.Dilation.an_train_data.txt.pkl", "--output_format",
            "dm"
        ])

        sp1 = Space.build(data=self.dir_ +
                          "COMPOSED_SS.Multiplicative.aan_train_data.txt.dm",
                          format="dm")
Пример #54
0
    return ss1.wup_similarity(ss2, brown_ic)
def lch_sim(ss1,ss2):
    return ss1.lch_similarity(ss2, brown_ic)
def mean(seq):
    print(sum(seq) / len(seq))
    return sum(seq) / len(seq)
def is_better(ingredients, result, other):
    return mean(map(lambda x: sim(x,result),ingredients)) > mean(map(lambda x: sim(x,other),ingredients))
def vs_sim(word1,word2,space):
    return space.get_sim(word1,word2,CosSimilarity())
def limit(iterator,num):
    for _ in range(num):
        yield next(iterator)
    raise StopIteration

gastrovec = io_utils.load("../vector_processing/gastrovec.ppmi.svd20.pkl")

wn_scores, vs_scores = [], []
jcn_scores, res_scores, lin_scores, lch_scores, wup_scores = [], [], [], [], []

ingredients = []

with open("../vector_processing/ingredients_in_wordnet") as f:
    for line in limit(f,int(sys.argv[1])):
        l = line.strip()
        ingredients.append(l)

for (a,b) in combinations(ingredients,2):
    a_,b_=getss(a), getss(b)
    wn_scores.append(wn_sim(a_,b_))
    res_scores.append(res_sim(a_,b_))
from __future__ import print_function
import sys
from random import randint
from itertools import count
from composes.utils import io_utils
from composes.composition.weighted_additive import WeightedAdditive
from composes.semantic_space.space import Space


stacked_space = io_utils.load("gastrovec.ppmi.svd20.pkl")

WA = WeightedAdditive(alpha = 1, beta = 1)

recipes = {}
max_size = 0
with open("../corpus_collection/composition_counts.txt") as f:
    for line in f:
        words = line.split()
        recipes[words[0]] = words[1:]
        if len(words)-1 > max_size:
            max_size = len(words)-1

WA = WeightedAdditive(alpha = 1, beta = 1)
last_space = None
number = count()
for size in xrange(max_size,1,-1):
    relevant = (rec for rec in recipes if len(recipes[rec]) == size)
    print(size)
    composition = []
    for recipe in relevant:
        old = recipes[recipe]
#similarity.py
#USAGE: python similarity [space file] [word1] [word2]
#EXAMPLE: python kneighbours ~/UkWac/dissect/ANs/ANs.kpl car_n dog_n
#-------
from composes.utils import io_utils
from composes.similarity.cos import CosSimilarity
import sys

#load a space
my_space = io_utils.load(sys.argv[1])

#print my_space.cooccurrence_matrix
#print my_space.id2row

#compute similarity between two words in the space
print "The similarity of",sys.argv[2],"and",sys.argv[3],"is:",my_space.get_sim(sys.argv[2], sys.argv[3], CosSimilarity())
Пример #57
0
#ex12.py
#-------
from composes.utils import io_utils

#load a previously saved weighted additive model
my_comp = io_utils.load("./data/out/model01.pkl")

#print its parameters
print "alpha:", my_comp.alpha
print "beta:", my_comp.beta

#load two spaces
my_space = io_utils.load("./data/out/ex10.pkl")
my_per_space = io_utils.load("./data/out/PER_SS.ex05.pkl")

#apply the composition model to them
composed_space = my_comp.compose(
    [("good", "history_book", "good_history_book")], (my_space, my_per_space))

print composed_space.id2row
print composed_space.cooccurrence_matrix
Пример #58
0
#ex10.py
#-------
from composes.utils import io_utils
from composes.composition.weighted_additive import WeightedAdditive

#load a space
my_space = io_utils.load("./data/out/ex10.pkl")

print my_space.id2row
print my_space.cooccurrence_matrix

# instantiate a weighted additive model
my_comp = WeightedAdditive(alpha = 1, beta = 1)

# use the model to compose words in my_space
composed_space = my_comp.compose([("good", "book", "good_book"),
                                  ("good", "car", "good_car")],
                                 my_space)

print composed_space.id2row
print composed_space.cooccurrence_matrix

#save the composed space
io_utils.save(composed_space, "data/out/PHRASE_SS.ex10.pkl")


Пример #59
0
# (Paris is to France what ___ is to Germany)
##########################################################################

from composes.utils import io_utils
from composes.composition.weighted_additive import WeightedAdditive
from composes.similarity.cos import CosSimilarity
import sys



pkl=sys.argv[1]
base=sys.argv[2]
minus=sys.argv[3]
plus=sys.argv[4]

space = io_utils.load(pkl)

# instantiate an additive and subtractive model
add = WeightedAdditive(alpha = 1, beta = 1)
sub = WeightedAdditive(alpha = 1, beta = -1)


#print space.get_neighbours(base, 10, CosSimilarity())

print "Subtracting",minus,"from",base
composed_space = sub.compose([(base, minus, "step1")], space)
#print composed_space.get_neighbours("step1", 10, CosSimilarity(),space)

print "Adding",plus,"..."
composed_space2 = add.compose([("step1", plus, "step2")], (composed_space,space))
print composed_space2.get_neighbours("step2", 10, CosSimilarity(),space)