Пример #1
0
def main():
    """
    Compute the FREQ/PPMI/PLMI matrix from a co-occurrence matrix, as default pickle the raw matrix
    """

    # Get the arguments
    args = docopt(
        '''Compute the FREQ/PPMI/PLMI matrix from a co-occurrence matrix, as default pickle the raw matrix

    Usage:
        create_dsm.py <dsm_prefix> [-p | -l]

        <dsm_prefix> = the prefix for the input files (.sm for the matrix, .rows and .cols) and output files (.ppmi)
    
    Options:
    <none>      weight the matrice entries via FREQUENCY
    -p, --ppmi  weight the matrice entries via PPMI
    -l, --plmi  weight the matrice entries via PLMI
    
    ''')

    dsm_prefix = args['<dsm_prefix>']
    is_ppmi = args['--ppmi']
    is_plmi = args['--plmi']

    postfix = "_freq"

    # Create a space from co-occurrence counts in sparse format
    dsm = Space.build(data=dsm_prefix + '.sm',
                      rows=dsm_prefix + '.rows',
                      cols=dsm_prefix + '.cols',
                      format='sm')

    if is_ppmi:
        # Apply ppmi weighting
        dsm = dsm.apply(PpmiWeighting())
        postfix = "_ppmi"
    elif is_plmi:
        # Apply plmi weighting
        dsm = dsm.apply(PlmiWeighting())
        postfix = "_plmi"

    # Save the Space object in pickle format
    save_pkl_files(dsm_prefix + postfix, dsm)
Пример #2
0
def apply_weighting(space, w):

    weightings_dict = {
        "plog": PlogWeighting(),
        "ppmi": PpmiWeighting(),
        "epmi": EpmiWeighting(),
        "plmi": PlmiWeighting()
    }

    if not w in (None, "none"):
        print("Applying weighting: %s" % w)
        if not w in weightings_dict:
            warn("Weigthing scheme: %s not defined" % w)
            return space

        w_space = space.apply(weightings_dict[w])
    else:
        w_space = space

    return w_space
Пример #3
0
    def test_add_rows(self):

        test_cases = [(self.space1, self.m2, self.row2, np.array([[4, 2,
                                                                   6]]), ["c"],
                       np.array([[4, 2, 6], [4, 2, 6]]),
                       np.array([[0.69314718, 0, 0], [0.69314718, 0, 0]]), {
                           "b": 0,
                           "c": 1
                       }, ["b", "c"])]

        for (core_sp, per_mat1, id2row1, per_mat2, id2row2, per_exp_mat1,
             per_exp_mat2, per_exp_row2id, per_exp_id2row) in test_cases:

            per_sp = PeripheralSpace(core_sp, DenseMatrix(per_mat1), id2row1)
            per_sp.add_rows(DenseMatrix(per_mat2), id2row2)
            np.testing.assert_array_almost_equal(
                per_sp.cooccurrence_matrix.mat, per_exp_mat1, 7)

            self.assertDictEqual(per_sp.row2id, per_exp_row2id)
            self.assertListEqual(per_sp.id2row, per_exp_id2row)

            self.assertDictEqual(per_sp.column2id, core_sp.column2id)
            self.assertListEqual(per_sp.id2column, core_sp.id2column)

            core_sp2 = core_sp.apply(PpmiWeighting())
            per_sp2 = PeripheralSpace(core_sp2, DenseMatrix(per_mat1), id2row1)
            per_sp2.add_rows(DenseMatrix(per_mat2), id2row2)
            np.testing.assert_array_almost_equal(
                per_sp2.cooccurrence_matrix.mat, per_exp_mat2, 7)

            self.assertRaises(ValueError, per_sp2.add_rows,
                              DenseMatrix(per_mat2), id2row1)

            self.assertRaises(ValueError, per_sp2.add_rows,
                              DenseMatrix(per_mat2), id2row2)

            self.assertRaises(ValueError, per_sp2.add_rows,
                              DenseMatrix(per_mat2), ["d", "e"])
Пример #4
0
def get_space(matrice_folder, matrice_name, is_pmi, is_lmi, is_save_weighted):
    """
    Loads semantic space from matrix file.
    :param matrice_folder: string, path of matrice folder
    :param matrice_name: string, name of matrice file
    :param is_pmi: boolean, whether to weight matrice with PPMI values
    :param is_lmi: boolean, whether to weight matrice with PLMI values
    :param is_save_weighted: boolean, whether to save weighted matrice
    :return cooc_space: unweighted semantic space
    :return mi_space: weighted semantic space
    :return vocab_map: dictionary that maps row strings to integer ids
    :return vocab_size: int, number of rows
    :return column_map: dictionary that maps column strings to integer ids
    :return id2column_map: list of strings, the column elements
    """

    try:
        print "Loading frequency matrice..."
        cooc_space = load_pkl_files(matrice_folder + matrice_name)
        cooc_space.__class__ = Space_extension
    except IOError:
        print "Format not suitable or file does not exist: " + matrice_folder + matrice_name

    mi_space = []

    if is_pmi:
        try:
            mi_space = load_pkl_files(matrice_folder + matrice_name + "_ppmi")
            print "Found Ppmi weighted matrice."
        except:
            print "No Ppmi weighted matrice found."
            print "Building Ppmi weighted matrice..."
            mi_space = cooc_space.apply(PpmiWeighting())
            if is_save_weighted:
                print "Saving Ppmi weighted matrice..."
                save_pkl_files(matrice_folder + matrice_name + "_ppmi",
                               mi_space, False)

        mi_space.__class__ = Space_extension

    if is_lmi:
        try:
            mi_space = load_pkl_files(matrice_folder + matrice_name + "_plmi")
            print "Found Plmi weighted matrice."
        except:
            print "No Plmi weighted matrice found."
            print "Building Plmi weighted matrice..."
            mi_space = cooc_space.apply(PlmiWeighting())
            if is_save_weighted:
                print "Saving Plmi weighted matrice..."
                save_pkl_files(matrice_folder + matrice_name + "_plmi",
                               mi_space, False)

        mi_space.__class__ = Space_extension

    vocab_map = cooc_space.get_vocab()
    vocab_size = len(vocab_map)
    column_map = cooc_space.get_columns()
    id2column_map = cooc_space.get_id2column_map()

    print "The vocabulary has size: " + str(vocab_size)

    return cooc_space, mi_space, vocab_map, vocab_size, column_map, id2column_map
Пример #5
0
#ex03.py
#-------
from composes.utils import io_utils
from composes.transformation.scaling.ppmi_weighting import PpmiWeighting

#create a space from co-occurrence counts in sparse format
my_space = io_utils.load("./data/out/ex01.pkl")

#print the co-occurrence matrix of the space
print(my_space.cooccurrence_matrix)

#apply ppmi weighting
my_space = my_space.apply(PpmiWeighting())

#print the co-occurrence matrix of the transformed space
print(my_space.cooccurrence_matrix)

Пример #6
0
 def apply(self, matrix_, column_marginal=None):
     return matrix_.multiply(PpmiWeighting().apply(matrix_, column_marginal))
Пример #7
0
from composes.composition.multiplicative import Multiplicative
from composes.composition.dilation import Dilation
from composes.utils.regression_learner import RidgeRegressionLearner

import composes.utils.io_utils as io_utils
import composes.utils.scoring_utils as scoring_utils

#load a core space
print "Loading the data..."
data_path = "/mnt/cimec-storage-sata/users/thenghia.pham/shared/tutorial/"

space_file = data_path + "CORE_SS.verbnoun.core.pkl"
space = io_utils.load(space_file)

print "Applying PPMI..."
space = space.apply(PpmiWeighting())

print "Applying feature selection..."
space = space.apply(TopFeatureSelection(2000))

print "Applying SVD..."
space = space.apply(Svd(100))

print "Creating peripheral space.."
per_space = PeripheralSpace.build(space,
                                  data=data_path + "per.raw.SV.sm",
                                  cols=data_path + "per.raw.SV.cols",
                                  format="sm")

#reading in train data
train_data_file = data_path + "ML08_SV_train.txt"
Пример #8
0
import sys
import os

folder = os.path.expandvars('/home/luka/Downloads/dissect-master/src')
if folder not in sys.path:
    sys.path.append(folder)

from composes.semantic_space.space import Space

lassy_space = Space.build(data="/home/luka/ThLi/cooccurrence/spm1.sm",
                          rows="/home/luka/ThLi/cooccurrence/rows1.rows",
                          cols="/home/luka/ThLi/cooccurrence/cols1.cols",
                          format="sm")

#%%

from composes.utils import io_utils
from composes.transformation.scaling.ppmi_weighting import PpmiWeighting

lassy_space = lassy_space.apply(PpmiWeighting())
Пример #9
0
    els_for_comp = []
    for item in items:
        item = item.split('_')
        element = (item[0], item[1], item[0] + '_' + item[1])
        els_for_comp.append(element)
    return els_for_comp


typ_space = create_space(TypDmFile, TypRowsFile)
distr_space = create_space(DistrDmFile, DistrRowsFile)

#load a space from a pickle file
#my_space = io_utils.load("./sharp/lexfunc/lexfunc_Ridge_pract.pkl")

#distributional vectors processing
distr_space = distr_space.apply(PpmiWeighting())
distr_space = distr_space.apply(Svd(300))
#io_utils.save(distr_space, "./spaces/smooth_phrases_ppmi.pkl")

items = items_from_file(itemsFile)
els_for_comp = elements_for_composition(items)

my_comp = WeightedAdditive(alpha=1, beta=1)
distr_space = my_comp.compose(els_for_comp, distr_space)

pairs = pairs(items)

predicted = distr_space.get_sims(pairs, CosSimilarity())
gold = typ_space.get_sims(pairs, CosSimilarity())

#compute correlations
Пример #10
0
    def test_ppmi(self):
        w = PpmiWeighting()
        test_cases = [(self.b, np.mat([[0, 0, 0]])), (self.c, self.c)]

        for matrix_, expected in test_cases:
            self.single_case_test(matrix_, expected, w)
Пример #11
0
    def test_ppmi_raises(self):
        w = PpmiWeighting()
        test_cases = [(self.d, ValueError)]

        for matrix_, error_type in test_cases:
            self.single_case_raises_test(matrix_, error_type, w)
Пример #12
0
import sys
import os

folder = os.path.expandvars('/home/luka/Downloads/dissect-master/src')
if folder not in sys.path:
    sys.path.append(folder)
    
from composes.semantic_space.space import Space

holspace = Space.build(data = "/home/luka/ThLi/cooccurrence/spm1.sm",
                       rows = "/home/luka/ThLi/cooccurrence/rows1.rows",
                       cols = "/home/luka/ThLi/cooccurrence/cols1.cols",
                       format = "sm")

#%%

from composes.transformation.scaling.ppmi_weighting import PpmiWeighting
holspace = holspace.apply(PpmiWeighting())

#%%

from composes.utils import io_utils
io_utils.save(holspace, "/home/luka/ThLi/cooccurrence/weighted")

#%%

holspace.export("/home/luka/ThLi/cooccurrence/weighted_sm", format = "sm")