def main(): """ Compute the FREQ/PPMI/PLMI matrix from a co-occurrence matrix, as default pickle the raw matrix """ # Get the arguments args = docopt( '''Compute the FREQ/PPMI/PLMI matrix from a co-occurrence matrix, as default pickle the raw matrix Usage: create_dsm.py <dsm_prefix> [-p | -l] <dsm_prefix> = the prefix for the input files (.sm for the matrix, .rows and .cols) and output files (.ppmi) Options: <none> weight the matrice entries via FREQUENCY -p, --ppmi weight the matrice entries via PPMI -l, --plmi weight the matrice entries via PLMI ''') dsm_prefix = args['<dsm_prefix>'] is_ppmi = args['--ppmi'] is_plmi = args['--plmi'] postfix = "_freq" # Create a space from co-occurrence counts in sparse format dsm = Space.build(data=dsm_prefix + '.sm', rows=dsm_prefix + '.rows', cols=dsm_prefix + '.cols', format='sm') if is_ppmi: # Apply ppmi weighting dsm = dsm.apply(PpmiWeighting()) postfix = "_ppmi" elif is_plmi: # Apply plmi weighting dsm = dsm.apply(PlmiWeighting()) postfix = "_plmi" # Save the Space object in pickle format save_pkl_files(dsm_prefix + postfix, dsm)
def apply_weighting(space, w): weightings_dict = { "plog": PlogWeighting(), "ppmi": PpmiWeighting(), "epmi": EpmiWeighting(), "plmi": PlmiWeighting() } if not w in (None, "none"): print("Applying weighting: %s" % w) if not w in weightings_dict: warn("Weigthing scheme: %s not defined" % w) return space w_space = space.apply(weightings_dict[w]) else: w_space = space return w_space
def test_add_rows(self): test_cases = [(self.space1, self.m2, self.row2, np.array([[4, 2, 6]]), ["c"], np.array([[4, 2, 6], [4, 2, 6]]), np.array([[0.69314718, 0, 0], [0.69314718, 0, 0]]), { "b": 0, "c": 1 }, ["b", "c"])] for (core_sp, per_mat1, id2row1, per_mat2, id2row2, per_exp_mat1, per_exp_mat2, per_exp_row2id, per_exp_id2row) in test_cases: per_sp = PeripheralSpace(core_sp, DenseMatrix(per_mat1), id2row1) per_sp.add_rows(DenseMatrix(per_mat2), id2row2) np.testing.assert_array_almost_equal( per_sp.cooccurrence_matrix.mat, per_exp_mat1, 7) self.assertDictEqual(per_sp.row2id, per_exp_row2id) self.assertListEqual(per_sp.id2row, per_exp_id2row) self.assertDictEqual(per_sp.column2id, core_sp.column2id) self.assertListEqual(per_sp.id2column, core_sp.id2column) core_sp2 = core_sp.apply(PpmiWeighting()) per_sp2 = PeripheralSpace(core_sp2, DenseMatrix(per_mat1), id2row1) per_sp2.add_rows(DenseMatrix(per_mat2), id2row2) np.testing.assert_array_almost_equal( per_sp2.cooccurrence_matrix.mat, per_exp_mat2, 7) self.assertRaises(ValueError, per_sp2.add_rows, DenseMatrix(per_mat2), id2row1) self.assertRaises(ValueError, per_sp2.add_rows, DenseMatrix(per_mat2), id2row2) self.assertRaises(ValueError, per_sp2.add_rows, DenseMatrix(per_mat2), ["d", "e"])
def get_space(matrice_folder, matrice_name, is_pmi, is_lmi, is_save_weighted): """ Loads semantic space from matrix file. :param matrice_folder: string, path of matrice folder :param matrice_name: string, name of matrice file :param is_pmi: boolean, whether to weight matrice with PPMI values :param is_lmi: boolean, whether to weight matrice with PLMI values :param is_save_weighted: boolean, whether to save weighted matrice :return cooc_space: unweighted semantic space :return mi_space: weighted semantic space :return vocab_map: dictionary that maps row strings to integer ids :return vocab_size: int, number of rows :return column_map: dictionary that maps column strings to integer ids :return id2column_map: list of strings, the column elements """ try: print "Loading frequency matrice..." cooc_space = load_pkl_files(matrice_folder + matrice_name) cooc_space.__class__ = Space_extension except IOError: print "Format not suitable or file does not exist: " + matrice_folder + matrice_name mi_space = [] if is_pmi: try: mi_space = load_pkl_files(matrice_folder + matrice_name + "_ppmi") print "Found Ppmi weighted matrice." except: print "No Ppmi weighted matrice found." print "Building Ppmi weighted matrice..." mi_space = cooc_space.apply(PpmiWeighting()) if is_save_weighted: print "Saving Ppmi weighted matrice..." save_pkl_files(matrice_folder + matrice_name + "_ppmi", mi_space, False) mi_space.__class__ = Space_extension if is_lmi: try: mi_space = load_pkl_files(matrice_folder + matrice_name + "_plmi") print "Found Plmi weighted matrice." except: print "No Plmi weighted matrice found." print "Building Plmi weighted matrice..." mi_space = cooc_space.apply(PlmiWeighting()) if is_save_weighted: print "Saving Plmi weighted matrice..." save_pkl_files(matrice_folder + matrice_name + "_plmi", mi_space, False) mi_space.__class__ = Space_extension vocab_map = cooc_space.get_vocab() vocab_size = len(vocab_map) column_map = cooc_space.get_columns() id2column_map = cooc_space.get_id2column_map() print "The vocabulary has size: " + str(vocab_size) return cooc_space, mi_space, vocab_map, vocab_size, column_map, id2column_map
#ex03.py #------- from composes.utils import io_utils from composes.transformation.scaling.ppmi_weighting import PpmiWeighting #create a space from co-occurrence counts in sparse format my_space = io_utils.load("./data/out/ex01.pkl") #print the co-occurrence matrix of the space print(my_space.cooccurrence_matrix) #apply ppmi weighting my_space = my_space.apply(PpmiWeighting()) #print the co-occurrence matrix of the transformed space print(my_space.cooccurrence_matrix)
def apply(self, matrix_, column_marginal=None): return matrix_.multiply(PpmiWeighting().apply(matrix_, column_marginal))
from composes.composition.multiplicative import Multiplicative from composes.composition.dilation import Dilation from composes.utils.regression_learner import RidgeRegressionLearner import composes.utils.io_utils as io_utils import composes.utils.scoring_utils as scoring_utils #load a core space print "Loading the data..." data_path = "/mnt/cimec-storage-sata/users/thenghia.pham/shared/tutorial/" space_file = data_path + "CORE_SS.verbnoun.core.pkl" space = io_utils.load(space_file) print "Applying PPMI..." space = space.apply(PpmiWeighting()) print "Applying feature selection..." space = space.apply(TopFeatureSelection(2000)) print "Applying SVD..." space = space.apply(Svd(100)) print "Creating peripheral space.." per_space = PeripheralSpace.build(space, data=data_path + "per.raw.SV.sm", cols=data_path + "per.raw.SV.cols", format="sm") #reading in train data train_data_file = data_path + "ML08_SV_train.txt"
import sys import os folder = os.path.expandvars('/home/luka/Downloads/dissect-master/src') if folder not in sys.path: sys.path.append(folder) from composes.semantic_space.space import Space lassy_space = Space.build(data="/home/luka/ThLi/cooccurrence/spm1.sm", rows="/home/luka/ThLi/cooccurrence/rows1.rows", cols="/home/luka/ThLi/cooccurrence/cols1.cols", format="sm") #%% from composes.utils import io_utils from composes.transformation.scaling.ppmi_weighting import PpmiWeighting lassy_space = lassy_space.apply(PpmiWeighting())
els_for_comp = [] for item in items: item = item.split('_') element = (item[0], item[1], item[0] + '_' + item[1]) els_for_comp.append(element) return els_for_comp typ_space = create_space(TypDmFile, TypRowsFile) distr_space = create_space(DistrDmFile, DistrRowsFile) #load a space from a pickle file #my_space = io_utils.load("./sharp/lexfunc/lexfunc_Ridge_pract.pkl") #distributional vectors processing distr_space = distr_space.apply(PpmiWeighting()) distr_space = distr_space.apply(Svd(300)) #io_utils.save(distr_space, "./spaces/smooth_phrases_ppmi.pkl") items = items_from_file(itemsFile) els_for_comp = elements_for_composition(items) my_comp = WeightedAdditive(alpha=1, beta=1) distr_space = my_comp.compose(els_for_comp, distr_space) pairs = pairs(items) predicted = distr_space.get_sims(pairs, CosSimilarity()) gold = typ_space.get_sims(pairs, CosSimilarity()) #compute correlations
def test_ppmi(self): w = PpmiWeighting() test_cases = [(self.b, np.mat([[0, 0, 0]])), (self.c, self.c)] for matrix_, expected in test_cases: self.single_case_test(matrix_, expected, w)
def test_ppmi_raises(self): w = PpmiWeighting() test_cases = [(self.d, ValueError)] for matrix_, error_type in test_cases: self.single_case_raises_test(matrix_, error_type, w)
import sys import os folder = os.path.expandvars('/home/luka/Downloads/dissect-master/src') if folder not in sys.path: sys.path.append(folder) from composes.semantic_space.space import Space holspace = Space.build(data = "/home/luka/ThLi/cooccurrence/spm1.sm", rows = "/home/luka/ThLi/cooccurrence/rows1.rows", cols = "/home/luka/ThLi/cooccurrence/cols1.cols", format = "sm") #%% from composes.transformation.scaling.ppmi_weighting import PpmiWeighting holspace = holspace.apply(PpmiWeighting()) #%% from composes.utils import io_utils io_utils.save(holspace, "/home/luka/ThLi/cooccurrence/weighted") #%% holspace.export("/home/luka/ThLi/cooccurrence/weighted_sm", format = "sm")