def setUp(self): self.a = np.array([[1, 2, 3], [4, 0, 5]]) self.space_s = Space(SparseMatrix(np.mat(self.a)), ["a", "b"], ["f1", "f2", "f3"]) self.space_d = Space(DenseMatrix(np.mat(self.a)), ["a", "b"], ["f1", "f2", "f3"])
def setUp(self): self.dir_ = data_dir + "/space_test_resources/" self.init_test_cases = [(DenseMatrix(np.array([[1,2],[3,4]])), ["car", "man"], ["feat1", "feat2"], {"man":1, "car":0}, {"feat1":0, "feat2":1}, [ScalingOperation(EpmiWeighting())]), (DenseMatrix(np.array([[1,2],[3,4]])), ["car", "man"], [], {"man":1, "car":0}, {}, [ScalingOperation(EpmiWeighting())])] self.m1 = np.array([[1,2,3]]) self.row1 = ["a"] self.row2 = ["a", "b", "c"] self.ft1 = ["f1","f2","f3"] self.space1 = Space(DenseMatrix(self.m1),self.row1, self.ft1) self.x = np.mat([[1,2,3],[2,4,6],[4,675,43]]) self.us = np.mat([[ 2.19272110e+00, 3.03174768e+00], [ 4.38544220e+00, 6.06349536e+00], [ 6.76369708e+02, -4.91431927e-02]]) self.space2 = Space(DenseMatrix(self.x), self.row2, self.ft1)
def main(): """ Convert temporal referencing matrix to regular (binned) matrix. """ # Get the arguments args = docopt( """Convert temporal referencing matrix to regular (binned) matrix. Usage: tr2bin.py (-w | -s) <spacePrefix> <ref> <outPath> <spacePrefix> = path to pickled space without suffix <ref> = reference string <outPath> = output path for result file Options: -w, --w2v save in w2v format -s, --sps save in sparse matrix format """) is_w2v = args['--w2v'] is_sps = args['--sps'] spacePrefix = args['<spacePrefix>'] ref = args['<ref>'] outPath = args['<outPath>'] logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Load spaces space = load_pkl_files(spacePrefix) matrix = space.get_cooccurrence_matrix().get_mat() id2row = space.get_id2row() id2column = space.get_id2column() ti = [(spl[0], i) for i, w in enumerate(id2row) for spl in [w.split('_')] if len(spl) == 1 or (len(spl) == 2 and spl[1] == ref)] targets, indices = zip(*ti) new_matrix = matrix[list(indices), :] # Save the Space objects if is_w2v: new_space = Space(DenseMatrix(new_matrix), list(targets), id2column) save_pkl_files(new_space, outPath, save_in_one_file=True, save_as_w2v=True) if is_sps: new_space = Space(SparseMatrix(new_matrix), list(targets), id2column) save_pkl_files(new_space, outPath, save_in_one_file=True, save_as_w2v=False) logging.info("--- %s seconds ---" % (time.time() - start_time))
def setUp(self): self.ft = ["f1", "f2"] self.n_space = Space(DenseMatrix(np.mat([[3, 4], [5, 6]])), ["car", "man"], self.ft) self.an_space = Space(DenseMatrix(np.mat([[3, 4], [5, 6]])), ["a1_car", "a1_man"], self.ft)
def test_weighted_additive(self): self.m12 = DenseMatrix(np.mat([[3, 1], [9, 2]])) self.m22 = DenseMatrix(np.mat([[4, 3], [2, 1]])) self.ph2 = DenseMatrix(np.mat([[18, 11], [24, 7]])) self.row = ["a", "b"] self.ft = ["f1", "f2"] self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft) self.space2 = Space(DenseMatrix(self.ph2), ["a_a", "a_b"], self.ft) m = WeightedAdditive() m.export(self.prefix + ".add1") m.train([("a", "a", "a_a")], self.space1, self.space2) m.export(self.prefix + ".add2")
def test_dilation(self): self.m12 = DenseMatrix(np.mat([[3, 1], [9, 2]])) self.m22 = DenseMatrix(np.mat([[4, 3], [2, 1]])) self.ph2 = DenseMatrix(np.mat([[18, 11], [24, 7]])) self.row = ["a", "b"] self.ft = ["f1", "f2"] self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft) self.space2 = Space(DenseMatrix(self.ph2), ["a_a", "a_b"], self.ft) m = Dilation() m.export(self.prefix + ".dil1") m.train([("a", "b", "a_b")], self.space1, self.space2) m.export(self.prefix + ".dil2")
def test_train_intercept(self): a1_mat = DenseMatrix(np.mat([[3, 4], [5, 6]])) a2_mat = DenseMatrix(np.mat([[1, 2], [3, 4]])) train_data = [("a1", "man", "a1_man"), ("a2", "car", "a2_car"), ("a1", "boy", "a1_boy"), ("a2", "boy", "a2_boy") ] n_mat = DenseMatrix(np.mat([[13, 21], [3, 4], [5, 6]])) n_space = Space(n_mat, ["man", "car", "boy"], self.ft) an1_mat = (a1_mat * n_mat.transpose()).transpose() an2_mat = (a2_mat * n_mat.transpose()).transpose() an_mat = an1_mat.vstack(an2_mat) an_space = Space(an_mat, ["a1_man", "a1_car", "a1_boy", "a2_man", "a2_car", "a2_boy"], self.ft) #test train model = LexicalFunction(learner=LstsqRegressionLearner(intercept=True)) model.train(train_data, n_space, an_space) a_space = model.function_space a1_mat.reshape((1, 4)) #np.testing.assert_array_almost_equal(a1_mat.mat, # a_space.cooccurrence_matrix.mat[0]) a2_mat.reshape((1, 4)) #np.testing.assert_array_almost_equal(a2_mat.mat, # a_space.cooccurrence_matrix.mat[1]) self.assertListEqual(a_space.id2row, ["a1", "a2"]) self.assertTupleEqual(a_space.element_shape, (2, 3)) #test compose a1_mat = DenseMatrix(np.mat([[3, 4, 5, 6]])) a2_mat = DenseMatrix(np.mat([[1, 2, 3, 4]])) a_mat = a_space.cooccurrence_matrix a_space = Space(a_mat, ["a1", "a2"], [], element_shape=(2, 3)) model = LexicalFunction(function_space=a_space, intercept=True) comp_space = model.compose(train_data, n_space) self.assertListEqual(comp_space.id2row, ["a1_man", "a2_car", "a1_boy", "a2_boy"]) self.assertListEqual(comp_space.id2column, []) self.assertEqual(comp_space.element_shape, (2,)) np.testing.assert_array_almost_equal(comp_space.cooccurrence_matrix.mat, an_mat[[0, 4, 2, 5]].mat, 8)
def test_vstack_raises(self): space3 = Space(DenseMatrix(self.x[0:2,0:1]), ["e","f"], self.ft1[0:1]) space4 = Space(DenseMatrix(self.x[0:2,:]), ["a","f"], self.ft1) space5 = Space(DenseMatrix(self.x[0:2,:]), ["e","f"], []) space6 = Space(DenseMatrix(self.x[0:2,:]), ["e","f"], ["f1","f2","f4"]) test_cases = [(self.space2, space3), (self.space2, space4), (self.space2, space5), (self.space2, space6) ] for space1, space2 in test_cases: self.assertRaises(ValueError, space1.vstack, space1, space2)
def test_full_additive(self): self.m12 = DenseMatrix(np.mat([[3, 1], [9, 2]])) self.m22 = DenseMatrix(np.mat([[4, 3], [2, 1]])) self.ph2 = DenseMatrix(np.mat([[18, 11], [24, 7]])) self.row = ["a", "b"] self.ft = ["f1", "f2"] self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft) self.space2 = Space(DenseMatrix(self.ph2), ["a_a", "a_b"], self.ft) m = FullAdditive() self.assertRaises(IllegalStateError, m.export, self.prefix + ".full1") m.train([("a", "b", "a_b"), ("a", "a", "a_a")], self.space1, self.space2) m.export(self.prefix + ".full2")
def test_lexical_function(self): self.m12 = DenseMatrix(np.mat([[3, 1], [9, 2]])) self.m22 = DenseMatrix(np.mat([[4, 3], [2, 1]])) self.ph2 = DenseMatrix(np.mat([[18, 11], [24, 7]])) self.row = ["a", "b"] self.ft = ["f1", "f2"] self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft) self.space2 = Space(DenseMatrix(self.ph2), ["a_a", "a_b"], self.ft) m = LexicalFunction() m._MIN_SAMPLES = 1 self.assertRaises(IllegalStateError, m.export, self.prefix + ".lf1") m.train([("a", "b", "a_b"), ("a", "a", "a_a")], self.space1, self.space2) m.export(self.prefix + ".lf2")
def main(): parser = argparse.ArgumentParser( 'Converts a VW topic output to a COMPOSES pkl file.') parser.add_argument('--input', '-i', type=argparse.FileType('r'), help='Input file') parser.add_argument('--docnames', '-d', type=argparse.FileType('r'), help='Docnames file') parser.add_argument('--output', '-o', type=argparse.FileType('w'), default=sys.stdout, help='Output file') args = parser.parse_args() docnames = [l for l in (l.strip() for l in args.docnames) if l] matrix = None for i, line in enumerate(args.input): line = line.strip() weights = map(float, line.split(" ")) if matrix is None: matrix = np.zeros((len(docnames), len(weights)), dtype=np.float) weights = np.array(weights) matrix[i] = weights dm = DenseMatrix(matrix) sp = Space(dm, docnames, []) pickle.dump(sp, args.output) args.output.close()
def main(): """ Transform EPMI matrix in npz format to SPPMI space and save as pickle file. """ # Get the arguments args = docopt( '''Transform EPMI matrix in npz format to SPPMI space and save as pickle file. Usage: transform_matrix_epmi2sppmi.py <spacePrefix> <outPath> <k> <spacePrefix> = path to npz without suffix <outPath> = output path for space <k> = shifting parameter ''') spacePrefix = args['<spacePrefix>'] outPath = args['<outPath>'] k = int(args['<k>']) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Get npz matrix with np.load(spacePrefix + '.npz') as loader: matrix = csr_matrix( (loader['data'], loader['indices'], loader['indptr']), shape=loader['shape']) with open(spacePrefix + '.words.vocab') as f: id2row = vocab = [line.strip() for line in f if len(line) > 0] with open(spacePrefix + '.contexts.vocab') as f: id2column = [line.strip() for line in f if len(line) > 0] # Apply log weighting matrix.data = np.log(matrix.data) # Shift values matrix.data -= np.log(k) # Eliminate negative counts matrix.data[matrix.data <= 0] = 0.0 # Eliminate zero counts matrix.eliminate_zeros() # Create new space sparseSpace = Space(SparseMatrix(matrix), id2row, id2column) #print sparseSpace.get_cooccurrence_matrix() # Save the Space object in pickle format save_pkl_files(sparseSpace, outPath + 'ppmi.sm', save_in_one_file=True) logging.info("--- %s seconds ---" % (time.time() - start_time))
def load_pkl_files(dsm_prefix): """ Load the space from either a single pkl file or numerous files. :param dsm_prefix: :param dsm: """ # Check whether there is a single pickle file for the Space object if os.path.isfile(dsm_prefix + '.pkl'): return io_utils.load(dsm_prefix + '.pkl') # Load the multiple files: npz for the matrix and pkl for the other data members of Space with np.load(dsm_prefix + 'cooc.npz') as loader: coo = coo_matrix((loader['data'], (loader['row'], loader['col'])), shape=loader['shape']) cooccurrence_matrix = SparseMatrix(csr_matrix(coo)) with open(dsm_prefix + '_row2id.pkl', 'rb') as f_in: row2id = pickle.load(f_in) with open(dsm_prefix + '_id2row.pkl', 'rb') as f_in: id2row = pickle.load(f_in) with open(dsm_prefix + '_column2id.pkl', 'rb') as f_in: column2id = pickle.load(f_in) with open(dsm_prefix + '_id2column.pkl', 'rb') as f_in: id2column = pickle.load(f_in) return Space(cooccurrence_matrix, id2row, id2column, row2id=row2id, column2id=column2id)
def read_mikolov(spacefile): header = spacefile.readline().rstrip() vocab_s, dims = map(int, header.split(" ")) vocab = [] # init matrix matrix = np.zeros((vocab_s, dims), dtype=np.float) i = 0 while True: line = spacefile.readline() if not line: break sep = line.find(" ") if sep == -1: raise ValueError( "Couldn't find the vocab/data separation character! Space file corruption?" ) word = line[:sep] data = line[sep + 1:] if len(data) < FLOAT_SIZE * dims + 1: data += spacefile.read(FLOAT_SIZE * dims + 1 - len(data)) data = data[:-1] vocab.append(word) vector = (struct.unpack("%df" % dims, data)) matrix[i] = vector i += 1 dm = DenseMatrix(matrix) sp = Space(dm, vocab, []) return sp
def main(): parser = argparse.ArgumentParser( description="Converts a vecf file to dissect pkl format.") parser.add_argument('--input', '-i', type=argparse.FileType('r'), help='Input file') parser.add_argument('--output', '-o', type=argparse.FileType('w'), help='Output file') args = parser.parse_args() header = args.input.readline().rstrip() vocab_s, dims = map(int, header.split(" ")) vocab = [] # init matrix matrix = np.zeros((vocab_s, dims), dtype=np.float) for i, line in enumerate(args.input): data = line.split() vector = np.array(map(float, data[1:])) word = data[0] vocab.append(word) matrix[i] = vector dm = DenseMatrix(matrix) sp = Space(dm, vocab, []) pickle.dump(sp, args.output) args.output.close()
def compose(self, data, arg_space): """ Uses a composition model to compose elements. Args: data: data to be composed. List of tuples, each containing 3 strings: (arg1, arg2, composed_phrase). arg1 and arg2 are the elements to be composed and composed_phrase is the string associated to their composition. arg_space: argument space(s). Space object or a tuple of two Space objects (e.g. my_space, or (my_space1, my_space2)). If two spaces are provided, arg1 elements of data are interpreted in space1, and arg2 in space2. Returns: composed space: a new object of type Space, containing the phrases obtained through composition. """ start = time.time() arg1_space, arg2_space = self.extract_arg_spaces(arg_space) arg1_list, arg2_list, phrase_list = self.valid_data_to_lists(data, (arg1_space.row2id, arg2_space.row2id, None)) # we try to achieve at most MAX_MEM_OVERHEAD*phrase_space memory overhead # the /3.0 is needed # because the composing data needs 3 * len(train_data) memory (arg1 vector, arg2 vector, phrase vector) chunk_size = int(max(arg1_space.cooccurrence_matrix.shape[0],arg2_space.cooccurrence_matrix.shape[0],len(phrase_list)) * self.MAX_MEM_OVERHEAD / 3.0) + 1 composed_mats = [] for i in range(int(math.ceil(len(arg1_list) / float(chunk_size)))): beg, end = i*chunk_size, min((i+1)*chunk_size, len(arg1_list)) arg1_mat = arg1_space.get_rows(arg1_list[beg:end]) arg2_mat = arg2_space.get_rows(arg2_list[beg:end]) [arg1_mat, arg2_mat] = resolve_type_conflict([arg1_mat, arg2_mat], DenseMatrix) composed_mat = self._compose(arg1_mat, arg2_mat) composed_mats.append(composed_mat) composed_phrase_mat = composed_mat.nary_vstack(composed_mats) if self.composed_id2column is None: self.composed_id2column = self._build_id2column(arg1_space, arg2_space) log.print_name(logger, self, 1, "\nComposed with composition model:") log.print_info(logger, 3, "Composed total data points:%s" % arg1_mat.shape[0]) log.print_matrix_info(logger, composed_phrase_mat, 4, "Resulted (composed) semantic space::") log.print_time_info(logger, time.time(), start, 2) return Space(composed_phrase_mat, phrase_list, self.composed_id2column)
def setUp(self): self.m1 = np.array([[1, 2, 3]]) self.row1 = ["a"] self.ft1 = ["f1", "f2", "f3"] self.space1 = Space(DenseMatrix(self.m1), self.row1, self.ft1) self.m2 = np.array([[4, 2, 6]]) self.row2 = ["b"] self.row3 = ["a", "b", "c"] self.x = np.mat([[1, 2, 3], [2, 4, 6], [4, 675, 43]]) self.us = np.mat([[2.19272110e+00, 3.03174768e+00], [4.38544220e+00, 6.06349536e+00], [6.76369708e+02, -4.91431927e-02]]) self.us2 = np.mat([[2.19272110e+00], [4.38544220e+00], [6.76369708e+02]]) self.space2 = Space(DenseMatrix(self.x), self.row3, self.ft1)
def compose(self, data, arg_space): """ Uses a lexical function composition model to compose elements. Args: data: data to be composed. List of tuples, each containing 3 strings: (function_word, arg, composed_phrase). function_word and arg are the elements to be composed and composed_phrase is the string associated to their composition. function_word elements are interpreted in self.function_space. arg_space: argument space, of type Space. arg elements of data are interpreted in this space. Returns: composed space: a new object of type Space, containing the phrases obtained through composition. """ start = time.time() assert_is_instance(arg_space, Space) arg1_list, arg2_list, phrase_list = self.valid_data_to_lists( data, (self._function_space.row2id, arg_space.row2id, None)) composed_vec_list = [] for i in range(len(arg1_list)): arg1_vec = self._function_space.get_row(arg1_list[i]) arg2_vec = arg_space.get_row(arg2_list[i]) matrix_type = get_type_of_largest([arg1_vec, arg2_vec]) [arg1_vec, arg2_vec] = resolve_type_conflict([arg1_vec, arg2_vec], matrix_type) composed_ph_vec = self._compose(arg1_vec, arg2_vec, self._function_space.element_shape) composed_vec_list.append(composed_ph_vec) result_element_shape = self._function_space.element_shape[0:-1] composed_ph_mat = composed_ph_vec.nary_vstack(composed_vec_list) log.print_name(logger, self, 1, "\nComposed with composition model:") log.print_info(logger, 3, "Composed total data points:%s" % len(arg1_list)) log.print_info( logger, 3, "Functional shape of the resulted (composed) elements:%s" % (result_element_shape, )) log.print_matrix_info(logger, composed_ph_mat, 4, "Resulted (composed) semantic space:") log.print_time_info(logger, time.time(), start, 2) return Space(composed_ph_mat, phrase_list, self.composed_id2column, element_shape=result_element_shape)
def test_init1(self): for (m, id2row, id2col, row2id, col2id, ops) in self.init_test_cases: space_ = Space(m, id2row, id2col) self.assertIs(m, space_.cooccurrence_matrix) self.assertIs(id2row, space_.id2row) self.assertIs(id2col, space_.id2column) self.assertDictEqual(row2id, space_.row2id) self.assertDictEqual(col2id, space_.column2id) self.assertListEqual([], space_.operations)
def test_init4(self): for (m, id2row, id2col, row2id, col2id, ops) in self.init_test_cases: space_ = Space(m, id2row, id2col, row2id, col2id, operations = ops) self.assertIs(m, space_.cooccurrence_matrix) self.assertIs(id2row, space_.id2row) self.assertIs(id2col, space_.id2column) self.assertIs(row2id, space_.row2id) self.assertIs(col2id, space_.column2id) self.assertIs(ops, space_.operations)
def load_pkl_files(dsm_prefix): """ Load the space from either a single pkl file or numerous files. :param dsm_prefix: the prefix of the input files (.pkl, .rows, .cols) """ # Check whether there is a single pickle file for the Space object if os.path.isfile(dsm_prefix + '.pkl'): return io_utils.load(dsm_prefix + '.pkl') # Load the multiple files: npz for the matrix and pkl for the other data members of Space if os.path.isfile(dsm_prefix + '.npz'): with np.load(dsm_prefix + '.npz') as loader: coo = coo_matrix((loader['data'], (loader['row'], loader['col'])), shape=loader['shape']) cooccurrence_matrix = SparseMatrix(csr_matrix(coo)) with open(dsm_prefix + '_row2id.pkl', 'rb') as f_in: row2id = pickle.load(f_in) with open(dsm_prefix + '_id2row.pkl', 'rb') as f_in: id2row = pickle.load(f_in) with open(dsm_prefix + '_column2id.pkl', 'rb') as f_in: column2id = pickle.load(f_in) with open(dsm_prefix + '_id2column.pkl', 'rb') as f_in: id2column = pickle.load(f_in) return Space(cooccurrence_matrix, id2row, id2column, row2id=row2id, column2id=column2id) if os.path.isfile(dsm_prefix + '.tsv'): values = np.loadtxt(dsm_prefix + '.tsv', dtype=float, delimiter='\t', skiprows=0, comments=None, encoding='utf-8') targets = np.loadtxt(dsm_prefix + '.rows', dtype=str, skiprows=0, comments=None, encoding='utf-8') # Convert to space in sparse matrix format return Space(SparseMatrix(values), list(targets), []) # If everything fails try to load it as single w2v file space_array = np.loadtxt(dsm_prefix + '.w2v', dtype=object, delimiter=' ', skiprows=1, comments=None, encoding='utf-8') targets = space_array[:,0].flatten() values = space_array[:,1:].astype(np.float) # Convert to space and sparse matrix format return Space(SparseMatrix(values), list(targets), [])
def test_vstack(self): space3 = Space(DenseMatrix(self.x[0:2,:]), ["e","f"], self.ft1) space4 = Space(DenseMatrix(np.vstack((self.x, self.x[0:2,:]))), ["a", "b", "c", "e","f"], self.ft1) test_cases = [(self.space2, space3, space4)] for space1, space2, expected_space in test_cases: outcome = space1.vstack(space1, space2) np.testing.assert_array_equal(expected_space.cooccurrence_matrix.mat, outcome.cooccurrence_matrix.mat) self.assertListEqual(outcome.id2column, space1.id2column) self.assertListEqual(outcome.id2column, expected_space.id2column) self.assertDictEqual(outcome.column2id, space1.column2id) self.assertDictEqual(outcome.column2id, expected_space.column2id) self.assertListEqual(outcome.id2row, expected_space.id2row) self.assertDictEqual(outcome.row2id, expected_space.row2id) self.assertListEqual([], outcome.operations)
def compose(self, data, arg_space): """ Uses a composition model to compose elements. Args: data: data to be composed. List of tuples, each containing 3 strings: (arg1, arg2, composed_phrase). arg1 and arg2 are the elements to be composed and composed_phrase is the string associated to their composition. arg_space: argument space(s). Space object or a tuple of two Space objects (e.g. my_space, or (my_space1, my_space2)). If two spaces are provided, arg1 elements of data are interpreted in space1, and arg2 in space2. Returns: composed space: a new object of type Space, containing the phrases obtained through composition. """ start = time.time() arg1_space, arg2_space = self.extract_arg_spaces(arg_space) arg1_list, arg2_list, phrase_list = self.valid_data_to_lists( data, (arg1_space.row2id, arg2_space.row2id, None)) arg1_mat = arg1_space.get_rows(arg1_list) arg2_mat = arg2_space.get_rows(arg2_list) [arg1_mat, arg2_mat] = resolve_type_conflict([arg1_mat, arg2_mat], DenseMatrix) composed_phrase_mat = self._compose(arg1_mat, arg2_mat) if self.composed_id2column is None: self.composed_id2column = self._build_id2column( arg1_space, arg2_space) log.print_name(logger, self, 1, "\nComposed with composition model:") log.print_info(logger, 3, "Composed total data points:%s" % arg1_mat.shape[0]) log.print_matrix_info(logger, composed_phrase_mat, 4, "Resulted (composed) semantic space::") log.print_time_info(logger, time.time(), start, 2) return Space(composed_phrase_mat, phrase_list, self.composed_id2column)
def main(): """ Convert txt matrix to w2v matrix and save. """ # Get the arguments args = docopt('''Convert txt matrix to w2v matrix and save. Usage: convert_matrix_txt2w2v.py <spacePrefix> <outPath> <spacePrefix> = path to npz without suffix <outPath> = output path for space ''') spacePrefix = args['<spacePrefix>'] outPath = args['<outPath>'] logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() space_array = np.loadtxt(spacePrefix + '.txt', dtype=object, delimiter=' ', skiprows=0, comments='', encoding='utf-8') targets = space_array[:, 0].flatten() values = space_array[:, 1:].astype(np.float) # Create new space sparseSpace = Space(DenseMatrix(coo_matrix(values)), list(targets), []) #print sparseSpace.get_row('wood').get_mat().toarray()[0].tolist()[id2column.index('inexhaustible')] # Save the Space object in pickle format save_pkl_files(sparseSpace, outPath, save_in_one_file=True, save_as_w2v=True) logging.info("--- %s seconds ---" % (time.time() - start_time))
def setUp(self): self.m11 = DenseMatrix(np.mat([[3], [9]])) self.m21 = DenseMatrix(np.mat([[4], [2]])) self.ph1 = DenseMatrix(np.mat([[18], [24]])) self.space1 = Space(SparseMatrix(np.mat([[3, 9], [4, 2]])), ["a", "b"], ["f1", "f2"]) self.space2 = Space(SparseMatrix(np.mat([[7, 11]])), ["a_b"], ["f1", "f2"]) self.space3 = Space(SparseMatrix(np.mat([[0, 0]])), ["a_b"], ["f1", "f2"]) self.space4 = Space(DenseMatrix(np.mat([[3, 9], [4, 2]])), ["a", "b"], ["f1", "f2"]) self.space5 = Space(DenseMatrix(np.mat([[7, 11]])), ["a_b"], ["f1", "f2"]) self.space6 = Space(DenseMatrix(np.mat([[0, 0]])), ["a_b"], ["f1", "f2"]) self.space7 = Space(DenseMatrix(np.mat([[7, 11], [7, 11]])), ["a_b", "a_a"], ["f1", "f2"])
def main(): """ Make count-based vector space from corpus. """ # Get the arguments args = docopt("""Make count-based vector space from corpus. Usage: count.py [-l] <windowSize> <corpDir> <outPath> <lowerBound> <upperBound> Arguments: <corpDir> = path to corpus directory with zipped files, each sentence in form 'year\tword1 word2 word3...' <outPath> = output path for vectors <windowSize> = the linear distance of context words to consider in each direction <lowerBound> = lower bound for time period <upperBound> = upper bound for time period Options: -l, --len normalize final vectors to unit length """) is_len = args['--len'] corpDir = args['<corpDir>'] outPath = args['<outPath>'] windowSize = int(args['<windowSize>']) lowerBound = int(args['<lowerBound>']) upperBound = int(args['<upperBound>']) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Build vocabulary logging.info("Building vocabulary") sentences = PathLineSentences_mod(corpDir, lowerBound=lowerBound, upperBound=upperBound) vocabulary = list( set([ word for sentence in sentences for word in sentence if len(sentence) > 1 ])) # Skip one-word sentences to avoid zero-vectors w2i = {w: i for i, w in enumerate(vocabulary)} # Initialize co-occurrence matrix as dictionary cooc_mat = defaultdict(lambda: 0) # Get counts from corpus sentences = PathLineSentences_mod(corpDir, lowerBound=lowerBound, upperBound=upperBound) logging.info("Counting context words") for sentence in sentences: for i, word in enumerate(sentence): lowerWindowSize = max(i - windowSize, 0) upperWindowSize = min(i + windowSize, len(sentence)) window = sentence[lowerWindowSize:i] + sentence[i + 1:upperWindowSize + 1] if len(window) == 0: # Skip one-word sentences continue windex = w2i[word] for contextWord in window: cooc_mat[(windex, w2i[contextWord])] += 1 # Convert dictionary to sparse matrix logging.info("Converting dictionary to matrix") cooc_mat_sparse = dok_matrix((len(vocabulary), len(vocabulary)), dtype=float) try: cooc_mat_sparse.update(cooc_mat) except NotImplementedError: cooc_mat_sparse._update(cooc_mat) if is_len: # L2-normalize vectors l2norm1 = linalg.norm(cooc_mat_sparse, axis=1, ord=2) l2norm1[l2norm1 == 0.0] = 1.0 # Convert 0 values to 1 cooc_mat_sparse /= l2norm1.reshape(len(l2norm1), 1) # Make space vocabulary = [v.encode('utf-8') for v in vocabulary] countSpace = Space(SparseMatrix(cooc_mat_sparse), vocabulary, vocabulary) # Save the Space object in pickle format save_pkl_files(countSpace, outPath, save_in_one_file=False) logging.info("Corpus has size %d" % sentences.corpusSize) logging.info("--- %s seconds ---" % (time.time() - start_time))
def test_export(self): out_file = self.dir_ + "tmp" mat1 = np.mat([[1,2],[3,0]]) mat1row, mat1col = ["a","b"], ["f1","f2"] mat2 = np.mat([[0,0]]) mat2row, mat2col = ["a"], [] test_cases = [(Space(DenseMatrix(mat1), mat1row, mat1col), Space(SparseMatrix(mat1), mat1row, mat1col)), (Space(DenseMatrix(mat2), mat2row, mat1col), Space(SparseMatrix(mat2), mat2row, mat1col))] #3 cases allowed at the moment for sp_d, sp_s in test_cases: self.reset_export_files(out_file) sp_d.export(out_file, format="dm") new_sp = Space.build(data=out_file + ".dm", rows=out_file + ".rows", cols=out_file + ".cols", format="dm") self._test_equal_spaces_dense(sp_d, new_sp) self.reset_export_files(out_file) sp_d.export(out_file, format="sm") new_sp = Space.build(data=out_file + ".sm", rows=out_file + ".rows", cols=out_file + ".cols", format="sm") self._test_equal_spaces_sparse(sp_s, new_sp) self.reset_export_files(out_file) sp_s.export(out_file, format="sm") new_sp = Space.build(data=out_file + ".sm", rows=out_file + ".rows", cols=out_file + ".cols", format="sm") self._test_equal_spaces_sparse(sp_s, new_sp) self.reset_export_files(out_file) sp_s.export(out_file, format="dm") new_sp = Space.build(data=out_file + ".dm", rows=out_file + ".rows", cols=out_file + ".cols", format="dm") self._test_equal_spaces_dense(sp_d, new_sp) test_cases = [(Space(DenseMatrix(mat2), mat2row, mat2col), Space(SparseMatrix(mat2), mat2row, mat2col))] for sp_d, sp_s in test_cases: self.reset_export_files(out_file) sp_d.export(out_file, format="dm") new_sp = Space.build(data=out_file + ".dm", rows=out_file + ".rows", format="dm") self._test_equal_spaces_dense(sp_d, new_sp) self.reset_export_files(out_file) sp_s.export(out_file, format="dm") new_sp = Space.build(data=out_file + ".dm", rows=out_file + ".rows", format="dm") self._test_equal_spaces_dense(sp_d, new_sp)
def main(): """ Align two sparse matrices by intersecting their columns. """ # Get the arguments args = docopt('''Align two sparse matrices by intersecting their columns. Usage: count_alignment_intersect.py [-l] <outPath1> <outPath2> <spacePrefix1> <spacePrefix2> <outPath1> = output path for aligned space 1 <outPath2> = output path for aligned space 2 <spacePrefix1> = path to pickled space1 without suffix <spacePrefix2> = path to pickled space2 without suffix Options: -l, --len normalize final vectors to unit length ''') is_len = args['--len'] spacePrefix1 = args['<spacePrefix1>'] spacePrefix2 = args['<spacePrefix2>'] outPath1 = args['<outPath1>'] outPath2 = args['<outPath2>'] logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() space1 = load_pkl_files(spacePrefix1) space2 = load_pkl_files(spacePrefix2) id2row1 = space1.get_id2row() id2row2 = space2.get_id2row() id2column1 = space1.get_id2column() id2column2 = space2.get_id2column() column2id1 = space1.get_column2id() column2id2 = space2.get_column2id() intersected_columns = list(set(id2column1).intersection(id2column2)) intersected_columns_id1 = [ column2id1[item] for item in intersected_columns ] intersected_columns_id2 = [ column2id2[item] for item in intersected_columns ] reduced_matrix1 = space1.get_cooccurrence_matrix( )[:, intersected_columns_id1].get_mat() reduced_matrix2 = space2.get_cooccurrence_matrix( )[:, intersected_columns_id2].get_mat() if is_len: # L2-normalize vectors l2norm1 = linalg.norm(reduced_matrix1, axis=1, ord=2) l2norm2 = linalg.norm(reduced_matrix2, axis=1, ord=2) l2norm1[l2norm1 == 0.0] = 1.0 # Convert 0 values to 1 l2norm2[l2norm2 == 0.0] = 1.0 # Convert 0 values to 1 reduced_matrix1 /= l2norm1.reshape(len(l2norm1), 1) reduced_matrix2 /= l2norm2.reshape(len(l2norm2), 1) reduced_space1 = Space(SparseMatrix(reduced_matrix1), id2row1, intersected_columns) reduced_space2 = Space(SparseMatrix(reduced_matrix2), id2row2, intersected_columns) if reduced_space1.get_id2column() != reduced_space2.get_id2column(): sys.exit('Two spaces not properly aligned!') # Save the Space object in pickle format save_pkl_files(reduced_space1, outPath1 + '.sm', save_in_one_file=True) save_pkl_files(reduced_space2, outPath2 + '.sm', save_in_one_file=True) logging.info("--- %s seconds ---" % (time.time() - start_time))
def main(): """ Compute the smoothed and shifted (P)PMI matrix from a co-occurrence matrix. Smoothing is performed as described in Omer Levy, Yoav Goldberg, and Ido Dagan. 2015. Improving distributional similarity with lessons learned from word embeddings. Trans. ACL, 3. """ # Get the arguments args = docopt( '''Compute the smoothed and shifted (P)PMI matrix from a co-occurrence matrix and save it in pickle format. Usage: ppmi.py [-l] <dsm_prefix> <k> <alpha> <outPath> <dsm_prefix> = the prefix for the input files (.sm for the matrix, .rows and .cols) and output files (.ppmi) <k> = shifting parameter <alpha> = smoothing parameter <outPath> = output path for space Options: -l, --len normalize final vectors to unit length ''') is_len = args['--len'] dsm_prefix = args['<dsm_prefix>'] k = int(args['<k>']) alpha = float(args['<alpha>']) outPath = args['<outPath>'] logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Get space with sparse matrix dsm = load_pkl_files(dsm_prefix) id2row = dsm.get_id2row() id2column = dsm.get_id2column() # Get probabilities matrix_ = dsm.cooccurrence_matrix matrix_.assert_positive() row_sum = matrix_.sum(axis=1) col_sum = matrix_.sum(axis=0) # Compute smoothed P_alpha(c) smooth_col_sum = np.power(col_sum, alpha) col_sum = smooth_col_sum / smooth_col_sum.sum() # Compute P(w) row_sum = nonzero_invert(row_sum) col_sum = nonzero_invert(col_sum) # Apply epmi weighting (without log) matrix_ = matrix_.scale_rows(row_sum) matrix_ = matrix_.scale_columns(col_sum) # Apply log weighting matrix_.mat.data = np.log(matrix_.mat.data) # Shift values matrix_.mat.data -= np.log(k) # Eliminate negative counts matrix_.mat.data[matrix_.mat.data <= 0] = 0.0 # Eliminate zero counts matrix_.mat.eliminate_zeros() matrix_ = matrix_.get_mat() if is_len: # L2-normalize vectors l2norm1 = linalg.norm(matrix_, axis=1, ord=2) l2norm1[l2norm1 == 0.0] = 1.0 # Convert 0 values to 1 matrix_ /= l2norm1.reshape(len(l2norm1), 1) dsm = Space(SparseMatrix(matrix_), id2row, id2column) # Save the Space object in pickle format save_pkl_files(dsm, outPath + ".ppmi.sm", save_in_one_file=False) logging.info("--- %s seconds ---" % (time.time() - start_time))
def train(self, train_data, arg_space, phrase_space): """ Trains a lexical function composition model to learn a function space and sets the function_space parameter. Args: train_data: list of string tuples. Each tuple contains 3 string elements: (function_word, arg, phrase). arg_space: argument space, of type Space. arg elements of train data are interpreted in this space. phrase space: phrase space, of type Space. phrase elements of the train data are interpreted in this space. Training tuples which contain strings not found in their respective spaces are ignored. Function words containing less than _MIN_SAMPLES training instances are ignored. For example, if _MIN_SAMPLES=2 and function word "red" occurs in only one phrase, "red" is ignored. The id2column attribute of the resulted composed space is set to be equal to that of the phrase space given as an input. """ start = time.time() self._has_intercept = self._regression_learner.has_intercept() if not isinstance(arg_space, Space): raise ValueError("expected one input spaces!") result_mats = [] train_data = sorted(train_data, key=lambda tup: tup[0]) function_word_list, arg_list, phrase_list = self.valid_data_to_lists( train_data, (None, arg_space.row2id, phrase_space.row2id)) #partitions the sorted input data keys, key_ranges = get_partitions(function_word_list, self._MIN_SAMPLES) if not keys: raise ValueError("No valid training data found!") assert (len(arg_space.element_shape) == 1) if self._has_intercept: new_element_shape = phrase_space.element_shape + ( arg_space.element_shape[0] + 1, ) else: new_element_shape = phrase_space.element_shape + ( arg_space.element_shape[0], ) for i in range(len(key_ranges)): idx_beg, idx_end = key_ranges[i] print(("Training lexical function...%s with %d samples" % (keys[i], idx_end - idx_beg))) arg_mat = arg_space.get_rows(arg_list[idx_beg:idx_end]) phrase_mat = phrase_space.get_rows(phrase_list[idx_beg:idx_end]) #convert them to the same type matrix_type = get_type_of_largest([arg_mat, phrase_mat]) [arg_mat, phrase_mat] = resolve_type_conflict([arg_mat, phrase_mat], matrix_type) result_mat = self._regression_learner.train( arg_mat, phrase_mat).transpose() result_mat.reshape((1, np.prod(new_element_shape))) result_mats.append(result_mat) new_space_mat = arg_mat.nary_vstack(result_mats) self.composed_id2column = phrase_space.id2column self._function_space = Space(new_space_mat, keys, [], element_shape=new_element_shape) log.print_composition_model_info(logger, self, 1, "\nTrained composition model:") log.print_info(logger, 3, "Trained: %s lexical functions" % len(keys)) log.print_info(logger, 3, "With total data points:%s" % len(function_word_list)) log.print_matrix_info(logger, arg_space.cooccurrence_matrix, 3, "Semantic space of arguments:") log.print_info( logger, 3, "Shape of lexical functions learned:%s" % (new_element_shape, )) log.print_matrix_info(logger, new_space_mat, 3, "Semantic space of lexical functions:") log.print_time_info(logger, time.time(), start, 2)