예제 #1
0
    def setUp(self):
        self.a = np.array([[1, 2, 3], [4, 0, 5]])
        self.space_s = Space(SparseMatrix(np.mat(self.a)), ["a", "b"],
                             ["f1", "f2", "f3"])

        self.space_d = Space(DenseMatrix(np.mat(self.a)), ["a", "b"],
                             ["f1", "f2", "f3"])
예제 #2
0
    def setUp(self):
        self.dir_ = data_dir + "/space_test_resources/"
        self.init_test_cases = [(DenseMatrix(np.array([[1,2],[3,4]])),
                       ["car", "man"],
                       ["feat1", "feat2"],
                       {"man":1, "car":0},
                       {"feat1":0, "feat2":1},
                       [ScalingOperation(EpmiWeighting())]),
                      (DenseMatrix(np.array([[1,2],[3,4]])),
                       ["car", "man"],
                       [],
                       {"man":1, "car":0},
                       {},
                       [ScalingOperation(EpmiWeighting())])]

        self.m1 = np.array([[1,2,3]])
        self.row1 = ["a"]
        self.row2 = ["a", "b", "c"]
        self.ft1 = ["f1","f2","f3"]
        self.space1 = Space(DenseMatrix(self.m1),self.row1, self.ft1)

        self.x = np.mat([[1,2,3],[2,4,6],[4,675,43]])
        self.us = np.mat([[  2.19272110e+00,   3.03174768e+00],
                               [  4.38544220e+00,   6.06349536e+00],
                               [  6.76369708e+02,  -4.91431927e-02]])
        self.space2 = Space(DenseMatrix(self.x), self.row2, self.ft1)
예제 #3
0
def main():
    """
    Convert temporal referencing matrix to regular (binned) matrix.
    """

    # Get the arguments
    args = docopt(
        """Convert temporal referencing matrix to regular (binned) matrix.

    Usage:
        tr2bin.py (-w | -s) <spacePrefix> <ref> <outPath>

        <spacePrefix> = path to pickled space without suffix
        <ref> = reference string
        <outPath> = output path for result file

    Options:
        -w, --w2v   save in w2v format
        -s, --sps   save in sparse matrix format
        
    """)

    is_w2v = args['--w2v']
    is_sps = args['--sps']
    spacePrefix = args['<spacePrefix>']
    ref = args['<ref>']
    outPath = args['<outPath>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Load spaces
    space = load_pkl_files(spacePrefix)
    matrix = space.get_cooccurrence_matrix().get_mat()
    id2row = space.get_id2row()
    id2column = space.get_id2column()

    ti = [(spl[0], i) for i, w in enumerate(id2row) for spl in [w.split('_')]
          if len(spl) == 1 or (len(spl) == 2 and spl[1] == ref)]
    targets, indices = zip(*ti)

    new_matrix = matrix[list(indices), :]

    # Save the Space objects
    if is_w2v:
        new_space = Space(DenseMatrix(new_matrix), list(targets), id2column)
        save_pkl_files(new_space,
                       outPath,
                       save_in_one_file=True,
                       save_as_w2v=True)
    if is_sps:
        new_space = Space(SparseMatrix(new_matrix), list(targets), id2column)
        save_pkl_files(new_space,
                       outPath,
                       save_in_one_file=True,
                       save_as_w2v=False)

    logging.info("--- %s seconds ---" % (time.time() - start_time))
예제 #4
0
    def setUp(self):
        self.ft = ["f1", "f2"]

        self.n_space = Space(DenseMatrix(np.mat([[3, 4], [5, 6]])),
                             ["car", "man"], self.ft)
        self.an_space = Space(DenseMatrix(np.mat([[3, 4], [5, 6]])),
                              ["a1_car", "a1_man"], self.ft)
예제 #5
0
    def test_weighted_additive(self):

        self.m12 = DenseMatrix(np.mat([[3, 1], [9, 2]]))
        self.m22 = DenseMatrix(np.mat([[4, 3], [2, 1]]))
        self.ph2 = DenseMatrix(np.mat([[18, 11], [24, 7]]))
        self.row = ["a", "b"]
        self.ft = ["f1", "f2"]
        self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft)
        self.space2 = Space(DenseMatrix(self.ph2), ["a_a", "a_b"], self.ft)
        m = WeightedAdditive()
        m.export(self.prefix + ".add1")
        m.train([("a", "a", "a_a")], self.space1, self.space2)
        m.export(self.prefix + ".add2")
예제 #6
0
    def test_dilation(self):

        self.m12 = DenseMatrix(np.mat([[3, 1], [9, 2]]))
        self.m22 = DenseMatrix(np.mat([[4, 3], [2, 1]]))
        self.ph2 = DenseMatrix(np.mat([[18, 11], [24, 7]]))
        self.row = ["a", "b"]
        self.ft = ["f1", "f2"]
        self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft)
        self.space2 = Space(DenseMatrix(self.ph2), ["a_a", "a_b"], self.ft)
        m = Dilation()
        m.export(self.prefix + ".dil1")
        m.train([("a", "b", "a_b")], self.space1, self.space2)
        m.export(self.prefix + ".dil2")
예제 #7
0
    def test_train_intercept(self):
        a1_mat = DenseMatrix(np.mat([[3, 4], [5, 6]]))
        a2_mat = DenseMatrix(np.mat([[1, 2], [3, 4]]))

        train_data = [("a1", "man", "a1_man"),
                      ("a2", "car", "a2_car"),
                      ("a1", "boy", "a1_boy"),
                      ("a2", "boy", "a2_boy")
        ]

        n_mat = DenseMatrix(np.mat([[13, 21], [3, 4], [5, 6]]))
        n_space = Space(n_mat, ["man", "car", "boy"], self.ft)

        an1_mat = (a1_mat * n_mat.transpose()).transpose()
        an2_mat = (a2_mat * n_mat.transpose()).transpose()
        an_mat = an1_mat.vstack(an2_mat)

        an_space = Space(an_mat, ["a1_man", "a1_car", "a1_boy", "a2_man", "a2_car", "a2_boy"], self.ft)

        #test train
        model = LexicalFunction(learner=LstsqRegressionLearner(intercept=True))
        model.train(train_data, n_space, an_space)
        a_space = model.function_space

        a1_mat.reshape((1, 4))
        #np.testing.assert_array_almost_equal(a1_mat.mat,
        #                                     a_space.cooccurrence_matrix.mat[0])

        a2_mat.reshape((1, 4))
        #np.testing.assert_array_almost_equal(a2_mat.mat,
        #                                     a_space.cooccurrence_matrix.mat[1])

        self.assertListEqual(a_space.id2row, ["a1", "a2"])
        self.assertTupleEqual(a_space.element_shape, (2, 3))

        #test compose
        a1_mat = DenseMatrix(np.mat([[3, 4, 5, 6]]))
        a2_mat = DenseMatrix(np.mat([[1, 2, 3, 4]]))
        a_mat = a_space.cooccurrence_matrix

        a_space = Space(a_mat, ["a1", "a2"], [], element_shape=(2, 3))
        model = LexicalFunction(function_space=a_space, intercept=True)
        comp_space = model.compose(train_data, n_space)

        self.assertListEqual(comp_space.id2row, ["a1_man", "a2_car", "a1_boy", "a2_boy"])
        self.assertListEqual(comp_space.id2column, [])

        self.assertEqual(comp_space.element_shape, (2,))

        np.testing.assert_array_almost_equal(comp_space.cooccurrence_matrix.mat,
                                             an_mat[[0, 4, 2, 5]].mat, 8)
예제 #8
0
    def test_vstack_raises(self):

        space3 = Space(DenseMatrix(self.x[0:2,0:1]), ["e","f"], self.ft1[0:1])
        space4 = Space(DenseMatrix(self.x[0:2,:]), ["a","f"], self.ft1)
        space5 = Space(DenseMatrix(self.x[0:2,:]), ["e","f"], [])
        space6 = Space(DenseMatrix(self.x[0:2,:]), ["e","f"], ["f1","f2","f4"])

        test_cases = [(self.space2, space3),
                      (self.space2, space4),
                      (self.space2, space5),
                      (self.space2, space6)
                      ]

        for space1, space2 in test_cases:
            self.assertRaises(ValueError, space1.vstack, space1, space2)
예제 #9
0
    def test_full_additive(self):

        self.m12 = DenseMatrix(np.mat([[3, 1], [9, 2]]))
        self.m22 = DenseMatrix(np.mat([[4, 3], [2, 1]]))
        self.ph2 = DenseMatrix(np.mat([[18, 11], [24, 7]]))
        self.row = ["a", "b"]
        self.ft = ["f1", "f2"]
        self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft)
        self.space2 = Space(DenseMatrix(self.ph2), ["a_a", "a_b"], self.ft)
        m = FullAdditive()
        self.assertRaises(IllegalStateError, m.export, self.prefix + ".full1")
        m.train([("a", "b", "a_b"), ("a", "a", "a_a")], self.space1,
                self.space2)

        m.export(self.prefix + ".full2")
예제 #10
0
    def test_lexical_function(self):

        self.m12 = DenseMatrix(np.mat([[3, 1], [9, 2]]))
        self.m22 = DenseMatrix(np.mat([[4, 3], [2, 1]]))
        self.ph2 = DenseMatrix(np.mat([[18, 11], [24, 7]]))
        self.row = ["a", "b"]
        self.ft = ["f1", "f2"]
        self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft)
        self.space2 = Space(DenseMatrix(self.ph2), ["a_a", "a_b"], self.ft)
        m = LexicalFunction()
        m._MIN_SAMPLES = 1
        self.assertRaises(IllegalStateError, m.export, self.prefix + ".lf1")
        m.train([("a", "b", "a_b"), ("a", "a", "a_a")], self.space1,
                self.space2)
        m.export(self.prefix + ".lf2")
예제 #11
0
def main():
    parser = argparse.ArgumentParser(
        'Converts a VW topic output to a COMPOSES pkl file.')
    parser.add_argument('--input',
                        '-i',
                        type=argparse.FileType('r'),
                        help='Input file')
    parser.add_argument('--docnames',
                        '-d',
                        type=argparse.FileType('r'),
                        help='Docnames file')
    parser.add_argument('--output',
                        '-o',
                        type=argparse.FileType('w'),
                        default=sys.stdout,
                        help='Output file')

    args = parser.parse_args()
    docnames = [l for l in (l.strip() for l in args.docnames) if l]
    matrix = None
    for i, line in enumerate(args.input):
        line = line.strip()
        weights = map(float, line.split(" "))
        if matrix is None:
            matrix = np.zeros((len(docnames), len(weights)), dtype=np.float)
        weights = np.array(weights)
        matrix[i] = weights

    dm = DenseMatrix(matrix)
    sp = Space(dm, docnames, [])
    pickle.dump(sp, args.output)
    args.output.close()
예제 #12
0
def main():
    """
    Transform EPMI matrix in npz format to SPPMI space and save as pickle file.
    """

    # Get the arguments
    args = docopt(
        '''Transform EPMI matrix in npz format to SPPMI space and save as pickle file.

    Usage:
        transform_matrix_epmi2sppmi.py <spacePrefix> <outPath> <k>

        <spacePrefix> = path to npz without suffix
        <outPath> = output path for space
        <k> = shifting parameter
    
    ''')

    spacePrefix = args['<spacePrefix>']
    outPath = args['<outPath>']
    k = int(args['<k>'])

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Get npz matrix
    with np.load(spacePrefix + '.npz') as loader:
        matrix = csr_matrix(
            (loader['data'], loader['indices'], loader['indptr']),
            shape=loader['shape'])

    with open(spacePrefix + '.words.vocab') as f:
        id2row = vocab = [line.strip() for line in f if len(line) > 0]

    with open(spacePrefix + '.contexts.vocab') as f:
        id2column = [line.strip() for line in f if len(line) > 0]

    # Apply log weighting
    matrix.data = np.log(matrix.data)

    # Shift values
    matrix.data -= np.log(k)

    # Eliminate negative counts
    matrix.data[matrix.data <= 0] = 0.0

    # Eliminate zero counts
    matrix.eliminate_zeros()

    # Create new space
    sparseSpace = Space(SparseMatrix(matrix), id2row, id2column)

    #print sparseSpace.get_cooccurrence_matrix()

    # Save the Space object in pickle format
    save_pkl_files(sparseSpace, outPath + 'ppmi.sm', save_in_one_file=True)

    logging.info("--- %s seconds ---" % (time.time() - start_time))
예제 #13
0
def load_pkl_files(dsm_prefix):
    """
    Load the space from either a single pkl file or numerous files.
    :param dsm_prefix:
    :param dsm:
    """
    
    # Check whether there is a single pickle file for the Space object
    if os.path.isfile(dsm_prefix + '.pkl'):
        return io_utils.load(dsm_prefix + '.pkl')

    # Load the multiple files: npz for the matrix and pkl for the other data members of Space
    with np.load(dsm_prefix + 'cooc.npz') as loader:
        coo = coo_matrix((loader['data'], (loader['row'], loader['col'])), shape=loader['shape'])

    cooccurrence_matrix = SparseMatrix(csr_matrix(coo))

    with open(dsm_prefix + '_row2id.pkl', 'rb') as f_in:
        row2id = pickle.load(f_in)

    with open(dsm_prefix + '_id2row.pkl', 'rb') as f_in:
        id2row = pickle.load(f_in)

    with open(dsm_prefix + '_column2id.pkl', 'rb') as f_in:
        column2id = pickle.load(f_in)

    with open(dsm_prefix + '_id2column.pkl', 'rb') as f_in:
        id2column = pickle.load(f_in)

    return Space(cooccurrence_matrix, id2row, id2column, row2id=row2id, column2id=column2id)
예제 #14
0
def read_mikolov(spacefile):
    header = spacefile.readline().rstrip()
    vocab_s, dims = map(int, header.split(" "))

    vocab = []

    # init matrix
    matrix = np.zeros((vocab_s, dims), dtype=np.float)

    i = 0
    while True:
        line = spacefile.readline()
        if not line:
            break
        sep = line.find(" ")
        if sep == -1:
            raise ValueError(
                "Couldn't find the vocab/data separation character! Space file corruption?"
            )

        word = line[:sep]
        data = line[sep + 1:]
        if len(data) < FLOAT_SIZE * dims + 1:
            data += spacefile.read(FLOAT_SIZE * dims + 1 - len(data))
        data = data[:-1]
        vocab.append(word)
        vector = (struct.unpack("%df" % dims, data))
        matrix[i] = vector
        i += 1

    dm = DenseMatrix(matrix)
    sp = Space(dm, vocab, [])

    return sp
예제 #15
0
def main():
    parser = argparse.ArgumentParser(
        description="Converts a vecf file to dissect pkl format.")
    parser.add_argument('--input',
                        '-i',
                        type=argparse.FileType('r'),
                        help='Input file')
    parser.add_argument('--output',
                        '-o',
                        type=argparse.FileType('w'),
                        help='Output file')
    args = parser.parse_args()

    header = args.input.readline().rstrip()
    vocab_s, dims = map(int, header.split(" "))

    vocab = []

    # init matrix
    matrix = np.zeros((vocab_s, dims), dtype=np.float)

    for i, line in enumerate(args.input):
        data = line.split()
        vector = np.array(map(float, data[1:]))
        word = data[0]
        vocab.append(word)
        matrix[i] = vector

    dm = DenseMatrix(matrix)
    sp = Space(dm, vocab, [])
    pickle.dump(sp, args.output)
    args.output.close()
예제 #16
0
    def compose(self, data, arg_space):
        """
        Uses a composition model to compose elements.

        Args:
            data: data to be composed. List of tuples, each containing 3
            strings: (arg1, arg2, composed_phrase). arg1 and arg2 are the
            elements to be composed and composed_phrase is the string associated
            to their composition.

            arg_space: argument space(s). Space object or a tuple of two
            Space objects (e.g. my_space, or (my_space1, my_space2)).
            If two spaces are provided, arg1 elements of data are
            interpreted in space1, and arg2 in space2.

        Returns:
            composed space: a new object of type Space, containing the
            phrases obtained through composition.

        """
        start = time.time()

        arg1_space, arg2_space = self.extract_arg_spaces(arg_space)
        arg1_list, arg2_list, phrase_list = self.valid_data_to_lists(data,
                                                                     (arg1_space.row2id,
                                                                      arg2_space.row2id,
                                                                      None))
        
        # we try to achieve at most MAX_MEM_OVERHEAD*phrase_space memory overhead
        # the /3.0 is needed
        # because the composing data needs 3 * len(train_data) memory (arg1 vector, arg2 vector, phrase vector)
        chunk_size = int(max(arg1_space.cooccurrence_matrix.shape[0],arg2_space.cooccurrence_matrix.shape[0],len(phrase_list))
                          * self.MAX_MEM_OVERHEAD / 3.0) + 1
        
        composed_mats = []
        for i in range(int(math.ceil(len(arg1_list) / float(chunk_size)))):
            beg, end = i*chunk_size, min((i+1)*chunk_size, len(arg1_list))

            arg1_mat = arg1_space.get_rows(arg1_list[beg:end])
            arg2_mat = arg2_space.get_rows(arg2_list[beg:end])

            [arg1_mat, arg2_mat] = resolve_type_conflict([arg1_mat, arg2_mat],
                                                                    DenseMatrix)
            composed_mat = self._compose(arg1_mat, arg2_mat)
            composed_mats.append(composed_mat)
        
        composed_phrase_mat = composed_mat.nary_vstack(composed_mats)
        
        if self.composed_id2column is None:
            self.composed_id2column = self._build_id2column(arg1_space, arg2_space)

        log.print_name(logger, self, 1, "\nComposed with composition model:")
        log.print_info(logger, 3, "Composed total data points:%s" % arg1_mat.shape[0])
        log.print_matrix_info(logger, composed_phrase_mat, 4,
                              "Resulted (composed) semantic space::")
        log.print_time_info(logger, time.time(), start, 2)
        
        return Space(composed_phrase_mat, phrase_list, self.composed_id2column)
예제 #17
0
    def setUp(self):
        self.m1 = np.array([[1, 2, 3]])
        self.row1 = ["a"]
        self.ft1 = ["f1", "f2", "f3"]
        self.space1 = Space(DenseMatrix(self.m1), self.row1, self.ft1)

        self.m2 = np.array([[4, 2, 6]])
        self.row2 = ["b"]
        self.row3 = ["a", "b", "c"]

        self.x = np.mat([[1, 2, 3], [2, 4, 6], [4, 675, 43]])
        self.us = np.mat([[2.19272110e+00, 3.03174768e+00],
                          [4.38544220e+00, 6.06349536e+00],
                          [6.76369708e+02, -4.91431927e-02]])
        self.us2 = np.mat([[2.19272110e+00], [4.38544220e+00],
                           [6.76369708e+02]])

        self.space2 = Space(DenseMatrix(self.x), self.row3, self.ft1)
예제 #18
0
    def compose(self, data, arg_space):
        """
        Uses a lexical function composition model to compose elements.

        Args:
            data: data to be composed. List of tuples, each containing 3
            strings: (function_word, arg, composed_phrase). function_word and
            arg are the elements to be composed and composed_phrase is the
            string associated to their composition. function_word elements
            are interpreted in self.function_space.

            arg_space: argument space, of type Space. arg elements of data are
            interpreted in this space.

        Returns:
            composed space: a new object of type Space, containing the
            phrases obtained through composition.

        """
        start = time.time()

        assert_is_instance(arg_space, Space)
        arg1_list, arg2_list, phrase_list = self.valid_data_to_lists(
            data, (self._function_space.row2id, arg_space.row2id, None))

        composed_vec_list = []
        for i in range(len(arg1_list)):
            arg1_vec = self._function_space.get_row(arg1_list[i])
            arg2_vec = arg_space.get_row(arg2_list[i])

            matrix_type = get_type_of_largest([arg1_vec, arg2_vec])
            [arg1_vec, arg2_vec] = resolve_type_conflict([arg1_vec, arg2_vec],
                                                         matrix_type)

            composed_ph_vec = self._compose(arg1_vec, arg2_vec,
                                            self._function_space.element_shape)

            composed_vec_list.append(composed_ph_vec)

        result_element_shape = self._function_space.element_shape[0:-1]
        composed_ph_mat = composed_ph_vec.nary_vstack(composed_vec_list)

        log.print_name(logger, self, 1, "\nComposed with composition model:")
        log.print_info(logger, 3,
                       "Composed total data points:%s" % len(arg1_list))
        log.print_info(
            logger, 3,
            "Functional shape of the resulted (composed) elements:%s" %
            (result_element_shape, ))
        log.print_matrix_info(logger, composed_ph_mat, 4,
                              "Resulted (composed) semantic space:")
        log.print_time_info(logger, time.time(), start, 2)

        return Space(composed_ph_mat,
                     phrase_list,
                     self.composed_id2column,
                     element_shape=result_element_shape)
예제 #19
0
 def test_init1(self):
     for (m, id2row, id2col, row2id, col2id, ops) in self.init_test_cases:
         space_ = Space(m, id2row, id2col)
         self.assertIs(m, space_.cooccurrence_matrix)
         self.assertIs(id2row, space_.id2row)
         self.assertIs(id2col, space_.id2column)
         self.assertDictEqual(row2id, space_.row2id)
         self.assertDictEqual(col2id, space_.column2id)
         self.assertListEqual([], space_.operations)
예제 #20
0
 def test_init4(self):
     for (m, id2row, id2col, row2id, col2id, ops) in self.init_test_cases:
         space_ = Space(m, id2row, id2col, row2id, col2id, operations = ops)
         self.assertIs(m, space_.cooccurrence_matrix)
         self.assertIs(id2row, space_.id2row)
         self.assertIs(id2col, space_.id2column)
         self.assertIs(row2id, space_.row2id)
         self.assertIs(col2id, space_.column2id)
         self.assertIs(ops, space_.operations)
예제 #21
0
def load_pkl_files(dsm_prefix):
    """
    Load the space from either a single pkl file or numerous files.
    :param dsm_prefix: the prefix of the input files (.pkl, .rows, .cols)
    """

    # Check whether there is a single pickle file for the Space object
    if os.path.isfile(dsm_prefix + '.pkl'):
        return io_utils.load(dsm_prefix + '.pkl')

    # Load the multiple files: npz for the matrix and pkl for the other data members of Space
    if os.path.isfile(dsm_prefix + '.npz'):
        with np.load(dsm_prefix + '.npz') as loader:
            coo = coo_matrix((loader['data'], (loader['row'], loader['col'])), shape=loader['shape'])

        cooccurrence_matrix = SparseMatrix(csr_matrix(coo))

        with open(dsm_prefix + '_row2id.pkl', 'rb') as f_in:
            row2id = pickle.load(f_in)

        with open(dsm_prefix + '_id2row.pkl', 'rb') as f_in:
            id2row = pickle.load(f_in)

        with open(dsm_prefix + '_column2id.pkl', 'rb') as f_in:
            column2id = pickle.load(f_in)

        with open(dsm_prefix + '_id2column.pkl', 'rb') as f_in:
            id2column = pickle.load(f_in)

        return Space(cooccurrence_matrix, id2row, id2column, row2id=row2id, column2id=column2id)

    if os.path.isfile(dsm_prefix + '.tsv'):
        values = np.loadtxt(dsm_prefix + '.tsv', dtype=float, delimiter='\t', skiprows=0, comments=None, encoding='utf-8')
        targets = np.loadtxt(dsm_prefix + '.rows', dtype=str, skiprows=0, comments=None, encoding='utf-8')
        # Convert to space in sparse matrix format        
        return Space(SparseMatrix(values), list(targets), [])
    
    # If everything fails try to load it as single w2v file
    space_array = np.loadtxt(dsm_prefix + '.w2v', dtype=object, delimiter=' ', skiprows=1, comments=None, encoding='utf-8')
    targets = space_array[:,0].flatten()
    values = space_array[:,1:].astype(np.float)
    # Convert to space and sparse matrix format        
    return Space(SparseMatrix(values), list(targets), [])
예제 #22
0
    def test_vstack(self):
        space3 = Space(DenseMatrix(self.x[0:2,:]), ["e","f"], self.ft1)

        space4 = Space(DenseMatrix(np.vstack((self.x, self.x[0:2,:]))),
                       ["a", "b", "c", "e","f"], self.ft1)

        test_cases = [(self.space2, space3, space4)]
        for space1, space2, expected_space in test_cases:
            outcome = space1.vstack(space1, space2)
            np.testing.assert_array_equal(expected_space.cooccurrence_matrix.mat,
                                          outcome.cooccurrence_matrix.mat)

            self.assertListEqual(outcome.id2column, space1.id2column)
            self.assertListEqual(outcome.id2column, expected_space.id2column)

            self.assertDictEqual(outcome.column2id, space1.column2id)
            self.assertDictEqual(outcome.column2id, expected_space.column2id)

            self.assertListEqual(outcome.id2row, expected_space.id2row)
            self.assertDictEqual(outcome.row2id, expected_space.row2id)

            self.assertListEqual([], outcome.operations)
예제 #23
0
    def compose(self, data, arg_space):
        """
        Uses a composition model to compose elements.

        Args:
            data: data to be composed. List of tuples, each containing 3
            strings: (arg1, arg2, composed_phrase). arg1 and arg2 are the
            elements to be composed and composed_phrase is the string associated
            to their composition.

            arg_space: argument space(s). Space object or a tuple of two
            Space objects (e.g. my_space, or (my_space1, my_space2)).
            If two spaces are provided, arg1 elements of data are
            interpreted in space1, and arg2 in space2.

        Returns:
            composed space: a new object of type Space, containing the
            phrases obtained through composition.

        """
        start = time.time()

        arg1_space, arg2_space = self.extract_arg_spaces(arg_space)
        arg1_list, arg2_list, phrase_list = self.valid_data_to_lists(
            data, (arg1_space.row2id, arg2_space.row2id, None))

        arg1_mat = arg1_space.get_rows(arg1_list)
        arg2_mat = arg2_space.get_rows(arg2_list)

        [arg1_mat, arg2_mat] = resolve_type_conflict([arg1_mat, arg2_mat],
                                                     DenseMatrix)

        composed_phrase_mat = self._compose(arg1_mat, arg2_mat)
        if self.composed_id2column is None:
            self.composed_id2column = self._build_id2column(
                arg1_space, arg2_space)

        log.print_name(logger, self, 1, "\nComposed with composition model:")
        log.print_info(logger, 3,
                       "Composed total data points:%s" % arg1_mat.shape[0])
        log.print_matrix_info(logger, composed_phrase_mat, 4,
                              "Resulted (composed) semantic space::")
        log.print_time_info(logger, time.time(), start, 2)

        return Space(composed_phrase_mat, phrase_list, self.composed_id2column)
def main():
    """
    Convert txt matrix to w2v matrix and save.
    """

    # Get the arguments
    args = docopt('''Convert txt matrix to w2v matrix and save.

    Usage:
        convert_matrix_txt2w2v.py <spacePrefix> <outPath>

        <spacePrefix> = path to npz without suffix
        <outPath> = output path for space
    
    ''')

    spacePrefix = args['<spacePrefix>']
    outPath = args['<outPath>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    space_array = np.loadtxt(spacePrefix + '.txt',
                             dtype=object,
                             delimiter=' ',
                             skiprows=0,
                             comments='',
                             encoding='utf-8')
    targets = space_array[:, 0].flatten()
    values = space_array[:, 1:].astype(np.float)
    # Create new space
    sparseSpace = Space(DenseMatrix(coo_matrix(values)), list(targets), [])

    #print sparseSpace.get_row('wood').get_mat().toarray()[0].tolist()[id2column.index('inexhaustible')]

    # Save the Space object in pickle format
    save_pkl_files(sparseSpace,
                   outPath,
                   save_in_one_file=True,
                   save_as_w2v=True)

    logging.info("--- %s seconds ---" % (time.time() - start_time))
예제 #25
0
    def setUp(self):
        self.m11 = DenseMatrix(np.mat([[3], [9]]))
        self.m21 = DenseMatrix(np.mat([[4], [2]]))
        self.ph1 = DenseMatrix(np.mat([[18], [24]]))

        self.space1 = Space(SparseMatrix(np.mat([[3, 9], [4, 2]])), ["a", "b"],
                            ["f1", "f2"])
        self.space2 = Space(SparseMatrix(np.mat([[7, 11]])), ["a_b"],
                            ["f1", "f2"])
        self.space3 = Space(SparseMatrix(np.mat([[0, 0]])), ["a_b"],
                            ["f1", "f2"])

        self.space4 = Space(DenseMatrix(np.mat([[3, 9], [4, 2]])), ["a", "b"],
                            ["f1", "f2"])
        self.space5 = Space(DenseMatrix(np.mat([[7, 11]])), ["a_b"],
                            ["f1", "f2"])
        self.space6 = Space(DenseMatrix(np.mat([[0, 0]])), ["a_b"],
                            ["f1", "f2"])
        self.space7 = Space(DenseMatrix(np.mat([[7, 11], [7, 11]])),
                            ["a_b", "a_a"], ["f1", "f2"])
예제 #26
0
def main():
    """
    Make count-based vector space from corpus.
    """

    # Get the arguments
    args = docopt("""Make count-based vector space from corpus.

    Usage:
        count.py [-l] <windowSize> <corpDir> <outPath> <lowerBound> <upperBound>
        
    Arguments:
       
        <corpDir> = path to corpus directory with zipped files, each sentence in form 'year\tword1 word2 word3...'
        <outPath> = output path for vectors
        <windowSize> = the linear distance of context words to consider in each direction
        <lowerBound> = lower bound for time period
        <upperBound> = upper bound for time period

    Options:
        -l, --len   normalize final vectors to unit length

    """)

    is_len = args['--len']
    corpDir = args['<corpDir>']
    outPath = args['<outPath>']
    windowSize = int(args['<windowSize>'])
    lowerBound = int(args['<lowerBound>'])
    upperBound = int(args['<upperBound>'])

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Build vocabulary
    logging.info("Building vocabulary")
    sentences = PathLineSentences_mod(corpDir,
                                      lowerBound=lowerBound,
                                      upperBound=upperBound)
    vocabulary = list(
        set([
            word for sentence in sentences for word in sentence
            if len(sentence) > 1
        ]))  # Skip one-word sentences to avoid zero-vectors
    w2i = {w: i for i, w in enumerate(vocabulary)}

    # Initialize co-occurrence matrix as dictionary
    cooc_mat = defaultdict(lambda: 0)

    # Get counts from corpus
    sentences = PathLineSentences_mod(corpDir,
                                      lowerBound=lowerBound,
                                      upperBound=upperBound)
    logging.info("Counting context words")
    for sentence in sentences:
        for i, word in enumerate(sentence):
            lowerWindowSize = max(i - windowSize, 0)
            upperWindowSize = min(i + windowSize, len(sentence))
            window = sentence[lowerWindowSize:i] + sentence[i +
                                                            1:upperWindowSize +
                                                            1]
            if len(window) == 0:  # Skip one-word sentences
                continue
            windex = w2i[word]
            for contextWord in window:
                cooc_mat[(windex, w2i[contextWord])] += 1

    # Convert dictionary to sparse matrix
    logging.info("Converting dictionary to matrix")
    cooc_mat_sparse = dok_matrix((len(vocabulary), len(vocabulary)),
                                 dtype=float)
    try:
        cooc_mat_sparse.update(cooc_mat)
    except NotImplementedError:
        cooc_mat_sparse._update(cooc_mat)

    if is_len:
        # L2-normalize vectors
        l2norm1 = linalg.norm(cooc_mat_sparse, axis=1, ord=2)
        l2norm1[l2norm1 == 0.0] = 1.0  # Convert 0 values to 1
        cooc_mat_sparse /= l2norm1.reshape(len(l2norm1), 1)

    # Make space
    vocabulary = [v.encode('utf-8') for v in vocabulary]
    countSpace = Space(SparseMatrix(cooc_mat_sparse), vocabulary, vocabulary)

    # Save the Space object in pickle format
    save_pkl_files(countSpace, outPath, save_in_one_file=False)

    logging.info("Corpus has size %d" % sentences.corpusSize)
    logging.info("--- %s seconds ---" % (time.time() - start_time))
예제 #27
0
    def test_export(self):

        out_file = self.dir_ + "tmp"
        mat1 = np.mat([[1,2],[3,0]])
        mat1row, mat1col = ["a","b"], ["f1","f2"]

        mat2 = np.mat([[0,0]])
        mat2row, mat2col = ["a"], []

        test_cases = [(Space(DenseMatrix(mat1), mat1row, mat1col),
                       Space(SparseMatrix(mat1), mat1row, mat1col)),
                       (Space(DenseMatrix(mat2), mat2row, mat1col),
                       Space(SparseMatrix(mat2), mat2row, mat1col))]

        #3 cases allowed at the moment
        for sp_d, sp_s in test_cases:

            self.reset_export_files(out_file)
            sp_d.export(out_file, format="dm")
            new_sp = Space.build(data=out_file + ".dm",
                                 rows=out_file + ".rows",
                                 cols=out_file + ".cols", format="dm")
            self._test_equal_spaces_dense(sp_d, new_sp)

            self.reset_export_files(out_file)
            sp_d.export(out_file, format="sm")
            new_sp = Space.build(data=out_file + ".sm",
                                 rows=out_file + ".rows",
                                 cols=out_file + ".cols", format="sm")
            self._test_equal_spaces_sparse(sp_s, new_sp)

            self.reset_export_files(out_file)
            sp_s.export(out_file, format="sm")
            new_sp = Space.build(data=out_file + ".sm",
                                 rows=out_file + ".rows",
                                 cols=out_file + ".cols", format="sm")
            self._test_equal_spaces_sparse(sp_s, new_sp)

            self.reset_export_files(out_file)
            sp_s.export(out_file, format="dm")
            new_sp = Space.build(data=out_file + ".dm",
                                 rows=out_file + ".rows",
                                 cols=out_file + ".cols", format="dm")

            self._test_equal_spaces_dense(sp_d, new_sp)

        test_cases = [(Space(DenseMatrix(mat2), mat2row, mat2col),
                       Space(SparseMatrix(mat2), mat2row, mat2col))]

        for sp_d, sp_s in test_cases:

            self.reset_export_files(out_file)
            sp_d.export(out_file, format="dm")
            new_sp = Space.build(data=out_file + ".dm",
                                 rows=out_file + ".rows",
                                 format="dm")
            self._test_equal_spaces_dense(sp_d, new_sp)

            self.reset_export_files(out_file)
            sp_s.export(out_file, format="dm")
            new_sp = Space.build(data=out_file + ".dm",
                                 rows=out_file + ".rows",
                                 format="dm")

            self._test_equal_spaces_dense(sp_d, new_sp)
예제 #28
0
def main():
    """
    Align two sparse matrices by intersecting their columns.
    """

    # Get the arguments
    args = docopt('''Align two sparse matrices by intersecting their columns.

    Usage:
        count_alignment_intersect.py [-l] <outPath1> <outPath2> <spacePrefix1> <spacePrefix2>

        <outPath1> = output path for aligned space 1
        <outPath2> = output path for aligned space 2
        <spacePrefix1> = path to pickled space1 without suffix
        <spacePrefix2> = path to pickled space2 without suffix

    Options:
        -l, --len   normalize final vectors to unit length
    
    ''')

    is_len = args['--len']
    spacePrefix1 = args['<spacePrefix1>']
    spacePrefix2 = args['<spacePrefix2>']
    outPath1 = args['<outPath1>']
    outPath2 = args['<outPath2>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    space1 = load_pkl_files(spacePrefix1)
    space2 = load_pkl_files(spacePrefix2)
    id2row1 = space1.get_id2row()
    id2row2 = space2.get_id2row()
    id2column1 = space1.get_id2column()
    id2column2 = space2.get_id2column()
    column2id1 = space1.get_column2id()
    column2id2 = space2.get_column2id()
    intersected_columns = list(set(id2column1).intersection(id2column2))
    intersected_columns_id1 = [
        column2id1[item] for item in intersected_columns
    ]
    intersected_columns_id2 = [
        column2id2[item] for item in intersected_columns
    ]
    reduced_matrix1 = space1.get_cooccurrence_matrix(
    )[:, intersected_columns_id1].get_mat()
    reduced_matrix2 = space2.get_cooccurrence_matrix(
    )[:, intersected_columns_id2].get_mat()

    if is_len:
        # L2-normalize vectors
        l2norm1 = linalg.norm(reduced_matrix1, axis=1, ord=2)
        l2norm2 = linalg.norm(reduced_matrix2, axis=1, ord=2)
        l2norm1[l2norm1 == 0.0] = 1.0  # Convert 0 values to 1
        l2norm2[l2norm2 == 0.0] = 1.0  # Convert 0 values to 1
        reduced_matrix1 /= l2norm1.reshape(len(l2norm1), 1)
        reduced_matrix2 /= l2norm2.reshape(len(l2norm2), 1)

    reduced_space1 = Space(SparseMatrix(reduced_matrix1), id2row1,
                           intersected_columns)
    reduced_space2 = Space(SparseMatrix(reduced_matrix2), id2row2,
                           intersected_columns)

    if reduced_space1.get_id2column() != reduced_space2.get_id2column():
        sys.exit('Two spaces not properly aligned!')

    # Save the Space object in pickle format
    save_pkl_files(reduced_space1, outPath1 + '.sm', save_in_one_file=True)
    save_pkl_files(reduced_space2, outPath2 + '.sm', save_in_one_file=True)

    logging.info("--- %s seconds ---" % (time.time() - start_time))
예제 #29
0
def main():
    """
    Compute the smoothed and shifted (P)PMI matrix from a co-occurrence matrix. Smoothing is performed as described in

      Omer Levy, Yoav Goldberg, and Ido Dagan. 2015. Improving distributional similarity with lessons learned from word embeddings. Trans. ACL, 3.

    """

    # Get the arguments
    args = docopt(
        '''Compute the smoothed and shifted (P)PMI matrix from a co-occurrence matrix and save it in pickle format.

    Usage:
        ppmi.py [-l] <dsm_prefix> <k> <alpha> <outPath>

        <dsm_prefix> = the prefix for the input files (.sm for the matrix, .rows and .cols) and output files (.ppmi)
        <k> = shifting parameter
        <alpha> = smoothing parameter
        <outPath> = output path for space

    Options:
        -l, --len   normalize final vectors to unit length

    ''')

    is_len = args['--len']
    dsm_prefix = args['<dsm_prefix>']
    k = int(args['<k>'])
    alpha = float(args['<alpha>'])
    outPath = args['<outPath>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Get space with sparse matrix
    dsm = load_pkl_files(dsm_prefix)
    id2row = dsm.get_id2row()
    id2column = dsm.get_id2column()

    # Get probabilities
    matrix_ = dsm.cooccurrence_matrix

    matrix_.assert_positive()
    row_sum = matrix_.sum(axis=1)
    col_sum = matrix_.sum(axis=0)

    # Compute smoothed P_alpha(c)
    smooth_col_sum = np.power(col_sum, alpha)
    col_sum = smooth_col_sum / smooth_col_sum.sum()

    # Compute P(w)
    row_sum = nonzero_invert(row_sum)
    col_sum = nonzero_invert(col_sum)

    # Apply epmi weighting (without log)
    matrix_ = matrix_.scale_rows(row_sum)
    matrix_ = matrix_.scale_columns(col_sum)

    # Apply log weighting
    matrix_.mat.data = np.log(matrix_.mat.data)

    # Shift values
    matrix_.mat.data -= np.log(k)

    # Eliminate negative counts
    matrix_.mat.data[matrix_.mat.data <= 0] = 0.0

    # Eliminate zero counts
    matrix_.mat.eliminate_zeros()

    matrix_ = matrix_.get_mat()

    if is_len:
        # L2-normalize vectors
        l2norm1 = linalg.norm(matrix_, axis=1, ord=2)
        l2norm1[l2norm1 == 0.0] = 1.0  # Convert 0 values to 1
        matrix_ /= l2norm1.reshape(len(l2norm1), 1)

    dsm = Space(SparseMatrix(matrix_), id2row, id2column)

    # Save the Space object in pickle format
    save_pkl_files(dsm, outPath + ".ppmi.sm", save_in_one_file=False)
    logging.info("--- %s seconds ---" % (time.time() - start_time))
예제 #30
0
    def train(self, train_data, arg_space, phrase_space):
        """
        Trains a lexical function composition model to learn a function
        space and sets the function_space parameter.

        Args:
            train_data: list of string tuples. Each tuple contains 3
            string elements: (function_word, arg, phrase).

            arg_space: argument space, of type Space. arg elements of
            train data are interpreted in this space.

            phrase space: phrase space, of type Space. phrase elements of
            the train data are interpreted in this space.

        Training tuples which contain strings not found in their
        respective spaces are ignored. Function words containing less than
        _MIN_SAMPLES training instances are ignored. For example, if
        _MIN_SAMPLES=2 and function word "red" occurs in only one phrase, "red"
        is ignored.

        The id2column attribute of the resulted composed space is set to
        be equal to that of the phrase space given as an input.
        """

        start = time.time()

        self._has_intercept = self._regression_learner.has_intercept()

        if not isinstance(arg_space, Space):
            raise ValueError("expected one input spaces!")

        result_mats = []

        train_data = sorted(train_data, key=lambda tup: tup[0])
        function_word_list, arg_list, phrase_list = self.valid_data_to_lists(
            train_data, (None, arg_space.row2id, phrase_space.row2id))
        #partitions the sorted input data
        keys, key_ranges = get_partitions(function_word_list,
                                          self._MIN_SAMPLES)

        if not keys:
            raise ValueError("No valid training data found!")

        assert (len(arg_space.element_shape) == 1)

        if self._has_intercept:
            new_element_shape = phrase_space.element_shape + (
                arg_space.element_shape[0] + 1, )
        else:
            new_element_shape = phrase_space.element_shape + (
                arg_space.element_shape[0], )

        for i in range(len(key_ranges)):
            idx_beg, idx_end = key_ranges[i]

            print(("Training lexical function...%s with %d samples" %
                   (keys[i], idx_end - idx_beg)))

            arg_mat = arg_space.get_rows(arg_list[idx_beg:idx_end])
            phrase_mat = phrase_space.get_rows(phrase_list[idx_beg:idx_end])

            #convert them to the same type
            matrix_type = get_type_of_largest([arg_mat, phrase_mat])
            [arg_mat,
             phrase_mat] = resolve_type_conflict([arg_mat, phrase_mat],
                                                 matrix_type)

            result_mat = self._regression_learner.train(
                arg_mat, phrase_mat).transpose()

            result_mat.reshape((1, np.prod(new_element_shape)))

            result_mats.append(result_mat)

        new_space_mat = arg_mat.nary_vstack(result_mats)

        self.composed_id2column = phrase_space.id2column

        self._function_space = Space(new_space_mat,
                                     keys, [],
                                     element_shape=new_element_shape)

        log.print_composition_model_info(logger, self, 1,
                                         "\nTrained composition model:")
        log.print_info(logger, 3, "Trained: %s lexical functions" % len(keys))
        log.print_info(logger, 3,
                       "With total data points:%s" % len(function_word_list))
        log.print_matrix_info(logger, arg_space.cooccurrence_matrix, 3,
                              "Semantic space of arguments:")
        log.print_info(
            logger, 3,
            "Shape of lexical functions learned:%s" % (new_element_shape, ))
        log.print_matrix_info(logger, new_space_mat, 3,
                              "Semantic space of lexical functions:")
        log.print_time_info(logger, time.time(), start, 2)