示例#1
0
    def _preprocess_data(self, X, Y=None, idxs=None, train=False):
        """
        Preprocess the data:
        1. Convert sparse feature matrix to dense matrix for pytorch operation.
        2. Make sentence with mention into sequence data for LSTM.
        3. Select subset of the input if idxs exists.

        :param X: The input data of the model
        :param Y: The labels of input data (optional)
        :param idxs: The selected indexs of input data
        :param train: An indicator for word dictionary to extend new words
        """
        C, F = X

        # Covert sparse feature matrix to dense matrix
        # TODO: the pytorch implementation is taking dense vector as input,
        # should optimize later
        if issparse(F):
            F = F.todense()

        # Create word dictionary for LSTM
        if not hasattr(self, "word_dict"):
            self.word_dict = SymbolTable()
            arity = len(C[0])
            # Add paddings into word dictionary
            for i in range(arity):
                # TODO: optimize this
                list(
                    map(self.word_dict.get,
                        ["~~[[" + str(i), str(i) + "]]~~"]))

        # Make sequence input for LSTM from candidates
        seq_data = []
        for candidate in C:
            cand_idx = []
            for i in range(len(candidate)):
                # Add mark for each mention in the original sentence
                args = [(
                    candidate[i].span.get_word_start(),
                    candidate[i].span.get_word_end(),
                    i,
                )]
                s = mark_sentence(mention_to_tokens(candidate[i]), args)
                f = self.word_dict.get if train else self.word_dict.lookup
                cand_idx.append(list(map(f, s)))
            seq_data.append(cand_idx)

        # Generate proprcessed the input
        if idxs is None:
            if Y is not None:
                return [(seq_data[i], F[i]) for i in range(len(seq_data))], Y
            else:
                return [(seq_data[i], F[i]) for i in range(len(seq_data))]
        if Y is not None:
            return [(seq_data[i], F[i]) for i in idxs], Y[idxs]
        else:
            return [(seq_data[i], F[i]) for i in idxs]
示例#2
0
    def _preprocess_data(self, X, Y=None, idxs=None, train=False):
        """
        Preprocess the data:
        1. Make sentence with mention into sequence data for LSTM.
        2. Select subset of the input if idxs exists.

        :param X: The input data of the model.
        :type X: pair with candidates and corresponding features
        :param Y: The labels of input data (optional).
        :type Y: list of floats if num_classes = 2
            otherwise num_classes-length numpy array
        :param idxs: The selected indexs of input data.
        :type idxs: list or numpy.array
        :param train: An indicator for word dictionary to extend new words.
        :type train: bool
        :return: Preprocessed data.
        :rtype: list of (candidate, features) pairs
        """

        C, F = X

        # Create word dictionary for LSTM
        if not hasattr(self, "word_dict"):
            self.word_dict = SymbolTable()
            arity = len(C[0])
            # Add paddings into word dictionary
            for i in range(arity):
                # TODO: optimize this
                list(
                    map(self.word_dict.get,
                        ["~~[[" + str(i), str(i) + "]]~~"]))

        # Make sequence input for LSTM from candidates
        seq_data = []
        for candidate in C:
            cand_idx = []
            for i in range(len(candidate)):
                # Add mark for each mention in the original sentence
                args = [(
                    candidate[i].span.get_word_start_index(),
                    candidate[i].span.get_word_end_index(),
                    i,
                )]
                s = mark_sentence(mention_to_tokens(candidate[i]), args)
                f = self.word_dict.get if train else self.word_dict.lookup
                cand_idx.append(list(map(f, s)))
            seq_data.append(cand_idx)

        # Generate proprcessed the input
        if idxs is None:
            if Y is not None:
                return (
                    [(
                        seq_data[i],
                        F.indices[F.indptr[i]:F.indptr[i + 1]],
                        F.data[F.indptr[i]:F.indptr[i + 1]],
                    ) for i in range(len(C))],
                    Y,
                )
            else:
                return [(
                    seq_data[i],
                    F.indices[F.indptr[i]:F.indptr[i + 1]],
                    F.data[F.indptr[i]:F.indptr[i + 1]],
                ) for i in range(len(C))]
        if Y is not None:
            return (
                [(
                    seq_data[i],
                    F.indices[F.indptr[i]:F.indptr[i + 1]],
                    F.data[F.indptr[i]:F.indptr[i + 1]],
                ) for i in idxs],
                Y[idxs],
            )
        else:
            return [(
                seq_data[i],
                F.indices[F.indptr[i]:F.indptr[i + 1]],
                F.data[F.indptr[i]:F.indptr[i + 1]],
            ) for i in idxs]
示例#3
0
    def _preprocess_data(self, X, Y=None, idxs=None, train=False):
        """
        Preprocess the data:
        1. Convert sparse feature matrix to dense matrix for pytorch operation.
        2. Make sentence with mention into sequence data for LSTM.
        3. Select subset of the input if idxs exists.

        :param X: The input data of the model.
        :type X: pair with candidates and corresponding features
        :param Y: The labels of input data (optional).
        :type Y: list or numpy.array
        :param idxs: The selected indexs of input data.
        :type idxs: list or numpy.array
        :param train: An indicator for word dictionary to extend new words.
        :type train: bool
        :return: Preprocessed data.
        :rtype: list of (candidate, features) pairs
        """

        C, F = X

        # Covert sparse feature matrix to dense matrix
        if issparse(F):
            F = np.array(F.todense(), dtype=np.float32)

        if Y is not None:
            Y = np.array(Y).astype(np.float32)

        # Create word dictionary for LSTM
        if not hasattr(self, "word_dict"):
            self.word_dict = SymbolTable()
            arity = len(C[0])
            # Add paddings into word dictionary
            for i in range(arity):
                list(map(self.word_dict.get, ["~~[[" + str(i), str(i) + "]]~~"]))

        # Make sequence input for LSTM from candidates
        seq_data = []
        for candidate in C:
            cand_idx = []
            for i in range(len(candidate)):
                # Add mark for each mention in the original sentence
                args = [
                    (
                        candidate[i].context.get_word_start_index(),
                        candidate[i].context.get_word_end_index(),
                        i,
                    )
                ]
                s = mark_sentence(mention_to_tokens(candidate[i]), args)
                f = self.word_dict.get if train else self.word_dict.lookup
                cand_idx.append(list(map(f, s)))
            seq_data.append(cand_idx)

        # Generate proprcessed the input
        if idxs is None:
            if Y is not None:
                return [[seq_data[i], F[i]] for i in range(len(seq_data))], Y
            else:
                return [[seq_data[i], F[i]] for i in range(len(seq_data))]
        if Y is not None:
            return [[seq_data[i], F[i]] for i in idxs], Y[idxs]
        else:
            return [[seq_data[i], F[i]] for i in idxs]