예제 #1
0
    def build_dictionary(self):
        logging.debug('=' * 20)
        logging.debug('首先,构建训练库字典,然后将每个词映射到一个索引,再将所有句子映射成索引的列表')

        # 构建训练库字典
        # 将训练库所有句子切分成列表,构成 2D的训练文档,每个单元是一个token,
        # 比如: [['今年','你','多少岁'],['你', '二十四','小时','在线','吗'],...]

        train_document = map(lambda x: x.split(),self.__seg_sentence__)

        gensim_dict = Dictionary.from_documents(train_document)

        # 更新字典,再字典中添加特殊符号,其中
        # U表示未知字符,即OOV词汇
        gensim_dict.add_documents([[u'UNKOWN']])
        logging.debug('更新字典,再字典中添加特殊符号(UNKOWN),之后字典大小为:%d' % (len(gensim_dict.keys())))
        # print '更新字典,再字典中添加特殊符号,之后字典大小为:%d' % (len(gensim_dict.keys()))
        self.__gensim_dict__ = gensim_dict
        self.__vocabulary_size__ = len(gensim_dict.keys())
        logging.debug('训练库字典为:%d' % (self.__vocabulary_size__))
        print '训练库字典为:%d' % self.__vocabulary_size__

        logging.debug(u'字典有:%s' % (','.join(gensim_dict.token2id.keys())))
        print u'字典有:%s' % (','.join(gensim_dict.token2id.keys()))
        # word2embedding = {}
        # unknow_token_index = self.__gensim_dict__.token2id[u'UNKOWN']
        embedding_weights = np.zeros((self.__vocabulary_size__ + 1, self.__word_embedding_length__ ))
        for key,value in gensim_dict.token2id.items():
            embedding_weights[value,:] = self.get_w2vEmbedding(key)
        # todo 创建词向量字典
        self.__embedding_weights__ = embedding_weights
예제 #2
0
    def build_dictionary(self):
        logging.debug('=' * 20)
        logging.debug('首先,构建训练库字典,然后将每个词映射到一个索引,再将所有句子映射成索引的列表')

        # 构建训练库字典
        # 将训练库所有句子切分成列表,构成 2D的训练文档,每个单元是一个token,
        # 比如: [['今年','你','多少岁'],['你', '二十四','小时','在线','吗'],...]

        train_document = map(lambda x: x.split(),self.__seg_sentence__)

        gensim_dict = Dictionary.from_documents(train_document)

        logging.debug('训练库字典为:%d' % (len(gensim_dict.keys())))
        print '训练库字典为:%d' % (len(gensim_dict.keys()))

        # 更新字典,再字典中添加特殊符号,其中
        # U表示未知字符,即OOV词汇
        gensim_dict.add_documents([[u'UNKOWN']])
        logging.debug('更新字典,再字典中添加特殊符号(UNKOWN),之后字典大小为:%d' % (len(gensim_dict.keys())))
        print '更新字典,再字典中添加特殊符号,之后字典大小为:%d' % (len(gensim_dict.keys()))

        logging.debug(u'字典有:%s' % (','.join(gensim_dict.token2id.keys())))
        print u'字典有:%s' % (','.join(gensim_dict.token2id.keys()))
        self.__gensim_dict__ = gensim_dict
예제 #3
0
    def build_dictionary(self, train_X=None, test_X=None):
        """
            1.对数据进行分词
            2.构建训练库字典,插入 一个特殊字符 'UNKOWN'表示未知词

        Parameters
        ----------
        train_X : array-like
        test_X : array-like

        Returns
        --------
        object:
            self
        """

        # region -------------- 1.将训练集和测试集合并 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
            logging.debug('1.将训练集和测试集合并')
            print('1.将训练集和测试集合并')
        if self.kwargs.get('vocabulary_including_test_set', True):
            X = np.concatenate((train_X, test_X), axis=0)
        else:
            X = train_X

        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
        # endregion -------------- 1.将训练集和测试集合并 ---------------

        # region -------------- 2.对数据进行分词 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
            logging.debug('对数据进行分词')
            print('对数据进行分词')
        # -------------- code start : 开始 -------------
        if self.need_segmented:
            segmented_sentences = map(self.segment_sentence, X)
        else:
            segmented_sentences = X
        # -------------- code start : 结束 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
        # endregion -------------- 2.对数据进行分词 ---------------

        # region -------------- 3. 将句子补齐到等长 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
            logging.debug('2. 将句子补齐到等长')
            print('2. 将句子补齐到等长')
        # -------------- code start : 开始 -------------

        # 将不等长的句子都对齐,超出padding_length长度的句子截断,小于的则补 PADDING
        padded_sentences = np.asarray(
            map(self.sentence_padding, segmented_sentences))

        # endregion -------------- 3. 将句子补齐到等长 -------------

        # region -------------- region start : 4.构建训练库字典 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
            logging.debug('4.构建训练库字典')
            print('4.构建训练库字典')
        # -------------- code start : 开始 -------------
        logging.debug('=' * 20)
        logging.debug('首先,构建训练库字典,然后将每个词映射到一个索引,再将所有句子映射成索引的列表')

        # 将训练库所有句子切分成列表,构成 2D的训练文档,每个单元是一个token,
        # 比如: [['今年','你','多少岁'],['你', '二十四','小时','在线','吗'],...]
        # 将分完词句子转成合适的数据格式
        train_document = map(lambda x: x.split(), padded_sentences)
        # 获取训练库字典
        if self.padding_mode != 'none':
            # 为了确保padding的索引是0,所以在最前面加入 PADDING
            train_document.insert(0, [u'PADDING'])
        self.train_data_dict = Dictionary.from_documents(train_document)

        # 更新字典,再字典中添加特殊符号,其中
        # UNKOWN表示未知字符,即OOV词汇
        if self.add_unkown_word:
            self.train_data_dict.add_documents([[u'UNKOWN']])

        # 获取padding和UNKOWN 的字典索引
        self.padding_token_index = self.train_data_dict.token2id.get(
            u'PADDING', -1)
        self.unknow_token_index = self.train_data_dict.token2id.get(
            u'UNKOWN', -1)

        self.vocabulary_size = len(self.train_data_dict.keys())
        # 按索引从小到大排序
        self.vocabulary = [
            token
            for token, id in sorted(self.train_data_dict.token2id.items(),
                                    key=lambda x: x[1])
        ]
        # print(self.vocabulary_size)
        # print((self.train_data_dict.token2id.items()))
        # quit()

        # -------------- print start : just print info -------------
        if self.verbose > 1:
            logging.debug('训练库字典为:%d' % (len(self.train_data_dict.keys())))
            print('训练库字典为:%d' % (len(self.train_data_dict.keys())))
            logging.debug(u'字典有:%s' %
                          (','.join(self.train_data_dict.token2id.keys())))
            print(u'字典有:%s' % (','.join(self.train_data_dict.token2id.keys())))
        # -------------- print end : just print info -------------

        # -------------- code start : 结束 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)

        # endregion -------------- 4.构建训练库字典 ---------------

        return padded_sentences
예제 #4
0
    def build_dictionary(self, train_X=None, test_X=None):
        """
            1.对数据进行分词
            2.构建训练库字典,插入 一个特殊字符 'UNKOWN'表示未知词

        Parameters
        ----------
        train_X : array-like
        test_X : array-like

        Returns
        --------
        object:
            self
        """

        # region -------------- 1.将训练集和测试集合并 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
            logging.debug('1.将训练集和测试集合并')
            print('1.将训练集和测试集合并')
        if self.kwargs.get('vocabulary_including_test_set', True):
            X = np.concatenate((train_X, test_X), axis=0)
        else:
            X = train_X

        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
        # endregion -------------- 1.将训练集和测试集合并 ---------------

        # region -------------- 2.对数据进行分词 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
            logging.debug('对数据进行分词')
            print('对数据进行分词')
        # -------------- code start : 开始 -------------
        if self.need_segmented:
            segmented_sentences = map(self.segment_sentence, X)
        else:
            segmented_sentences = X
        # -------------- code start : 结束 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
        # endregion -------------- 2.对数据进行分词 ---------------

        # region -------------- 3. 将句子补齐到等长 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
            logging.debug('2. 将句子补齐到等长')
            print('2. 将句子补齐到等长')
        # -------------- code start : 开始 -------------

        # 将不等长的句子都对齐,超出padding_length长度的句子截断,小于的则补 PADDING
        padded_sentences = np.asarray(map(self.sentence_padding, segmented_sentences))

        # endregion -------------- 3. 将句子补齐到等长 -------------

        # region -------------- region start : 4.构建训练库字典 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
            logging.debug('4.构建训练库字典')
            print('4.构建训练库字典')
        # -------------- code start : 开始 -------------
        logging.debug('=' * 20)
        logging.debug('首先,构建训练库字典,然后将每个词映射到一个索引,再将所有句子映射成索引的列表')

        # 将训练库所有句子切分成列表,构成 2D的训练文档,每个单元是一个token,
        # 比如: [['今年','你','多少岁'],['你', '二十四','小时','在线','吗'],...]
        # 将分完词句子转成合适的数据格式
        train_document = map(lambda x: x.split(), padded_sentences)
        # 获取训练库字典
        if self.padding_mode != 'none':
            # 为了确保padding的索引是0,所以在最前面加入 PADDING
            train_document.insert(0, [u'PADDING'])
        self.train_data_dict = Dictionary.from_documents(train_document)

        # 更新字典,再字典中添加特殊符号,其中
        # UNKOWN表示未知字符,即OOV词汇
        if self.add_unkown_word:
            self.train_data_dict.add_documents([[u'UNKOWN']])

        # 获取padding和UNKOWN 的字典索引
        self.padding_token_index = self.train_data_dict.token2id.get(u'PADDING', -1)
        self.unknow_token_index = self.train_data_dict.token2id.get(u'UNKOWN', -1)

        self.vocabulary_size = len(self.train_data_dict.keys())
        # 按索引从小到大排序
        self.vocabulary = [token for token, id in sorted(self.train_data_dict.token2id.items(), key=lambda x: x[1])]
        # print(self.vocabulary_size)
        # print((self.train_data_dict.token2id.items()))
        # quit()

        # -------------- print start : just print info -------------
        if self.verbose > 1:
            logging.debug('训练库字典为:%d' % (len(self.train_data_dict.keys())))
            print('训练库字典为:%d' % (len(self.train_data_dict.keys())))
            logging.debug(u'字典有:%s' % (','.join(self.train_data_dict.token2id.keys())))
            print(u'字典有:%s' % (','.join(self.train_data_dict.token2id.keys())))
        # -------------- print end : just print info -------------

        # -------------- code start : 结束 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)

        # endregion -------------- 4.构建训练库字典 ---------------

        return padded_sentences