Exemplo n.º 1
0
    def batch_segment_sentences(self, sentences):
        '''
            对多个句子批量分词

        :param sentences: array-like
        :return:
        '''
        self.jieba_util = Jieba_Util()
        segmented_sentences = map(self.segment_sentence, sentences)
        return segmented_sentences
Exemplo n.º 2
0
 def __init__(self):
     super(DataUtil, self).__init__()
     jutil = Jieba_Util(verbose=0)
     self.remove_sentence_punctuation = lambda x: jutil.seg(x, sep='', remove_url=False)
     self.get_sentence_length = lambda x: len(jutil.seg(x,
                                                 sep=' ',
                                                 full_mode=False,
                                                 remove_stopword=False,
                                                 replace_number=False,
                                                 lowercase=True,
                                                 zhs2zht=True,
                                                 remove_url=True,
                                                 HMM=False,
                                                 ).split())
Exemplo n.º 3
0
 def __init__(self):
     super(DataUtil, self).__init__()
     jutil = Jieba_Util(verbose=0)
     self.remove_sentence_punctuation = lambda x: jutil.seg(
         x, sep='', remove_url=False)
     self.get_sentence_length = lambda x: len(
         jutil.seg(
             x,
             sep=' ',
             full_mode=False,
             remove_stopword=False,
             replace_number=False,
             lowercase=True,
             zhs2zht=True,
             remove_url=True,
             HMM=False,
         ).split())
Exemplo n.º 4
0
    def batch_segment_sentences(self,sentences):
        '''
            对多个句子批量分词

        :param sentences: array-like
        :return:
        '''
        self.jieba_util = Jieba_Util()
        segmented_sentences = map(self.segment_sentence,sentences)
        return segmented_sentences
Exemplo n.º 5
0
    def remove_repet_data(self,data):
        '''
            去除重复的的句子(去除标点符号后一样的句子则算一样)
                1. 初始化jieba分词,并用分词去除标点符号
                2. 去重处理

        :param data:
        :return:
        '''

        jutil = Jieba_Util(verbose=0)
        # 去除标点符号
        remove_sentence_punctuation = lambda x:jutil.seg(x,sep='',remove_url=False)


        labels = []
        sentences = []
        for label,group in data.groupby(by=[u'LABEL']):
            # print(label,len(group),len(group[u'SENTENCE'].unique()))
            # 去除该类别之后的句子和句子数
            # print(group[u'SENTENCE'])
            # print(group[u'SENTENCE'].apply(remove_sentence_punctuation))
            norepet_sentcence_set = set()
            sentences_after_rm_rep = []
            for item in group[u'SENTENCE'].as_matrix():
                seged_sentence = remove_sentence_punctuation(item)
                if seged_sentence not in norepet_sentcence_set:
                    norepet_sentcence_set.add(seged_sentence)
                    sentences_after_rm_rep.append(item)
                    # print(seged_sentence)
                else:
                    pass
                    # print(item)
            num_after_rm_rep = len(sentences_after_rm_rep)
            sentences.extend(sentences_after_rm_rep)
            labels.extend([label]*num_after_rm_rep)

        # print(len(labels))
        # print(len(sentences))
        return pd.DataFrame(data={'LABEL':labels,'SENTENCE':sentences})
Exemplo n.º 6
0
def test2():
    input_file1 = './sample_data/v2.3_train_Sa_891.csv'

    data = pd.read_csv(input_file1,
                       encoding='utf8',
                       sep='\t',
                       index_col=0,
                       header=0)

    data = data[data['LABEL'] != u'其它#其它']
    data = data[data['LABEL'] != u'其它#捣乱']
    print(data.head())
    # 分词
    jieba_util = Jieba_Util()
    segment_sentence = lambda x: jieba_util.iter_each_word(
        sentence=x,
        sep=' ',
        need_segmented=True,
        full_mode=False,
        remove_stopword=False,
        replace_number=True,
        lowercase=True,
        zhs2zht=True,
        remove_url=True,
    )
    data['WORDS'] = data['SENTENCE'].apply(segment_sentence).as_matrix()
    sentences = data['WORDS'].as_matrix()
    print '句子数:%d' % sentences.shape
    # print(sentences[-1])
    # quit()
    util = Word2vecUtil(size=50,
                        train_method='cbow'
                        )
    util.train(sentences)
    util.print_model_descibe()

    most_similar_words = util.model.most_similar(u'机')
    most_similar_words = util.model.most_similar(u'喜')
    print ','.join([i for i, j in most_similar_words])
    util.save('vector/v2.3_train_Sa_891_word_50dim.gem')
Exemplo n.º 7
0
    def remove_repet_data(self, data):
        '''
            去除重复的的句子(去除标点符号后一样的句子则算一样)
                1. 初始化jieba分词,并用分词去除标点符号
                2. 去重处理

        :param data:
        :return:
        '''

        jutil = Jieba_Util(verbose=0)
        # 去除标点符号
        remove_sentence_punctuation = lambda x: jutil.seg(
            x, sep='', remove_url=False)

        labels = []
        sentences = []
        for label, group in data.groupby(by=[u'LABEL']):
            # print(label,len(group),len(group[u'SENTENCE'].unique()))
            # 去除该类别之后的句子和句子数
            # print(group[u'SENTENCE'])
            # print(group[u'SENTENCE'].apply(remove_sentence_punctuation))
            norepet_sentcence_set = set()
            sentences_after_rm_rep = []
            for item in group[u'SENTENCE'].as_matrix():
                seged_sentence = remove_sentence_punctuation(item)
                if seged_sentence not in norepet_sentcence_set:
                    norepet_sentcence_set.add(seged_sentence)
                    sentences_after_rm_rep.append(item)
                    # print(seged_sentence)
                else:
                    pass
                    # print(item)
            num_after_rm_rep = len(sentences_after_rm_rep)
            sentences.extend(sentences_after_rm_rep)
            labels.extend([label] * num_after_rm_rep)

        # print(len(labels))
        # print(len(sentences))
        return pd.DataFrame(data={'LABEL': labels, 'SENTENCE': sentences})
Exemplo n.º 8
0
def test2():
    input_file1 = './sample_data/v2.3_train_Sa_891.csv'

    data = pd.read_csv(input_file1,
                       encoding='utf8',
                       sep='\t',
                       index_col=0,
                       header=0)

    data = data[data['LABEL'] != u'其它#其它']
    data = data[data['LABEL'] != u'其它#捣乱']
    print(data.head())
    # 分词
    jieba_util = Jieba_Util()
    segment_sentence = lambda x: jieba_util.iter_each_word(
        sentence=x,
        sep=' ',
        need_segmented=True,
        full_mode=False,
        remove_stopword=False,
        replace_number=True,
        lowercase=True,
        zhs2zht=True,
        remove_url=True,
    )
    data['WORDS'] = data['SENTENCE'].apply(segment_sentence).as_matrix()
    sentences = data['WORDS'].as_matrix()
    print '句子数:%d' % sentences.shape
    # print(sentences[-1])
    # quit()
    util = Word2vecUtil(size=50, train_method='cbow')
    util.train(sentences)
    util.print_model_descibe()

    most_similar_words = util.model.most_similar(u'机')
    most_similar_words = util.model.most_similar(u'喜')
    print ','.join([i for i, j in most_similar_words])
    util.save('vector/v2.3_train_Sa_891_word_50dim.gem')
Exemplo n.º 9
0
class FeatureEncoder(object):
    """
        Onehot特征编码器,将句子转成 onehot编码
        函数列表为:
            1. segment_sentence:对句子分词
            2. build_dictionary:构建字典
            3. sentence_to_index:将原始字符串句子转为字典索引列表
            4. sentence_padding:将句子补齐
            5. fit_transform:构建编码器并转换数据
            6. transform_sentence:对句子编码
            7. get_sentence_length:对句子长度计算
            8. print_sentence_length_detail: 打印训练库句子详情.
            9. print_model_descibe: 打印模型的详情.
            10. sentence_index_to_bow: 将索引转为onehot数据
            11. to_onehot_array: 生成训练库句子的onehot编码
            12. reset: clear 数据

        注意:
            1. onehot编码 有两种形式:通过设置 to_onehot_array 切换
                - 字典索引形式表示,补齐 (默认是这种形式)
                - onehot 向量
            0. 训练库中所有词,包括未知词字符(UNKOWN),的字典索引都是从1开始分配的,索引0是作为填充字符所用。
            1. 训练库字典大小 (vocabulary_size)是计入索引0的,计算训练库中所有词和填充字符(PADDING)未知词字符(UNKOWN),如果不使用可以关闭。
    """
    __version__ = '1.4'

    def __init__(self,
                 need_segmented=True,
                 verbose=0,
                 full_mode=True,
                 feature_type='seg',
                 remove_stopword=True,
                 replace_number=True,
                 lowercase=True,
                 zhs2zht=True,
                 remove_url=True,
                 sentence_padding_length=7,
                 padding_mode='center',
                 add_unkown_word=True,
                 to_onehot_array=False,
                 word2vec_to_solve_oov=False,
                 **kwargs
                 ):
        """
            Onehot特征编码器,将句子转成 onehot 编码(以字典索引形式表示,补齐)
            1. 初始化参数
            2. build feature encoder

            :param need_segmented: 数据处理选项,是否需要经过分词处理;如果为False,那么输入的数据不需要分词,提供的数据的每个句子的每个词要以空格分割.比如: ['我 要 买 手机','你好','早上 好'];如果为True,提供原始输入句子即可,比如:['我要买手机','你好','早上好'].
            :type need_segmented: bool
            :param verbose: 数值越大,输出越详细
            :type verbose: int
            :param full_mode: jieba分词选项,是否使用 full mode,默认为True
            :type full_mode: bool
            :param feature_type: 模型设置选项,选择不同粒度的特征单位, 目前只支持 word,seg和 word_seg。
                - word:直接以字为单位,比如 我要买手机--->我 要 买 手 机
                - seg:分词后的词单元为单位,比如 我要买手机--->我 要 买 手机
                - word_seg:分词后的字和词为单位,去重,比如 我要买手机--->我 要 买 手机 手 机
                - word_seg_concat:分词后的字和词为单位,不去重,比如 我要买手机--->我 要 买 手 机 我 要 买 手机
            :type feature_type: str
            :param remove_stopword: jieba分词选项,是否去除 stop word,默认为True
            :type remove_stopword: bool
            :param replace_number: jieba分词选项,是否将数据统一替换成NUM,默认为True
            :type replace_number: bool
            :param lowercase: jieba分词选项,是否将数据统一替换成NUM,默认为True
            :type lowercase: bool
            :param zhs2zht: jieba分词选项,出現繁体的時候,是否转简体,默认为True
            :type zhs2zht: bool
            :param remove_url: jieba分词选项,是否移除 微博url,http://t.cn/开头的地址,默认为True
            :type remove_url: bool
            :param add_unkown_word: 训练库字典的设置选项,是否在字典中增加一个未知词字符(UNKOWN)
            :type add_unkown_word: bool
            :param sentence_padding_length:  句子的补齐(截断)长度,默认为7
            :type sentence_padding_length: int
            :param padding_mode:  句子的补齐(截断)模式,有四种模式:
                                        1. center:如果小于sentence_padding_length的话往两边补0;如果超出sentence_padding_length的话,直接在后面截断。
                                        2. left:如果小于sentence_padding_length的话往左边补0;如果超出sentence_padding_length的话,直接在后面截断。
                                        3. right:如果小于sentence_padding_length的话往右边补0;如果超出sentence_padding_length的话,直接在后面截断。
                                        4. none:不补齐。
            :type padding_mode: str
            :param to_onehot_array: 输出 onehot array,还是字典索引 array,默认为False,输出字典索引,
            :type to_onehot_array: bool
            :param word2vec_to_solve_oov: 使用word2vec扩展oov词
            :type word2vec_to_solve_oov: bool
            :param kwargs:
                - word2vec_model_file_path:
                - vocabulary_including_test_set: (default,True)
                - update_dictionary: (default,True)
                - 等

        """
        self.full_mode = full_mode
        self.feature_type = feature_type
        self.remove_stopword = remove_stopword
        self.verbose = verbose
        self.need_segmented = need_segmented
        self.replace_number = replace_number
        self.lowercase = lowercase
        self.zhs2zht = zhs2zht
        self.remove_url = remove_url
        self.add_unkown_word = add_unkown_word
        self.sentence_padding_length = sentence_padding_length
        self.padding_mode = padding_mode
        self.to_onehot_array = to_onehot_array
        self.word2vec_to_solve_oov = word2vec_to_solve_oov
        self.kwargs = kwargs

        # 检验参数合法性
        assert self.padding_mode in ['center', 'left', 'right', 'none'], 'padding mode 只能取: center,left,right,none'
        assert self.feature_type in ['word', 'seg', 'word_seg',
                                     'word_seg_concat'], 'feature type 只能取: word,seg和word_seg'

        # 初始化jieba分词器
        if need_segmented:
            self.jieba_seg = Jieba_Util(verbose=self.verbose)
        # 训练库提取出来的字典对象
        self.train_data_dict = None
        # 训练库提取出来的字典词汇列表
        self.vocabulary = None
        # 训练库提取出来的字典词汇个数
        self.vocabulary_size = None
        # UNKOWN字符的索引
        self.unknow_token_index = None
        # PADDING字符的索引
        self.padding_token_index = None

        # region NOTE: 这些变量不再维护,因为消耗内存
        # 原始训练数据
        # self.train_data = None
        # 切完词的句子
        # self.segmented_sentences = None
        # 训练库句子的字典索引形式
        # self.train_index = None
        # 训练库句子的补齐的字典索引形式
        # self.train_padding_index = None
        # 训练库句子装成onehot array
        # endregion
        self.train_onehot_array = None
        # word2vec 模型
        self.word2vec_model = None
        if word2vec_to_solve_oov:
            assert kwargs.has_key('word2vec_model_file_path'), '请提供 属性 word2vec_model_file_path'
            # 加载word2vec模型
            w2v_util = Word2vecUtil()
            self.word2vec_model = w2v_util.load(kwargs.get('word2vec_model_file_path'))

        if verbose > 1:
            logging.debug('build feature encoder...')
            print('build feature encoder...')
            # self.fit_transform()

    def segment_sentence(self, sentence):
        """
        对句子进行分词,使用jieba分词

        :param sentence: 句子
        :type sentence: str
        :return: 分完词句子,以空格连接
        :rtype: str
        """

        if self.feature_type == 'seg':
            segmented_sentence = self.jieba_seg.seg(
                sentence,
                sep=' ',
                full_mode=self.full_mode,
                remove_stopword=self.remove_stopword,
                replace_number=self.replace_number,
                lowercase=self.lowercase,
                zhs2zht=self.zhs2zht,
                remove_url=self.remove_url,
                HMM=False,
            )
        elif self.feature_type == 'word':
            # 将句子切分为 以字为单元 以空格分割
            # 1. 先使用jieba进行预处理,将数字替换等
            segmented_sentence = self.jieba_seg.iter_each_word(
                sentence,
                sep=' ',
                need_segmented=True,
                full_mode=self.full_mode,
                remove_stopword=self.remove_stopword,
                replace_number=self.replace_number,
                lowercase=self.lowercase,
                zhs2zht=self.zhs2zht,
                remove_url=self.remove_url,
                HMM=False,
            )

            # 2. 按字切分

        elif self.feature_type == 'word_seg':
            # 将句子切分为 以字和词为单元,相同则去重 以空格分割
            # 1. 先使用jieba进行预处理,将数字替换等
            segmented_sentence = self.jieba_seg.seg(
                sentence,
                sep=' ',
                full_mode=self.full_mode,
                remove_stopword=self.remove_stopword,
                replace_number=self.replace_number,
                lowercase=self.lowercase,
                zhs2zht=self.zhs2zht,
                remove_url=self.remove_url,
                HMM=False,
            )

            # print(segmented_sentence)
            # 2. 按字切分
            word = self.jieba_seg.iter_each_word(segmented_sentence, sep=' ', need_segmented=False).split()
            # 3. 按词切分
            seg = segmented_sentence.split()
            segmented_sentence = ' '.join(set(seg + word))
        elif self.feature_type == 'word_seg_concat':
            # 先字后词拼接,不去重
            # 1. 先使用jieba进行预处理,将数字替换等
            segmented_sentence = self.jieba_seg.seg(
                sentence,
                sep=' ',
                full_mode=self.full_mode,
                remove_stopword=self.remove_stopword,
                replace_number=self.replace_number,
                lowercase=self.lowercase,
                zhs2zht=self.zhs2zht,
                remove_url=self.remove_url,
                HMM=False,
            )

            # print(segmented_sentence)
            # 2. 按字切分
            word = self.jieba_seg.iter_each_word(segmented_sentence, sep=' ', need_segmented=False).split()
            # 3. 按词切分
            seg = segmented_sentence.split()
            segmented_sentence = ' '.join(word + seg)
        else:
            assert False, '不支持其他粒度的切分!'

        return segmented_sentence

    def get_sentence_length(self, sentence):
        '''
            计算句子的长度,注意,这里的长度以词为单位,即分完词后统计。
                1. 对句子分词
                2. 对句子的词计算

        :param sentence: 句子
        :type sentence: str
        :return: 句子长度
        :rtype: int
        '''

        # 1. 分词
        segmented_senence = self.segment_sentence(sentence)
        # 2. 统计
        sentence_length = len(segmented_senence.split())

        return sentence_length

    def print_sentence_length_detail(
            self,
            data=None,
            lengths=[7, 10, 15, 20,50,80,100],
    ):
        """
            打印训练库中句子的长度情况

        :type lengths: list
        :param lengths: 长度界限列表
        :return: 句子长度列表
        :rtype: list
        """
        if self.need_segmented:
            sentence_length = map(self.get_sentence_length, data)
        else:
            sentence_length = map(lambda x: len(x.split()), data)
        for l in lengths:
            le_this_len = sum(np.asarray(sentence_length) <= l) / (1.0 * len(sentence_length))
            print('句子长度小于等于%d的有:%f' % (l, le_this_len))

        print('句子长度情况为:%s' % (str(sentence_length)))
        print('句子最长长度为:%d' % (max(sentence_length)))
        print('句子最短长度为:%d' % (min(sentence_length)))
        print('句子平均长度为:%d' % (np.average(sentence_length)))
        return sentence_length

    def get_unkown_vector(self, ndim=50):
        rand = np.random.RandomState(1337)
        return rand.uniform(-0.25, 0.25, ndim)

    def get_w2vEmbedding(self, word):
        """
            返回词向量

        Returns
        -------
        (array,str)
        """

        try:
            if word == u'PADDING':
                vector = np.zeros(self.word2vec_model.vector_size)
                flag = 'PADDING'
            elif word == u'UNKOWN':
                # 当训练
                vector = self.get_unkown_vector(self.word2vec_model.vector_size)
                flag = 'NO_IN_MODEL_VOCAB'
            else:
                vector = self.word2vec_model[word]
                flag = 'OK'
        except:
            vector = self.get_unkown_vector(self.word2vec_model.vector_size)
            if self.verbose > 1:
                print('OOV: %s' % word)
            flag = 'NO_IN_W2V'
        return np.asarray(vector), flag

    def to_embedding_weight(self, path):
        """
            使用训练好的 word2vec 模型 将字典中每个词转为 word2vec向量,接着生成一个 Embedding层的初始权重形式,可用于初始化 Embedding 层的权重。
                1. 加载word2vec模型
                2.

        :param path: word2vec 模型文件路径
        :type path: str
        :return:
        """

        if self.word2vec_model is None:
            w2v_util = Word2vecUtil()
            self.word2vec_model = w2v_util.load(path)
        size = self.vocabulary_size

        embedding_weights = np.zeros((size, self.word2vec_model.vector_size))
        words_count_no_in_w2v = 0
        words_count_no_in_vacab = 0
        words_count_in = 0
        words_count_paddding = 0
        for key, value in self.train_data_dict.token2id.items():
            vector, flag = self.get_w2vEmbedding(key)
            embedding_weights[value, :] = vector
            if flag == 'NO_IN_W2V':
                words_count_no_in_w2v += 1
            if flag == 'NO_IN_MODEL_VOCAB':
                words_count_no_in_vacab += 1
            if flag == 'OK':
                words_count_in += 1
                # print(key)
            if flag == 'PADDING':
                words_count_paddding += 1
        if self.verbose > 0:
            print('没有出现在w2v模型中的词有:%d个' % (words_count_no_in_w2v))
            print('没有出现在模型vocab中的词有:%d个' % (words_count_no_in_vacab))
            print('出现在w2v模型中的词有:%d个' % (words_count_in))

        # self.embedding_weights = embedding_weights

        return embedding_weights

    def build_dictionary(self, train_X=None, test_X=None):
        """
            1.对数据进行分词
            2.构建训练库字典,插入 一个特殊字符 'UNKOWN'表示未知词

        Parameters
        ----------
        train_X : array-like
        test_X : array-like

        Returns
        --------
        object:
            self
        """

        # region -------------- 1.将训练集和测试集合并 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
            logging.debug('1.将训练集和测试集合并')
            print('1.将训练集和测试集合并')
        if self.kwargs.get('vocabulary_including_test_set', True):
            X = np.concatenate((train_X, test_X), axis=0)
        else:
            X = train_X

        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
        # endregion -------------- 1.将训练集和测试集合并 ---------------

        # region -------------- 2.对数据进行分词 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
            logging.debug('对数据进行分词')
            print('对数据进行分词')
        # -------------- code start : 开始 -------------
        if self.need_segmented:
            segmented_sentences = map(self.segment_sentence, X)
        else:
            segmented_sentences = X
        # -------------- code start : 结束 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
        # endregion -------------- 2.对数据进行分词 ---------------

        # region -------------- 3. 将句子补齐到等长 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
            logging.debug('2. 将句子补齐到等长')
            print('2. 将句子补齐到等长')
        # -------------- code start : 开始 -------------

        # 将不等长的句子都对齐,超出padding_length长度的句子截断,小于的则补 PADDING
        padded_sentences = np.asarray(map(self.sentence_padding, segmented_sentences))

        # endregion -------------- 3. 将句子补齐到等长 -------------

        # region -------------- region start : 4.构建训练库字典 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
            logging.debug('4.构建训练库字典')
            print('4.构建训练库字典')
        # -------------- code start : 开始 -------------
        logging.debug('=' * 20)
        logging.debug('首先,构建训练库字典,然后将每个词映射到一个索引,再将所有句子映射成索引的列表')

        # 将训练库所有句子切分成列表,构成 2D的训练文档,每个单元是一个token,
        # 比如: [['今年','你','多少岁'],['你', '二十四','小时','在线','吗'],...]
        # 将分完词句子转成合适的数据格式
        train_document = map(lambda x: x.split(), padded_sentences)
        # 获取训练库字典
        if self.padding_mode != 'none':
            # 为了确保padding的索引是0,所以在最前面加入 PADDING
            train_document.insert(0, [u'PADDING'])
        self.train_data_dict = Dictionary.from_documents(train_document)

        # 更新字典,再字典中添加特殊符号,其中
        # UNKOWN表示未知字符,即OOV词汇
        if self.add_unkown_word:
            self.train_data_dict.add_documents([[u'UNKOWN']])

        # 获取padding和UNKOWN 的字典索引
        self.padding_token_index = self.train_data_dict.token2id.get(u'PADDING', -1)
        self.unknow_token_index = self.train_data_dict.token2id.get(u'UNKOWN', -1)

        self.vocabulary_size = len(self.train_data_dict.keys())
        # 按索引从小到大排序
        self.vocabulary = [token for token, id in sorted(self.train_data_dict.token2id.items(), key=lambda x: x[1])]
        # print(self.vocabulary_size)
        # print((self.train_data_dict.token2id.items()))
        # quit()

        # -------------- print start : just print info -------------
        if self.verbose > 1:
            logging.debug('训练库字典为:%d' % (len(self.train_data_dict.keys())))
            print('训练库字典为:%d' % (len(self.train_data_dict.keys())))
            logging.debug(u'字典有:%s' % (','.join(self.train_data_dict.token2id.keys())))
            print(u'字典有:%s' % (','.join(self.train_data_dict.token2id.keys())))
        # -------------- print end : just print info -------------

        # -------------- code start : 结束 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)

        # endregion -------------- 4.构建训练库字典 ---------------

        return padded_sentences

    def replace_oov_with_similay_word(self, word2vec_model, sentence):
        '''
            对句子中oov词使用训练库中最相近的词替换(word2vec余弦相似性)

        :param sentence:
        :return:
        '''

        # is_oov = np.asarray([item for item in self.feature_encoder.vocabulary])
        # has_oov = any(is_oov)
        sentence = sentence.split()
        oov_word = []
        replace_word = []
        for item in sentence:
            if item not in self.vocabulary:
                oov_word.append(item)
                keywords_sim_score = np.asarray(
                    [self.word_similarity(word2vec_model, item, i) for i in self.vocabulary])
                sorted_index = np.argsort(keywords_sim_score)[-1::-1]
                most_similarity_score = keywords_sim_score[sorted_index[0]]
                most_similarity_word = self.vocabulary[sorted_index[0]]
                if self.verbose > 1:
                    print(u'%s 最相近的词是%s,分数为:%f' % (item, most_similarity_word, most_similarity_score))
                replace_word.append(most_similarity_word)
        sentence += replace_word
        return ' '.join(sentence)

    def word_similarity(self, word2vec_model, word1, word2):
        '''
        计算两个词的相似性
        :param word1:
        :param word2:
        :return:
        '''
        try:
            return word2vec_model.n_similarity(word1, word2)
        except:
            return 0

    def sentence_to_index(self, sentence):
        """
            将 sentence 转换为 index,如果 token为OOV词,则分配为 UNKOWN


        Parameters
        ----------
        sentence: str
            以空格分割

        """

        if self.add_unkown_word:
            unknow_token_index = self.train_data_dict.token2id[u'UNKOWN']
        else:
            unknow_token_index = 0
        # 将训练库中所有句子的每个词映射到索引上,变成索引列表
        index = [self.train_data_dict.token2id.get(item, unknow_token_index) for item in sentence.split()]
        if self.verbose > 1:
            if index.__contains__(unknow_token_index):
                print('unknow_token_index:%d' % unknow_token_index)
                print('出现字典OOV')
                print(sentence)
                print(index)
        # assert not index.__contains__(-1),u'出现OOV词'
        return index

    def sentence_padding(self, sentence):
        '''
            将不等长的句子都对齐,超出padding_length长度的句子截断,小于的则补 PADDING

        :type sentence: str
        :param sentence: 句子,词之间以 空格 分割
        :return: 返回补齐后的句子,以空格分割
        :type: str
        '''

        padding_length = self.sentence_padding_length
        # print(sentence)
        sentence = sentence.split()
        sentence_length = len(sentence)
        # print(sentence_length)
        if sentence_length > padding_length:
            # logging.debug(u'对句子进行截断:%s' % (sentence))

            sentence = sentence[:padding_length]

            # logging.debug(u'对句子进行截断后:%s' % (' '.join(seg[:padding_length])))
            # print(u'对句子进行截断后:%s' % (' '.join(seg[:padding_length])))
        elif sentence_length < padding_length:
            should_padding_length = padding_length - sentence_length
            left_padding = np.asarray(['PADDING'] * (should_padding_length / 2))
            right_padding = np.asarray(['PADDING'] * (should_padding_length - len(left_padding)))
            if self.padding_mode == 'center':
                sentence = np.concatenate((left_padding, sentence, right_padding), axis=0)
            elif self.padding_mode == 'left':
                sentence = np.concatenate((left_padding, right_padding, sentence), axis=0)
            elif self.padding_mode == 'right':
                sentence = np.concatenate((sentence, left_padding, right_padding), axis=0)
            elif self.padding_mode == 'none':
                sentence = sentence
            else:
                raise NotImplemented

        sentence = ' '.join(sentence)
        return sentence

    def sentence_index_to_onehot(self, index):
        '''
            注意:该方法跟[sentence_index_to_bow]的区别。
            将词的索引转成 onehot 编码,比如:
                索引 1 -->[  0 , 0 , 0 , 0,  1]

        :param index: 一个词的字典索引
        :type index: list
        :return: onehot 编码,shape为 (句子长度,字典长度)
        :rtype: np.array()
        '''

        onehot_array = []

        for item in index:
            temp = np.zeros(self.vocabulary_size, dtype=int)
            if item == 0:
                pass
            else:
                temp[item - 1] = 1

            onehot_array.append(temp)

        # onehot_array = np.concatenate(onehot_array,axis=1)
        onehot_array = np.asarray(onehot_array)
        return onehot_array

    def sentence_index_to_bow(self, index):
        '''
            注意:该方法跟[word_index_to_onehot]的区别。
            将句子的字典索引转成 词包向量 编码比如:
                [1,2]-->[ 0 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0,  0]

        :param index: 一个句子的字典索引
        :type index: list
        :return: bow 编码,长度为 字典长度
        :rtype: np.array()
        '''

        onehot_array = np.zeros(self.vocabulary_size, dtype=int)

        onehot_array[index] = 1

        return onehot_array

    def batch_sentence_index_to_onehot_array(self, sentence_indexs):
        '''
            将所有训练库句子转成onehot编码的数组,保存在 self.onehot_array 中

        :return: onehot编码的数组
        '''

        self.onehot_array = np.asarray(map(self.sentence_index_to_onehot, sentence_indexs))
        return self.onehot_array

    def fit_transform(self,
                      train_data=None,
                      test_data=None):
        return self.fit(train_data, test_data).transform(train_data)

    def fit(self,
            train_X=None,
            test_X=None
            ):
        """
            build feature encoder ---- 构建训练库字典

            Notes
            ------
            update_dictionary: 设置 再次调用fit()函数时,是否更新字典,默认为 True,即只在第一次调用fit()函数时才更新 字典
            vocabulary_including_test_set: 设置 是否 字典是否包含测试集的词汇,默认包含,即字典包含训练集和测试集的所有词汇。
                - 设置为 False ,则 字典只包含训练集中的词汇

        Parameters
        ----------
        train_X: array-like
            训练句子列表:['','',...,'']
        test_X: array-like
            测试句子列表:['','',...,'']

        Returns
        -------
        object:
            编码后的列表
        """

        if not self.kwargs.get('update_dictionary', True):
            # 假如不更新字典,则如果原有的字典在,就直接用原有的字典即可
            if self.vocabulary is not None:
                return self

        logging.debug('=' * 20)
        if train_X is None:
            logging.debug('没有输入训练数据!')
            assert False, '没有输入训练数据!'

        if self.kwargs.get('vocabulary_including_test_set', True):
            if test_X is None:
                logging.debug('vocabulary_including_test_set=True,构建字典需要全部数据,请输入测试数据!')
                assert False, 'vocabulary_including_test_set=True,构建字典需要全部数据,请输入测试数据!'

        # region -------------- 1.构建训练库字典 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
            logging.debug('1.构建训练库字典')
            print('1.构建训练库字典')
        # -------------- code start : 开始 -------------

        # 构建训练库字典
        self.build_dictionary(train_X, test_X)

        # -------------- code start : 结束 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
        # endregion -------------- 1.构建训练库字典 ---------------

        return self

    def transform_sentence(self, sentence):
        """
            转换一个句子的格式。跟训练数据一样的操作,对输入句子进行 padding index 编码,将sentence转为补齐的字典索引
                1. 分词
                2. 转为字典索引列表,之后补齐,输入为补齐的字典索引列表
                    - 当 参数 to_onehot_array = True (默认为 False)时,直接返回 字典索引 ;
                    - 当 参数 to_onehot_array = False (默认为 False)时,进入第3步,进一步转换成 onehot 向量 ;
                3. 每个词的字典索引变成onehot向量
                    - 这一步不一定会执行
                    - to_onehot_array = True 时, 执行

        :param sentence: 输入句子,不用分词,进来后会有分词处理
        :type sentence: str
        :return: 补齐的字典索引
        :rtype: array-like
        """

        assert self.train_data_dict is not None, '请先fit_transform()模型'

        # region -------------- 1. 分词 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
            logging.debug('1. 分词')
            print('1. 分词')
        # -------------- code start : 开始 -------------

        # 分词
        if self.need_segmented:
            seg_sentence = self.segment_sentence(sentence)
        else:
            seg_sentence = sentence

        if self.word2vec_to_solve_oov:
            seg_sentence = self.replace_oov_with_similay_word(self.word2vec_model,
                                                              seg_sentence)
        # -------------- code start : 结束 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
        # endregion -------------- region end : 1. 分词 ---------------

        # region -------------- 2. 转为字典索引列表,之后补齐,输出为补齐的字典索引列表 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
            logging.debug('2. 转为字典索引列表,之后补齐,输出为补齐的字典索引列表')
            print('2. 转为字典索引列表,之后补齐,输入为补齐的字典索引列表')
        # -------------- code start : 开始 -------------

        paded_sentence = self.sentence_padding(seg_sentence)
        sentence_index = self.sentence_to_index(paded_sentence)

        # -------------- code start : 结束 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
        # endregion -------------- region end : 2. 转为字典索引列表,之后补齐,输出为补齐的字典索引列表 ---------------

        # region -------------- 3. 将每个词的字典索引变成onehot向量 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
            logging.debug('3. 将每个词的字典索引变成onehot向量')
            print('3. 将每个词的字典索引变成onehot向量')
        # -------------- code start : 开始 -------------

        if self.to_onehot_array:
            onehot_array = self.sentence_index_to_onehot(sentence_index)
        else:
            onehot_array = sentence_index

        # -------------- code start : 结束 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
        # endregion -------------- region end : 3. 将每个词的字典索引变成onehot向量 ---------------

        return onehot_array

    def transform(self, X):
        '''
            批量转换数据,跟训练数据一样的操作,对输入句子进行 padding index 编码,将sentence转为补齐的字典索引
                1. 直接调用 self.transform_sentence 进行处理

        :param sentence: 输入句子
        :type sentence: array-like
        :return: 补齐的字典索引
        :rtype: array-like
        '''

        index = map(lambda x: self.transform_sentence(x), X)
        # print train_index[:5]

        return np.asarray(index)

    def reset(self):
        """
        清理对象中的数据
           - self.vocabulary

        """
        self.vocabulary = None

    def print_model_descibe(self):
        '''
            打印模型参数详情

        :return: 参数设置详情
        :rtype: dict 或 {}
        '''
        import pprint
        detail = {'train_data_count': len(self.train_data),
                  'need_segmented': self.need_segmented,
                  'feature_type': self.feature_type,
                  'verbose': self.verbose,
                  'full_mode': self.full_mode,
                  'remove_stopword': self.remove_stopword,
                  'replace_number': self.replace_number,
                  'sentence_padding_length': self.sentence_padding_length,
                  'padding_mode': 'center',
                  'vocabulary_size': self.vocabulary_size,
                  'padding_token_index': self.padding_token_index,
                  'unknow_token_index': self.unknow_token_index,
                  'add_unkown_word': True,
                  'mask_zero': True,
                  }
        pprint.pprint(detail)
        logging.debug(detail)
        return detail
Exemplo n.º 10
0
class DataUtil(object):
    """
    OOD 数据集 - stable version 数据工具类
        - 数据的部分设置 在文件 setting.py 中
    """
    def __init__(self):
        # 训练数据的根目录
        self.dataset_root_path = DATA_ROOT_PATH
        self.word2vec_model_root_path = WORD2VEC_MODEL_ROOT_PATH
        self.jieba_util = None

    def get_label_index(self, version='v2.0'):
        """
            获取 DA 分类类别的列表,总共有24类

        :return: label_to_index,index_to_label
        """
        if version == 'v1.0':
            # 16分类标签
            index_to_label = [
                u'捣乱#骂人',
                u'导购#开始',
                u'导购#成交',
                u'导购#更换',
                u'导购#详情',
                u'表态#附和',
                u'表态#否定',
                u'表态#犹豫',
                u'表态#肯定',
                u'表态#否定#不满',
                u'表态#随便',
                u'闲聊#身份信息',
                u'闲聊#天气',
                u'闲聊#问候',
                u'闲聊#时间',
                u'闲聊#结束语',
            ]
        elif version == 'v2.0':
            # 24分类标签
            index_to_label = [
                u'其它#骂人', u'导购#不成交', u'导购#不理解', u'导购#开始', u'导购#成交', u'导购#更换',
                u'导购#结束', u'导购#详情', u'表态#不满', u'表态#否定', u'表态#满意', u'表态#犹豫',
                u'表态#疑问', u'表态#肯定', u'表态#附和', u'表态#随便', u'社交义务#不用谢',
                u'社交义务#接受道歉', u'社交义务#致谢', u'社交义务#道歉', u'社交义务#问候', u'闲聊#天气',
                u'闲聊#时间', u'闲聊#身份信息'
            ]
        elif version == 'v2.0++':
            # 24分类标签 ++所用版本,临时用
            index_to_label = [
                u'其它#骂人', u'导购#不成交', u'导购#不理解', u'导购#开始', u'导购#成交', u'导购#更换',
                u'导购#结束', u'导购#详情', u'社交义务#不用谢', u'社交义务#接受道歉', u'社交义务#致谢',
                u'社交义务#道歉', u'社交义务#问候', u'表态#不满', u'表态#否定', u'表态#满意', u'表态#犹豫',
                u'表态#疑问', u'表态#肯定', u'表态#附和', u'表态#随便', u'闲聊#天气', u'闲聊#时间',
                u'闲聊#身份信息'
            ]
        elif version == 'v2.0_small':
            # 17分类标签
            index_to_label = [
                u'其它#骂人', u'导购#开始', u'导购#成交', u'导购#更换', u'导购#结束', u'导购#详情',
                u'表态#否定', u'表态#不满', u'表态#犹豫', u'表态#肯定', u'表态#附和', u'表态#随便',
                u'社交义务#不用谢', u'社交义务#问候', u'闲聊#天气', u'闲聊#时间', u'闲聊#身份信息'
            ]

        # print('类别数为:%d'%len(index_to_label))
        label_to_index = {
            label: idx
            for idx, label in enumerate(index_to_label)
        }
        return label_to_index, index_to_label

    def transform_word2vec_model_name(self, flag):
        """
            根据 flag 转换成完整的 word2vec 模型文件名

        :param flag:
        :return:
        """

        from data_processing_util.word2vec_util.word2vec_util import Word2vecUtil
        return Word2vecUtil().transform_word2vec_model_name(flag)

    def transform_dataset_name(self, flag):
        """
            将数据集标记转为真正的训练集和测试集文件名

        :param flag: 数据集标记 v1.0(S),v2.2(S),v2.2(Sa),v2.2(L),v2.3(S)
        :type flag: str
        :return: train_data_file_path,test_data_file_path
        """

        if flag == 'v1.0(S)':
            # 使用v2.2 L版本的数据集
            train_data_file_path = os.path.join(self.dataset_root_path,
                                                '20160526/train_all.csv')
            test_data_file_path = os.path.join(self.dataset_root_path,
                                               '20160526/ood_labeled.csv')
        elif flag == 'v2.2(L)':
            # 使用v2.2 L版本的数据集
            train_data_file_path = os.path.join(self.dataset_root_path,
                                                'v2.2/v2.2_train_L_2302.csv')
            test_data_file_path = os.path.join(self.dataset_root_path,
                                               'v2.2/v2.2_test_L_76.csv')
        elif flag == 'v2.2(S)':
            # 使用v2.2 S版本的数据集
            train_data_file_path = os.path.join(self.dataset_root_path,
                                                'v2.2/v2.2_train_S_1518.csv')
            test_data_file_path = os.path.join(self.dataset_root_path,
                                               'v2.2/v2.2_test_S_131.csv')
        elif flag == 'v2.2(Sa)':
            # 使用v2.2 Sa版本的数据集
            train_data_file_path = os.path.join(self.dataset_root_path,
                                                'v2.2/v2.2_train_Sa_893.csv')
            test_data_file_path = os.path.join(self.dataset_root_path,
                                               'v2.2/v2.2_test_Sa_79.csv')
            # else:
            #     如果匹配不上,则使用v2.2 Sa版本的数据集
            # train_data_file_path = self.dataset_root_path , 'v2.2/v2.2_train_L_2302.csv'
            # test_data_file_path = self.dataset_root_path , 'v2.2/v2.2_test_L_76.csv'
        elif flag == 'v2.3(L)':
            # 使用v2.2 L版本的数据集
            train_data_file_path = os.path.join(self.dataset_root_path,
                                                'v2.3/v2.3_train_L_2300.csv')
            test_data_file_path = os.path.join(self.dataset_root_path,
                                               'v2.3/v2.3_test_L_76.csv')
        elif flag == 'v2.3(S)':
            # 使用v2.2 S版本的数据集
            train_data_file_path = os.path.join(self.dataset_root_path,
                                                'v2.3/v2.3_train_S_1518.csv')
            test_data_file_path = os.path.join(self.dataset_root_path,
                                               'v2.3/v2.3_test_S_131.csv')
        elif flag == 'v2.3(Sa)':
            # 使用v2.2 Sa版本的数据集
            train_data_file_path = os.path.join(self.dataset_root_path,
                                                'v2.3/v2.3_train_Sa_891.csv')
            test_data_file_path = os.path.join(self.dataset_root_path,
                                               'v2.3/v2.3_test_Sa_79.csv')
        else:
            # 如果匹配不上,则使用v2.2 Sa版本的数据集
            train_data_file_path = os.path.join(self.dataset_root_path,
                                                'v2.3/v2.3_train_L_2300.csv')
            test_data_file_path = os.path.join(self.dataset_root_path,
                                               'v2.3/v2.3_test_L_76.csv')

        return train_data_file_path, test_data_file_path

    def merge_to_17class(self, data):
        '''
            将新版数据集合并成17个类别

        :param data:
        :return:
        '''

        data.loc[data['LABEL'] == u'导购#不理解', 'LABEL'] = u'其它#其它'
        data.loc[data['LABEL'] == u'表态#疑问', 'LABEL'] = u'其它#其它'
        data.loc[data['LABEL'] == u'表态#满意', 'LABEL'] = u'表态#肯定'
        data.loc[data['LABEL'] == u'导购#不成交', 'LABEL'] = u'导购#结束'
        data.loc[data['LABEL'] == u'社交义务#接受道歉', 'LABEL'] = u'导购#结束'
        data.loc[data['LABEL'] == u'社交义务#致谢', 'LABEL'] = u'导购#结束'
        data.loc[data['LABEL'] == u'社交义务#道歉', 'LABEL'] = u'导购#结束'
        # print(','.join(data['LABEL'].unique()))
        # print(len(data['LABEL'].unique()))
        # quit()
        return data

    def load_train_test_data(self, config):
        """
            加载训练数据和测试数据,根据配置选择
            加载的文件中一定要有 LABEL 和 SENTENCE 字段

        :param config:
        :return:
        """

        logging.debug('=' * 20)

        train_data_file_path, test_data_file_path = self.transform_dataset_name(
            config['dataset_type'])

        # -------------- print start : just print info -------------
        if config['verbose'] > 0:
            logging.debug('加载%s版本数据集的训练数据和测试数据\n标注版本:%s' %
                          (config['dataset_type'], config['label_version']))
            print('加载%s版本数据集的训练数据和测试数据\n标注版本:%s' %
                  (config['dataset_type'], config['label_version']))
            logging.debug('train_data_file_path:%s' % train_data_file_path)
            logging.debug('test_data_file_path:%s' % test_data_file_path)
            print('train_data_file_path:%s' % train_data_file_path)
            print('test_data_file_path:%s' % test_data_file_path)
        # -------------- print end : just print info -------------

        train_data = pd.read_csv(train_data_file_path,
                                 sep='\t',
                                 encoding='utf8',
                                 header=0)

        test_data = pd.read_csv(test_data_file_path,
                                sep='\t',
                                encoding='utf8',
                                header=0)

        if config['label_version'] == 'v2.0_small':
            train_data = self.merge_to_17class(train_data)
            test_data = self.merge_to_17class(test_data)

        if config['verbose'] > 0:
            logging.debug('fit data shape is :%s' % (str(train_data.shape)))
            print('fit data shape is :%s' % (str(train_data.shape)))

            logging.debug('test data shape is :%s' % (str(test_data.shape)))
            print('test data shape is :%s' % (str(test_data.shape)))
            logging.debug('-' * 20)
            # 去除类别 其他#其他
            logging.debug('去除类别 其他#其他 ID,其他#捣乱')
            print('去除类别 其它#其它 ID 其他#捣乱')

        filter_row = lambda x: x not in [u'其它#其它', u'其他#其他', u'ID', u'其它#捣乱']

        train_data['IS_FILTER'] = train_data['LABEL'].apply(filter_row)
        test_data['IS_FILTER'] = test_data['LABEL'].apply(filter_row)

        train_data = train_data[train_data['IS_FILTER'] == True]
        test_data = test_data[test_data['IS_FILTER'] == True]

        if config['verbose'] > 0:
            logging.debug('fit data shape is :%s' % (str(train_data.shape)))
            print('fit data shape is :%s' % (str(train_data.shape)))

            logging.debug('test data shape is :%s' % (str(test_data.shape)))
            print('test data shape is :%s' % (str(test_data.shape)))
            logging.debug('-' * 20)

        train_data = train_data[['LABEL', 'SENTENCE']]
        test_data = test_data[['LABEL', 'SENTENCE']]

        label_to_index, index_to_label = self.get_label_index(
            version=config['label_version'])
        if config['verbose'] > 0:
            logging.debug(u'总共类别数:%d,分别为:%s' %
                          (len(index_to_label), ','.join(index_to_label)))
            print(u'总共类别数:%d,分别为:%s' %
                  (len(index_to_label), ','.join(index_to_label)))

        train_data['LABEL_INDEX'] = train_data['LABEL'].map(label_to_index)

        test_data['LABEL_INDEX'] = test_data['LABEL'].map(label_to_index)

        return train_data, test_data

    def batch_segment_sentences(self, sentences):
        '''
            对多个句子批量分词

        :param sentences: array-like
        :return:
        '''
        self.jieba_util = Jieba_Util()
        segmented_sentences = map(self.segment_sentence, sentences)
        return segmented_sentences

    def segment_sentence(self, sentence):
        '''
            将句子进行分词

        :param sentence:
        :return:
        '''
        segmented_sentence = self.jieba_util.seg(
            sentence=sentence,
            sep=' ',
            full_mode=True,
            remove_stopword=False,
            replace_number=True,
            lowercase=True,
            zhs2zht=True,
            remove_url=True,
        )
        return segmented_sentence

    def save_data(self, data, path):
        '''
            保存DataFrame格式的数据

        :param data: 数据
        :param path: 数据文件的路径
        :return: None
        '''
        data.to_csv(
            path,
            sep='\t',
            header=True,
            index=False,
            encoding='utf8',
        )

    def save_result(self, data, predict, is_correct, path):
        '''
            将预测结果进行保存

        :param data: 数据,DataFrame
        :param predict: 预测结果
        :type predict: array-like
        :param is_correct: 是否正确
        :param path: 路径
        :return: None
        '''
        label_to_index, index_to_label = self.get_label_index()
        data['PREDICT'] = [index_to_label[item] for item in predict]
        data['is_correct'] = is_correct
        self.save_data(data, path)

    def load_data(self, path):
        '''
            加载DataFrame格式的数据

        :param data: 数据
        :param path: 数据文件的路径
        :return: None
        '''
        data = pd.read_csv(
            path,
            sep='\t',
            header=0,
            encoding='utf8',
            index_col=0,
        )
        return data

    def get_k_fold_data(
        self,
        k=5,
        data=None,
        rand_seed=0,
    ):
        '''
            将数据分为K-fold

        :param k:
        :param data:
        :type data: pd.DataFrame()
        :return:
        '''

        train_X = data['SENTENCE'].as_matrix()
        train_y = data['LABEL_INDEX'].as_matrix()

        cv_x = []
        cv_y = []
        for x, y in data_split_k_fold(k=k,
                                      data=(train_X, train_y),
                                      rand_seed=rand_seed):
            cv_x.append(x)
            cv_y.append(y)
        return cv_x, cv_y
Exemplo n.º 11
0
class FeatureEncoder(object):
    '''
        ## 简介
        BOW特征编码器:基于sklearn的CountVectorizer,TfidfVectorizer实现,将句子转成 BOW(计算)或者TFIDF编码。
        ## 目前支持两种粒度的切分: 字(word) 和 分词后的词(seg)
        包含以下主要函数:
            1. segment_sentence:对句子分词
            2. transform_sentence:buildin,对一个句子编码
            3. fit_transform:构建编码器并转换数据
            4. transform: 转换数据
            5. print_sentence_length_detail: todo,打印训练库句子详情.
            6. print_model_descibe: 打印模型的详情.

    '''
    def __init__(
            self,
            # rand_seed=1337,
            verbose=0,
            need_segmented=True,
            full_mode=True,
            remove_stopword=True,
            replace_number=True,
            lowercase=True,
            zhs2zht=True,
            remove_url=True,
            feature_method='bow',
            feature_type='seg',
            max_features=None,
            word2vec_to_solve_oov=False,
            save_middle_result=False,
            **kwargs):
        '''
            1. 初始化参数,并验证参数合法性
            2. build feature encoder

            :param verbose: 数值越大,输出越详细
            :type verbose: int
            :param need_segmented: 数据处理选项,是否需要经过分词处理;如果为False,那么输入的数据不需要分词,提供的数据的每个句子的每个词要以空格分割.比如: ['我 要 买 手机','你好','早上 好'];如果为True,提供原始输入句子即可,比如:['我要买手机','你好','早上好'].
            :type need_segmented: bool
            :param full_mode: jieba分词选项,是否使用 full mode,默认为True
            :type full_mode: bool
            :param remove_stopword: jieba分词选项,是否去除 stop word,默认为True
            :type remove_stopword: bool
            :param replace_number: jieba分词选项,是否将数据统一替换成NUM,默认为True
            :type replace_number: bool
            :param lowercase: jieba分词选项,是否将数据统一替换成NUM,默认为True
            :type lowercase: bool
            :param zhs2zht: jieba分词选项,出現繁体的時候,是否转简体,默认为True
            :type zhs2zht: bool
            :param remove_url: jieba分词选项,是否移除 微博url,http://t.cn/开头的地址,默认为True
            :type remove_url: bool
            :param feature_method: 模型设置选项,选择 bow或者tfidf 特征计算方法
            :type feature_method: str
            :param feature_type: 模型设置选项,选择不同粒度的特征单位, 目前只支持 word,seg和 word_seg。
                - word:直接以字为单位,比如 我要买手机--->我 要 买 手 机
                - seg:分词后的词单元为单位,比如 我要买手机--->我 要 买 手机
                - word_seg:分词后的字和词为单位,比如 我要买手机--->我 要 买 手机 手 机
            :type feature_type: str
            :param max_features: 模型设置选项,特征选择的最大特征词数
            :type max_features: int
            :param word2vec_to_solve_oov: 使用word2vec扩展oov词
            :type word2vec_to_solve_oov: bool
            :param save_middle_result: 是否保存中间结果,为了节约空间默认关闭!
            :type save_middle_result: bool
            :param kwargs: 支持 word2vec_model_file_path等
            :type kwargs: dict


        '''
        # self.rand_seed = rand_seed
        self.save_middle_result = save_middle_result
        self.verbose = verbose
        self.full_mode = full_mode
        self.remove_stopword = remove_stopword
        self.need_segmented = need_segmented
        self.replace_number = replace_number
        self.lowercase = lowercase
        self.zhs2zht = zhs2zht
        self.remove_url = remove_url
        self.feature_method = feature_method
        self.feature_type = feature_type
        self.max_features = max_features
        self.word2vec_to_solve_oov = word2vec_to_solve_oov
        self.kwargs = kwargs

        # 检验参数合法性
        assert self.feature_method in ['bow', 'tfidf'
                                       ], 'feature method 只能取: bow,tfidf'
        assert self.feature_type in ['word', 'seg', 'word_seg'
                                     ], 'feature type 只能取: word,seg和word_seg'

        if word2vec_to_solve_oov:
            # 加载word2vec模型
            if word2vec_to_solve_oov:
                assert kwargs.has_key('word2vec_model_file_path'
                                      ), '请提供 属性 word2vec_model_file_path'
                # 加载word2vec模型
                w2v_util = Word2vecUtil()
                self.word2vec_model = w2v_util.load(
                    kwargs.get('word2vec_model_file_path'))

        # 初始化jieba分词器
        if need_segmented:
            self.jieba_seg = Jieba_Util(verbose=self.verbose)

        # 特征编码器: bow or tf-idf transformer
        self.feature_encoder = None
        # 训练库提取出来的字典对象
        self.train_data_dict = None
        # 训练库提取出来的字典词汇列表
        self.vocabulary = None
        # 训练库提取出来的字典词汇个数
        self.vocabulary_size = None
        # 训练样例的个数
        self.train_data_count = 0

        # region 为了节约内存空间,实际运行中时,建议设置 save_middle_result = False(关闭中间结果的保存)
        if self.save_middle_result:
            # 原始训练数据
            self.train_data = None
            # 切完词的句子
            self.segmented_sentences = None
            # 训练句子特征
            self.train_features = None
            # endregion

            # word2vec 模型
            # self.word2vec_model = None

            # self.fit_transform()

    def segment_sentence(self, sentence):
        '''
        对句子进行分词,使用jieba分词

        :param sentence: 句子
        :type sentence: str
        :return: 分完词句子,以空格连接
        :rtype: str
        '''

        if self.feature_type == 'seg':
            segmented_sentence = self.jieba_seg.seg(
                sentence,
                sep=' ',
                full_mode=self.full_mode,
                remove_stopword=self.remove_stopword,
                replace_number=self.replace_number,
                lowercase=self.lowercase,
                zhs2zht=self.zhs2zht,
                remove_url=self.remove_url,
                HMM=False,
            )
        elif self.feature_type == 'word':
            # 将句子切分为 以字为单元 以空格分割
            # 1. 先使用jieba进行预处理,将数字替换等
            segmented_sentence = self.jieba_seg.iter_each_word(
                sentence,
                sep=' ',
                need_segmented=True,
                full_mode=self.full_mode,
                remove_stopword=self.remove_stopword,
                replace_number=self.replace_number,
                lowercase=self.lowercase,
                zhs2zht=self.zhs2zht,
                remove_url=self.remove_url,
                HMM=False,
            )
            # 2. 按字切分

        elif self.feature_type == 'word_seg':
            # 将句子切分为 以字和词为单元,相同则去重 以空格分割
            # 1. 先使用jieba进行预处理,将数字替换等
            segmented_sentence = self.jieba_seg.seg(
                sentence,
                sep=' ',
                full_mode=self.full_mode,
                remove_stopword=self.remove_stopword,
                replace_number=self.replace_number,
                lowercase=self.lowercase,
                zhs2zht=self.zhs2zht,
                remove_url=self.remove_url,
                HMM=False,
            )
            # print(segmented_sentence)
            # 2. 按字切分
            word = self.jieba_seg.iter_each_word(segmented_sentence,
                                                 sep=' ',
                                                 need_segmented=False).split()
            # 3. 按词切分
            seg = segmented_sentence.split()
            segmented_sentence = ' '.join(set(seg + word))
        else:
            assert False, '不支持其他粒度的切分!'

        return segmented_sentence

    def reset(self):
        """重置对象

        Returns
        -------

        """
        self.feature_encoder = None

    def fit_transform(self, train_data=None, test_data=None):
        """
            build feature encoder
                1. fit
                2. transform拟合数据

        :param train_data: 训练句子列表:['','',...,'']
        :type train_data: array-like.
        :return: train_data 编码后的向量
        """
        # 训练样例的个数
        self.train_data_count = len(train_data)

        return self.fit(train_data, test_data).transform(train_data)

    def fit(self, train_data=None, test_data=None):
        """
            build feature encoder
                1. 转换数据格式,并分词
                2. 构建vectorizer

        :param train_data: 训练句子列表:['','',...,'']
        :type train_data: array-like.
        :return: train_data 编码后的向量
        """

        if self.verbose > 1:
            logging.debug('build feature encoder...')
            print('build feature encoder...')

        # -------------- region start : 1. 转换数据格式,并分词 -------------
        if self.verbose > 2:
            logging.debug('-' * 20)
            print('-' * 20)
            logging.debug('1. 转换数据格式,并分词')
            print('1. 转换数据格式,并分词')
        # -------------- code start : 开始 -------------

        assert train_data is not None, '没有输入训练数据!'

        train_data = np.asarray(train_data)
        # 为了节约内存空间,实际运行中时,建议设置 save_middle_result = False(关闭中间结果的保存)
        if self.save_middle_result:
            self.train_data = train_data

        if self.need_segmented:
            # 分词
            train_segmented_sentences = map(self.segment_sentence, train_data)
        else:
            # 不需要分词
            train_segmented_sentences = train_data

        # -------------- code start : 结束 -------------
        if self.verbose > 2:
            logging.debug('-' * 20)
            print('-' * 20)
        # -------------- region end : 1. 转换数据格式,并分词 ---------------
        if self.feature_encoder is None:
            # 当 feature_encoder 还没创建过时,则创建
            if self.feature_method == 'tfidf':
                self.feature_encoder = TfidfVectorizer(
                    analyzer="word",
                    token_pattern=u'(?u)\\b\w+\\b',
                    tokenizer=None,
                    preprocessor=None,
                    lowercase=False,
                    stop_words=None,
                    # vocabulary = tfidf_vocabulary,
                    max_features=self.max_features,
                )

            elif self.feature_method == 'bow':
                self.feature_encoder = CountVectorizer(
                    analyzer="word",
                    token_pattern=u'(?u)\\b\w+\\b',
                    tokenizer=None,
                    preprocessor=None,
                    lowercase=False,
                    stop_words=None,
                    # vocabulary = tfidf_vocabulary,
                    max_features=self.max_features,
                )
            else:
                raise NotImplementedError

            self.feature_encoder.fit_transform(
                train_segmented_sentences).toarray()

        # 为了节约内存空间,实际运行中时,建议设置 save_middle_result = False(关闭中间结果的保存)
        # if self.save_middle_result:
        #     self.train_features = train_features

        # 字典
        self.vocabulary = self.feature_encoder.get_feature_names()
        # 字典个数
        self.vocabulary_size = len(self.vocabulary)

        return self

    def word_similarity(self, word2vec_model, word1, word2):
        '''
        计算两个词的相似性

        Parameters
        ----------
        word2vec_model : gensim object
            word2vec_model gensim Word2Vec model
        word2:
        word1:

        Returns
        --------
            similarity score: float
        '''
        try:
            return word2vec_model.n_similarity(word1, word2)
        except:
            return 0

    def replace_oov_with_similay_word(self, word2vec_model, sentence):
        '''
            对句子中oov词使用训练库中最相近的词替换(word2vec余弦相似性)

        :param sentence:
        :return:
        '''

        # is_oov = np.asarray([item for item in self.feature_encoder.vocabulary])
        # has_oov = any(is_oov)
        sentence = sentence.split()
        oov_word = []
        replace_word = []
        for item in sentence:
            if item not in self.vocabulary:
                oov_word.append(item)
                keywords_sim_score = np.asarray([
                    self.word_similarity(word2vec_model, item, i)
                    for i in self.vocabulary
                ])
                sorted_index = np.argsort(keywords_sim_score)[-1::-1]
                most_similarity_score = keywords_sim_score[sorted_index[0]]
                most_similarity_word = self.vocabulary[sorted_index[0]]
                if self.verbose > 1:
                    print(u'%s 最相近的词是%s,分数为:%f' %
                          (item, most_similarity_word, most_similarity_score))
                replace_word.append(most_similarity_word)
        sentence += replace_word
        return ' '.join(sentence)

    def transform_sentence(
        self,
        sentence,
    ):
        '''
            转换一个句子的格式。跟训练数据一样的操作,对输入句子进行 bow或tfidf 编码。
                1. 分词
                2. 编码

        :param sentence: 输入句子,不用分词,进来后会有分词处理
        :type sentence: str
        :return: 补齐的字典索引
        :rtype: array-like
        '''

        # region -------------- 1. 分词 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
            logging.debug('1. 分词')
            print('1. 分词')
        # -------------- code start : 开始 -------------

        # 分词
        if self.need_segmented:
            seg_sentence = self.segment_sentence(sentence)
        else:
            seg_sentence = sentence

        if self.word2vec_to_solve_oov:
            seg_sentence = self.replace_oov_with_similay_word(
                self.word2vec_model, seg_sentence)

        # -------------- code start : 结束 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
        # endregion -------------- 1. 分词 ---------------

        # region -------------- 2. 特征转换 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
            logging.debug('2. 转为字典索引列表,之后补齐,输入为补齐的字典索引列表')
            print('2. 转为字典索引列表,之后补齐,输入为补齐的字典索引列表')
        # -------------- code start : 开始 -------------

        features = self.feature_encoder.transform([seg_sentence]).toarray()[0]

        # -------------- code start : 结束 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
        # endregion -------------- 2. 特征转换 ---------------

        return features

    def transform(
        self,
        data,
    ):
        '''
            批量转换数据,跟 transform_sentence()一样的操作
                1. 直接调用 self.transform_sentence 进行处理

        :param data: 输入句子集合
        :type data: array-like
        :return: bow vector
        :rtype: array-like
        '''

        index = map(self.transform_sentence, data)
        # print(index[:5])

        return np.asarray(index)

    def print_model_descibe(self):
        '''
            打印模型参数详情

        :return: 参数设置详情
        :rtype: dict 或 {}
        '''
        import pprint
        detail = {
            'train_data_count': self.train_data_count,
            'need_segmented': self.need_segmented,
            'word2vec_to_solve_oov': self.word2vec_to_solve_oov,
            'vocabulary_size': self.vocabulary_size,
            'verbose': self.verbose,
            # 'rand_seed': self.rand_seed,
            'full_mode': self.full_mode,
            'remove_stopword': self.remove_stopword,
            'replace_number': self.replace_number,
            'lowercase': self.lowercase,
            'zhs2zht': self.zhs2zht,
            'remove_url': self.remove_url,
            'feature_method': self.feature_method,
            'feature_type': self.feature_type,
            'max_features': self.max_features,
        }
        pprint.pprint(detail)
        logging.debug(detail)
        return detail
Exemplo n.º 12
0
 def __init__(self):
     jutil = Jieba_Util(verbose=0)
     self.remove_sentence_punctuation = lambda x:jutil.seg(x,sep='',remove_url=False)
Exemplo n.º 13
0
    def __init__(self,
                 need_segmented=True,
                 verbose=0,
                 full_mode=True,
                 feature_type='seg',
                 remove_stopword=True,
                 replace_number=True,
                 lowercase=True,
                 zhs2zht=True,
                 remove_url=True,
                 sentence_padding_length=7,
                 padding_mode='center',
                 add_unkown_word=True,
                 to_onehot_array=False,
                 word2vec_to_solve_oov=False,
                 **kwargs):
        """
            Onehot特征编码器,将句子转成 onehot 编码(以字典索引形式表示,补齐)
            1. 初始化参数
            2. build feature encoder

            :param need_segmented: 数据处理选项,是否需要经过分词处理;如果为False,那么输入的数据不需要分词,提供的数据的每个句子的每个词要以空格分割.比如: ['我 要 买 手机','你好','早上 好'];如果为True,提供原始输入句子即可,比如:['我要买手机','你好','早上好'].
            :type need_segmented: bool
            :param verbose: 数值越大,输出越详细
            :type verbose: int
            :param full_mode: jieba分词选项,是否使用 full mode,默认为True
            :type full_mode: bool
            :param feature_type: 模型设置选项,选择不同粒度的特征单位, 目前只支持 word,seg和 word_seg。
                - word:直接以字为单位,比如 我要买手机--->我 要 买 手 机
                - seg:分词后的词单元为单位,比如 我要买手机--->我 要 买 手机
                - word_seg:分词后的字和词为单位,去重,比如 我要买手机--->我 要 买 手机 手 机
                - word_seg_concat:分词后的字和词为单位,不去重,比如 我要买手机--->我 要 买 手 机 我 要 买 手机
            :type feature_type: str
            :param remove_stopword: jieba分词选项,是否去除 stop word,默认为True
            :type remove_stopword: bool
            :param replace_number: jieba分词选项,是否将数据统一替换成NUM,默认为True
            :type replace_number: bool
            :param lowercase: jieba分词选项,是否将数据统一替换成NUM,默认为True
            :type lowercase: bool
            :param zhs2zht: jieba分词选项,出現繁体的時候,是否转简体,默认为True
            :type zhs2zht: bool
            :param remove_url: jieba分词选项,是否移除 微博url,http://t.cn/开头的地址,默认为True
            :type remove_url: bool
            :param add_unkown_word: 训练库字典的设置选项,是否在字典中增加一个未知词字符(UNKOWN)
            :type add_unkown_word: bool
            :param sentence_padding_length:  句子的补齐(截断)长度,默认为7
            :type sentence_padding_length: int
            :param padding_mode:  句子的补齐(截断)模式,有四种模式:
                                        1. center:如果小于sentence_padding_length的话往两边补0;如果超出sentence_padding_length的话,直接在后面截断。
                                        2. left:如果小于sentence_padding_length的话往左边补0;如果超出sentence_padding_length的话,直接在后面截断。
                                        3. right:如果小于sentence_padding_length的话往右边补0;如果超出sentence_padding_length的话,直接在后面截断。
                                        4. none:不补齐。
            :type padding_mode: str
            :param to_onehot_array: 输出 onehot array,还是字典索引 array,默认为False,输出字典索引,
            :type to_onehot_array: bool
            :param word2vec_to_solve_oov: 使用word2vec扩展oov词
            :type word2vec_to_solve_oov: bool
            :param kwargs:
                - word2vec_model_file_path:
                - vocabulary_including_test_set: (default,True)
                - update_dictionary: (default,True)
                - 等

        """
        self.full_mode = full_mode
        self.feature_type = feature_type
        self.remove_stopword = remove_stopword
        self.verbose = verbose
        self.need_segmented = need_segmented
        self.replace_number = replace_number
        self.lowercase = lowercase
        self.zhs2zht = zhs2zht
        self.remove_url = remove_url
        self.add_unkown_word = add_unkown_word
        self.sentence_padding_length = sentence_padding_length
        self.padding_mode = padding_mode
        self.to_onehot_array = to_onehot_array
        self.word2vec_to_solve_oov = word2vec_to_solve_oov
        self.kwargs = kwargs

        # 检验参数合法性
        assert self.padding_mode in [
            'center', 'left', 'right', 'none'
        ], 'padding mode 只能取: center,left,right,none'
        assert self.feature_type in [
            'word', 'seg', 'word_seg', 'word_seg_concat'
        ], 'feature type 只能取: word,seg和word_seg'

        # 初始化jieba分词器
        if need_segmented:
            self.jieba_seg = Jieba_Util(verbose=self.verbose)
        # 训练库提取出来的字典对象
        self.train_data_dict = None
        # 训练库提取出来的字典词汇列表
        self.vocabulary = None
        # 训练库提取出来的字典词汇个数
        self.vocabulary_size = None
        # UNKOWN字符的索引
        self.unknow_token_index = None
        # PADDING字符的索引
        self.padding_token_index = None

        # region NOTE: 这些变量不再维护,因为消耗内存
        # 原始训练数据
        # self.train_data = None
        # 切完词的句子
        # self.segmented_sentences = None
        # 训练库句子的字典索引形式
        # self.train_index = None
        # 训练库句子的补齐的字典索引形式
        # self.train_padding_index = None
        # 训练库句子装成onehot array
        # endregion
        self.train_onehot_array = None
        # word2vec 模型
        self.word2vec_model = None
        if word2vec_to_solve_oov:
            assert kwargs.has_key(
                'word2vec_model_file_path'), '请提供 属性 word2vec_model_file_path'
            # 加载word2vec模型
            w2v_util = Word2vecUtil()
            self.word2vec_model = w2v_util.load(
                kwargs.get('word2vec_model_file_path'))

        if verbose > 1:
            logging.debug('build feature encoder...')
            print('build feature encoder...')
Exemplo n.º 14
0
 def __init__(self):
     # 初始化jieba工具
     self.jieba_util = Jieba_Util()
Exemplo n.º 15
0
    def __init__(
            self,
            # rand_seed=1337,
            verbose=0,
            need_segmented=True,
            full_mode=True,
            remove_stopword=True,
            replace_number=True,
            lowercase=True,
            zhs2zht=True,
            remove_url=True,
            feature_method='bow',
            feature_type='seg',
            max_features=None,
            word2vec_to_solve_oov=False,
            save_middle_result=False,
            **kwargs):
        '''
            1. 初始化参数,并验证参数合法性
            2. build feature encoder

            :param verbose: 数值越大,输出越详细
            :type verbose: int
            :param need_segmented: 数据处理选项,是否需要经过分词处理;如果为False,那么输入的数据不需要分词,提供的数据的每个句子的每个词要以空格分割.比如: ['我 要 买 手机','你好','早上 好'];如果为True,提供原始输入句子即可,比如:['我要买手机','你好','早上好'].
            :type need_segmented: bool
            :param full_mode: jieba分词选项,是否使用 full mode,默认为True
            :type full_mode: bool
            :param remove_stopword: jieba分词选项,是否去除 stop word,默认为True
            :type remove_stopword: bool
            :param replace_number: jieba分词选项,是否将数据统一替换成NUM,默认为True
            :type replace_number: bool
            :param lowercase: jieba分词选项,是否将数据统一替换成NUM,默认为True
            :type lowercase: bool
            :param zhs2zht: jieba分词选项,出現繁体的時候,是否转简体,默认为True
            :type zhs2zht: bool
            :param remove_url: jieba分词选项,是否移除 微博url,http://t.cn/开头的地址,默认为True
            :type remove_url: bool
            :param feature_method: 模型设置选项,选择 bow或者tfidf 特征计算方法
            :type feature_method: str
            :param feature_type: 模型设置选项,选择不同粒度的特征单位, 目前只支持 word,seg和 word_seg。
                - word:直接以字为单位,比如 我要买手机--->我 要 买 手 机
                - seg:分词后的词单元为单位,比如 我要买手机--->我 要 买 手机
                - word_seg:分词后的字和词为单位,比如 我要买手机--->我 要 买 手机 手 机
            :type feature_type: str
            :param max_features: 模型设置选项,特征选择的最大特征词数
            :type max_features: int
            :param word2vec_to_solve_oov: 使用word2vec扩展oov词
            :type word2vec_to_solve_oov: bool
            :param save_middle_result: 是否保存中间结果,为了节约空间默认关闭!
            :type save_middle_result: bool
            :param kwargs: 支持 word2vec_model_file_path等
            :type kwargs: dict


        '''
        # self.rand_seed = rand_seed
        self.save_middle_result = save_middle_result
        self.verbose = verbose
        self.full_mode = full_mode
        self.remove_stopword = remove_stopword
        self.need_segmented = need_segmented
        self.replace_number = replace_number
        self.lowercase = lowercase
        self.zhs2zht = zhs2zht
        self.remove_url = remove_url
        self.feature_method = feature_method
        self.feature_type = feature_type
        self.max_features = max_features
        self.word2vec_to_solve_oov = word2vec_to_solve_oov
        self.kwargs = kwargs

        # 检验参数合法性
        assert self.feature_method in ['bow', 'tfidf'
                                       ], 'feature method 只能取: bow,tfidf'
        assert self.feature_type in ['word', 'seg', 'word_seg'
                                     ], 'feature type 只能取: word,seg和word_seg'

        if word2vec_to_solve_oov:
            # 加载word2vec模型
            if word2vec_to_solve_oov:
                assert kwargs.has_key('word2vec_model_file_path'
                                      ), '请提供 属性 word2vec_model_file_path'
                # 加载word2vec模型
                w2v_util = Word2vecUtil()
                self.word2vec_model = w2v_util.load(
                    kwargs.get('word2vec_model_file_path'))

        # 初始化jieba分词器
        if need_segmented:
            self.jieba_seg = Jieba_Util(verbose=self.verbose)

        # 特征编码器: bow or tf-idf transformer
        self.feature_encoder = None
        # 训练库提取出来的字典对象
        self.train_data_dict = None
        # 训练库提取出来的字典词汇列表
        self.vocabulary = None
        # 训练库提取出来的字典词汇个数
        self.vocabulary_size = None
        # 训练样例的个数
        self.train_data_count = 0

        # region 为了节约内存空间,实际运行中时,建议设置 save_middle_result = False(关闭中间结果的保存)
        if self.save_middle_result:
            # 原始训练数据
            self.train_data = None
            # 切完词的句子
            self.segmented_sentences = None
            # 训练句子特征
            self.train_features = None
Exemplo n.º 16
0
"""
    Author:  'jdwang'
    Date:    'create date: 2016-07-16'
    Email:   '*****@*****.**'
    Describe: 
"""
from __future__ import print_function

import numpy as np
import pandas as pd
import logging
import timeit

from data_processing_util.jiebanlp.jieba_util import Jieba_Util

jutil = Jieba_Util(verbose=0)
remove_sentence_punctuation = lambda x: jutil.seg(x, sep='', remove_url=False)

# 统计 进入协处理的对话段数

ch2r_dialogue_file_path = '/home/jdwang/PycharmProjects/corprocessor/coprocessor/Corpus/ood_dataset/dev_vesion/ch2r_test_dataset/start-20150613测试集/data/dialogue_usersentence_ge_1.csv'

ch2r_dialogue = pd.read_csv(
    ch2r_dialogue_file_path,
    sep='\t',
    encoding='utf8',
    header=0,
)

user_sentence = ch2r_dialogue[ch2r_dialogue['Name'] != 'Ch2R']
Exemplo n.º 17
0
class DataUtil(object):
    def __init__(self):
        # 训练数据的根目录
        self.dataset_root_path = '/home/jdwang/PycharmProjects/corprocessor/coprocessor/Corpus/ood_dataset/stable_vesion/'
        self.word2vec_model_root_path = '/home/jdwang/PycharmProjects/corprocessor/word2vec/vector/'
        self.jieba_util = None

    def get_label_index(self,version='v2.0'):
        """
            获取 DA 分类类别的列表,总共有24类

        :return: label_to_index,index_to_label
        """
        if version == 'v1.0':
            # 16分类标签
            index_to_label = [
                u'捣乱#骂人',

                u'导购#开始',u'导购#成交',
                u'导购#更换',u'导购#详情',

                u'表态#附和',u'表态#否定',
                u'表态#犹豫',u'表态#肯定',u'表态#否定#不满',u'表态#随便',

                u'闲聊#身份信息',u'闲聊#天气',
                u'闲聊#问候',u'闲聊#时间',u'闲聊#结束语',
            ]
        elif version=='v2.0':
            # 24分类标签
            index_to_label = [
                u'其它#骂人',

                u'导购#不成交',u'导购#不理解',u'导购#开始',
                u'导购#成交',u'导购#更换',u'导购#结束',u'导购#详情',

                u'表态#不满',u'表态#否定',u'表态#满意',
                u'表态#犹豫',u'表态#疑问',u'表态#肯定',u'表态#附和',u'表态#随便',

                u'社交义务#不用谢',u'社交义务#接受道歉',u'社交义务#致谢',
                u'社交义务#道歉',u'社交义务#问候',

                u'闲聊#天气',u'闲聊#时间',u'闲聊#身份信息'
            ]
        elif version=='v2.0++':
            # 24分类标签 ++所用版本,临时用
            index_to_label = [
                u'其它#骂人',

                u'导购#不成交',u'导购#不理解',u'导购#开始',
                u'导购#成交',u'导购#更换',u'导购#结束',u'导购#详情',

                u'社交义务#不用谢',u'社交义务#接受道歉',u'社交义务#致谢',
                u'社交义务#道歉',u'社交义务#问候',

                u'表态#不满',u'表态#否定',u'表态#满意',
                u'表态#犹豫',u'表态#疑问',u'表态#肯定',u'表态#附和',u'表态#随便',

                u'闲聊#天气',u'闲聊#时间',u'闲聊#身份信息'
            ]
        elif version=='v2.0_small':
            # 17分类标签
            index_to_label = [
                u'其它#骂人',

                u'导购#开始',
                u'导购#成交',u'导购#更换',u'导购#结束',u'导购#详情',

                u'表态#否定',u'表态#不满',
                u'表态#犹豫',u'表态#肯定',u'表态#附和',u'表态#随便',

                u'社交义务#不用谢',
                u'社交义务#问候',

                u'闲聊#天气',u'闲聊#时间',u'闲聊#身份信息'
            ]

        # print('类别数为:%d'%len(index_to_label))
        label_to_index = {label: idx for idx, label in enumerate(index_to_label)}
        return label_to_index,index_to_label

    def transform_word2vec_model_name(self,flag):
        '''
            根据 flag 转换成完整的 word2vec 模型文件名

        :param flag:
        :return:
        '''

        from data_processing_util.word2vec_util.word2vec_util import Word2vecUtil
        return Word2vecUtil().transform_word2vec_model_name(flag)



    def transform_dataset_name(self,flag):
        """
            将数据集标记转为真正的训练集和测试集文件名

        :param flag: 数据集标记 v1.0(S),v2.2(S),v2.2(Sa),v2.2(L),v2.3(S)
        :type flag: str
        :return: train_data_file_path,test_data_file_path
        """

        if flag == 'v1.0(S)':
            # 使用v2.2 L版本的数据集
            train_data_file_path = self.dataset_root_path + '20160526/train_all.csv'
            test_data_file_path = self.dataset_root_path + '20160526/ood_labeled.csv'
        elif flag == 'v2.2(L)':
            # 使用v2.2 L版本的数据集
            train_data_file_path = self.dataset_root_path + 'v2.2/v2.2_train_L_2302.csv'
            test_data_file_path = self.dataset_root_path + 'v2.2/v2.2_test_L_76.csv'
        elif flag == 'v2.2(S)':
            # 使用v2.2 S版本的数据集
            train_data_file_path = self.dataset_root_path + 'v2.2/v2.2_train_S_1518.csv'
            test_data_file_path = self.dataset_root_path + 'v2.2/v2.2_test_S_131.csv'
        elif flag == 'v2.2(Sa)':
            # 使用v2.2 Sa版本的数据集
            train_data_file_path = self.dataset_root_path + 'v2.2/v2.2_train_Sa_893.csv'
            test_data_file_path = self.dataset_root_path + 'v2.2/v2.2_test_Sa_79.csv'
        # else:
        #     如果匹配不上,则使用v2.2 Sa版本的数据集
            # train_data_file_path = self.dataset_root_path + 'v2.2/v2.2_train_L_2302.csv'
            # test_data_file_path = self.dataset_root_path + 'v2.2/v2.2_test_L_76.csv'
        elif flag == 'v2.3(L)':
            # 使用v2.2 L版本的数据集
            train_data_file_path = self.dataset_root_path + 'v2.3/v2.3_train_L_2300.csv'
            test_data_file_path = self.dataset_root_path + 'v2.3/v2.3_test_L_76.csv'
        elif flag == 'v2.3(S)':
            # 使用v2.2 S版本的数据集
            train_data_file_path = self.dataset_root_path + 'v2.3/v2.3_train_S_1518.csv'
            test_data_file_path = self.dataset_root_path + 'v2.3/v2.3_test_S_131.csv'
        elif flag == 'v2.3(Sa)':
            # 使用v2.2 Sa版本的数据集
            train_data_file_path = self.dataset_root_path + 'v2.3/v2.3_train_Sa_891.csv'
            test_data_file_path = self.dataset_root_path + 'v2.3/v2.3_test_Sa_79.csv'
        else:
            # 如果匹配不上,则使用v2.2 Sa版本的数据集
            train_data_file_path = self.dataset_root_path + 'v2.3/v2.3_train_L_2300.csv'
            test_data_file_path = self.dataset_root_path + 'v2.3/v2.3_test_L_76.csv'

        return train_data_file_path,test_data_file_path
    def merge_to_17class(self,data):
        '''
            将新版数据集合并成17个类别

        :param data:
        :return:
        '''

        data.loc[data['LABEL']==u'导购#不理解','LABEL'] = u'其它#其它'
        data.loc[data['LABEL']==u'表态#疑问','LABEL'] = u'其它#其它'
        data.loc[data['LABEL']==u'表态#满意','LABEL'] = u'表态#肯定'
        data.loc[data['LABEL']==u'导购#不成交','LABEL'] = u'导购#结束'
        data.loc[data['LABEL']==u'社交义务#接受道歉','LABEL'] = u'导购#结束'
        data.loc[data['LABEL']==u'社交义务#致谢','LABEL'] = u'导购#结束'
        data.loc[data['LABEL']==u'社交义务#道歉','LABEL'] = u'导购#结束'
        # print(','.join(data['LABEL'].unique()))
        # print(len(data['LABEL'].unique()))
        # quit()
        return data

    def load_train_test_data(self,config):
        """
            加载训练数据和测试数据,根据配置选择
            加载的文件中一定要有 LABEL 和 SENTENCE 字段

        :param config:
        :return:
        """

        logging.debug('=' * 20)

        train_data_file_path, test_data_file_path = self.transform_dataset_name(config['dataset_type'])

        # -------------- print start : just print info -------------
        if config['verbose'] > 0 :
            logging.debug('加载%s版本数据集的训练数据和测试数据\n标注版本:%s'%(config['dataset_type'],config['label_version']))
            print('加载%s版本数据集的训练数据和测试数据\n标注版本:%s'%(config['dataset_type'],config['label_version']))
            logging.debug('train_data_file_path:%s'%train_data_file_path)
            logging.debug('test_data_file_path:%s'%test_data_file_path)
            print('train_data_file_path:%s'%train_data_file_path)
            print('test_data_file_path:%s'%test_data_file_path)
        # -------------- print end : just print info -------------

        train_data = pd.read_csv(
            train_data_file_path,
            sep='\t',
            encoding='utf8',
            header=0
        )

        test_data = pd.read_csv(
            test_data_file_path,
            sep='\t',
            encoding='utf8',
            header=0
        )


        if config['label_version']=='v2.0_small':
            train_data = self.merge_to_17class(train_data)
            test_data = self.merge_to_17class(test_data)

        if config['verbose'] > 0:
            logging.debug('fit data shape is :%s' % (str(train_data.shape)))
            print('fit data shape is :%s' % (str(train_data.shape)))

            logging.debug('test data shape is :%s' % (str(test_data.shape)))
            print('test data shape is :%s' % (str(test_data.shape)))
            logging.debug('-' * 20)
            # 去除类别 其他#其他
            logging.debug('去除类别 其他#其他 ID,其他#捣乱')
            print('去除类别 其它#其它 ID 其他#捣乱')

        filter_row = lambda x: x not in [u'其它#其它', u'其他#其他', u'ID',u'其它#捣乱']

        train_data['IS_FILTER'] = train_data['LABEL'].apply(filter_row)
        test_data['IS_FILTER'] = test_data['LABEL'].apply(filter_row)

        train_data = train_data[train_data['IS_FILTER'] == True]
        test_data = test_data[test_data['IS_FILTER'] == True]

        if config['verbose'] > 0:
            logging.debug('fit data shape is :%s' % (str(train_data.shape)))
            print('fit data shape is :%s' % (str(train_data.shape)))

            logging.debug('test data shape is :%s' % (str(test_data.shape)))
            print('test data shape is :%s' % (str(test_data.shape)))
            logging.debug('-' * 20)

        train_data = train_data[['LABEL', 'SENTENCE']]
        test_data = test_data[['LABEL', 'SENTENCE']]

        label_to_index,index_to_label =self.get_label_index(version=config['label_version'])
        if config['verbose'] > 0:
            logging.debug(u'总共类别数:%d,分别为:%s' % (len(index_to_label), ','.join(index_to_label)))
            print(u'总共类别数:%d,分别为:%s' % (len(index_to_label), ','.join(index_to_label)))

        train_data['LABEL_INDEX'] = train_data['LABEL'].map(label_to_index)

        test_data['LABEL_INDEX'] = test_data['LABEL'].map(label_to_index)

        return train_data,test_data


    def batch_segment_sentences(self,sentences):
        '''
            对多个句子批量分词

        :param sentences: array-like
        :return:
        '''
        self.jieba_util = Jieba_Util()
        segmented_sentences = map(self.segment_sentence,sentences)
        return segmented_sentences


    def segment_sentence(self,sentence):
        '''
            将句子进行分词

        :param sentence:
        :return:
        '''
        segmented_sentence = self.jieba_util.seg(sentence=sentence,
                                                 sep=' ',
                                                 full_mode=True,
                                                 remove_stopword=False,
                                                 replace_number=True,
                                                 lowercase=True,
                                                 zhs2zht=True,
                                                 remove_url=True,
                                                 )
        return segmented_sentence


    def save_data(self,data,path):
        '''
            保存DataFrame格式的数据

        :param data: 数据
        :param path: 数据文件的路径
        :return: None
        '''
        data.to_csv(path,
                    sep='\t',
                    header=True,
                    index=False,
                    encoding='utf8',
                    )

    def save_result(self,data,predict,is_correct,path):
        '''
            将预测结果进行保存

        :param data: 数据,DataFrame
        :param predict: 预测结果
        :type predict: array-like
        :param is_correct: 是否正确
        :param path: 路径
        :return: None
        '''
        label_to_index, index_to_label = self.get_label_index()
        data['PREDICT'] = [index_to_label[item] for item in predict]
        data['is_correct'] = is_correct
        self.save_data(data,path)


    def load_data(self,path):
        '''
            加载DataFrame格式的数据

        :param data: 数据
        :param path: 数据文件的路径
        :return: None
        '''
        data = pd.read_csv(path,
                           sep='\t',
                           header=0,
                           encoding='utf8',
                           index_col=0,
                           )
        return data

    def get_k_fold_data(self,
                        k=5,
                        data=None,
                        rand_seed = 0,
                        ):
        '''
            将数据分为K-fold

        :param k:
        :param data:
        :type data: pd.DataFrame()
        :return:
        '''

        train_X = data['SENTENCE'].as_matrix()
        train_y = data['LABEL_INDEX'].as_matrix()

        cv_x = []
        cv_y = []
        for x, y in data_split_k_fold(k=k, data=(train_X, train_y), rand_seed=rand_seed):
            cv_x.append(x)
            cv_y.append(y)
        return cv_x,cv_y
Exemplo n.º 18
0
class DataUtil(object):
    '''
        微博立场分析数据处理工具类,包含以下函数:
            1. load_data:加载csv格式的数据
            2. save_data:保存csv格式的数据
            3. print_data_detail: 打印数据详情
            4. processing_na_value:处理空值数据
            5. segment_sentence:分词
            6. split_train_test:切分训练集和测试集
            7.
    '''
    def __init__(self):
        # 初始化jieba工具
        self.jieba_util = Jieba_Util()

    def load_data(self,path,header=True):
        '''
            读取数据
        :param path: 数据文件的路径
        :return:
        '''
        if header:
            data = pd.read_csv(path,
                               sep='\t',
                               header=0,
                               encoding='utf8',
                               )
        else:
            data = pd.read_csv(path,
                               sep='\t',
                               header=None,
                               encoding='utf8',
                               )
        return data

    def load_train_test_data(self,
                             config = None
                             ):
        '''
            加载训练数据和测试数据,已经标签索引

        :param config:  一些配置信息
        :param config: dict
        :return:
        '''

        # -------------- region start : 1. 加载训练集和测试集 -------------
        if config['verbose'] > 2:
            logging.debug('-' * 20)
            print '-' * 20
            logging.debug('1. 加载训练集和测试集')
            print '1. 加载训练集和测试集'
        # -------------- code start : 开始 -------------
        train_data_file_path = (config['train_data_file_path']) % config['train_data_type']
        test_data_file_path = (config['test_data_file_path']) % config['test_data_type']
        logging.debug(train_data_file_path)
        print train_data_file_path
        logging.debug(test_data_file_path)
        print test_data_file_path

        data_util = DataUtil()
        train_data = data_util.load_data(train_data_file_path)
        test_data = data_util.load_data(test_data_file_path)

        # -------------- code start : 结束 -------------
        if config['verbose'] > 2:
            logging.debug('-' * 20)
            print '-' * 20
        # -------------- region end : 1. 加载训练集和测试集 ---------------

        # 生成类别索引
        label_to_index = {u'FAVOR': 0, u'AGAINST': 1, u'NONE': 2}
        index_to_label = [u'FAVOR', u'AGAINST', u'NONE']

        return train_data,test_data,label_to_index,index_to_label

    def save_data(self,data,path):
        '''
            保存数据
        :param path: 数据文件的路径
        :return:
        '''
        data.to_csv(path,
                    sep='\t',
                    header=True,
                    index=False,
                    encoding='utf8',
                    )

    def print_data_detail(self, data, has_stance=True):
        '''
            展示数据的详细信息
        :param data: Dateframe对象
        :param has_stance: 是否有STANCE字段
        :return: 无
        '''

        logging.debug('data的个数为:%d' % (len(data)))
        logging.debug('data的sample数据:')
        logging.debug(data.head())

        logging.debug('data的target和个数分别为:')
        logging.debug(data['TARGET'].value_counts())
        if has_stance:
            logging.debug('统计每个Target下各个类型立场的数量...')
            group = data.groupby(by=['TARGET', 'STANCE'])
            logging.debug(group.count())
        else:
            logging.debug('没有STANCE字段')

        logging.debug('数据各个字段情况...')
        # print data.info()
        for column in data.columns:
            # 统计每个字段是否有数据是空串
            # 先将所有空字符串用nan替换
            data[column] = data[column].replace(r'^\s*$', np.nan, regex=True)
            count_null = sum(data[column].isnull())
            if count_null != 0:
                logging.warn(u'%s字段有空值,个数:%d,建议使用processing_na_value()方法进一步处理!' % (column, count_null))
                null_data_path = './result/null_data.csv'
                logging.warn(u'将缺失值数据输出到文件:%s' % (null_data_path))
                data[data[column].isnull()].to_csv(null_data_path,
                                                   index=None,
                                                   encoding='utf8',
                                                   sep='\t')

    def processing_na_value(self,data,clear_na=True,fill_na = False,fill_char = 'NULL',columns=None):
        '''
            处理数据的空值

        :param data:  Dateframe对象
        :param clear_na: bool,是否去掉空值数据
        :param fill_na: bool,是否填充空值
        :param fill_char: str,填充空置的字符
        :param column: list,需要处理的字段,默认为None时,对所有字段处理
        :return: Dateframe对象
        '''
        logging.debug('[def processing_na_value()] 对缺失值进行处理....')
        for column in data.columns:
            if columns == None or column in columns:
                data[column] = data[column].replace(r'^\s*$', np.nan, regex=True)
                count_null = sum(data[column].isnull())
                if count_null != 0:
                    logging.warn(u'%s字段有空值,个数:%d' % (column, count_null))
                    if clear_na:
                        logging.warn(u'对数据的%s字段空值进行摘除'%(column))
                        data = data[data[column].notnull()].copy()
                    else:
                        if fill_na:
                            logging.warn(u'对数据的%s字段空值进行填充,填充字符为:%s'%(column,fill_char))
                            data[column] = data[column].fillna(value=fill_char)

        return data

    def segment_sentence(self,sentence):
        segmented_sentence = self.jieba_util.seg(sentence=sentence,
                                                 sep=' ',
                                                 full_mode=True,
                                                 remove_stopword=True,
                                                 replace_number=True,
                                                 lowercase = True,
                                                 zhs2zht=True,
                                                 remove_url=True,
                                                 )
        return segmented_sentence

    def split_train_test(self,data, train_split=0.7):
        '''
            将数据切分成训练集和验证集

        :param data:
        :param train_split: float,取值范围[0,1],设置训练集的比例
        :return: dev_data,test_data
        '''
        logging.debug('对数据随机切分成train和test数据集,比例为:%f' % (train_split))
        num_train = len(data)
        num_dev = int(num_train * train_split)
        num_test = num_train - num_dev
        logging.debug('全部数据、训练数据和测试数据的个数分别为:%d,%d,%d' % (num_train, num_dev, num_test))
        rand_list = np.random.RandomState(0).permutation(num_train)
        # print rand_list
        # print rand_list[:num_dev]
        # print rand_list[num_dev:]
        dev_data = data.iloc[rand_list[:num_dev]].sort_index()
        test_data = data.iloc[rand_list[num_dev:]].sort_index()
        # print dev_data
        # print test_data
        return dev_data, test_data


    def count_word_freq(self,data):
        '''
            统计每个词 在各个类别中的次数,每个词有四个统计项:
                1. FAVOR:	在favor类别中的出现的次数
                2. AGAINST:在AGAINST类别中的出现的次数
                3. NONE	: 在NONE类别中的出现的次数
                4. FREQ	: 在所有类别中的出现的次数,即FAVOR+AGAINST+NONE
                5. SUPPORT: 最高词频词频项/(FREQ)

        :param data:
        :return:
        '''
        from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder

        feature_encoder = FeatureEncoder(train_data=data['WORDS'].as_matrix(),
                                         verbose=0,
                                         padding_mode='none',
                                         need_segmented=False,
                                         full_mode=True,
                                         remove_stopword=True,
                                         replace_number=True,
                                         lowercase=True,
                                         remove_url=True,
                                         sentence_padding_length=7,
                                         add_unkown_word=False,
                                         mask_zero=False,
                                         zhs2zht=True,
                                         )

        # print feature_encoder.train_padding_index
        train_X_features = feature_encoder.to_onehot_array()

        np.save('result/train_X_feature',train_X_features)

        print train_X_features.shape
        print train_X_features[:5]
        vocabulary = feature_encoder.vocabulary
        print ','.join(vocabulary)
        print feature_encoder.vocabulary_size
        np.save('result/vocabulary',vocabulary)

        freq = np.sum(train_X_features,axis=0)
        favor_freq = np.sum(train_X_features[data['STANCE'].as_matrix()==u'FAVOR'],axis=0)
        against_freq = np.sum(train_X_features[data['STANCE'].as_matrix()==u'AGAINST'],axis=0)
        none_freq = np.sum(train_X_features[data['STANCE'].as_matrix()==u'NONE'],axis=0)



        support = np.nan_to_num([max(favor,against,none)/(1.0*(favor+against+none)) for favor,against,none in zip(favor_freq,against_freq,none_freq)])
        print freq
        print favor_freq
        print against_freq
        print none_freq
        count_data = pd.DataFrame(data={
            u'WORD':vocabulary,
            u'FAVOR':favor_freq,
            u'AGAINST':against_freq,
            u'NONE':none_freq,
            u'SUPPORT':support,
            u'FREQ':freq,
        })
        count_data = count_data.sort_values(by=[u'SUPPORT',u'FREQ','WORD'],ascending=False)
        count_data = count_data[[u'WORD',u'FAVOR',u'AGAINST',u'NONE',u'FREQ',u'SUPPORT']]
        count_data.to_csv('result/word_count.csv',
                          sep='\t',
                          index=False,
                          header=True,
                          encoding='utf8',
                          )
        print count_data.head()
Exemplo n.º 19
0
    def __init__(self,
                 need_segmented=True,
                 verbose=0,
                 full_mode=True,
                 feature_type='seg',
                 remove_stopword=True,
                 replace_number=True,
                 lowercase=True,
                 zhs2zht=True,
                 remove_url=True,
                 sentence_padding_length=7,
                 padding_mode='center',
                 add_unkown_word=True,
                 to_onehot_array=False,
                 word2vec_to_solve_oov=False,
                 **kwargs
                 ):
        """
            Onehot特征编码器,将句子转成 onehot 编码(以字典索引形式表示,补齐)
            1. 初始化参数
            2. build feature encoder

            :param need_segmented: 数据处理选项,是否需要经过分词处理;如果为False,那么输入的数据不需要分词,提供的数据的每个句子的每个词要以空格分割.比如: ['我 要 买 手机','你好','早上 好'];如果为True,提供原始输入句子即可,比如:['我要买手机','你好','早上好'].
            :type need_segmented: bool
            :param verbose: 数值越大,输出越详细
            :type verbose: int
            :param full_mode: jieba分词选项,是否使用 full mode,默认为True
            :type full_mode: bool
            :param feature_type: 模型设置选项,选择不同粒度的特征单位, 目前只支持 word,seg和 word_seg。
                - word:直接以字为单位,比如 我要买手机--->我 要 买 手 机
                - seg:分词后的词单元为单位,比如 我要买手机--->我 要 买 手机
                - word_seg:分词后的字和词为单位,去重,比如 我要买手机--->我 要 买 手机 手 机
                - word_seg_concat:分词后的字和词为单位,不去重,比如 我要买手机--->我 要 买 手 机 我 要 买 手机
            :type feature_type: str
            :param remove_stopword: jieba分词选项,是否去除 stop word,默认为True
            :type remove_stopword: bool
            :param replace_number: jieba分词选项,是否将数据统一替换成NUM,默认为True
            :type replace_number: bool
            :param lowercase: jieba分词选项,是否将数据统一替换成NUM,默认为True
            :type lowercase: bool
            :param zhs2zht: jieba分词选项,出現繁体的時候,是否转简体,默认为True
            :type zhs2zht: bool
            :param remove_url: jieba分词选项,是否移除 微博url,http://t.cn/开头的地址,默认为True
            :type remove_url: bool
            :param add_unkown_word: 训练库字典的设置选项,是否在字典中增加一个未知词字符(UNKOWN)
            :type add_unkown_word: bool
            :param sentence_padding_length:  句子的补齐(截断)长度,默认为7
            :type sentence_padding_length: int
            :param padding_mode:  句子的补齐(截断)模式,有四种模式:
                                        1. center:如果小于sentence_padding_length的话往两边补0;如果超出sentence_padding_length的话,直接在后面截断。
                                        2. left:如果小于sentence_padding_length的话往左边补0;如果超出sentence_padding_length的话,直接在后面截断。
                                        3. right:如果小于sentence_padding_length的话往右边补0;如果超出sentence_padding_length的话,直接在后面截断。
                                        4. none:不补齐。
            :type padding_mode: str
            :param to_onehot_array: 输出 onehot array,还是字典索引 array,默认为False,输出字典索引,
            :type to_onehot_array: bool
            :param word2vec_to_solve_oov: 使用word2vec扩展oov词
            :type word2vec_to_solve_oov: bool
            :param kwargs:
                - word2vec_model_file_path:
                - vocabulary_including_test_set: (default,True)
                - update_dictionary: (default,True)
                - 等

        """
        self.full_mode = full_mode
        self.feature_type = feature_type
        self.remove_stopword = remove_stopword
        self.verbose = verbose
        self.need_segmented = need_segmented
        self.replace_number = replace_number
        self.lowercase = lowercase
        self.zhs2zht = zhs2zht
        self.remove_url = remove_url
        self.add_unkown_word = add_unkown_word
        self.sentence_padding_length = sentence_padding_length
        self.padding_mode = padding_mode
        self.to_onehot_array = to_onehot_array
        self.word2vec_to_solve_oov = word2vec_to_solve_oov
        self.kwargs = kwargs

        # 检验参数合法性
        assert self.padding_mode in ['center', 'left', 'right', 'none'], 'padding mode 只能取: center,left,right,none'
        assert self.feature_type in ['word', 'seg', 'word_seg',
                                     'word_seg_concat'], 'feature type 只能取: word,seg和word_seg'

        # 初始化jieba分词器
        if need_segmented:
            self.jieba_seg = Jieba_Util(verbose=self.verbose)
        # 训练库提取出来的字典对象
        self.train_data_dict = None
        # 训练库提取出来的字典词汇列表
        self.vocabulary = None
        # 训练库提取出来的字典词汇个数
        self.vocabulary_size = None
        # UNKOWN字符的索引
        self.unknow_token_index = None
        # PADDING字符的索引
        self.padding_token_index = None

        # region NOTE: 这些变量不再维护,因为消耗内存
        # 原始训练数据
        # self.train_data = None
        # 切完词的句子
        # self.segmented_sentences = None
        # 训练库句子的字典索引形式
        # self.train_index = None
        # 训练库句子的补齐的字典索引形式
        # self.train_padding_index = None
        # 训练库句子装成onehot array
        # endregion
        self.train_onehot_array = None
        # word2vec 模型
        self.word2vec_model = None
        if word2vec_to_solve_oov:
            assert kwargs.has_key('word2vec_model_file_path'), '请提供 属性 word2vec_model_file_path'
            # 加载word2vec模型
            w2v_util = Word2vecUtil()
            self.word2vec_model = w2v_util.load(kwargs.get('word2vec_model_file_path'))

        if verbose > 1:
            logging.debug('build feature encoder...')
            print('build feature encoder...')
Exemplo n.º 20
0
 def __init__(self):
     jutil = Jieba_Util(verbose=0)
     self.remove_sentence_punctuation = lambda x: jutil.seg(
         x, sep='', remove_url=False)
Exemplo n.º 21
0
class DataUtil(object):
    '''
        微博立场分析数据处理工具类,包含以下函数:
            1. load_data:加载csv格式的数据
            2. save_data:保存csv格式的数据
            3. print_data_detail: 打印数据详情
            4. processing_na_value:处理空值数据
            5. segment_sentence:分词
            6. split_train_test:切分训练集和测试集
            7.
    '''
    def __init__(self):
        # 初始化jieba工具
        self.jieba_util = Jieba_Util()

    def load_data(self, path, header=True):
        '''
            读取数据
        :param path: 数据文件的路径
        :return:
        '''
        if header:
            data = pd.read_csv(
                path,
                sep='\t',
                header=0,
                encoding='utf8',
            )
        else:
            data = pd.read_csv(
                path,
                sep='\t',
                header=None,
                encoding='utf8',
            )
        return data

    def load_train_test_data(self, config=None):
        '''
            加载训练数据和测试数据,已经标签索引

        :param config:  一些配置信息
        :param config: dict
        :return:
        '''

        # -------------- region start : 1. 加载训练集和测试集 -------------
        if config['verbose'] > 2:
            logging.debug('-' * 20)
            print '-' * 20
            logging.debug('1. 加载训练集和测试集')
            print '1. 加载训练集和测试集'
        # -------------- code start : 开始 -------------
        train_data_file_path = (
            config['train_data_file_path']) % config['train_data_type']
        test_data_file_path = (
            config['test_data_file_path']) % config['test_data_type']
        logging.debug(train_data_file_path)
        print train_data_file_path
        logging.debug(test_data_file_path)
        print test_data_file_path

        data_util = DataUtil()
        train_data = data_util.load_data(train_data_file_path)
        test_data = data_util.load_data(test_data_file_path)

        # -------------- code start : 结束 -------------
        if config['verbose'] > 2:
            logging.debug('-' * 20)
            print '-' * 20
        # -------------- region end : 1. 加载训练集和测试集 ---------------

        # 生成类别索引
        label_to_index = {u'FAVOR': 0, u'AGAINST': 1, u'NONE': 2}
        index_to_label = [u'FAVOR', u'AGAINST', u'NONE']

        return train_data, test_data, label_to_index, index_to_label

    def save_data(self, data, path):
        '''
            保存数据
        :param path: 数据文件的路径
        :return:
        '''
        data.to_csv(
            path,
            sep='\t',
            header=True,
            index=False,
            encoding='utf8',
        )

    def print_data_detail(self, data, has_stance=True):
        '''
            展示数据的详细信息
        :param data: Dateframe对象
        :param has_stance: 是否有STANCE字段
        :return: 无
        '''

        logging.debug('data的个数为:%d' % (len(data)))
        logging.debug('data的sample数据:')
        logging.debug(data.head())

        logging.debug('data的target和个数分别为:')
        logging.debug(data['TARGET'].value_counts())
        if has_stance:
            logging.debug('统计每个Target下各个类型立场的数量...')
            group = data.groupby(by=['TARGET', 'STANCE'])
            logging.debug(group.count())
        else:
            logging.debug('没有STANCE字段')

        logging.debug('数据各个字段情况...')
        # print data.info()
        for column in data.columns:
            # 统计每个字段是否有数据是空串
            # 先将所有空字符串用nan替换
            data[column] = data[column].replace(r'^\s*$', np.nan, regex=True)
            count_null = sum(data[column].isnull())
            if count_null != 0:
                logging.warn(
                    u'%s字段有空值,个数:%d,建议使用processing_na_value()方法进一步处理!' %
                    (column, count_null))
                null_data_path = './result/null_data.csv'
                logging.warn(u'将缺失值数据输出到文件:%s' % (null_data_path))
                data[data[column].isnull()].to_csv(null_data_path,
                                                   index=None,
                                                   encoding='utf8',
                                                   sep='\t')

    def processing_na_value(self,
                            data,
                            clear_na=True,
                            fill_na=False,
                            fill_char='NULL',
                            columns=None):
        '''
            处理数据的空值

        :param data:  Dateframe对象
        :param clear_na: bool,是否去掉空值数据
        :param fill_na: bool,是否填充空值
        :param fill_char: str,填充空置的字符
        :param column: list,需要处理的字段,默认为None时,对所有字段处理
        :return: Dateframe对象
        '''
        logging.debug('[def processing_na_value()] 对缺失值进行处理....')
        for column in data.columns:
            if columns == None or column in columns:
                data[column] = data[column].replace(r'^\s*$',
                                                    np.nan,
                                                    regex=True)
                count_null = sum(data[column].isnull())
                if count_null != 0:
                    logging.warn(u'%s字段有空值,个数:%d' % (column, count_null))
                    if clear_na:
                        logging.warn(u'对数据的%s字段空值进行摘除' % (column))
                        data = data[data[column].notnull()].copy()
                    else:
                        if fill_na:
                            logging.warn(u'对数据的%s字段空值进行填充,填充字符为:%s' %
                                         (column, fill_char))
                            data[column] = data[column].fillna(value=fill_char)

        return data

    def segment_sentence(self, sentence):
        segmented_sentence = self.jieba_util.seg(
            sentence=sentence,
            sep=' ',
            full_mode=True,
            remove_stopword=True,
            replace_number=True,
            lowercase=True,
            zhs2zht=True,
            remove_url=True,
        )
        return segmented_sentence

    def split_train_test(self, data, train_split=0.7):
        '''
            将数据切分成训练集和验证集

        :param data:
        :param train_split: float,取值范围[0,1],设置训练集的比例
        :return: dev_data,test_data
        '''
        logging.debug('对数据随机切分成train和test数据集,比例为:%f' % (train_split))
        num_train = len(data)
        num_dev = int(num_train * train_split)
        num_test = num_train - num_dev
        logging.debug('全部数据、训练数据和测试数据的个数分别为:%d,%d,%d' %
                      (num_train, num_dev, num_test))
        rand_list = np.random.RandomState(0).permutation(num_train)
        # print rand_list
        # print rand_list[:num_dev]
        # print rand_list[num_dev:]
        dev_data = data.iloc[rand_list[:num_dev]].sort_index()
        test_data = data.iloc[rand_list[num_dev:]].sort_index()
        # print dev_data
        # print test_data
        return dev_data, test_data

    def count_word_freq(self, data):
        '''
            统计每个词 在各个类别中的次数,每个词有四个统计项:
                1. FAVOR:	在favor类别中的出现的次数
                2. AGAINST:在AGAINST类别中的出现的次数
                3. NONE	: 在NONE类别中的出现的次数
                4. FREQ	: 在所有类别中的出现的次数,即FAVOR+AGAINST+NONE
                5. SUPPORT: 最高词频词频项/(FREQ)

        :param data:
        :return:
        '''
        from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder

        feature_encoder = FeatureEncoder(
            train_data=data['WORDS'].as_matrix(),
            verbose=0,
            padding_mode='none',
            need_segmented=False,
            full_mode=True,
            remove_stopword=True,
            replace_number=True,
            lowercase=True,
            remove_url=True,
            sentence_padding_length=7,
            add_unkown_word=False,
            mask_zero=False,
            zhs2zht=True,
        )

        # print feature_encoder.train_padding_index
        train_X_features = feature_encoder.to_onehot_array()

        np.save('result/train_X_feature', train_X_features)

        print train_X_features.shape
        print train_X_features[:5]
        vocabulary = feature_encoder.vocabulary
        print ','.join(vocabulary)
        print feature_encoder.vocabulary_size
        np.save('result/vocabulary', vocabulary)

        freq = np.sum(train_X_features, axis=0)
        favor_freq = np.sum(
            train_X_features[data['STANCE'].as_matrix() == u'FAVOR'], axis=0)
        against_freq = np.sum(
            train_X_features[data['STANCE'].as_matrix() == u'AGAINST'], axis=0)
        none_freq = np.sum(
            train_X_features[data['STANCE'].as_matrix() == u'NONE'], axis=0)

        support = np.nan_to_num([
            max(favor, against, none) / (1.0 * (favor + against + none)) for
            favor, against, none in zip(favor_freq, against_freq, none_freq)
        ])
        print freq
        print favor_freq
        print against_freq
        print none_freq
        count_data = pd.DataFrame(
            data={
                u'WORD': vocabulary,
                u'FAVOR': favor_freq,
                u'AGAINST': against_freq,
                u'NONE': none_freq,
                u'SUPPORT': support,
                u'FREQ': freq,
            })
        count_data = count_data.sort_values(by=[u'SUPPORT', u'FREQ', 'WORD'],
                                            ascending=False)
        count_data = count_data[[
            u'WORD', u'FAVOR', u'AGAINST', u'NONE', u'FREQ', u'SUPPORT'
        ]]
        count_data.to_csv(
            'result/word_count.csv',
            sep='\t',
            index=False,
            header=True,
            encoding='utf8',
        )
        print count_data.head()
Exemplo n.º 22
0
class FeatureEncoder(object):
    '''
        Onehot特征编码器,将句子转成onehot编码(以字典索引形式表示,补齐),包含以下函数:
            1. segment_sentence:对句子分词
            2. build_dictionary:构建字典
            3. sentence_to_index:将原始字符串句子转为字典索引列表
            4. sentence_padding:将句子补齐
            5. fit_transform:构建编码器并转换数据
            6. transform_sentence:对句子编码
            7. get_sentence_length:对句子长度计算
            8. print_sentence_length_detail: 打印训练库句子详情.
            9. print_model_descibe: 打印模型的详情.
            10. sentence_index_to_bow: 将索引转为onehot数据
            11. to_onehot_array: 生成训练库句子的onehot编码
            12. reset: clear 数据

        注意:
            1. 训练库中所有词,包括未知词字符(UNKOWN),的字典索引都是从1开始分配的,索引0是作为填充字符所用。
            2. 训练库字典大小 (vocabulary_size)是计入索引0的,计算训练库中所有词和填充字符(PADDING)未知词字符(UNKOWN),如果不使用可以关闭。
    '''
    def __init__(self,
                 need_segmented=True,
                 verbose=0,
                 full_mode=True,
                 feature_type='seg',
                 remove_stopword=True,
                 replace_number=True,
                 lowercase=True,
                 zhs2zht=True,
                 remove_url=True,
                 sentence_padding_length=7,
                 padding_mode='center',
                 add_unkown_word=True,
                 to_onehot_array=False,
                 word2vec_to_solve_oov=False,
                 **kwargs):
        """
            Onehot特征编码器,将句子转成 onehot 编码(以字典索引形式表示,补齐)
            1. 初始化参数
            2. build feature encoder

            :param need_segmented: 数据处理选项,是否需要经过分词处理;如果为False,那么输入的数据不需要分词,提供的数据的每个句子的每个词要以空格分割.比如: ['我 要 买 手机','你好','早上 好'];如果为True,提供原始输入句子即可,比如:['我要买手机','你好','早上好'].
            :type need_segmented: bool
            :param verbose: 数值越大,输出越详细
            :type verbose: int
            :param full_mode: jieba分词选项,是否使用 full mode,默认为True
            :type full_mode: bool
            :param feature_type: 模型设置选项,选择不同粒度的特征单位, 目前只支持 word,seg和 word_seg。
                - word:直接以字为单位,比如 我要买手机--->我 要 买 手 机
                - seg:分词后的词单元为单位,比如 我要买手机--->我 要 买 手机
                - word_seg:分词后的字和词为单位,去重,比如 我要买手机--->我 要 买 手机 手 机
                - word_seg_concat:分词后的字和词为单位,不去重,比如 我要买手机--->我 要 买 手 机 我 要 买 手机
            :type feature_type: str
            :param remove_stopword: jieba分词选项,是否去除 stop word,默认为True
            :type remove_stopword: bool
            :param replace_number: jieba分词选项,是否将数据统一替换成NUM,默认为True
            :type replace_number: bool
            :param lowercase: jieba分词选项,是否将数据统一替换成NUM,默认为True
            :type lowercase: bool
            :param zhs2zht: jieba分词选项,出現繁体的時候,是否转简体,默认为True
            :type zhs2zht: bool
            :param remove_url: jieba分词选项,是否移除 微博url,http://t.cn/开头的地址,默认为True
            :type remove_url: bool
            :param add_unkown_word: 训练库字典的设置选项,是否在字典中增加一个未知词字符(UNKOWN)
            :type add_unkown_word: bool
            :param sentence_padding_length:  句子的补齐(截断)长度,默认为7
            :type sentence_padding_length: int
            :param padding_mode:  句子的补齐(截断)模式,有四种模式:
                                        1. center:如果小于sentence_padding_length的话往两边补0;如果超出sentence_padding_length的话,直接在后面截断。
                                        2. left:如果小于sentence_padding_length的话往左边补0;如果超出sentence_padding_length的话,直接在后面截断。
                                        3. right:如果小于sentence_padding_length的话往右边补0;如果超出sentence_padding_length的话,直接在后面截断。
                                        4. none:不补齐。
            :type padding_mode: str
            :param to_onehot_array: 输出 onehot array,还是字典索引 array,默认为False,输出字典索引,
            :type to_onehot_array: bool
            :param word2vec_to_solve_oov: 使用word2vec扩展oov词
            :type word2vec_to_solve_oov: bool
            :param kwargs:
                - word2vec_model_file_path:
                - vocabulary_including_test_set: (default,True)
                - update_dictionary: (default,True)
                - 等

        """
        self.full_mode = full_mode
        self.feature_type = feature_type
        self.remove_stopword = remove_stopword
        self.verbose = verbose
        self.need_segmented = need_segmented
        self.replace_number = replace_number
        self.lowercase = lowercase
        self.zhs2zht = zhs2zht
        self.remove_url = remove_url
        self.add_unkown_word = add_unkown_word
        self.sentence_padding_length = sentence_padding_length
        self.padding_mode = padding_mode
        self.to_onehot_array = to_onehot_array
        self.word2vec_to_solve_oov = word2vec_to_solve_oov
        self.kwargs = kwargs

        # 检验参数合法性
        assert self.padding_mode in [
            'center', 'left', 'right', 'none'
        ], 'padding mode 只能取: center,left,right,none'
        assert self.feature_type in [
            'word', 'seg', 'word_seg', 'word_seg_concat'
        ], 'feature type 只能取: word,seg和word_seg'

        # 初始化jieba分词器
        if need_segmented:
            self.jieba_seg = Jieba_Util(verbose=self.verbose)
        # 训练库提取出来的字典对象
        self.train_data_dict = None
        # 训练库提取出来的字典词汇列表
        self.vocabulary = None
        # 训练库提取出来的字典词汇个数
        self.vocabulary_size = None
        # UNKOWN字符的索引
        self.unknow_token_index = None
        # PADDING字符的索引
        self.padding_token_index = None

        # region NOTE: 这些变量不再维护,因为消耗内存
        # 原始训练数据
        # self.train_data = None
        # 切完词的句子
        # self.segmented_sentences = None
        # 训练库句子的字典索引形式
        # self.train_index = None
        # 训练库句子的补齐的字典索引形式
        # self.train_padding_index = None
        # 训练库句子装成onehot array
        # endregion
        self.train_onehot_array = None
        # word2vec 模型
        self.word2vec_model = None
        if word2vec_to_solve_oov:
            assert kwargs.has_key(
                'word2vec_model_file_path'), '请提供 属性 word2vec_model_file_path'
            # 加载word2vec模型
            w2v_util = Word2vecUtil()
            self.word2vec_model = w2v_util.load(
                kwargs.get('word2vec_model_file_path'))

        if verbose > 1:
            logging.debug('build feature encoder...')
            print('build feature encoder...')
            # self.fit_transform()

    def segment_sentence(self, sentence):
        '''
        对句子进行分词,使用jieba分词

        :param sentence: 句子
        :type sentence: str
        :return: 分完词句子,以空格连接
        :rtype: str
        '''

        if self.feature_type == 'seg':
            segmented_sentence = self.jieba_seg.seg(
                sentence,
                sep=' ',
                full_mode=self.full_mode,
                remove_stopword=self.remove_stopword,
                replace_number=self.replace_number,
                lowercase=self.lowercase,
                zhs2zht=self.zhs2zht,
                remove_url=self.remove_url,
                HMM=False,
            )
        elif self.feature_type == 'word':
            # 将句子切分为 以字为单元 以空格分割
            # 1. 先使用jieba进行预处理,将数字替换等
            segmented_sentence = self.jieba_seg.iter_each_word(
                sentence,
                sep=' ',
                need_segmented=True,
                full_mode=self.full_mode,
                remove_stopword=self.remove_stopword,
                replace_number=self.replace_number,
                lowercase=self.lowercase,
                zhs2zht=self.zhs2zht,
                remove_url=self.remove_url,
                HMM=False,
            )

            # 2. 按字切分

        elif self.feature_type == 'word_seg':
            # 将句子切分为 以字和词为单元,相同则去重 以空格分割
            # 1. 先使用jieba进行预处理,将数字替换等
            segmented_sentence = self.jieba_seg.seg(
                sentence,
                sep=' ',
                full_mode=self.full_mode,
                remove_stopword=self.remove_stopword,
                replace_number=self.replace_number,
                lowercase=self.lowercase,
                zhs2zht=self.zhs2zht,
                remove_url=self.remove_url,
                HMM=False,
            )

            # print(segmented_sentence)
            # 2. 按字切分
            word = self.jieba_seg.iter_each_word(segmented_sentence,
                                                 sep=' ',
                                                 need_segmented=False).split()
            # 3. 按词切分
            seg = segmented_sentence.split()
            segmented_sentence = ' '.join(set(seg + word))
        elif self.feature_type == 'word_seg_concat':
            # 先字后词拼接,不去重
            # 1. 先使用jieba进行预处理,将数字替换等
            segmented_sentence = self.jieba_seg.seg(
                sentence,
                sep=' ',
                full_mode=self.full_mode,
                remove_stopword=self.remove_stopword,
                replace_number=self.replace_number,
                lowercase=self.lowercase,
                zhs2zht=self.zhs2zht,
                remove_url=self.remove_url,
                HMM=False,
            )

            # print(segmented_sentence)
            # 2. 按字切分
            word = self.jieba_seg.iter_each_word(segmented_sentence,
                                                 sep=' ',
                                                 need_segmented=False).split()
            # 3. 按词切分
            seg = segmented_sentence.split()
            segmented_sentence = ' '.join(word + seg)
        else:
            assert False, '不支持其他粒度的切分!'

        return segmented_sentence

    def get_sentence_length(self, sentence):
        '''
            计算句子的长度,注意,这里的长度以词为单位,即分完词后统计。
                1. 对句子分词
                2. 对句子的词计算

        :param sentence: 句子
        :type sentence: str
        :return: 句子长度
        :rtype: int
        '''

        # 1. 分词
        segmented_senence = self.segment_sentence(sentence)
        # 2. 统计
        sentence_length = len(segmented_senence.split())

        return sentence_length

    def print_sentence_length_detail(
        self,
        data=None,
        lengths=[7, 10, 15, 20],
    ):
        '''
            打印训练库中句子的长度情况

        :type lengths: list
        :param lengths: 长度界限列表
        :return: 句子长度列表
        :rtype: list
        '''
        if self.need_segmented:
            sentence_length = map(self.get_sentence_length, data)
        else:
            sentence_length = map(lambda x: len(x.split()), data)
        for l in lengths:
            le_this_len = sum(np.asarray(sentence_length) <= l) / (
                1.0 * len(sentence_length))
            print('句子长度小于等于%d的有:%f' % (l, le_this_len))

        print('句子长度情况为:%s' % (str(sentence_length)))
        print('句子最长长度为:%d' % (max(sentence_length)))
        print('句子最短长度为:%d' % (min(sentence_length)))
        print('句子平均长度为:%d' % (np.average(sentence_length)))
        return sentence_length

    def get_unkown_vector(self, ndim=50):
        rand = np.random.RandomState(1337)
        return rand.uniform(-0.25, 0.25, ndim)

    def get_w2vEmbedding(self, word):
        """
            返回词向量

        Returns
        -------
        (array,str)
        """

        try:
            if word == u'PADDING':
                vector = np.zeros(self.word2vec_model.vector_size)
                flag = 'PADDING'
            elif word == u'UNKOWN':
                # 当训练
                vector = self.get_unkown_vector(
                    self.word2vec_model.vector_size)
                flag = 'NO_IN_MODEL_VOCAB'
            else:
                vector = self.word2vec_model[word]
                flag = 'OK'
        except:
            vector = self.get_unkown_vector(self.word2vec_model.vector_size)
            if self.verbose > 1:
                print('OOV: %s' % word)
            flag = 'NO_IN_W2V'
        return np.asarray(vector), flag

    def to_embedding_weight(self, path):
        """
            使用训练好的 word2vec 模型 将字典中每个词转为 word2vec向量,接着生成一个 Embedding层的初始权重形式,可用于初始化 Embedding 层的权重。
                1. 加载word2vec模型
                2.

        :param path: word2vec 模型文件路径
        :type path: str
        :return:
        """

        if self.word2vec_model is None:
            w2v_util = Word2vecUtil()
            self.word2vec_model = w2v_util.load(path)
        size = self.vocabulary_size

        embedding_weights = np.zeros((size, self.word2vec_model.vector_size))
        words_count_no_in_w2v = 0
        words_count_no_in_vacab = 0
        words_count_in = 0
        words_count_paddding = 0
        for key, value in self.train_data_dict.token2id.items():
            vector, flag = self.get_w2vEmbedding(key)
            embedding_weights[value, :] = vector
            if flag == 'NO_IN_W2V':
                words_count_no_in_w2v += 1
            if flag == 'NO_IN_MODEL_VOCAB':
                words_count_no_in_vacab += 1
            if flag == 'OK':
                words_count_in += 1
                # print(key)
            if flag == 'PADDING':
                words_count_paddding += 1
        if self.verbose > 0:
            print('没有出现在w2v模型中的词有:%d个' % (words_count_no_in_w2v))
            print('没有出现在模型vocab中的词有:%d个' % (words_count_no_in_vacab))
            print('出现在w2v模型中的词有:%d个' % (words_count_in))

        # self.embedding_weights = embedding_weights

        return embedding_weights

    def build_dictionary(self, train_X=None, test_X=None):
        """
            1.对数据进行分词
            2.构建训练库字典,插入 一个特殊字符 'UNKOWN'表示未知词

        Parameters
        ----------
        train_X : array-like
        test_X : array-like

        Returns
        --------
        object:
            self
        """

        # region -------------- 1.将训练集和测试集合并 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
            logging.debug('1.将训练集和测试集合并')
            print('1.将训练集和测试集合并')
        if self.kwargs.get('vocabulary_including_test_set', True):
            X = np.concatenate((train_X, test_X), axis=0)
        else:
            X = train_X

        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
        # endregion -------------- 1.将训练集和测试集合并 ---------------

        # region -------------- 2.对数据进行分词 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
            logging.debug('对数据进行分词')
            print('对数据进行分词')
        # -------------- code start : 开始 -------------
        if self.need_segmented:
            segmented_sentences = map(self.segment_sentence, X)
        else:
            segmented_sentences = X
        # -------------- code start : 结束 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
        # endregion -------------- 2.对数据进行分词 ---------------

        # region -------------- 3. 将句子补齐到等长 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
            logging.debug('2. 将句子补齐到等长')
            print('2. 将句子补齐到等长')
        # -------------- code start : 开始 -------------

        # 将不等长的句子都对齐,超出padding_length长度的句子截断,小于的则补 PADDING
        padded_sentences = np.asarray(
            map(self.sentence_padding, segmented_sentences))

        # endregion -------------- 3. 将句子补齐到等长 -------------

        # region -------------- region start : 4.构建训练库字典 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
            logging.debug('4.构建训练库字典')
            print('4.构建训练库字典')
        # -------------- code start : 开始 -------------
        logging.debug('=' * 20)
        logging.debug('首先,构建训练库字典,然后将每个词映射到一个索引,再将所有句子映射成索引的列表')

        # 将训练库所有句子切分成列表,构成 2D的训练文档,每个单元是一个token,
        # 比如: [['今年','你','多少岁'],['你', '二十四','小时','在线','吗'],...]
        # 将分完词句子转成合适的数据格式
        train_document = map(lambda x: x.split(), padded_sentences)
        # 获取训练库字典
        if self.padding_mode != 'none':
            # 为了确保padding的索引是0,所以在最前面加入 PADDING
            train_document.insert(0, [u'PADDING'])
        self.train_data_dict = Dictionary.from_documents(train_document)

        # 更新字典,再字典中添加特殊符号,其中
        # UNKOWN表示未知字符,即OOV词汇
        if self.add_unkown_word:
            self.train_data_dict.add_documents([[u'UNKOWN']])

        # 获取padding和UNKOWN 的字典索引
        self.padding_token_index = self.train_data_dict.token2id.get(
            u'PADDING', -1)
        self.unknow_token_index = self.train_data_dict.token2id.get(
            u'UNKOWN', -1)

        self.vocabulary_size = len(self.train_data_dict.keys())
        # 按索引从小到大排序
        self.vocabulary = [
            token
            for token, id in sorted(self.train_data_dict.token2id.items(),
                                    key=lambda x: x[1])
        ]
        # print(self.vocabulary_size)
        # print((self.train_data_dict.token2id.items()))
        # quit()

        # -------------- print start : just print info -------------
        if self.verbose > 1:
            logging.debug('训练库字典为:%d' % (len(self.train_data_dict.keys())))
            print('训练库字典为:%d' % (len(self.train_data_dict.keys())))
            logging.debug(u'字典有:%s' %
                          (','.join(self.train_data_dict.token2id.keys())))
            print(u'字典有:%s' % (','.join(self.train_data_dict.token2id.keys())))
        # -------------- print end : just print info -------------

        # -------------- code start : 结束 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)

        # endregion -------------- 4.构建训练库字典 ---------------

        return padded_sentences

    def replace_oov_with_similay_word(self, word2vec_model, sentence):
        '''
            对句子中oov词使用训练库中最相近的词替换(word2vec余弦相似性)

        :param sentence:
        :return:
        '''

        # is_oov = np.asarray([item for item in self.feature_encoder.vocabulary])
        # has_oov = any(is_oov)
        sentence = sentence.split()
        oov_word = []
        replace_word = []
        for item in sentence:
            if item not in self.vocabulary:
                oov_word.append(item)
                keywords_sim_score = np.asarray([
                    self.word_similarity(word2vec_model, item, i)
                    for i in self.vocabulary
                ])
                sorted_index = np.argsort(keywords_sim_score)[-1::-1]
                most_similarity_score = keywords_sim_score[sorted_index[0]]
                most_similarity_word = self.vocabulary[sorted_index[0]]
                if self.verbose > 1:
                    print(u'%s 最相近的词是%s,分数为:%f' %
                          (item, most_similarity_word, most_similarity_score))
                replace_word.append(most_similarity_word)
        sentence += replace_word
        return ' '.join(sentence)

    def word_similarity(self, word2vec_model, word1, word2):
        '''
        计算两个词的相似性
        :param word1:
        :param word2:
        :return:
        '''
        try:
            return word2vec_model.n_similarity(word1, word2)
        except:
            return 0

    def sentence_to_index(self, sentence):
        """
            将 sentence 转换为 index,如果 token为OOV词,则分配为 UNKOWN


        Parameters
        ----------
        sentence: str
            以空格分割

        """

        if self.add_unkown_word:
            unknow_token_index = self.train_data_dict.token2id[u'UNKOWN']
        else:
            unknow_token_index = 0
        # 将训练库中所有句子的每个词映射到索引上,变成索引列表
        index = [
            self.train_data_dict.token2id.get(item, unknow_token_index)
            for item in sentence.split()
        ]
        if self.verbose > 0:
            if index.__contains__(unknow_token_index):
                print('出现字典OOV')
                print(sentence)
                print(index)
        # assert not index.__contains__(-1),u'出现OOV词'
        return index

    def sentence_padding(self, sentence):
        '''
            将不等长的句子都对齐,超出padding_length长度的句子截断,小于的则补 PADDING

        :type sentence: str
        :param sentence: 句子,词之间以 空格 分割
        :return: 返回补齐后的句子,以空格分割
        :type: str
        '''

        padding_length = self.sentence_padding_length
        # print(sentence)
        sentence = sentence.split()
        sentence_length = len(sentence)
        # print(sentence_length)
        if sentence_length > padding_length:
            # logging.debug(u'对句子进行截断:%s' % (sentence))

            sentence = sentence[:padding_length]

            # logging.debug(u'对句子进行截断后:%s' % (' '.join(seg[:padding_length])))
            # print(u'对句子进行截断后:%s' % (' '.join(seg[:padding_length])))
        elif sentence_length < padding_length:
            should_padding_length = padding_length - sentence_length
            left_padding = np.asarray(['PADDING'] *
                                      (should_padding_length / 2))
            right_padding = np.asarray(
                ['PADDING'] * (should_padding_length - len(left_padding)))
            if self.padding_mode == 'center':
                sentence = np.concatenate(
                    (left_padding, sentence, right_padding), axis=0)
            elif self.padding_mode == 'left':
                sentence = np.concatenate(
                    (left_padding, right_padding, sentence), axis=0)
            elif self.padding_mode == 'right':
                sentence = np.concatenate(
                    (sentence, left_padding, right_padding), axis=0)
            elif self.padding_mode == 'none':
                sentence = sentence
            else:
                raise NotImplemented

        sentence = ' '.join(sentence)
        return sentence

    def sentence_index_to_onehot(self, index):
        '''
            注意:该方法跟[sentence_index_to_bow]的区别。
            将词的索引转成 onehot 编码,比如:
                索引 1 -->[  0 , 0 , 0 , 0,  1]

        :param index: 一个词的字典索引
        :type index: list
        :return: onehot 编码,shape为 (句子长度,字典长度)
        :rtype: np.array()
        '''

        onehot_array = []

        for item in index:
            temp = np.zeros(self.vocabulary_size, dtype=int)
            if item == 0:
                pass
            else:
                temp[item - 1] = 1

            onehot_array.append(temp)

        # onehot_array = np.concatenate(onehot_array,axis=1)
        onehot_array = np.asarray(onehot_array)
        return onehot_array

    def sentence_index_to_bow(self, index):
        '''
            注意:该方法跟[word_index_to_onehot]的区别。
            将句子的字典索引转成 词包向量 编码比如:
                [1,2]-->[ 0 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0,  0]

        :param index: 一个句子的字典索引
        :type index: list
        :return: bow 编码,长度为 字典长度
        :rtype: np.array()
        '''

        onehot_array = np.zeros(self.vocabulary_size, dtype=int)

        onehot_array[index] = 1

        return onehot_array

    def batch_sentence_index_to_onehot_array(self, sentence_indexs):
        '''
            将所有训练库句子转成onehot编码的数组,保存在 self.onehot_array 中

        :return: onehot编码的数组
        '''

        self.onehot_array = np.asarray(
            map(self.sentence_index_to_onehot, sentence_indexs))
        return self.onehot_array

    def fit_transform(self, train_data=None, test_data=None):
        return self.fit(train_data, test_data).transform(train_data)

    def fit(self, train_X=None, test_X=None):
        """
            build feature encoder
                1. 构建训练库字典
                2. 分词,并将句子补齐到等长,补齐长度为: self.sentence_padding_length
                3. 将训练句子转成字典索引形式
                4. 将每个词的字典索引变成onehot向量


        Parameters
        ----------
        train_X: array-like
            训练句子列表:['','',...,'']
        test_X: array-like
            测试句子列表:['','',...,'']

        Returns
        -------
        object:
            编码后的列表
        """

        if not self.kwargs.get('update_dictionary', True):
            # 假如不更新字典,则如果原有的字典在,就直接用原有的字典即可
            if self.vocabulary is not None:
                return self

        logging.debug('=' * 20)
        if train_X is None:
            logging.debug('没有输入训练数据!')
            assert False, '没有输入训练数据!'
        if test_X is None:
            logging.debug('构建字典需要全部数据,请输入测试数据!')
            assert False, '构建字典需要全部数据,请输入测试数据!'

        # region -------------- 1.构建训练库字典 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
            logging.debug('1.构建训练库字典')
            print('1.构建训练库字典')
        # -------------- code start : 开始 -------------

        # 构建训练库字典
        self.build_dictionary(train_X, test_X)

        # -------------- code start : 结束 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
        # endregion -------------- 1.构建训练库字典 ---------------

        return self

    def transform_sentence(self, sentence):
        """
            转换一个句子的格式。跟训练数据一样的操作,对输入句子进行 padding index 编码,将sentence转为补齐的字典索引
                1. 分词
                2. 转为字典索引列表,之后补齐,输入为补齐的字典索引列表
                3. 每个词的字典索引变成onehot向量

        :param sentence: 输入句子,不用分词,进来后会有分词处理
        :type sentence: str
        :return: 补齐的字典索引
        :rtype: array-like
        """

        assert self.train_data_dict is not None, '请先fit_transform()模型'

        # region -------------- 1. 分词 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
            logging.debug('1. 分词')
            print('1. 分词')
        # -------------- code start : 开始 -------------

        # 分词
        if self.need_segmented:
            seg_sentence = self.segment_sentence(sentence)
        else:
            seg_sentence = sentence

        if self.word2vec_to_solve_oov:
            seg_sentence = self.replace_oov_with_similay_word(
                self.word2vec_model, seg_sentence)
        # -------------- code start : 结束 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
        # endregion -------------- region end : 1. 分词 ---------------

        # region -------------- 2. 转为字典索引列表,之后补齐,输出为补齐的字典索引列表 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
            logging.debug('2. 转为字典索引列表,之后补齐,输出为补齐的字典索引列表')
            print('2. 转为字典索引列表,之后补齐,输入为补齐的字典索引列表')
        # -------------- code start : 开始 -------------

        paded_sentence = self.sentence_padding(seg_sentence)
        sentence_index = self.sentence_to_index(paded_sentence)

        # -------------- code start : 结束 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
        # endregion -------------- region end : 2. 转为字典索引列表,之后补齐,输出为补齐的字典索引列表 ---------------

        # region -------------- 3. 将每个词的字典索引变成onehot向量 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
            logging.debug('3. 将每个词的字典索引变成onehot向量')
            print('3. 将每个词的字典索引变成onehot向量')
        # -------------- code start : 开始 -------------

        if self.to_onehot_array:
            onehot_array = self.sentence_index_to_onehot(sentence_index)
        else:
            onehot_array = sentence_index

        # -------------- code start : 结束 -------------
        if self.verbose > 1:
            logging.debug('-' * 20)
            print('-' * 20)
        # endregion -------------- region end : 3. 将每个词的字典索引变成onehot向量 ---------------

        return onehot_array

    def transform(self, X):
        '''
            批量转换数据,跟训练数据一样的操作,对输入句子进行 padding index 编码,将sentence转为补齐的字典索引
                1. 直接调用 self.transform_sentence 进行处理

        :param sentence: 输入句子
        :type sentence: array-like
        :return: 补齐的字典索引
        :rtype: array-like
        '''

        index = map(lambda x: self.transform_sentence(x), X)
        # print train_index[:5]

        return np.asarray(index)

    def reset(self):
        """
        清理对象中的数据
           - self.vocabulary

        """
        self.vocabulary = None

    def print_model_descibe(self):
        '''
            打印模型参数详情

        :return: 参数设置详情
        :rtype: dict 或 {}
        '''
        import pprint
        detail = {
            'train_data_count': len(self.train_data),
            'need_segmented': self.need_segmented,
            'feature_type': self.feature_type,
            'verbose': self.verbose,
            'full_mode': self.full_mode,
            'remove_stopword': self.remove_stopword,
            'replace_number': self.replace_number,
            'sentence_padding_length': self.sentence_padding_length,
            'padding_mode': 'center',
            'vocabulary_size': self.vocabulary_size,
            'padding_token_index': self.padding_token_index,
            'unknow_token_index': self.unknow_token_index,
            'add_unkown_word': True,
            'mask_zero': True,
        }
        pprint.pprint(detail)
        logging.debug(detail)
        return detail
Exemplo n.º 23
0
 def __init__(self):
     # 初始化jieba工具
     self.jieba_util = Jieba_Util()
Exemplo n.º 24
0
    Author:  'jdwang'
    Date:    'create date: 2016-07-16'
    Email:   '*****@*****.**'
    Describe: 
"""
from __future__ import print_function


import numpy as np
import pandas as pd
import logging
import timeit

from data_processing_util.jiebanlp.jieba_util import Jieba_Util

jutil = Jieba_Util(verbose=0)
remove_sentence_punctuation = lambda x: jutil.seg(x, sep='', remove_url=False)

# 统计 进入协处理的对话段数

ch2r_dialogue_file_path = '/home/jdwang/PycharmProjects/corprocessor/coprocessor/Corpus/ood_dataset/dev_vesion/ch2r_test_dataset/start-20150613测试集/data/dialogue_usersentence_ge_1.csv'

ch2r_dialogue = pd.read_csv(
    ch2r_dialogue_file_path,
    sep='\t',
    encoding='utf8',
    header= 0,
)

user_sentence = ch2r_dialogue[ch2r_dialogue['Name'] != 'Ch2R']
Exemplo n.º 25
0
# print ','.join(train_data['LABEL'].unique())
# print test_data['LABEL_INDEX'].as_matrix()
test_y = test_data['LABEL_INDEX'].as_matrix()
train_y = train_data['LABEL_INDEX'].as_matrix()
# ------------------------------------------------------------------------------
# -------------- region end : 加载训练数据和测试数据 -------------
# ------------------------------------------------------------------------------

train_data = train_data[['LABEL','SENTENCE']]
test_data = test_data[['LABEL','SENTENCE']]


logging.debug('=' * 20)
logging.debug('对数据进行分词...')
logging.debug('-' * 20)
jutil = Jieba_Util()
if config['feature_type'] == 'word':
    sentence_to_seg = lambda x: jutil.iter_each_word(
        sentence=x,
        need_segmented=True,
        sep=' ',
        full_mode=config['full_mode'],
        remove_stopword=config['remove_stopword'],
        replace_number=True,
        lowercase=True,
        zhs2zht=True,
        remove_url=True,
    )
else:
    sentence_to_seg = lambda x: jutil.seg(
        sentence=x,