예제 #1
0
    def _load_dataset(self):
        """
        加载数据集,并构建词汇表,word embedding
        :return:
        """
        file_name = self.data_name.split('.')[0]
        # 如果pkl 文件不存在:1.加载训练集 2.构建训练集上的词汇表 3.构建训练集上的word embedding
        if not os.path.exists(os.path.join(self.data_dir, file_name + '.pkl')):
            pretrained_wordembedding = load_pretrained_wordembedding(self.word_embedding_path)  # 加载预训练的word embedding
            examples = []
            # 根据训练数据构建词汇表  'token':id
            stoi = {}
            itos = {}
            stoi['UNK'] = 0
            stoi['PAD'] = 1
            itos[0] = 'UNK'
            itos[1] = 'PAD'
            # 根据训练数据构建word_embedding id:-1.08563066e+00  9.97345448e-01  2.82978505e-01 -1.50629473e+00........
            vectors = []
            vectors.append(pretrained_wordembedding['UNK'])
            vectors.append(pretrained_wordembedding['PAD'])

            raw_data = pd.read_csv(os.path.join(self.data_dir, self.data_name), header=0, names=['label', 'text'])
            for item in tqdm(raw_data.iterrows()):
                weibo_example = WeiboExample(item[1]['text'], item[1]['label'])
                # 使用词向量
                weibo_example.tokens = [*jieba.lcut(weibo_example.text)]
                # 使用字向量
                # weibo_example.tokens = list(weibo_example.text)
                for token in weibo_example.tokens:

                    if token in pretrained_wordembedding:  # 如果token在预训练的word embedding 词汇表中
                        if token not in stoi:
                            stoi[token] = len(stoi)
                            itos[len(stoi)] = token
                            vectors.append(pretrained_wordembedding[token])

                        weibo_example.tokens_ids.append(stoi[token])
                    else:  # 如果token 不在预训练的word embedding 词汇表中
                        weibo_example.tokens_ids.append(stoi['UNK'])
                examples.append(weibo_example)

            word_embedding = WordEmbedding(stoi, itos)
            word_embedding.vectors = np.array(vectors)

            # 保存成pkl文件,方便加载
            with open(os.path.join(self.data_dir, file_name + '.pkl'), 'wb') as f:
                pickle.dump(examples, f)
            with open(os.path.join(self.data_dir, 'word_embedding.pkl'), 'wb') as f:
                pickle.dump(word_embedding, f)
        else:
            with open(os.path.join(self.data_dir, file_name + '.pkl'), 'rb') as f:
                examples = pickle.load(f)
            with open(os.path.join(self.data_dir, 'word_embedding.pkl'), 'rb') as f:
                word_embedding = pickle.load(f)

        return examples, word_embedding
예제 #2
0
    def _load_dataset(self, file_name):
        """
        加载数据集,并构建词汇表,word embedding
        :return:
        """

        # 只在训练集上构建word embedding
        if 'train' in file_name:
            # 如果pkl 文件不存在:1.加载训练集 2.构建训练集上的词汇表 3.构建训练集上的word embedding
            if not os.path.exists(
                    os.path.join(self.data_dir,
                                 file_name.rsplit('.', 1)[0] + '_char.pkl')):
                examples = []
                # 加载预训练的word embedding
                pretrained_wordembedding = load_pretrained_wordembedding(
                    self.word_embedding_path)
                stoi = {}
                itos = {}
                stoi['UNK'] = 0
                stoi['PAD'] = 1
                itos[0] = 'UNK'
                itos[1] = 'PAD'
                # 根据训练数据构建word_embedding id:-1.08563066e+00  9.97345448e-01  2.82978505e-01 -1.50629473e+00........
                vectors = []
                vectors.append(pretrained_wordembedding['UNK'])
                vectors.append(pretrained_wordembedding['PAD'])
                with open(os.path.join(self.data_dir, file_name), 'r') as f:
                    for line in tqdm(f):
                        label, text = line.split('\t')
                        if len(text) > self.max_sent_len:
                            text = text[:self.max_sent_len]
                        cnews_example = CnewsExample(text, label)
                        # 使用字向量
                        cnews_example.tokens = list(cnews_example.text)
                        # 使用词向量
                        # cnews_example.tokens = [*jieba.lcut(cnews_example.text)]
                        for token in cnews_example.tokens:
                            if token in pretrained_wordembedding:
                                if token not in stoi:
                                    stoi[token] = len(stoi)
                                    itos[len(stoi)] = token
                                    vectors.append(
                                        pretrained_wordembedding[token])
                                cnews_example.tokens_ids.append(stoi[token])
                            else:
                                cnews_example.tokens_ids.append(stoi['UNK'])
                        examples.append(cnews_example)
                word_embedding = WordEmbedding(stoi, itos)
                word_embedding.vectors = np.array(vectors)

                # 保存成pkl文件,方便加载
                with open(
                        os.path.join(self.data_dir,
                                     file_name.rsplit('.', 1)[0] +
                                     '_char.pkl'), 'wb') as f:
                    pickle.dump(examples, f)
                with open(os.path.join(self.data_dir, 'word_embedding.pkl'),
                          'wb') as f:
                    pickle.dump(word_embedding, f)
            else:

                with open(
                        os.path.join(self.data_dir,
                                     file_name.rsplit('.', 1)[0] +
                                     '_char.pkl'), 'rb') as f:
                    examples = pickle.load(f)
                with open(os.path.join(self.data_dir, 'word_embedding.pkl'),
                          'rb') as f:
                    word_embedding = pickle.load(f)

            return examples, word_embedding
        else:
            if not os.path.exists(
                    os.path.join(self.data_dir,
                                 file_name.rsplit('.', 1)[0] + '_char.pkl')):
                examples = []
                # 加载训练集上的word embedding
                with open(os.path.join(self.data_dir, 'word_embedding.pkl'),
                          'rb') as f:
                    word_embedding = pickle.load(f)
                with open(os.path.join(self.data_dir, file_name), 'r') as f:
                    for line in tqdm(f):
                        label, text = line.split('\t')
                        if len(text) > self.max_sent_len:
                            text = text[:self.max_sent_len]
                        cnews_example = CnewsExample(text, label)
                        # 使用字向量
                        cnews_example.tokens = list(cnews_example.text)
                        # 使用词向量
                        # cnews_example.tokens = [*jieba.lcut(cnews_example.text)]
                        for token in cnews_example.tokens:
                            if token in word_embedding.stoi:
                                cnews_example.tokens_ids.append(
                                    word_embedding.stoi[token])
                            else:
                                cnews_example.tokens_ids.append(
                                    word_embedding.stoi['UNK'])
                        examples.append(cnews_example)
                # 保存成pkl文件,方便加载
                with open(
                        os.path.join(self.data_dir,
                                     file_name.rsplit('.', 1)[0] +
                                     '_char.pkl'), 'wb') as f:
                    pickle.dump(examples, f)
            else:
                with open(
                        os.path.join(self.data_dir,
                                     file_name.rsplit('.', 1)[0] +
                                     '_char.pkl'), 'rb') as f:
                    examples = pickle.load(f)
            return examples
예제 #3
0
    def _load_dataset(self):
        """
        加载源数据集,并构建词嵌入

        :return: 数据集,词嵌入
        """

        # 数据集文件名
        file_name = self.data_name.split('.')[0]

        # 数据集pkl,不存在
        # 1.加载训练集、构建训练集上的词汇表
        # 2.构建训练集上的word embedding
        if not self.is_mlp:
            examples_path = os.path.join(self.data_dir,
                                         file_name + '_examples.pkl')
        else:
            examples_path = os.path.join(self.data_dir,
                                         file_name + '_examples_mlp.pkl')

        if not os.path.exists(examples_path):

            # 1、加载预训练词向量文件
            pretrained_wordembedding = load_pretrained_wordembedding(
                self.word_embedding_path)

            # 2、初始化词汇表、词向量
            """
            stoi: 字典,token键,index值;'UNK': 0
            itos: 也是字典,index键,token值;0: 'UNK'
            vectors: 词向量集合
            word_embedding id:      -1.08563066e+00     9.97345448e-01      ......
            """
            stoi = {}
            itos = {}
            stoi['UNK'] = 0
            stoi['PAD'] = 1
            itos[0] = 'UNK'
            itos[1] = 'PAD'
            vectors = []
            vectors.append(pretrained_wordembedding['UNK'])
            vectors.append(pretrained_wordembedding['PAD'])

            # 3、根据词汇表、词向量,构建数据集实例

            # 数据集
            examples = []

            # 读取源数据集csv,为raw_data
            """
            header,数据开始行数
            names,列名列表
            """
            raw_data = pd.read_csv(os.path.join(self.data_dir, self.data_name),
                                   header=0,
                                   names=['label', 'text'])

            # encoding = 'utf-8', dtype = str
            # .astype(str)

            # 遍历raw_data.iterrows(),作item,行迭代
            for item in tqdm(raw_data.iterrows()):

                # new一个数据集对象
                hotspring_example = HotspringExample(item[1]['text'],
                                                     item[1]['label'])

                # 使用词向量
                """
                分词,
                jieba.lcut(),返回列表
                jieba.cut(),返回迭代器
                """
                hotspring_example.tokens = [
                    *jieba.lcut(str(hotspring_example.text))
                ]

                # 使用字向量
                """
                直接转成列表
                """
                # hotspring_example.tokens = list(hotspring_example.text)

                # 遍历文本的tokens,为token
                for token in hotspring_example.tokens:

                    # 如果token在预训练词向量中
                    if token in pretrained_wordembedding:

                        # 如果不在stoi,加到词汇表
                        if token not in stoi:
                            # stoi、itos———— 放进
                            stoi[token] = len(stoi)
                            itos[len(stoi)] = token
                            # 根据token找pretrained_wordembedding对象中对应的一条词向量,放到vectors,用于构造word_embedding对象
                            vectors.append(pretrained_wordembedding[token])

                        # hotspring_example对象的tokens_ids(list),,根据{'UNK' : 0}添加0
                        hotspring_example.tokens_ids.append(stoi[token])

                    # 如果token不在
                    else:
                        # hotspring_example对象的tokens_ids(list),,直接添加0
                        hotspring_example.tokens_ids.append(stoi['UNK'])

                # 如果是mlp,padding到260
                """
                    也可以在mlp_model的时候再补齐的,
                    这样就做到和其它神经网络传入模型前的数据部分都统一了
                """
                if self.is_mlp:
                    tokens_ids_len = len(hotspring_example.tokens_ids)
                    if tokens_ids_len < self.needed_by_mlp_max_seq_len:
                        tokens_ids_len_need = self.needed_by_mlp_max_seq_len - tokens_ids_len
                        for i in range(tokens_ids_len_need):
                            hotspring_example.tokens_ids.append(stoi['PAD'])

                # hotspring_example(object,一个评论),添加到examples(list,n个评论)
                examples.append(hotspring_example)

            # 4、根据词汇表、词向量,构建词嵌入实例

            # new一个词嵌入对象,参数stoi、itos、vectors
            word_embedding = WordEmbedding(stoi, itos)
            word_embedding.vectors = np.array(vectors)

            # 5、数据集tokens_ids、词嵌入两种字典+向量,保存成pkl文件,方便加载
            """
               对于_examples.pkl,mlp和其它神经网络是不一样的;
               对于_word_embedding.pkl,mlp和其它神经网络应该是一样的,
               但是在测试集中,不知道为什么,
               用mlp的原始_word_embedding.pkl,测试其它神经网络,auc一样,
               用其它神经网络的原始_word_embedding.pkl,测试mlp络,auc偏低0.001量级一左右,
               再说,先用着分别保存
            """
            if not self.is_mlp:
                with open(
                        os.path.join(self.data_dir,
                                     file_name + '_examples.pkl'), 'wb') as f:
                    pickle.dump(examples, f)
                with open(
                        os.path.join(self.data_dir,
                                     file_name + '_word_embedding.pkl'),
                        'wb') as f:
                    pickle.dump(word_embedding, f)
            else:
                with open(
                        os.path.join(self.data_dir,
                                     file_name + '_examples_mlp.pkl'),
                        'wb') as f:
                    pickle.dump(examples, f)
                with open(
                        os.path.join(self.data_dir,
                                     file_name + '_word_embedding_mlp.pkl'),
                        'wb') as f:
                    pickle.dump(word_embedding, f)
            # 额外保存一下原始词典,较为方便,其实word_embedding对象里面有
            with open("data/dictionary/word2idx.json", 'w+',
                      encoding='utf-8') as f:
                f.write(json.dumps(stoi, ensure_ascii=False))

        # 数据集pkl,存在
        else:
            if not self.is_mlp:
                # 读取数据集pkl
                with open(
                        os.path.join(self.data_dir,
                                     file_name + '_examples.pkl'), 'rb') as f:
                    examples = pickle.load(f)
                # 读取词向量pkl
                with open(
                        os.path.join(self.data_dir,
                                     file_name + '_word_embedding.pkl'),
                        'rb') as f:
                    word_embedding = pickle.load(f)
            else:
                with open(
                        os.path.join(self.data_dir,
                                     file_name + '_examples_mlp.pkl'),
                        'rb') as f:
                    examples = pickle.load(f)
                # 读取词向量pkl
                with open(
                        os.path.join(self.data_dir,
                                     file_name + '_word_embedding_mlp.pkl'),
                        'rb') as f:
                    word_embedding = pickle.load(f)

        return examples, word_embedding
예제 #4
0
    def _load_dataset(self):

        file_name = self.data_name.split('.')[0]

        if not self.is_mlp:
            examples_path = os.path.join(self.data_dir,
                                         file_name + '_examples.pkl')
        else:
            examples_path = os.path.join(self.data_dir,
                                         file_name + '_examples_mlp.pkl')
        if not os.path.exists(examples_path):

            # 这里加载的pretrained_wordembedding,其实也就是原始语料的corpus_word_embedding.pkl
            pretrained_wordembedding = load_pretrained_wordembedding(
                self.word_embedding_path)

            # 加载原始词典
            """
                通过保存的字典方式加载:
                    with open("data/dictionary/word2idx.json", "r", encoding="utf-8") as f:
                        stoi = json.load(f)
            """
            with open(os.path.join(self.model_word_embedding_path), 'rb') as f:
                word_embedding_for_stoi = pickle.load(f)
            stoi = word_embedding_for_stoi.stoi

            # 根据stoi,造itos
            """
                {"zero":0, "one":1} --> {0: 'zero', 1: 'one'}
            """
            itos = {k: v for k, v in enumerate(stoi)}

            vectors = []
            vectors.append(pretrained_wordembedding['UNK'])
            vectors.append(pretrained_wordembedding['PAD'])

            examples = []

            raw_data = pd.read_csv(os.path.join(self.data_dir, self.data_name),
                                   header=0,
                                   names=['label', 'text'])

            for item in tqdm(raw_data.iterrows()):

                hotspring_example = HotspringExample(item[1]['text'],
                                                     item[1]['label'])

                hotspring_example.tokens = [
                    *jieba.lcut(str(hotspring_example.text))
                ]

                # 使用字向量
                # hotspring_example.tokens = list(hotspring_example.text)

                # 一句话
                for token in hotspring_example.tokens:
                    # 一个字
                    # 如果token在,原始字典stoi
                    if token in stoi:
                        # 构造vectors:根据token,从pretrained_wordembedding,找到对应的一条词向量,放到vectors
                        vectors.append(pretrained_wordembedding[token])
                        # 构造example:根据原始字典,翻译为这句话的tokens_ids
                        hotspring_example.tokens_ids.append(stoi[token])
                    # 不在字典,直接添加未知token的token_id
                    else:
                        hotspring_example.tokens_ids.append(stoi['UNK'])

                if self.is_mlp:
                    tokens_ids_len = len(hotspring_example.tokens_ids)
                    if tokens_ids_len < self.needed_by_mlp_max_seq_len:
                        tokens_ids_len_need = self.needed_by_mlp_max_seq_len - tokens_ids_len
                        for i in range(tokens_ids_len_need):
                            hotspring_example.tokens_ids.append(stoi['PAD'])
                    else:
                        hotspring_example.tokens_ids = hotspring_example.tokens_ids[:
                                                                                    self
                                                                                    .
                                                                                    needed_by_mlp_max_seq_len]

                examples.append(hotspring_example)

            word_embedding = WordEmbedding(stoi, itos)
            word_embedding.vectors = np.array(vectors)

            if not self.is_mlp:
                with open(
                        os.path.join(self.data_dir,
                                     file_name + '_examples.pkl'), 'wb') as f:
                    pickle.dump(examples, f)
                with open(
                        os.path.join(self.data_dir,
                                     file_name + '_word_embedding.pkl'),
                        'wb') as f:
                    pickle.dump(word_embedding, f)
            else:
                with open(
                        os.path.join(self.data_dir,
                                     file_name + '_examples_mlp.pkl'),
                        'wb') as f:
                    pickle.dump(examples, f)
                with open(
                        os.path.join(self.data_dir,
                                     file_name + '_word_embedding_mlp.pkl'),
                        'wb') as f:
                    pickle.dump(word_embedding, f)

        else:
            if not self.is_mlp:
                with open(
                        os.path.join(self.data_dir,
                                     file_name + '_examples.pkl'), 'rb') as f:
                    examples = pickle.load(f)
                with open(
                        os.path.join(self.data_dir,
                                     file_name + '_word_embedding.pkl'),
                        'rb') as f:
                    word_embedding = pickle.load(f)
            else:
                with open(
                        os.path.join(self.data_dir,
                                     file_name + '_examples_mlp.pkl'),
                        'rb') as f:
                    examples = pickle.load(f)
                with open(
                        os.path.join(self.data_dir,
                                     file_name + '_word_embedding_mlp.pkl'),
                        'rb') as f:
                    word_embedding = pickle.load(f)
        return examples, word_embedding