Пример #1
0
import time
import torch
import unittest
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

_cur_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.append("%s/../../" % _cur_dir)
from text_utils.utils.data_io import get_attr_values
from text_utils.utils.label_encoder import LabelEncoder
from text_utils.utils.logger import init_log
from text_utils.models.torch.base_model import ClassificationModel
from text_utils.models.torch.nets.bert import BertForClassification
from text_utils.tokenizers.bert_tokenizer import BertTokenizer

init_log(stream_level=logging.INFO)


class ClassificationDataset(Dataset):
    """
    针对特定数据集,定义一个相关的取数据的方式
    """
    def __init__(self, data_list) :
        ## 一般init函数是加载所有数据
        super(ClassificationDataset, self).__init__()
        # dataloader中的数据用numpy保存
        # 相关issue: https://github.com/pytorch/pytorch/issues/13246
        self.data_list = np.array(data_list)

    def __getitem__(self, index):
        # 得到单个数据
Пример #2
0
Date: 2020/06/04 15:59:43
"""

import logging
import json
import os
import re
import sys
_cur_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.append("%s/../../" % _cur_dir)

from text_utils.model.lr_model_multilabel_impl import BaseLRModel
from text_utils.preprocess import ProcessFilePath
from text_utils.feature.feature_generator import FeatureGenerator
from text_utils.utils.logger import init_log
init_log("./log/lr_model_multilabel.log")

import lr_multiple_config as config


class LRModelMultipleDemo(BaseLRModel):
    """LR多标签分类模型基础类
    """
    def __init__(self, mid_data_dir, model_dir, output_dir):
        """
        """
        super(LRModelMultipleDemo, self).__init__(model_dir, output_dir)
        self.mid_data_paths = ProcessFilePath(output_dir=mid_data_dir)
        self.generator_path = os.path.join(model_dir, "generator.pkl")

        self.feature_generator = FeatureGenerator(
Пример #3
0
import unittest

from ernie.modeling_ernie import ErnieModel

_cur_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.append("%s/../" % _cur_dir)
from text_utils.tokenizers.ernie_tokenizer import ErnieTokenizer
from text_utils.tokenizers.lr_tokenizer import LRTokenizer
from text_utils.utils.data_io import get_attr_values, gen_batch_data
from text_utils.utils.data_io import write_to_file
from text_utils.models.dygraph.train_infer_utils import batch_infer
from text_utils.utils.logger import init_log
from text_utils.models.machine_learning.cluster import mini_batch_kmeans, data_cluster
from text_utils.utils.vectorizer import init_vectorizer

init_log()

class TestCluster(unittest.TestCase):

    @classmethod
    def setUpClass(cls):
        test_root = "./"
        TestCluster.test_output_dir = os.path.join(test_root, "output/test_clutser/")
        if not os.path.isdir(TestCluster.test_output_dir):
            os.mkdir(TestCluster.test_output_dir)

        test_data_dir = os.path.join(test_root, "dataset/classification_data/toutiao_news")

        example_num = 5

        # 加载数据
Пример #4
0
_cur_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.append("%s/../../" % _cur_dir)

from process_data import process_origin_poetry

from text_utils.models.torch.base_model import BertSeq2seqModel, model_distributed
from text_utils.models.torch.nets.bert import BertForSeq2seq
from text_utils.tokenizers.bert_tokenizer import BertTokenizer
from text_utils.utils.data_io import get_data
from text_utils.utils.logger import init_log

torch.distributed.init_process_group(backend="nccl")
LOCAL_RANK = torch.distributed.get_rank()

logging_level = logging.INFO if LOCAL_RANK == 0 else logging.WARNING
init_log(stream_level=logging_level)


class PoetDataset(Dataset):
    """
    针对特定数据集,定义一个相关的取数据的方式
    """
    def __init__(self, data_dir, tokenizer) :
        ## 一般init函数是加载所有数据
        super(PoetDataset, self).__init__()
        self.tokenizer = tokenizer
        # dataloader中的数据用numpy保存
        # 相关issue: https://github.com/pytorch/pytorch/issues/13246
        self.poet_info_list = np.array(self.gen_dataset(data_dir))

    def gen_dataset(self, data_dir):