Python TextProcessor示例，DataLoader_v1.TextProcessor Python示例

示例#1

0

显示文件

文件： run_v1.py 项目： Alchemist1024/textMatch

    def __init__(self,
                 config,
                 model_path,
                 label_path,
                 bert_path='chinese-bert-wwm',
                 max_seq_length=32):
        self.config = config
        self.model_path = model_path
        self.label_path = label_path
        self.bert_path = bert_path

        self.model = BERTForMultiLabelSequenceClassification(
            self.config, self.config.num_classes)
        self.model.load_state_dict(
            torch.load(self.model_path, map_location=torch.device('cpu')))
        self.model.half()
        self.model.eval()
        self.model.to(self.config.device)

        self.tokenizer = BertTokenizer.from_pretrained(self.bert_path)
        self.max_seq_length = max_seq_length

        self.processor = TextProcessor()
        self.labels = self.processor.get_labels(self.label_path)
        self.label2id = {label: id_ for id_, label in enumerate(self.labels)}
        self.id2label = {id_: label for id_, label in enumerate(self.labels)}

示例#2

0

显示文件

 def __init__(self, model_path, bert_path):
     self.processor = TextProcessor()
     self.sess_options = SessionOptions()
     # self.sess_options.intra_op_num_threads = 1
     self.sess_options.intra_op_num_threads = psutil.cpu_count(logical=True)
     # self.sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
     self.session = InferenceSession(model_path, self.sess_options)
     self.use_gpu = torch.cuda.is_available()
     self.device = torch.device("cuda:7" if use_gpu else "cpu")
     self.tokenizer = BertTokenizer.from_pretrained(self.bert_path)

示例#3

0

显示文件

class Predict:
    def __init__(self, model_path, bert_path):
        self.processor = TextProcessor()
        self.sess_options = SessionOptions()
        # self.sess_options.intra_op_num_threads = 1
        self.sess_options.intra_op_num_threads = psutil.cpu_count(logical=True)
        # self.sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
        self.session = InferenceSession(model_path, self.sess_options)
        self.use_gpu = torch.cuda.is_available()
        self.device = torch.device("cuda:7" if use_gpu else "cpu")
        self.tokenizer = BertTokenizer.from_pretrained(self.bert_path)

    def to_numpy(self, tensor):
        return tensor.detach().cpu().numpy(
        ) if tensor.requires_grad else tensor.cpu().numpy()

    def run(self, record):
        text_a, text_b = record[0], record[1]
        example = self.processor._create_single_example(text_a, text_b)
        feature = convert_single_example(example, self.max_seq_length,
                                         self.tokenizer)

        input_ids = torch.tensor(feature.input_ids,
                                 dtype=torch.long).unsqueeze(0)
        input_mask = torch.tensor(feature.input_mask,
                                  dtype=torch.long).unsqueeze(0)
        segment_ids = torch.tensor(feature.segment_ids,
                                   dtype=torch.long).unsqueeze(0)

        ort_inputs = {
            'input_ids': input_ids,
            'input_mask': input_mask,
            'segment_ids': segment_ids
        }
        ort_outputs = self.session.run(None, ort_inputs)
        print(ort_outputs)
        print(type(ort_outputs))

    def infer(self, data_path):
        pass

示例#4

0

显示文件

文件： inference_onnx_merge3.py 项目： Alchemist1024/textMatch

                response["predict"] = predict
                response["index"] = index_str
                response["ok"] = True
            except Exception as e:
                response["predict"] = 0
                response["index"] = index_str
                response["ok"] = False
            response_batch["results"].append(response)

        return response_batch


max_seq_len = 32
config = Config('data')
tokenizer = BertTokenizer.from_pretrained('./vocab')
processor = TextProcessor()


def to_numpy(tensor):
    return tensor.detach().cpu().numpy(
    ) if tensor.requires_grad else tensor.cpu().numpy()


# 需要根据模型类型重写
def infer(bert, bert1, nezha, query_A, query_B):

    text_a, text_b = query_A, query_B
    example = processor._create_single_example(text_a, text_b)
    feature = convert_single_example_dynamic(example, max_seq_len, tokenizer)
    # feature = convert_single_example(example, max_seq_len, tokenizer)

示例#5

0

显示文件

文件： train_v1_single.py 项目： Alchemist1024/textMatch

def train(config, model):
    fgm = FGM(model)
    processor = TextProcessor()
    label_list = processor.get_labels(config.class_path)

    #加载训练数据
    train_examples = processor.get_train_examples(config.train_path)
    train_features = convert_examples_to_features(train_examples, label_list,
                                                  config.max_seq_length,
                                                  config.tokenizer)

    all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                   dtype=torch.long)
    all_label_ids = torch.tensor([f.label_ids for f in train_features],
                                 dtype=torch.long)

    train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                               all_label_ids)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=config.batch_size,
                                  drop_last=True)

    #加载测试数据
    dev_examples = processor.get_dev_examples(config.dev_path)
    dev_features = convert_examples_to_features(dev_examples, label_list,
                                                config.max_seq_length,
                                                config.tokenizer)

    all_input_ids_dev = torch.tensor([f.input_ids for f in dev_features],
                                     dtype=torch.long)
    all_input_mask_dev = torch.tensor([f.input_mask for f in dev_features],
                                      dtype=torch.long)
    all_segment_ids_dev = torch.tensor([f.segment_ids for f in dev_features],
                                       dtype=torch.long)
    all_label_ids_dev = torch.tensor([f.label_ids for f in dev_features],
                                     dtype=torch.long)

    dev_data = TensorDataset(all_input_ids_dev, all_input_mask_dev,
                             all_segment_ids_dev, all_label_ids_dev)
    dev_sampler = SequentialSampler(dev_data)
    dev_dataloader = DataLoader(dev_data,
                                sampler=dev_sampler,
                                batch_size=config.batch_size)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    optimizer = BertAdam(
        optimizer_grouped_parameters,
        lr=config.learning_rate,
        #  schedule='warmup_linear',
        warmup=0.05,
        t_total=config.num_epochs * len(train_dataloader))

    #lookahead
    # from optimizer import Lookahead
    # optimizer = Lookahead(optimizer, k=5, alpha=0.5)

    logger.info(f"正在使用GPU: {torch.cuda.current_device()}进行训练...")

    model.train()

    eval_steps = len(train_dataloader) // 2
    for i in range(config.num_epochs):
        total_batch = 0
        eval_best_loss = float('inf')
        eval_best_auc_score = float('-inf')
        eval_best_acc = float('-inf')
        last_improve = 0
        flag = False

        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.to(config.device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch

            loss = model(input_ids, segment_ids, input_mask, label_ids)
            loss.backward()

            #对抗训练
            fgm.attack()
            loss_adv = model(input_ids, segment_ids, input_mask, label_ids)
            loss_adv.backward()
            fgm.restore()

            optimizer.step()
            model.zero_grad()

            logits = model(input_ids, segment_ids, input_mask)

            logger.info(f"Epoch: {i+1}, step: {step+1}, train_loss: {loss}")
        torch.save(model.state_dict(), config.save_path)

示例#6

0

显示文件

文件： run_nezha.py 项目： Alchemist1024/textMatch

class Predict:
    def __init__(self, config, model_path, label_path, bert_path='chinese-bert-wwm', max_seq_length=32):
        self.config = config
        self.model_path = model_path
        self.label_path = label_path
        self.bert_path = bert_path

        self.model = BERTForMultiLabelSequenceClassification(self.config, self.config.num_classes) 
        self.model.load_state_dict(torch.load(self.model_path, map_location=torch.device('cpu')))
        self.model.eval()
        self.model.to(self.config.device)

        self.tokenizer = BertTokenizer.from_pretrained(self.bert_path)
        self.max_seq_length = max_seq_length
        
        self.processor = TextProcessor()
        self.labels = self.processor.get_labels(self.label_path)
        self.label2id = {label: id_ for id_, label in enumerate(self.labels)}
        self.id2label = {id_: label for id_, label in enumerate(self.labels)}

    def run(self, record):
        '''
        预测小类标签
        '''
        text_a, text_b = record[0], record[1]
        example = self.processor._create_single_example(text_a, text_b)
        feature = convert_single_example(example, self.max_seq_length, self.tokenizer)

        input_ids = torch.tensor(feature.input_ids, dtype=torch.long).unsqueeze(0).to(self.config.device)
        input_mask = torch.tensor(feature.input_mask, dtype=torch.long).unsqueeze(0).to(self.config.device)
        segment_ids = torch.tensor(feature.segment_ids, dtype=torch.long).unsqueeze(0).to(self.config.device)

        logits = self.model(input_ids, segment_ids, input_mask).detach()
        prob = logits.sigmoid()[:, 1].tolist() #[0.123]
        # prob = torch.sigmoid(logits)

        # return prob[0].cpu().tolist()[0]
        return prob[0]

    def collect_badcase(self, data_path):
        badcase = []
        cnt = 0
        with open(data_path, 'r', encoding='utf-8') as reader:
            for record in reader:
                print(f'第{cnt+1}条记录...')
                cnt += 1
                text_a, text_b, label = record.strip().split('\t')
                pre = self.run([text_a, text_b])
                if pre > 0.5:
                    pre_label = '1'
                else:
                    pre_label = '0'
                if pre_label != label:
                    badcase.append('\t'.join([text_a, text_b, label, pre_label, str(pre)]))

        return badcase

    def evaluate(self, data_path):
        '''在全部的数据集上对模型进行测试
        '''
        labels = []
        pres = []

        cnt = 0
        with open(data_path, 'r', encoding='utf-8') as reader:
            for record in reader:
                print(f'第{cnt+1}条记录...')
                cnt += 1
                text_a, text_b, label = record.strip().split('\t')
                pre = self.run([text_a, text_b])
                labels.append(int(label))
                pres.append(pre)

        fpr, tpr, th = roc_curve(labels, pres, pos_label=1)
        auc_score = auc(fpr, tpr)
        return auc_score, pres

    def inference(self, data_path, to_path):
        pres = []
        cnt = 0 
        with open(data_path, 'r', encoding='utf-8') as reader:
            for record in reader:
                print(f'第{cnt+1}条记录...')
                cnt += 1
                text_a, text_b = record.strip().split('\t')
                pre = self.run([text_a, text_b])
                pres.append(pre)

        with open(to_path, 'w', encoding='utf-8') as writer:
            for pre in pres:
                writer.write(str(pre) + '\n')

示例#7

0

显示文件

def train(config, model):
    fgm = FGM(model)
    processor = TextProcessor()
    label_list = processor.get_labels(config.class_path)

    #加载训练数据
    train_examples = processor.get_train_examples(config.train_path)
    train_features = convert_examples_to_features(train_examples, label_list,
                                                  config.max_seq_length,
                                                  config.tokenizer)

    all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                   dtype=torch.long)
    all_label_ids = torch.tensor([f.label_ids for f in train_features],
                                 dtype=torch.long)

    train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                               all_label_ids)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=config.batch_size,
                                  drop_last=True)

    #加载测试数据
    dev_examples = processor.get_dev_examples(config.dev_path)
    dev_features = convert_examples_to_features(dev_examples, label_list,
                                                config.max_seq_length,
                                                config.tokenizer)

    all_input_ids_dev = torch.tensor([f.input_ids for f in dev_features],
                                     dtype=torch.long)
    all_input_mask_dev = torch.tensor([f.input_mask for f in dev_features],
                                      dtype=torch.long)
    all_segment_ids_dev = torch.tensor([f.segment_ids for f in dev_features],
                                       dtype=torch.long)
    all_label_ids_dev = torch.tensor([f.label_ids for f in dev_features],
                                     dtype=torch.long)

    dev_data = TensorDataset(all_input_ids_dev, all_input_mask_dev,
                             all_segment_ids_dev, all_label_ids_dev)
    dev_sampler = SequentialSampler(dev_data)
    dev_dataloader = DataLoader(dev_data,
                                sampler=dev_sampler,
                                batch_size=config.batch_size)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=config.learning_rate,
                         warmup=0.05,
                         t_total=config.num_epochs * len(train_dataloader))

    logger.info(f"正在使用GPU: {torch.cuda.current_device()}进行训练...")

    model.train()

    eval_steps = len(train_dataloader) // 2
    for i in range(config.num_epochs):
        total_batch = 0
        eval_best_loss = float('inf')
        eval_best_auc_score = float('-inf')
        eval_best_acc = float('-inf')
        last_improve = 0
        flag = False

        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.to(config.device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch

            loss = model(input_ids, segment_ids, input_mask, label_ids)
            loss.backward()

            # #对抗训练
            fgm.attack()
            loss_adv = model(input_ids, segment_ids, input_mask, label_ids)
            loss_adv.backward()
            fgm.restore()

            optimizer.step()
            model.zero_grad()

            logits = model(input_ids, segment_ids, input_mask)

            logger.info(f"Epoch: {i+1}, step: {step+1}, train_loss: {loss}")
            if (total_batch + 1) % eval_steps == 0:
                torch.save(model.state_dict(), config.save_path)
                eval_accuracy, eval_precision, eval_loss, auc_score = evaluate(
                    config, model, dev_dataloader)
                if auc_score > eval_best_auc_score:
                    eval_best_auc_score = auc_score
                    torch.save(model.state_dict(), config.save_path)
                # if eval_accuracy > eval_best_acc:
                #     eval_best_acc = eval_accuracy
                #     torch.save(model.state_dict(), config.save_path)
                print('*' * 80)
                logger.info(
                    f"Epoch: {i+1}, step: {step+1}, train_loss: {loss}, eval_loss: {eval_loss}, eval_accuracy: {eval_accuracy}, eval_precision: {eval_precision}, auc_score: {auc_score}"
                )

                # 保存log
                with open('data/match/logs.txt', 'a+',
                          encoding='utf-8') as log_writer:
                    log_writer.write(
                        f"Epoch: {i+1}, step: {step+1}, train_loss: {loss}, eval_loss: {eval_loss}, eval_accuracy: {eval_accuracy}, eval_precision: {eval_precision}, auc_score: {auc_score} \n"
                    )
                model.train()
            total_batch += 1

示例#8

0

显示文件

class Predict:
    def __init__(self, model_path, bert_path):
        self.processor = TextProcessor()
        # self.sess_options = SessionOptions()
        # self.sess_options.intra_op_num_threads = 1
        # self.sess_options.intra_op_num_threads=psutil.cpu_count(logical=True)
        # self.sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
        # self.session = InferenceSession(model_path, self.sess_options)
        self.session = InferenceSession(model_path)
        # self.session_1 = InferenceSession(model_path_1) # 多个模型融合
        # print(self.session.get_inputs()[2].name)
        # print(len(self.session.get_inputs()))
        # self.use_gpu = torch.cuda.is_available()
        # self.device = torch.device("cuda:7" if self.use_gpu else "cpu")
        self.tokenizer = BertTokenizer.from_pretrained(bert_path)

    def to_numpy(self, tensor):
        return tensor.detach().cpu().numpy(
        ) if tensor.requires_grad else tensor.cpu().numpy()

    # def to_numpy(self, tensor):
    #     return tensor.detach().cuda().numpy() if tensor.requires_grad else tensor.cuda().numpy()

    def run(self, record):
        text_a, text_b = record[0], record[1]
        example = self.processor._create_single_example(text_a, text_b)
        feature = convert_single_example(example, 32, self.tokenizer)

        input_ids = torch.tensor(feature.input_ids,
                                 dtype=torch.long).unsqueeze(0)
        segment_ids = torch.tensor(feature.segment_ids,
                                   dtype=torch.long).unsqueeze(0)
        input_mask = torch.tensor(feature.input_mask,
                                  dtype=torch.long).unsqueeze(0)

        # ort_inputs = {
        #     'input_ids': self.to_numpy(input_ids),
        #     'segment_ids': self.to_numpy(segment_ids),
        #     'input_mask': self.to_numpy(input_mask)
        # }
        # print(input_ids)
        # print(segment_ids)
        # print(input_mask)
        ort_inputs = {
            'input_ids': self.to_numpy(input_ids),
            'segment_ids': self.to_numpy(segment_ids),
            'input_mask': self.to_numpy(input_mask)
        }
        print(self.session)
        ort_outputs = self.session.run(None, ort_inputs)
        # print(ort_outputs)
        ort_logits = torch.from_numpy(ort_outputs[0])
        print(ort_logits)  # tensor([[4.7433, -4.5335]])
        # ort_logits_1 = torch.from_numpy(ort_outputs_1[0]) # 二维向量
        prob = ort_logits.sigmoid()[:, 1].tolist()[0]  #[0.123]
        # print(ort_logits)
        return prob

    def infer(self, data_path, to_path):
        pres = []
        cnt = 0
        with open(data_path, 'r', encoding='utf-8') as reader:
            for record in reader:
                print(f'第{cnt+1}条记录...')
                cnt += 1
                text_a, text_b = record.strip().split('\t')
                pre = self.run([text_a, text_b])
                pres.append(pre)

        with open(to_path, 'w', encoding='utf-8') as writer:
            for pre in pres:
                writer.write(str(pre) + '\n')