import os import torch from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler from pytorch_pretrained_bert.tokenization import BertTokenizer from preprocessing.data_processor import MyPro, convert_examples_to_features import config.args as args from util.Logginger import init_logger logger = init_logger(f"{args.task_name}", logging_path=args.log_path) def init_parameters(): tokenizer = BertTokenizer(vocab_file=args.VOCAB_FILE) tokenizer.save_vocabulary(args.output_dir) # 保存词表文件 processor = MyPro() return tokenizer, processor def create_batch_iter(mode, path): """构造迭代器""" logger.info(f'{mode} path is {path}') tokenizer, processor = init_parameters() if mode == "train": examples = processor.get_train_examples(path) num_train_steps = int( len(examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) batch_size = args.train_batch_size logger.info(" Num train steps = %d", num_train_steps) elif mode == "dev":
import os import time import inspect import torch import torch.nn as nn from torch.autograd import Variable import config.config as config from util.gpu_mem_track import MemTracker from util.plot_util import loss_acc_plot from util.lr_util import lr_update from util.Logginger import init_logger logger = init_logger("torch", logging_path=config.LOG_PATH) torch.manual_seed(2018) torch.cuda.manual_seed(2018) torch.cuda.manual_seed_all(2018) import warnings warnings.filterwarnings('ignore') os.environ["CUDA_VISIBLE_DEVICES"] = "%d"%config.device frame = inspect.currentframe() gpu_tracker = MemTracker(frame) use_cuda = config.use_cuda if torch.cuda.is_available() else False
import time import torch from pytorch_pretrained_bert.optimization import BertAdam import config.args as args from util.plot_util import loss_acc_plot from util.Logginger import init_logger from evaluate.loss import loss_fn from evaluate.acc_f1 import qa_evaluate from util.model_util import save_model, load_model logger = init_logger("torch", logging_path=args.log_path) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) import warnings warnings.filterwarnings('ignore') def warmup_linear(x, warmup=0.002): if x < warmup: return x / warmup return 1.0 - x def fit(model, training_iter, eval_iter, num_epoch,
import os import json import random import collections from tqdm import tqdm import config.args as args from util.Logginger import init_logger from pytorch_pretrained_bert.tokenization import BertTokenizer logger = init_logger("QA", logging_path=args.log_path) class InputExample(object): "Template for a single data" def __init__( self, qas_id, # question id question_text, # question text doc_tokens, # context orig_answer_text=None, # answer text start_position=None, # For Yes, No & no-answer, start_position = 0 end_position=None, # For Yes, No & no-answer, start_position = 0 answer_type=None # We denote answer type as Yes: 0 No: 1 no-answer: 2 long-answer: 3 ): self.qas_id = qas_id self.question_text = question_text self.doc_tokens = doc_tokens self.orig_answer_text = orig_answer_text self.start_position = start_position self.end_position = end_position
import torch from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler from pytorch_pretrained_bert.tokenization import BertTokenizer from preprocessing.data_processor import MyPro, convert_examples_to_features import config.args as args from util.Logginger import init_logger logger = init_logger("bert_ner", logging_path=args.log_path) def init_params(): processors = {"bert_ner": MyPro} task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() tokenizer = BertTokenizer(vocab_file=args.VOCAB_FILE) return processor, tokenizer def create_batch_iter(mode): """构造迭代器""" processor, tokenizer = init_params() if mode == "train": examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)
import torch import config.args as args from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler from torch.utils.data.distributed import DistributedSampler from pytorch_pretrained_bert.tokenization import BertTokenizer from preprocessing.data_processor import read_qa_examples, convert_examples_to_features from util.Logginger import init_logger logger = init_logger("bert_class", logging_path=args.log_path) def init_params(): tokenizer = BertTokenizer(vocab_file=args.VOCAB_FILE) return tokenizer def create_batch_iter(mode): """构造迭代器""" tokenizer = init_params() if mode == "train": examples = read_qa_examples(args.data_dir, "train") batch_size = args.train_batch_size elif mode == "dev": examples = read_qa_examples(args.data_dir, "dev") batch_size = args.eval_batch_size else: raise ValueError("Invalid mode %s" % mode) # 特征 features = convert_examples_to_features(examples, tokenizer,
import json from util.Logginger import init_logger import config.args as args logger = init_logger("model_net", logging_path=args.log_path) class InputExample(object): def __init__(self, guid, text_a, text_b=None, label=None): """创建一个输入实例 Args: guid: 每个example拥有唯一的id text_a: 第一个句子的原始文本,一般对于文本分类来说,只需要text_a text_b: 第二个句子的原始文本,在句子对的任务中才有,分类问题中为None label: example对应的标签,对于训练集和验证集应非None,测试集为None """ self.guid = guid self.text_a = text_a self.text_b = text_b self.label = label class InputFeature(object): def __init__(self, input_ids, input_mask, segment_ids, label_id, output_mask): self.input_ids = input_ids self.input_mask = input_mask self.segment_ids = segment_ids self.label_id = label_id self.output_mask = output_mask