def __init__(self, dataset): self.model_name = 'bert' self.OCLI_train_path = dataset + '/data/OCLI_train.csv' # 推理训练集 self.OCEMOTION_train_path = dataset + '/data/OCEMOTION_train.csv' # 情感训练集 self.TNEWS_train_path = dataset + '/data/TNEWS_train.csv' # 新闻训练集 self.OCLI_test_path = dataset + '/data/OCNLI_a.csv' # 推理测试集 self.OCEMOTION_test_path = dataset + '/data/OCEMOTION_a.csv' # 情感测试集 self.TNEWS_test_path = dataset + '/data/TNEWS_a.csv' # 新闻测试集 self.OCLI_class_list = [0, 1, 2] self.OCEMOTION_class_list = ['sadness', 'like', 'happiness', 'fear', 'disgust', 'surprise', 'anger'] self.TNEWS_class_list = [100, 101, 102, 103, 104, 106, 107, 108, 109, 110, 112, 113, 114, 115, 116] # 类别名单 self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 模型训练结果 self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 设备 self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练 self.num_epochs = 3 # epoch数 self.batch_size = 32 # mini-batch大小 self.pad_size = 32 # 每句话处理成的长度(短填长切) self.learning_rate = 5e-5 # 学习率 self.bert_path = './bert_pretrain' # 读取预置的 Tokenizer self.tokenizer = BertTokenizer.from_pretrained(self.bert_path) self.hidden_size = 768 self.OCLI_submit_output_path = './submit/ocnli_predict.json' # 提交结果输出路径 self.OCEMOTION_submit_output_path = './submit/ocemotion_predict.json' # 提交结果输出路径 self.TNEWS_submit_output_path = './submit/tnews_predict.json' # 提交结果输出路径 self.filter_sizes = (2, 3, 4) # 卷积核尺寸 self.num_filters = 256 # 卷积核数量(channels数) self.dropout = 0.1 self.cut_ratio = 0.95 # 90%作为训练集,10%作为验证集
def __init__(self, dataset, task_name=None): self.model_name = 'bert' self.task_name = task_name if not task_name: self.train_path = dataset + '/data/train.txt' # 训练集 self.dev_path = dataset + '/data/dev.txt' # 验证集 self.test_path = dataset + '/data/test.txt' # 测试集 else: self.train_path = dataset + '/data/{}_train.txt'.format( task_name) # 训练集 self.dev_path = dataset + '/data/{}_dev.txt'.format( task_name) # 验证集 self.test_path = dataset + '/data/{}_test.txt'.format( task_name) # 测试集 self.class_list = [ x.strip() for x in open(dataset + '/data/class.txt', encoding='utf-8').readlines() ] self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 模型训练结果 self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') # 设备 self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练 self.num_classes = len(self.class_list) # 类别数 self.num_epochs = 10 # epoch数 self.batch_size = 128 # mini-batch大小 self.pad_size = 32 # 每句话处理成的长度(短填长切) self.learning_rate = 5e-5 # 学习率 self.bert_path = './bert_pretrain' self.tokenizer = BertTokenizer.from_pretrained(self.bert_path) self.hidden_size = 768
def __init__(self, data_path, doclabel_file, bert_path=None): self.model_name = 'bert' self.doc_path = doclabel_file # 文本类标数据路径 # self.dev_path = dataset + '/data/dev.txt' # 验证集 # self.test_path = dataset + '/data/test.txt' # 测试集 self.class_list = [x.strip() for x in open( data_path + '/data/class.txt').readlines()] # 类别名单 self.save_path = data_path + '/saved_dict/' + self.model_name + '.ckpt' # 模型训练结果 self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 设备 # self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练 self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练 self.num_classes = len(self.class_list) # 类别数 self.num_epochs = 1 # epoch数 self.batch_size = 1 # mini-batch大小 # self.batch_size = 128 # mini-batch大小 self.pad_size = 32 # 每句话处理成的长度(短填长切) self.learning_rate = 5e-6 # 学习率 # self.learning_rate = 5e-5 # 学习率\ if bert_path is None: self.bert_path = './bert_pretrain' else: self.bert_path = bert_path print('Use bert path', self.bert_path) self.tokenizer = BertTokenizer.from_pretrained(self.bert_path) self.hidden_size = 768
def __init__(self): self.model_name = 'Bert' # epoch self.num_epochs = 10 # batch_size self.batch_size = 37 # 每句话处理的长度(短填,长切) self.pad_size = 32 # 学习率 self.learning_rate = 1e-5 # bert预训练模型位置 self.bert_pretrain_path = 'bert_pretrain' # bert切词器 self.tokenizer = BertTokenizer.from_pretrained(self.bert_pretrain_path) # bert隐层个数 self.hidden_size = 768 # 类别名 self.class_list = [ 'news_agriculture', 'news_car', 'news_culture', 'news_edu', 'news_house' ] # 模型训练结果 self.save_path = 'F:/nlp/shiyan/NLP2020-classification/model' + '/saved_dict' + self.model_name + '.ckpt' # 若超过1000bacth效果还没有提升,提前结束训练 self.require_improvement = 1000 # 日志 self.log_path = 'F:/nlp/shiyan/NLP2020-classification' + '/log' + self.model_name
def __init__(self, dataset): self.model_name = 'bert' self.train_path = dataset + '/data/train.txt' self.test_path = dataset + '/data/test.txt' self.dev_path = dataset + '/data/dev.txt' # 类别 with open(dataset + '/data/class.txt', encoding='UTF-8') as f: self.class_list = [x.strip() for x in f] # 模型保存路径,模型训练结果。 self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 设备配置 self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') # 若超过1000batch效果还没有提升,提前结束训练 self.require_improvement = 1000 self.num_classes = len(self.class_list) # 类别数量 self.num_epochs = 3 # epoch数 self.batch_size = 128 # batch_size self.pad_size = 32 # 每句话处理的长度(短填,长切) self.learning_rate = 1e-5 # 学习率 self.bert_path = 'bert_pretrain' # bert预训练模型位置 self.tokenizer = BertTokenizer.from_pretrained( self.bert_path) # bert切词器 self.hidden_size = 768 # bert隐藏层的个数
def __init__(self, g_config): self.model_name = g_config.model_name self.train_path = g_config.train_path # 训练集 self.dev_path = g_config.dev_path # 验证集 self.test_path = g_config.test_path # 测试集 self.class_list = g_config.class_list # 类别名单 self.vocab_path = g_config.vocab_path self.save_path = g_config.save_path # 模型训练结果 self.log_path = g_config.log_path self.device = g_config.device # 设备 #self.device = torch.device('cpu') self.require_improvement = g_config.require_improvement # 若超过1000batch效果还没提升,则提前结束训练 self.num_classes = g_config.num_classes # 类别数 self.num_epochs = 4 # epoch数 self.batch_size = g_config.batch_size # mini-batch大小 self.pad_size = 256 # 每句话处理成的长度(短填长切) self.learning_rate = 5e-5 # 学习率 self.bert_path = './bert_pretrain' self.tokenizer = BertTokenizer.from_pretrained(self.bert_path) self.hidden_size = 768 self.filter_sizes = (2, 3, 4) # 卷积核尺寸 self.num_filters = 256 # 卷积核数量(channels数) self.dropout = 0.1 self.rnn_hidden = 768 self.num_layers = 2
def __init__(self, dataset, args): self.model_name = 'bert' self.feature_name = args.feature self.train_path = dataset + '/train/sentiment_analysis_trainingset.csv' # 训练集 self.dev_path = dataset + '/vilidation/sentiment_analysis_validationset.csv' # 验证集 # self.test_path = dataset + '/data/test.txt' # 测试集 self.class_list = [ x.strip() for x in open(dataset + '/data/class.txt').readlines() ] # 类别名单 self.dir_path = 'model_result/' + self.feature_name + '/saved_dict/' self.train_data_pkl = self.dir_path + 'train.pkl' self.dev_data_pkl = self.dir_path + 'dev.pkl' self.save_path = self.dir_path + self.model_name + '.ckpt' # 模型训练结果 self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') # 设备 self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练 self.num_classes = len(self.class_list) # 类别数 self.num_epochs = 3 # epoch数 self.batch_size = 32 # mini-batch大小 self.pad_size = 60 # 每句话处理成的长度(短填长切) self.learning_rate = 5e-5 # 学习率 self.bert_path = './bert_pretrain' self.tokenizer = BertTokenizer.from_pretrained(self.bert_path) self.hidden_size = 768 self.filter_sizes = (2, 3, 4) # 卷积核尺寸 self.num_filters = 256 # 卷积核数量(channels数) self.dropout = 0.1
def __init__(self, dataset): self.model_name = 'bert' self.train_path = dataset + '/data/train.txt' # 训练集 self.log_path = dataset + '/log/' + self.model_name data = pd.read_csv(self.train_path, encoding='utf-8', names=['comments', 'label'], sep='\t__label__', header=None, engine='python') label = np.array(data['label']) class_set = set(label.flatten().tolist()) self.class_list = list(class_set) self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 模型训练结果 self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') # 设备 self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练 self.num_classes = len(self.class_list) # 类别数 self.num_epochs = 32 # epoch数 self.batch_size = 64 # mini-batch大小 self.pad_size = 32 # 每句话处理成的长度(短填长切) self.learning_rate = 5e-5 # 学习率 self.bert_path = './bert_pretrain' self.tokenizer = BertTokenizer.from_pretrained(self.bert_path) self.hidden_size = 768
def __init__(self, use_words, dataset): self.model_name = 'bert_CNN' self.train_path = dataset + '/data/dataset_classes/train.txt' # 训练集 self.dev_path = dataset + '/data/dataset_classes/dev.txt' # 验证集 self.test_path = dataset + '/data/dataset_classes/test.txt' # 测试集 self.class_list = [ x.strip() for x in open(dataset + '/data/dataset_classes/class.txt').readlines() ] # 类别名单 dic = self.model_name + '+word/' if use_words else '+character/' self.save_path = dataset + '/saved_dict/classes/' + dic + self.model_name + '.pth' # 模型训练结果 self.save_dic = dataset + '/saved_dict/classes/' + dic + self.model_name + '_' # 模型训练结果 self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') # 设备 vocabfile = 'vocab_word.pkl' if use_words else 'vocab_char.pkl' self.vocab_path = dataset + '/data/dataset_classes/' + vocabfile self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练 self.num_classes = len(self.class_list) # 类别数 self.num_epochs = 3 # epoch数 self.batch_size = 128 # mini-batch大小 self.pad_size = 32 # 每句话处理成的长度(短填长切) self.learning_rate = 5e-5 # 学习率 self.bert_path = 'C:/Users/USER/Documents/Capstone_Project/pretrain_model/bert_pretrain' self.tokenizer = BertTokenizer.from_pretrained(self.bert_path) self.hidden_size = 768 self.filter_sizes = (2, 3, 4) # 卷积核尺寸 self.num_filters = 256 # 卷积核数量(channels数) self.dropout = 0.1
def __init__(self, dataset): self.model_name = 'BruceBert' self.train_path = dataset + '/data/train.txt' self.test_path = dataset + '/data/test.txt' self.dev_path = dataset + '/data/dev.txt' # dataset self.datasetpkl = dataset + '/data/dataset.pkl' # 类别 self.class_list = [ x.strip() for x in open(dataset + '/data/class.txt').readlines() ] #训练完模型后的保存地址 self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 设备配置 self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') # 若超过1000个bacth效果还没有提升,提前结束训练 self.require_improvement = 1000 # 类别数 self.num_classes = len(self.class_list) self.num_epochs = 3 self.batch_size = 128 self.learning_rate = 1e-5 # 每句话处理的长度(短填,长切) self.pad_size = 32 # 加载bert预训练模型的位置 self.bert_path = 'bert_pretrain' # bert切词器 self.tokenizer = BertTokenizer.from_pretrained(self.bert_path) # bert隐层层个数,基础bert self.hidden_size = 768
def __init__(self, dataset): self.model_name = 'bert' self.train_path = dataset + '/data/train.txt' # 训练集 self.dev_path = dataset + '/data/dev.txt' # 验证集 self.test_path = dataset + '/data/test.txt' # 测试集 self.class_list = [ x.strip() for x in open(dataset + '/data/class.txt').readlines() ] # 类别名单 self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 模型训练结果 self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') # 设备 self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练 self.num_classes = len(self.class_list) # 类别数 self.num_epochs = 3 # epoch数 self.batch_size = 128 # mini-batch大小 self.pad_size = 32 # 每句话处理成的长度(短填长切) self.learning_rate = 5e-5 # 学习率 self.bert_path = './bert_pretrain' self.tokenizer = BertTokenizer.from_pretrained(self.bert_path) self.hidden_size = 768 self.filter_sizes = (2, 3, 4) # 卷积核尺寸 self.num_filters = 256 # 卷积核数量(channels数) self.dropout = 0.1 self.rnn_hidden = 256 self.num_layers = 2
def __init__(self, dataset): self.model_name = 'bert' self.train_path = dataset + '/data/train.txt' # 训练集 self.dev_path = dataset + '/data/dev.txt' # 验证集 self.test_path = dataset + '/data/test.txt' # 测试集 self.class_list = [ x.strip() for x in open(dataset + '/data/class.txt', encoding='utf-8').readlines() ] # 类别名单 self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 模型训练结果 self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') # 设备 self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练 self.num_classes = len(self.class_list) # 类别数 self.num_epochs = 5 # epoch数 self.batch_size = 2 self.pad_size = 256 # 每句话处理成的长度(短填长切) self.learning_rate = 5e-5 # 学习率,0.00005 self.bert_path = './bert_pretrain' self.tokenizer = BertTokenizer.from_pretrained(self.bert_path) self.hidden_size = 768 self.acc_grad = 3 # 在执行向后/更新过程之前要累积的更新步骤数 self.dropout = 0.15
def __init__(self, dataset): self.model_name = 'BruceBert' # 训练集 self.train_path = dataset + '/data/train.txt' # 测试集 self.test_path = dataset + '/data/test.txt' # 校验集 self.dev_path = dataset + '/data/dev.txt' # 类别 self.class_list = [ x.strip() for x in open(dataset + '/data/class.txt').readlines() ] # 模型训练结果 self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 设备配置 self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') # 类别数 self.num_classes = len(self.class_list) # 若超过1000batch效果还没有提升(相当于从数据集中取了1000次数据),则提前结束训练 self.require_improvement = 1000 # 把数据集迭代几次 self.num_epochs = 3 # 每次取的数据个数 self.batch_size = 128 # 每句话的处理长度(短填,长截) self.pad_size = 32 # 学习率(0.00001) self.learning_rate = 1e-5 # bert 预训练的模型位置 self.bert_path = 'bert_pretrain' # bert 的分词器 self.tokenizer = BertTokenizer.from_pretrained(self.bert_path) # bert 隐藏层个数 self.hidden_size = 768
def __init__(self): self.model_name = 'Bert_Bilstm_crf' # self.train_data_path = './datas/train/source.txt' # 文本训练集 # self.train_label_path = './datas/train/target.txt' # 标签训练集 self.train_data_path = './datas/train/train_source.txt' # 文本验证集 self.train_label_path = './datas/train/train_target.txt' # 标签验证集 self.dev_data_path = './datas/dev/source.txt' # 文本验证集 self.dev_label_path = './datas/dev/target.txt' # 标签验证集 self.save_path = './Result/Save_path/' + self.model_name + '.ckpt' # 模型训练结果 self.bert_path = './bert_pretrain' self.tokenizer = BertTokenizer.from_pretrained('./bert_pretrain') self.tokenizer = BertTokenizer.from_pretrained('./bert_pretrain') self.vocab_class = { 'B-LAW': 0, 'B-ROLE': 1, 'B-TIME': 2, 'I-LOC': 3, 'I-LAW': 4, 'B-PER': 5, 'I-PER': 6, 'B-ORG': 7, 'I-ROLE': 8, 'I-CRIME': 9, 'B-CRIME': 10, 'I-ORG': 11, 'B-LOC': 12, 'I-TIME': 13, 'O': 14 } # 词性类别名单 self.tagset_size = len(self.vocab_class) self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') # 设备 self.num_epochs = 6 # epoch数 self.batch_size = 2 self.pad_size = 10 # 每句话处理成的长度(短填长切) self.learning_rate = 1e-5 # 学习率 self.learning_rate_decay = 5e-6 # 学习率衰减 self.hidden_size = 100 self.embedding_dim = 768 self.num_layers = 1 self.dropout = 0.5
def __init__(self, dataset): # 模型名称 self.model_name = 'BruceBertDPCNN' # 训练集 self.train_path = dataset + '/data/train.txt' # 测试集 self.test_path = dataset + '/data/test.txt' # 校验集 self.dev_path = dataset + '/data/dev.txt' # dataset self.datasetpkl = dataset + '/data/dataset.pkl' # 类别 self.class_list = [ x.strip() for x in open(dataset + '/data/class.txt').readlines() ] # 模型训练结果 self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 设备配置 self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') # 若超过1000个 batch 效果还是没有提升,就提前结束训练 self.require_improvement = 1000 # 类别数 self.num_classes = len(self.class_list) # epoch数 self.num_epochs = 3 # batch_size self.batch_size = 64 # 代码中所有标注的128,因为batch太大了,GPU内存超了,所以全部使用64 代替 # 每句话处理的长度(短填,长切) self.pad_size = 32 # 学习率 self.learning_rate = 1e-5 # bert 预训练模型位置 self.bert_path = './bert_pretrain' # bert 切词器 self.tokenizer = BertTokenizer.from_pretrained( self.bert_path) # 这里的相关的方法,可以通过查看源码来得到 # bert 隐藏层个数 self.hidden_size = 768 # 上边是 Bert 的参数配置,接下来是 RNN 的参数配置 # RNN 隐藏层数量 self.rnn_hidden = 256 # 卷积核的数量 self.num_filters = 250 self.dropout = 0.5
def __init__(self): self.model_name = 'bert' self.class_list = ['中性', '积极', '消极'] # 类别名单 self.save_path = './Sentiment/saved_dict/bert.ckpt' # 模型训练结果 self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 设备 self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练 self.num_classes = len(self.class_list) # 类别数 self.num_epochs = 3 # epoch数 self.batch_size = 128 # mini-batch大小 self.pad_size = 32 # 每句话处理成的长度(短填长切) self.learning_rate = 5e-5 # 学习率 self.bert_path = './bert_pretrain' self.tokenizer = BertTokenizer.from_pretrained(self.bert_path) self.hidden_size = 768
def __init__(self, dataset): self.model_name = 'bert' self.train_path = 'train.csv' # 训练集 self.class_list = [x.strip() for x in open( dataset + '/class_multi1.txt').readlines()] # 类别名单 self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 模型训练结果 self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 设备 self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练 self.num_classes = len(self.class_list) # 类别数 self.num_epochs = 3 # epoch数 self.batch_size = 64 # 128mini-batch大小 self.pad_size = 32 # 每句话处理成的长度(短填长切) self.learning_rate = 5e-4 # 学习率 self.bert_path = './bert_pretrain' self.tokenizer = BertTokenizer.from_pretrained(self.bert_path) self.hidden_size = 768
def __init__(self, dataset): # 模型名称 self.model_name = "BruceBertCNN" # 训练集 self.train_path = dataset + '/data/train.txt' # 校验集 self.dev_path = dataset + '/data/dev.txt' # 测试集 self.test_path = dataset + '/data/test.txt' #dataset self.datasetpkl = dataset + '/data/dataset.pkl' # 类别名单 self.class_list = [ x.strip() for x in open(dataset + '/data/class.txt').readlines() ] #模型保存路径 self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 运行设备 self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') # 若超过1000bacth效果还没有提升,提前结束训练 self.require_improvement = 1000 # 类别数量 self.num_classes = len(self.class_list) # epoch数 self.num_epochs = 3 # batch_size self.batch_size = 128 # 序列长度 self.pad_size = 32 # 学习率 self.learning_rate = 1e-5 # 预训练位置 self.bert_path = './bert_pretrain' # bert的 tokenizer self.tokenizer = BertTokenizer.from_pretrained((self.bert_path)) # Bert的隐藏层数量 self.hidden_size = 768 # 卷积核尺寸 self.filter_sizes = (2, 3, 4) # 卷积核数量 self.num_filters = 256 # droptout self.dropout = 0.5
def __init__(self, dataset): # 模型名称 self.model_name = "SkyerBertRNN" # 训练集 self.train_path = dataset + "/data/train.txt" # 校验集 self.dev_path = dataset + "/data/dev.txt" # 测试集 self.test_path = dataset + "/data/test.txt" # dataset self.datasetpkl = dataset + "/data/dataset.pkl" # 类别表 self.class_list = [ x.strip() for x in open(dataset + "/data/class.txt").readlines() ] # 模型保存路径 self.save_path = dataset + "/saved_dict/" + self.model_name + ".ckpt" # 运行设备 self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') # 若超过1000batch没有提升效果,提前结束训练 self.require_improvement = 1000 # 类别数量 self.num_classes = len(self.class_list) # epoch数 self.num_epochs = 3 # batch size self.batch_size = 128 # 序列长度 self.pad_size = 32 # 学习率 self.learning_rate = 1e-5 # 预训练位置 self.bert_path = './bert_pretrain' # bert的tokenizer self.tokenizer = BertTokenizer.from_pretrained(self.bert_path) # Bert隐藏层数量 self.hidden_size = 768 # RNN隐藏层数量 self.rnn_hidden = 256 # RNN层数 self.num_layers = 2 # dropout self.dropout = 0.5
def __init__(self, args): dataset = args.data_path + '/' + args.dataset self.model_name = 'bert' # Train self.train_path = dataset + '/data/train.txt' # Test self.test_path = dataset + '/data/test.txt' # Val self.dev_path = dataset + '/data/dev.txt' # Pickled dataset for fast load self.datasetpkl = dataset + '/data/dataset.pkl' # Classes self.class_list = [ x.strip() for x in open(dataset + '/data/class.txt').readlines()] # Model checkpoints self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # Devices self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Exit training if there is no improvement in 1000 batches self.require_improvement = 1000 # Num of classes self.num_classes = len(self.class_list) # Num of epochs self.num_epochs = args.epochs # Batch size self.batch_size = args.batch_size # length of every sentence (paragraph) , padding if less than it, cut it if longer than it. self.pad_size = 32 # Learning rate self.learning_rate = args.learning_rate # Location of pretrained Bert self.bert_path = args.model_path + '/bert_pretrain' # Bert tokenizer # #pdb.set_trace() self.tokenizer = BertTokenizer.from_pretrained(self.bert_path) #self.tokenizer = BertTokenizer.from_pretrained('bert-base-chinese') # Bert hidden layers self.hidden_size = 768 # Fine tune self.fine_tune = True self.weight_decay = args.weight_decay self.warmup = args.warmup
def __init__(self, dataset): self.model_name = 'bert' self.train_path = dataset + '/data/train.txt' # 训练集 self.dev_path = dataset + '/data/dev.txt' # 验证集 self.test_path = dataset + '/data/test.txt' # 测试集 self.class_list = [0, 1] # 类别名单 self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 模型训练结果 self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') # 设备 self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练 self.num_classes = len(self.class_list) # 类别数 self.num_epochs = 3 # epoch数 self.batch_size = 128 # mini-batch大小 self.pad_size = 510 # 每句话处理成的长度(短填长切) self.learning_rate = 5e-5 # 学习率 self.bert_path = './bert_pretrain' self.tokenizer = BertTokenizer.from_pretrained(self.bert_path) self.hidden_size = 768
def __init__(self, dataset): # 模型名称 self.model_name = 'bert_rcnn' # 训练数据地址 self.train_path = dataset + '/data/train.txt' # 测试数据地址 self.test_path = dataset + '/data/test.txt' # 验证数据地址 self.dev_path = dataset + '/data/dev.txt' # 类别名 with open(dataset + '/data/class.txt', 'r') as f: self.class_name = [line.strip() for line in f.readlines()] # 类别数量 self.num_classes = len(self.class_name) # 有GPU则启动GPU, 否则启动CPU self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') # 模型保存地址 self.model_save = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 数据集保存地址 self.datasetpkl = dataset + '/data/datasetpkl.pkl' # bert self.bert_path = 'bert_pretrain' # bert分词器 self.tokenizer = BertTokenizer.from_pretrained(self.bert_path) # 学习率 self.learning_rate = 1e-5 # bert隐藏层个数 self.bert_hidden = 768 # batch_size self.batch_size = 128 # epochs self.epochs = 3 # 每句话处理的长度 self.pad_size = 32 # 当模型超过1000次没有提升时, 停止 self.require_improvement = 1000 # dropout self.dropout = 0.5 # layers 数量 self.num_layers = 2 # rnn 隐藏层 self.rnn_hidden = 128
def __init__(self, dataset): self.model_name = 'ERNIE' self.train_path = dataset + '/data/train.txt' # 训练集 self.dev_path = dataset + '/data/dev.txt' # 验证集 self.test_path = dataset + '/data/test.txt' # 测试集 # self.class_list = [x.strip() for x in open( # dataset + '/data/class.txt').readlines()] # 类别名单 self.save_path = dataset + '/saved_dict/' + self.model_name + '0.4gauss.ckpt' # 模型训练结果 self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') # 设备 self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练 self.num_classes = 112 self.num_epochs = 20 # epoch数 self.batch_size = 16 # mini-batch大小 self.pad_size = 256 # 每句话处理成的长度(短填长切) self.learning_rate = 2e-5 # 学习率 self.bert_path = './ERNIE_pretrain' self.tokenizer = BertTokenizer.from_pretrained(self.bert_path) self.hidden_size = 768
def __init__(self, dataset): self.model_name = 'bert' self.train_path = dataset + '/data/train.csv' # 训练集 self.dev_path = dataset + '/data/valid.csv' # 验证集 self.test_path = dataset + '/data/test.csv' # 测试集 self.class_list = [x.strip() for x in open( dataset + '/data/class.txt').readlines()] # 类别名单 self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 模型训练结果 self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 设备 self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练 self.num_classes = len(self.class_list) # 类别数 self.num_epochs = 3 # epoch数 self.batch_size = 128 # mini-batch大小 self.pad_size = 32 # 每句话处理成的长度(短填长切) self.learning_rate = 5e-5 # 学习率 self.bert_path = '/home/wangqian/bert_model/bert-base-chinese/' # self.bert_path = "/Users/Vander/Code/pytorch_col/bert-base-chinese/" self.tokenizer = BertTokenizer.from_pretrained(self.bert_path) self.hidden_size = 768
def __init__(self, dataset): self.model_name = 'ERNIE' self.train_path = dataset + '/data/train.txt' # 训练集 self.dev_path = dataset + '/data/dev.txt' # 验证集 self.test_path = dataset + '/data/test.txt' # 测试集 self.class_list = [x.strip() for x in open( dataset + '/data/class.txt').readlines()] # 类别名单 self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 模型训练结果 os.makedirs(os.path.dirname(self.save_path), exist_ok=True) self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 设备 self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练 self.num_classes = len(self.class_list) # 类别数 self.num_epochs = 3 # epoch数 self.batch_size = 128 # mini-batch大小 self.pad_size = 32 # 每句话处理成的长度(短填长切) self.learning_rate = 5e-5 # 学习率 self.bert_path = './ERNIE_pretrain' self.tokenizer = BertTokenizer.from_pretrained(self.bert_path) print(self.tokenizer) self.hidden_size = 768
def __init__(self, dataset): self.model_name = 'bert-DPCNN' self.train_path = dataset + '/data/cv_1/cv1_train.txt' # 训练集 self.dev_path = dataset + '/data/cv_1/cv1_dev.txt' # 验证集 self.test_path = dataset + '/data/cv_1/cv_valid.txt' # 测试集 #self.class_list = [x.strip() for x in open( # dataset + '/data/class.txt').readlines()] # 类别名单 self.save_path = dataset + '/saved_dict/' + self.model_name + '512-0.bin' # 模型训练结果 self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') # 设备 self.require_improvement = 100000 # 若超过1000batch效果还没提升,则提前结束训练 self.num_classes = 14 # 类别数 self.num_epochs = 3 # epoch数 self.batch_size = 16 # mini-batch大小 self.pad_size = 512 # 每句话处理成的长度(短填长切) self.learning_rate = 2e-5 # 学习率 self.bert_path = './bert-small/' self.tokenizer = BertTokenizer.from_pretrained(self.bert_path) self.hidden_size = 512 self.num_filters = 512 # 卷积核数量(channels数)
def __init__(self, dataset): self.model_name = 'Bert' self.train_path = dataset + '/data/train.txt' self.test_path = dataset + '/data/test.txt' self.dev_path = dataset + '/data/dev.txt' self.datasetpkl = dataset + '/data/dataset.pkl' self.class_list = [ x.strip() for x in open(dataset + '/data/class.txt').readlines() ] self.save_path = 'checkpoints' + self.model_name + '.pth' self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.require_improvement = 1000 self.num_classes = len(self.class_list) self.num_epochs = 3 self.batch_size = 128 self.learning_rate = 1e-5 self.pad_size = 32 self.bert_path = 'bert_pretrain/roberta' self.tokenizer = BertTokenizer.from_pretrained(self.bert_path) self.hidden_size = 1024 # bert隐藏层
def __init__(self, dataset): self.model_name = 'bert' self.train_path = dataset + '/data/train.txt' # train self.dev_path = dataset + '/data/dev.txt' # dev self.test_path = dataset + '/data/test.txt' # test self.log_path = dataset + '/log/' + self.model_name self.class_list = [ x.strip() for x in open(dataset + '/data/class.txt').readlines() ] # class of category self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # model training results self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') # device self.require_improvement = 1000 # end training if it is not improved in 1000 epochs self.num_classes = len(self.class_list) # the number of categories self.num_epochs = 3 # the number of epoch self.batch_size = 128 # the size of mini-batch self.pad_size = 32 # the length of every sentence self.learning_rate = 5e-5 # learning rate self.bert_path = './bert_pretrain' self.tokenizer = BertTokenizer.from_pretrained(self.bert_path) self.hidden_size = 768
def __init__(self, dataset): self.model_name = 'RoBERTa' self.train_path = dataset + '/data/train.txt' # 训练集 self.dev_path = dataset + '/data/dev.txt' # 验证集 self.test_path = dataset + '/data/test.txt' # 测试集 self.class_list = [ x.strip() for x in open(dataset + '/data/class.txt').readlines() ] # 类别名单 self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 模型训练结果 self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') # 设备 self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练 self.num_classes = len(self.class_list) # 类别数 self.num_epochs = 5 # epoch数 self.batch_size = 12 # mini-batch大小 self.pad_size = 120 # 每句话处理成的长度(短填长切) self.learning_rate = 5e-5 # 学习率 self.bert_path = './roberta_wwm_ext_pytorch' self.tokenizer = BertTokenizer.from_pretrained(self.bert_path) self.hidden_size = 768 self.trunc_medium = -1
def __init__(self, dataset): self.model_name = 'Bert' # 训练集 self.train_path = dataset + '/data/train.txt' # 测试集 self.test_path = dataset + '/data/test.txt' # 校验集 self.dev_path = dataset + '/data/dev.txt' # dataset self.datasetpkl = dataset + '/data/dataset.pkl' # 类别 self.class_list = [ x.strip() for x in open(dataset + '/data/class.txt').readlines() ] #模型训练结果 self.save_path = dataset + '/saved_dict' + self.model_name + '.ckpt' # 设备配置 self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') # 若超过1000bacth效果还没有提升,提前结束训练 self.require_improvement = 1000 # 类别数 self.num_classes = len(self.class_list) # epoch数 self.num_epochs = 3 # batch_size 每一个批次的数目 self.batch_size = 128 # 每句话处理的长度(短填,长切) self.pad_size = 32 # 学习率 self.learning_rate = 1e-5 # bert预训练模型位置 self.bert_path = 'bert_pretrain' # bert切词器 中文切成一个一个字 self.tokenizer = BertTokenizer.from_pretrained(self.bert_path) # bert隐层层个数 self.hidden_size = 768