def main(): vocab = data.Vocabulary() data.build_vocab(vocab, config.vector_file) # build vocabulary # classifier = models.Attentionclassifier(vocab_size=vocab.n_words, # emb_dim=config.DIM, # hidden_size=config.HIDDEN_SIZE, # num_layer=config.NUM_LAYER, # dropout=config.drop_out, # bidirectional=config.bidirectional, # label_size=config.label_class, # use_pretrain=True, # embed_matrix=vocab.vector, # embed_freeze=False).to(config.device) classifier = models.FinetuneModel1(vocab_size=vocab.n_words, emb_dim=config.DIM, hidden_size=config.HIDDEN_SIZE, num_layer=config.NUM_LAYER, dropout=config.drop_out, bidirectional=config.bidirectional, label_size=config.label_class, hidden_size1=128, use_pretrain=True, embed_matrix=vocab.vector, embed_freeze=False).to(config.device) model_dict = classifier.state_dict() pretrained_model = torch.load(config.model_path) pretrained_dict = dict() for k, v in pretrained_model.items(): if k == 'state_dict': for kk, vv in v.items(): if kk in model_dict: pretrained_dict[kk] = vv # # 更新现有的model_dict model_dict.update(pretrained_dict) # 加载实际需要的model_dict classifier.load_state_dict(model_dict) # classifier.eval() test_data = data.Sentiment(config.predict_file, vocab) test_dataloader = DataLoader(test_data, batch_size=config.TRAIN_BATCH_SIZE, shuffle=True, collate_fn=data.collate_fn) predict(classifier, test_dataloader, config.silent)
def train(): args = parse_args() if args.random_seed == 0: args.random_seed = None print("random seed is None") if args.enable_ce: random.seed(args.random_seed) np.random.seed(args.random_seed) logger = logging.getLogger("lm") logger.setLevel(logging.INFO) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') console_handler = logging.StreamHandler() console_handler.setLevel(logging.INFO) console_handler.setFormatter(formatter) logger.info('Running with args : {}'.format(args)) logger.info('Running paddle : {}'.format(paddle.version.commit)) hidden_size = args.hidden_size batch_size = args.batch_size data_path = args.data_path logger.info("begin to load vocab") vocab = data.Vocabulary(args.vocab_path, validate_file=True) vocab_size = vocab.size logger.info("finished load vocab") logger.info('build the model...') # build model train_prog = fluid.Program() train_startup_prog = fluid.Program() if args.enable_ce: train_prog.random_seed = args.random_seed train_startup_prog.random_seed = args.random_seed # build infer model infer_prog = fluid.Program() infer_startup_prog = fluid.Program() with fluid.program_guard(infer_prog, infer_startup_prog): with fluid.unique_name.guard(): # Infer process infer_model = lm_model.LanguageModel(args, vocab_size, test_mode=True) infer_model.build() infer_progs = infer_prog, infer_startup_prog, infer_model with fluid.program_guard(train_prog, train_startup_prog): with fluid.unique_name.guard(): # Training process train_model = lm_model.LanguageModel(args, vocab_size, test_mode=False) train_model.build() fluid.clip.set_gradient_clip( clip=fluid.clip.GradientClipByGlobalNorm( clip_norm=args.max_grad_norm)) # build optimizer if args.optim == 'adagrad': optimizer = fluid.optimizer.Adagrad( learning_rate=args.learning_rate, epsilon=0.0, initial_accumulator_value=1.0) elif args.optim == 'sgd': optimizer = fluid.optimizer.SGD( learning_rate=args.learning_rate) elif args.optim == 'adam': optimizer = fluid.optimizer.Adam( learning_rate=args.learning_rate) elif args.optim == 'rprop': optimizer = fluid.optimizer.RMSPropOptimizer( learning_rate=args.learning_rate) else: logger.error('Unsupported optimizer: {}'.format(args.optim)) exit(-1) optimizer.minimize(train_model.loss * args.num_steps) # initialize parameters place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace() exe = Executor(place) train_progs = train_prog, train_startup_prog, train_model if args.local: logger.info("local start_up:") train_loop(args, logger, vocab, train_progs, infer_progs, optimizer) else: if args.update_method == "nccl2": trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) if args.test_nccl: worker_endpoints_env = os.getenv("PADDLE_WORK_ENDPOINTS") worker_endpoints = worker_endpoints_env.split(',') trainers_num = len(worker_endpoints) current_endpoint = worker_endpoints[trainer_id] else: port = os.getenv("PADDLE_PORT") worker_ips = os.getenv("PADDLE_TRAINERS") worker_endpoints = [] for ip in worker_ips.split(","): worker_endpoints.append(':'.join([ip, port])) worker_endpoints_env = ','.join(worker_endpoints) trainers_num = len(worker_endpoints) current_endpoint = os.getenv("POD_IP") + ":" + port if trainer_id == 0: logger.info("train_id == 0, sleep 60s") time.sleep(60) logger.info("trainers_num:{}".format(trainers_num)) logger.info("worker_endpoints:{}".format(worker_endpoints)) logger.info("current_endpoint:{}".format(current_endpoint)) config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" t = fluid.DistributeTranspiler(config=config) t.transpile(trainer_id, trainers=worker_endpoints_env, current_endpoint=current_endpoint, program=train_prog, startup_program=train_startup_prog) train_progs = train_prog, train_startup_prog, train_model train_loop(args, logger, vocab, train_progs, infer_progs, optimizer, trainers_num, trainer_id, worker_endpoints) else: port = os.getenv("PADDLE_PORT", "6174") pserver_ips = os.getenv("PADDLE_PSERVERS") eplist = [] for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) pserver_endpoints = ",".join(eplist) trainers = int(os.getenv("PADDLE_TRAINERS_NUM", "0")) current_endpoint = os.getenv("POD_IP") + ":" + port trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) logger.info("pserver_endpoints:{}".format(pserver_endpoints)) logger.info("current_endpoint:{}".format(current_endpoint)) logger.info("trainer_id:{}".format(trainer_id)) logger.info("pserver_ips:{}".format(pserver_ips)) logger.info("port:{}".format(port)) t = fluid.DistributeTranspiler() t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers, program=train_prog, startup_program=startup_prog) if training_role == "PSERVER": logger.info("distributed: pserver started") current_endpoint = os.getenv("POD_IP") + ":" + os.getenv( "PADDLE_PORT") if not current_endpoint: logger.critical("need env SERVER_ENDPOINT") exit(1) pserver_prog = t.get_pserver_program(current_endpoint) pserver_startup = t.get_startup_program( current_endpoint, pserver_prog) exe.run(pserver_startup) exe.run(pserver_prog) elif training_role == "TRAINER": logger.info("distributed: trainer started") trainer_prog = t.get_trainer_program() train_loop(args, logger, vocab, train_progs, infer_progs, optimizer) else: logger.critical( "environment var TRAINER_ROLE should be TRAINER os PSERVER" ) exit(1)
def finetune(): vocab = data.Vocabulary() data.build_vocab(vocab, config.vector_file) # build vocabulary train_data = data.Sentiment(config.finetune_train_file, vocab) train_dataloader = DataLoader(train_data, batch_size=config.TRAIN_BATCH_SIZE, shuffle=True, collate_fn=data.collate_fn) valid_data = data.Sentiment(config.finetune_valid_file, vocab) valid_dataloader = DataLoader(valid_data, batch_size=config.TRAIN_BATCH_SIZE, shuffle=True, collate_fn=data.collate_fn) test_data = data.Sentiment(config.finetune_test_file, vocab) test_dataloader = DataLoader(test_data, batch_size=config.TRAIN_BATCH_SIZE, shuffle=True, collate_fn=data.collate_fn) classifier = models.FinetuneModel1(vocab_size=vocab.n_words, emb_dim=config.DIM, hidden_size=config.HIDDEN_SIZE, num_layer=config.NUM_LAYER, dropout=config.drop_out, bidirectional=config.bidirectional, label_size=config.label_class, hidden_size1=128, use_pretrain=True, embed_matrix=vocab.vector, embed_freeze=False).to(config.device) model_dict = classifier.state_dict() pretrained_model = torch.load(config.model_path) # 将pretrained_dict里不属于model_dict的键剔除掉 pretrained_dict = dict() for k, v in pretrained_model.items(): if k == 'state_dict': for kk, vv in v.items(): if kk in model_dict: pretrained_dict[kk] = vv # 更新现有的model_dict model_dict.update(pretrained_dict) # 加载实际需要的model_dict classifier.load_state_dict(model_dict) # 固定网络参数,不更新 for param in classifier.parameters(): param.requires_grad = False # 将最后final层的参数设置可以更新 for param in classifier.final.parameters(): param.requires_grad = True # new_model = models.FinetuneModel(classifier, hidden_size1=128, class_size=2) # print(new_model) criterion = nn.NLLLoss() # optimizer = torch.optim.Adam(classifier.parameters()) # optimizer = torch.optim.RMSprop(classifier.parameters(), lr=0.001, alpha=0.9, momentum=0.2) optimizer = torch.optim.Adadelta(filter(lambda p: p.requires_grad, classifier.parameters()), lr=0.01, rho=0.9, eps=1e-06, weight_decay=0) # optimizer = torch.optim.RMSprop(classifier.parameters()) best_f1 = 0 for epoch in range(config.finetune_epochs): # lr update # adjust_learning_rate(optimizer, epoch) # 测试不同优化器的学习率是否是自适应的 for param_group in optimizer.param_groups: print("here lr :{}".format(param_group['lr'])) logging.info("epoch {0:04d}".format(epoch)) main.train(train_dataloader, classifier, criterion, optimizer, epoch, config.finetune_batch_size, config.silent) test_f1, val_loss = main.test(valid_dataloader, classifier, criterion, epoch, config.finetune_batch_size, config.silent) is_best = test_f1 > best_f1 # True or False best_f1 = max(test_f1, best_f1) logging.info("best f1 is {}".format(best_f1)) main.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': classifier.state_dict(), 'acc': test_f1, 'best_acc': best_f1, 'optimizer': optimizer.state_dict(), }, is_best, checkpoint='../output/', save_file='finetune_model_best.pth.tar') predict.predict(classifier, test_dataloader, config.silent)
import torch import torch.optim as optim import pandas as pd from torch.utils.data import DataLoader import numpy as np from torch.autograd import Variable import torch.nn.functional as F if __name__ == "__main__": print("starting...") # prepare data csv_dataset = pd.read_csv(config.file_name, header=None) # csv_file format: dataframe print("data loaded") vocab = data.Vocabulary() data.build_vocab(vocab) # build vocabulary print("build vocab success") train_data = data.sentimentDataset(vocab, csv_dataset, train_size=config.TRAIN_RATIO, test_size=config.TEST_RATIO, train=True) test_data = data.sentimentDataset(vocab, csv_dataset, train=False) train_dataloader = DataLoader(train_data, batch_size=config.TRAIN_BATCH_SIZE, shuffle=True, collate_fn=data.collate_fn) test_dataloader = DataLoader(test_data,
def main(): best_f1 = 0 print(config.device) vocab = data.Vocabulary() data.build_vocab(vocab, config.vector_file) # build vocabulary train_data = data.Sentiment(config.train_file, vocab) train_dataloader = DataLoader(train_data, batch_size=config.TRAIN_BATCH_SIZE, shuffle=True, collate_fn=data.collate_fn) test_data = data.Sentiment(config.test_file, vocab) test_dataloader = DataLoader(test_data, batch_size=config.TRAIN_BATCH_SIZE, shuffle=True, collate_fn=data.collate_fn) # classifier = models.RNNClassifier(nembedding=config.DIM, # hidden_size=config.HIDDEN_SIZE, # num_layer=config.NUM_LAYER, # dropout=config.drop_out, # vocab_size=vocab.n_words, # use_pretrain=True, # embed_matrix=vocab.vector, # embed_freeze=False, # label_size=config.label_class).to(config.device) classifier = models.Attentionclassifier(vocab_size=vocab.n_words, emb_dim=config.DIM, hidden_size=config.HIDDEN_SIZE, num_layer=config.NUM_LAYER, dropout=config.drop_out, bidirectional=config.bidirectional, label_size=config.label_class, use_pretrain=True, embed_matrix=vocab.vector, embed_freeze=False).to( config.device) criterion = nn.NLLLoss() # optimizer = torch.optim.Adam(classifier.parameters()) optimizer = torch.optim.RMSprop(classifier.parameters(), lr=config.LR, alpha=0.9, momentum=0.2) # optimizer = torch.optim.RMSprop(classifier.parameters()) # optimizer, scheduler = adam_optimizers(classifier.parameters()) # optimizer = torch.optim.Adadelta(classifier.parameters(), lr=config.LR, rho=0.9, eps=1e-06, weight_decay=0) for epoch in range(config.epochs): # lr update adjust_learning_rate(optimizer, epoch) # 测试不同优化器的学习率是否是自适应的 # for param_group in optimizer.param_groups: # print("here lr :{}".format(param_group['lr'])) logging.info("epoch {0:04d}".format(epoch)) train(train_dataloader, classifier, criterion, optimizer, epoch, config.TRAIN_BATCH_SIZE, config.silent) test_f1, val_loss = test(test_dataloader, classifier, criterion, epoch, config.TRAIN_BATCH_SIZE, config.silent) # scheduler.step(val_loss) is_best = test_f1 > best_f1 # True or False best_f1 = max(test_f1, best_f1) logging.info("best f1 is {}".format(best_f1)) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': classifier.state_dict(), 'acc': test_f1, 'best_acc': best_f1, 'optimizer': optimizer.state_dict(), }, is_best, checkpoint='../output/')