def train(args): fh = logging.FileHandler(f"./output/{args.model}/{args.dataset}/logs.txt") # create file handler which logs even debug messages logger.addHandler(fh)# add the handlers to the logger timestamp = datetime.now().strftime('%Y%m%d%H%M') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu") def save_model(model, epoch): torch.save(model.state_dict(), f'./output/{args.model}/{args.dataset}/models/epo{epoch}.h5') def load_model(model, epoch, to_device): assert os.path.exists(f'./output/{args.model}/{args.dataset}/models/epo{epoch}.h5'), f'Weights at epoch {epoch} not found' model.load_state_dict(torch.load(f'./output/{args.model}/{args.dataset}/models/epo{epoch}.h5', map_location=to_device)) config = getattr(configs, 'config_'+args.model)() print(config) # load data data_path = args.data_path+args.dataset+'/' train_set = eval(config['dataset_name'])(config, data_path, config['train_token'], config['tok_len'], config['train_ast'], config['vocab_ast'], config['train_cfg'], config['n_node'], config['train_desc'], config['desc_len']) data_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=config['batch_size'], collate_fn=batcher(device), shuffle=True, drop_last=False, num_workers=0) # define the models logger.info('Constructing Model..') model = getattr(models, args.model)(config) #initialize the model if args.reload_from>0: load_model(model, args.reload_from, device) logger.info('done') model.to(device) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=config['learning_rate'], eps=config['adam_epsilon']) scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=config['warmup_steps'], num_training_steps=len(data_loader)*config['nb_epoch']) # do not forget to modify the number when dataset is changed print('---model parameters---') num_params = 0 for param in model.parameters(): num_params += param.numel() print(num_params / 1e6) n_iters = len(data_loader) itr_global = args.reload_from+1 for epoch in range(int(args.reload_from)+1, config['nb_epoch']+1): itr_start_time = time.time() losses=[] for batch in data_loader: model.train() batch_gpu = [tensor for tensor in batch] loss = model(*batch_gpu) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0) optimizer.step() scheduler.step() model.zero_grad() losses.append(loss.item()) if itr_global % args.log_every == 0: elapsed = time.time() - itr_start_time logger.info('epo:[%d/%d] itr:[%d/%d] step_time:%ds Loss=%.5f'% (epoch, config['nb_epoch'], itr_global%n_iters, n_iters, elapsed, np.mean(losses))) losses=[] itr_start_time = time.time() itr_global = itr_global + 1 # save every epoch if epoch >= 90: if epoch % 5 == 0: save_model(model, epoch)
def train(args): timestamp=datetime.now().strftime('%Y%m%d%H%M') # LOG # logger = logging.getLogger(__name__) logging.basicConfig(level=logging.DEBUG, format="%(message)s")#,format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") tb_writer=None if args.visual: # make output directory if it doesn't already exist os.makedirs(f'./output/{args.model}/{args.expname}/{timestamp}/models', exist_ok=True) os.makedirs(f'./output/{args.model}/{args.expname}/{timestamp}/temp_results', exist_ok=True) fh = logging.FileHandler(f"./output/{args.model}/{args.expname}/{timestamp}/logs.txt") # create file handler which logs even debug messages logger.addHandler(fh)# add the handlers to the logger tb_writer = SummaryWriter(f"./output/{args.model}/{args.expname}/{timestamp}/logs/") # save arguments json.dump(vars(args), open(f'./output/{args.model}/{args.expname}/{timestamp}/args.json', 'w')) # Device # if args.gpu_id<0: device = torch.device("cuda") else: device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() and args.gpu_id>-1 else "cpu") print(device) n_gpu = torch.cuda.device_count() if args.gpu_id<0 else 1 print(f"num of gpus:{n_gpu}") # Set the random seed manually for reproducibility. random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) def save_model(model, epoch, timestamp): """Save model parameters to checkpoint""" os.makedirs(f'./output/{args.model}/{args.expname}/{timestamp}/models', exist_ok=True) ckpt_path=f'./output/{args.model}/{args.expname}/{timestamp}/models/model_epo{epoch}.pkl' print(f'Saving model parameters to {ckpt_path}') torch.save(model.state_dict(), ckpt_path) def load_model(model, epoch, timestamp): """Load parameters from checkpoint""" ckpt_path=f'./output/{args.model}/{args.expname}/{timestamp}/models/model_epo{epoch}.pkl' print(f'Loading model parameters from {ckpt_path}') model.load_state_dict(torch.load(checkpoint)) config = getattr(configs, 'config_'+args.model)() ############################################################################### # Load dataset ############################################################################### train_set=APIDataset(args.data_path+'train.desc.h5', args.data_path+'train.apiseq.h5', config['max_sent_len']) valid_set=APIDataset(args.data_path+'test.desc.h5', args.data_path+'test.apiseq.h5', config['max_sent_len']) train_loader=torch.utils.data.DataLoader(dataset=train_set, batch_size=config['batch_size'], shuffle=True, num_workers=1) valid_loader=torch.utils.data.DataLoader(dataset=valid_set, batch_size=config['batch_size'], shuffle=True, num_workers=1) print("Loaded dataset!") ############################################################################### # Define the models ############################################################################### model = getattr(models, args.model)(config) if args.reload_from>=0: load_model(model, args.reload_from) model=model.to(device) ############################################################################### # Prepare the Optimizer ############################################################################### no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=config['lr'], eps=config['adam_epsilon']) scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=config['warmup_steps'], num_training_steps=len(train_loader)*config['epochs']) # do not foget to modify the number when dataset is changed ############################################################################### # Training ############################################################################### logger.info("Training...") itr_global=1 start_epoch=1 if args.reload_from==-1 else args.reload_from+1 for epoch in range(start_epoch, config['epochs']+1): epoch_start_time = time.time() itr_start_time = time.time() # shuffle (re-define) dataset between epochs for batch in train_loader:# loop through all batches in training dataset model.train() batch_gpu = [tensor.to(device) for tensor in batch] loss = model(*batch_gpu) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), config['clip']) optimizer.step() scheduler.step() model.zero_grad() if itr_global % args.log_every == 0: elapsed = time.time() - itr_start_time log = '%s-%s|@gpu%d epo:[%d/%d] iter:%d step_time:%ds loss:%f'\ %(args.model, args.expname, args.gpu_id, epoch, config['epochs'],itr_global, elapsed, loss) if args.visual: tb_writer.add_scalar('loss', loss, itr_global) logger.info(log) itr_start_time = time.time() if itr_global % args.valid_every == 0: model.eval() loss_records={} for batch in valid_loader: batch_gpu = [tensor.to(device) for tensor in batch] with torch.no_grad(): valid_loss = model.valid(*batch_gpu) for loss_name, loss_value in valid_loss.items(): v=loss_records.get(loss_name, []) v.append(loss_value) loss_records[loss_name]=v log = 'Validation ' for loss_name, loss_values in loss_records.items(): log = log + loss_name + ':%.4f '%(np.mean(loss_values)) if args.visual: tb_writer.add_scalar(loss_name, np.mean(loss_values), itr_global) logger.info(log) itr_global+=1 if itr_global % args.eval_every == 0: # evaluate the model in the develop set model.eval() save_model(model, itr_global, timestamp) # save model after each epoch valid_loader=torch.utils.data.DataLoader(dataset=valid_set, batch_size=1, shuffle=False, num_workers=1) vocab_api = load_dict(args.data_path+'vocab.apiseq.json') vocab_desc = load_dict(args.data_path+'vocab.desc.json') metrics=Metrics() os.makedirs(f'./output/{args.model}/{args.expname}/{timestamp}/temp_results', exist_ok=True) f_eval = open(f"./output/{args.model}/{args.expname}/{timestamp}/temp_results/iter{itr_global}.txt", "w") repeat = 1 decode_mode = 'sample' recall_bleu, prec_bleu = evaluate(model, metrics, valid_loader, vocab_desc, vocab_api, repeat, decode_mode, f_eval) if args.visual: tb_writer.add_scalar('recall_bleu', recall_bleu, itr_global) tb_writer.add_scalar('prec_bleu', prec_bleu, itr_global) # end of epoch ---------------------------- model.adjust_lr()
def train(args): fh = logging.FileHandler(f"./output/{args.model}/{args.dataset}/logs.txt") # create file handler which logs even debug messages logger.addHandler(fh) # add the handlers to the logger timestamp = datetime.now().strftime('%Y%m%d%H%M') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) device = torch.device( f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu") def save_model(model, epoch): torch.save( model.state_dict(), f'./output/{args.model}/{args.dataset}/models/epo{epoch}.h5') def load_model(model, epoch, to_device): assert os.path.exists( f'./output/{args.model}/{args.dataset}/models/epo{epoch}.h5' ), f'Weights at epoch {epoch} not found' model.load_state_dict( torch.load( f'./output/{args.model}/{args.dataset}/models/epo{epoch}.h5', map_location=to_device)) config = getattr(configs, 'config_' + args.model)() print(config) ############################################################################### # Load data ############################################################################### data_path = args.data_path + args.dataset + '/' train_set = eval(config['dataset_name'])(data_path, config['train_ast'], config['vocab_ast'], config['train_desc'], config['desc_len']) ''' valid_set = eval(config['dataset_name'])(data_path, config['valid_tokens'], config['tokens_len'], config['valid_desc'], config['desc_len']) ''' data_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=config['batch_size'], collate_fn=batcher(device), shuffle=True, drop_last=False, num_workers=0) ############################################################################### # Define the models ############################################################################### logger.info('Constructing Model..') model = getattr(models, args.model)(config) #initialize the model if args.reload_from > 0: load_model(model, args.reload_from, device) logger.info('done') model.to(device) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] # origin: AdamW optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=config['learning_rate'], eps=config['adam_epsilon']) # no scheduler in paper's original code scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=config['warmup_steps'], num_training_steps=len(data_loader) * config['nb_epoch'] ) # do not foget to modify the number when dataset is changed if config['fp16']: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=config['fp16_opt_level']) print('---model parameters---') num_params = 0 for param in model.parameters(): num_params += param.numel() print(num_params / 1e6) n_iters = len(data_loader) itr_global = args.reload_from + 1 for epoch in range(int(args.reload_from) + 1, config['nb_epoch'] + 1): itr_start_time = time.time() losses = [] for batch in data_loader: model.train() batch_gpu = [tensor for tensor in batch] loss = model(*batch_gpu) #tree_batch, tree_node_num_batch, good_desc_batch, good_desc_len_batch, bad_desc_batch, bad_desc_len_batch = [tensor for tensor in batch] #loss = model(tree_batch, tree_node_num_batch.to(device), #good_desc_batch.to(device), good_desc_len_batch.to(device), bad_desc_batch.to(device), bad_desc_len_batch.to(device)) if config['fp16']: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), 5.0) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0) optimizer.step() scheduler.step() model.zero_grad() losses.append(loss.item()) if itr_global % args.log_every == 0: elapsed = time.time() - itr_start_time logger.info('epo:[%d/%d] itr:[%d/%d] step_time:%ds Loss=%.5f' % (epoch, config['nb_epoch'], itr_global % n_iters, n_iters, elapsed, np.mean(losses))) losses = [] itr_start_time = time.time() itr_global = itr_global + 1 ''' # validate every epoch logger.info("validating..") # mark: pool_size need to change acc1, mrr, map1, ndcg = validate(valid_set, model, 3500, 1, config['sim_measure']) logger.info(f'ACC={acc1}, MRR={mrr}, MAP={map1}, nDCG={ndcg}') if tb_writer is not None: tb_writer.add_scalar('acc', acc1, itr_global) tb_writer.add_scalar('mrr', mrr, itr_global) tb_writer.add_scalar('map', map1, itr_global) tb_writer.add_scalar('ndcg', ndcg, itr_global) ''' # save every epoch if epoch >= 90: if epoch % 5 == 0: save_model(model, epoch)
def train(args, ast2id, code2id, nl2id, id2nl): nl_vocab_size = len(nl2id) use_relative = True fh = logging.FileHandler(f"./output/{args.model}/{args.dataset}/logs.txt") # create file handler which logs even debug messages logger.addHandler(fh) # add the handlers to the logger timestamp = datetime.now().strftime('%Y%m%d%H%M') tb_writer = SummaryWriter( f"./output/{args.model}/{args.dataset}/logs/{timestamp}" ) if args.visual else None random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) device = torch.device( f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu") config = getattr(configs, 'config_' + args.model)() if args.automl: config.update(vars(args)) print(config) ############################################################################### # Load data ############################################################################### data_path = DATASET_PATH + "/train/" if IS_ON_NSML else args.data_path + args.dataset + '/' ''' train_set = eval(config['dataset_name'])(data_path, config['train_name'], config['name_len'], config['train_api'], config['api_len'], config['train_tokens'], config['tokens_len'], config['train_desc'], config['desc_len']) valid_set = eval(config['dataset_name'])(data_path, config['valid_name'], config['name_len'], config['valid_api'], config['api_len'], config['valid_tokens'], config['tokens_len'], config['valid_desc'], config['desc_len']) data_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=config['batch_size'], shuffle=True, drop_last=True, num_workers=1) ''' train_data_set = TreeDataSet(file_name=args.data_dir + '/train.json', ast_path=args.data_dir + '/tree/train/', ast2id=ast2id, nl2id=nl2id, max_ast_size=args.code_max_len, max_simple_name_size=args.max_simple_name_len, k=args.k, max_comment_size=args.comment_max_len, use_code=use_relative, desc=config['train_desc'], desclen=config['desc_len']) data_loader = DataLoaderX(dataset=train_data_set, batch_size=args.batch_size, shuffle=True, num_workers=2) ############################################################################### # Define Model ############################################################################### logger.info('Constructing Model..') model = getattr(models, args.model)(config, ast2id) #initialize the model def save_model(model, ckpt_path): # torch.save(model.state_dict(), ckpt_path) torch.save(model, ckpt_path) def load_model(model, ckpt_path, to_device): assert os.path.exists(ckpt_path), f'Weights not found' model.load_state_dict(torch.load(ckpt_path, map_location=to_device)) if args.reload_from > 0: ckpt = f'./output/{args.model}/{args.dataset}/models/step{args.reload_from}.h5' # load_model(model, ckpt, device) model = torch.load(ckpt) else: model.to(device) if IS_ON_NSML: bind_nsml(model) # model.to(device) ############################################################################### # Prepare the Optimizer ############################################################################### no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=config['learning_rate'], eps=config['adam_epsilon']) scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=config['warmup_steps'], num_training_steps=len(data_loader) * config['nb_epoch'] ) # do not foget to modify the number when dataset is changed if config['fp16']: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=config['fp16_opt_level']) ############################################################################### # Training Process ############################################################################### n_iters = len(data_loader) itr_global = args.reload_from + 1 code_reprs, desc_reprs = [], [] for epoch in range( int(args.reload_from / n_iters) + 1, config['nb_epoch'] + 1): itr_start_time = time.time() losses = [] n_processed = 0 for batch in data_loader: model.train() batch_gpu = [tensor.to(device).long() for tensor in batch] loss = model(*batch_gpu) # print(loss) # code_repr=normalize(code_repr.data.cpu().numpy().astype(np.float32)) # desc_repr = normalize(desc_repr.data.cpu().numpy().astype(np.float32)) # code_reprs.append(code_repr) # desc_reprs.append(desc_repr) #n_processed += batch[0].size(0) model.zero_grad() if config['fp16']: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), 5.0) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0) optimizer.step() scheduler.step() # model.zero_grad() losses.append(loss.item()) if itr_global % args.log_every == 0: elapsed = time.time() - itr_start_time logger.info('epo:[%d/%d] itr:[%d/%d] step_time:%ds Loss=%.5f' % (epoch, config['nb_epoch'], itr_global % n_iters, n_iters, elapsed, np.mean(losses))) if tb_writer is not None: tb_writer.add_scalar('loss', np.mean(losses), itr_global) if IS_ON_NSML: summary = { "summary": True, "scope": locals(), "step": itr_global } summary.update({'loss': np.mean(losses)}) nsml.report(**summary) losses = [] itr_start_time = time.time() itr_global = itr_global + 1 if itr_global % args.valid_every == 0: logger.info("validating..") # with torch.no_grad(): valid_result = validate(model, config['pool_size'], config['top_k'], config['sim_measure']) logger.info(valid_result) if tb_writer is not None: for key, value in valid_result.items(): tb_writer.add_scalar(key, value, itr_global) if IS_ON_NSML: summary = { "summary": True, "scope": locals(), "step": itr_global } summary.update(valid_result) nsml.report(**summary) code_reprs, desc_reprs = [], [] if itr_global % args.save_every == 0: ckpt_path = f'./output/{args.model}/{args.dataset}/models/3step{itr_global}.h5' save_model(model, ckpt_path) if IS_ON_NSML: nsml.save(checkpoint=f'model_step{itr_global}')
def train(args): fh = logging.FileHandler(f"./output/{args.model}/{args.dataset}/logs.txt") # create file handler which logs even debug messages logger.addHandler(fh) # add the handlers to the logger timestamp = datetime.now().strftime('%Y%m%d%H%M') tb_writer = SummaryWriter( f"./output/{args.model}/{args.dataset}/logs/{timestamp}" ) if args.visual else None random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) device = torch.device( f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu") def save_model(model, epoch): torch.save( model.state_dict(), f'./output/{args.model}/{args.dataset}/models/epo{epoch}.h5') def load_model(model, epoch, to_device): assert os.path.exists( f'./output/{args.model}/{args.dataset}/models/epo{epoch}.h5' ), f'Weights at epoch {epoch} not found' model.load_state_dict( torch.load( f'./output/{args.model}/{args.dataset}/models/epo{epoch}.h5', map_location=to_device)) config = getattr(configs, 'config_' + args.model)() if args.automl: config.update(vars(args)) print(config) ############################################################################### # Load data ############################################################################### data_path = DATASET_PATH + "/train/" if IS_ON_NSML else args.data_path + args.dataset + '/' train_set = eval(config['dataset_name'])( data_path, config['train_name'], config['name_len'], config['train_api'], config['api_len'], config['train_tokens'], config['tokens_len'], config['train_desc'], config['desc_len']) valid_set = eval(config['dataset_name'])( data_path, config['valid_name'], config['name_len'], config['valid_api'], config['api_len'], config['valid_tokens'], config['tokens_len'], config['valid_desc'], config['desc_len']) data_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=config['batch_size'], shuffle=True, drop_last=True, num_workers=1) ############################################################################### # Define the models ############################################################################### logger.info('Constructing Model..') model = getattr(models, args.model)(config) #initialize the model if args.reload_from > 0: load_model(model, args.reload_from, device) logger.info('done') if IS_ON_NSML: bind_nsml(model) model.to(device) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=config['learning_rate'], eps=config['adam_epsilon']) scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=config['warmup_steps'], num_training_steps=len(data_loader) * config['nb_epoch'] ) # do not foget to modify the number when dataset is changed if config['fp16']: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=config['fp16_opt_level']) n_iters = len(data_loader) itr_global = args.reload_from + 1 for epoch in range( int(args.reload_from / n_iters) + 1, config['nb_epoch'] + 1): itr_start_time = time.time() losses = [] for batch in data_loader: model.train() batch_gpu = [tensor.to(device) for tensor in batch] loss = model(*batch_gpu) if config['fp16']: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), 5.0) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0) optimizer.step() scheduler.step() model.zero_grad() losses.append(loss.item()) if itr_global % args.log_every == 0: elapsed = time.time() - itr_start_time logger.info('epo:[%d/%d] itr:[%d/%d] step_time:%ds Loss=%.5f' % (epoch, config['nb_epoch'], itr_global % n_iters, n_iters, elapsed, np.mean(losses))) if tb_writer is not None: tb_writer.add_scalar('loss', np.mean(losses), itr_global) if IS_ON_NSML: summary = { "summary": True, "scope": locals(), "step": itr_global } summary.update({'loss': np.mean(losses)}) nsml.report(**summary) losses = [] itr_start_time = time.time() itr_global = itr_global + 1 if itr_global % args.valid_every == 0: logger.info("validating..") acc1, mrr, map1, ndcg = validate(valid_set, model, 10000, 1, config['sim_measure']) logger.info(f'ACC={acc1}, MRR={mrr}, MAP={map1}, nDCG={ndcg}') if tb_writer is not None: tb_writer.add_scalar('acc', acc1, itr_global) tb_writer.add_scalar('mrr', mrr, itr_global) tb_writer.add_scalar('map', map1, itr_global) tb_writer.add_scalar('ndcg', ndcg, itr_global) if IS_ON_NSML: summary = { "summary": True, "scope": locals(), "step": itr_global } summary.update({ 'acc': acc1, 'mrr': mrr, 'map': map1, 'ndcg': ndcg }) nsml.report(**summary) if itr_global % args.save_every == 0: save_model(model, itr_global) if IS_ON_NSML: nsml.save(itr_global)
def train(args): fh = logging.FileHandler(f"/data/yanghe/pytorch/{args.model}/{args.dataset}/logs.txt") # create file handler which logs even debug messages logger.addHandler(fh) # add the handlers to the logger timestamp = datetime.now().strftime('%Y%m%d%H%M') tb_writer = SummaryWriter( f"/data/yanghe/pytorch/{args.model}/{args.dataset}/logs/{timestamp}") if args.visual else None random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu") config = getattr(configs, 'config_' + args.model)() if args.automl: config.update(vars(args)) print(config) ############################################################################### # Load data ############################################################################### data_path = DATASET_PATH + "/train/" if IS_ON_NSML else args.data_path + args.dataset + '/' # train_set = eval(config['dataset_name'])(data_path, config['train_name'], config['name_len'], # config['train_api'], config['api_len'], # config['train_tokens'], config['tokens_len'], # config['train_desc'], config['desc_len']) config['valid_name'] = 'test.name.h5' config['valid_api'] = 'test.apiseq.h5' config['valid_tokens'] = 'test.tokens.h5' config['valid_desc'] = 'test.desc.h5' valid_set = eval(config['dataset_name'])(data_path, config['valid_name'], config['name_len'], config['valid_api'], config['api_len'], config['valid_tokens'], config['tokens_len'], config['valid_desc'], config['desc_len']) # data_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=config['batch_size'], # shuffle=False, drop_last=True, num_workers=1) ############################################################################### # Define Model ############################################################################### logger.info('Constructing Model..') model = getattr(models, args.model)(config) # initialize the model def save_model(model, ckpt_path): torch.save(model.state_dict(), ckpt_path) def load_model(model, ckpt_path, to_device): assert os.path.exists(ckpt_path), f'Weights not found' model.load_state_dict(torch.load(ckpt_path, map_location=to_device)) if args.reload_from > 0: ckpt = f'/data/yanghe/pytorch/{args.model}/{args.dataset}/models/step{args.reload_from}.h5' load_model(model, ckpt, device) if IS_ON_NSML: bind_nsml(model) model.to(device) ckpt = '/data/yanghe/.ncc/deepcs/raw/step4000000.h5' load_model(model, ckpt, device) valid_result = validate(valid_set, model, 10000, 1, config['sim_measure']) exit() ############################################################################### # Prepare the Optimizer ############################################################################### no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=config['learning_rate'], eps=config['adam_epsilon']) # do not foget to modify the number when dataset is changed scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=config['warmup_steps'], num_training_steps=len(data_loader) * config['nb_epoch']) if config['fp16']: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=config['fp16_opt_level']) ############################################################################### # Training Process ############################################################################### n_iters = len(data_loader) itr_global = args.reload_from + 1 for epoch in range(int(args.reload_from / n_iters) + 1, config['nb_epoch'] + 1): itr_start_time = time.time() losses = [] for batch in data_loader: model.train() batch_gpu = [tensor.to(device) for tensor in batch] loss = model(*batch_gpu) if config['fp16']: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), 5.0) else: print(loss.item()) loss.backward() # print([p.grad.sum().item() for p in model.parameters() if p.grad is not None]) torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0) # print([p.grad.sum() for p in model.parameters() if p.grad is not None]) optimizer.step() scheduler.step() model.zero_grad() losses.append(loss.item()) if itr_global % args.log_every == 0: elapsed = time.time() - itr_start_time logger.info('epo:[%d/%d] itr:[%d/%d] step_time:%ds Loss=%.5f' % (epoch, config['nb_epoch'], itr_global % n_iters, n_iters, elapsed, np.mean(losses),)) if tb_writer is not None: tb_writer.add_scalar('loss', np.mean(losses), itr_global) if IS_ON_NSML: summary = {"summary": True, "scope": locals(), "step": itr_global} summary.update({'loss': np.mean(losses)}) nsml.report(**summary) losses = [] itr_start_time = time.time() itr_global = itr_global + 1 if itr_global % args.valid_every == 0: logger.info("validating..") valid_result = validate(valid_set, model, 10000, 1, config['sim_measure']) logger.info(valid_result) if tb_writer is not None: for key, value in valid_result.items(): tb_writer.add_scalar(key, value, itr_global) if IS_ON_NSML: summary = {"summary": True, "scope": locals(), "step": itr_global} summary.update(valid_result) nsml.report(**summary) if itr_global % args.save_every == 0: ckpt_path = f'/data/yanghe/pytorch/{args.model}/{args.dataset}/models/step{itr_global}.h5' save_model(model, ckpt_path) if IS_ON_NSML: nsml.save(checkpoint=f'model_step{itr_global}')