def main(args): ts = time.strftime('%Y-%b-%d-%H:%M:%S', time.gmtime()) seed = 1111 set_seed(seed) #### get data data_obj = _DATA() train_data, valid_data, vocab_obj = data_obj.f_load_data_yelp(args) # train_data, valid_data = data() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("device", device) if args.train: now_time = datetime.datetime.now() time_name = str(now_time.month)+"_"+str(now_time.day)+"_"+str(now_time.hour)+"_"+str(now_time.minute) model_file = os.path.join(args.model_path, args.data_name+"_"+args.model_name) if not os.path.isdir(model_file): print("create a directory ", model_file) os.mkdir(model_file) args.model_file = model_file+"/model_best_"+time_name+".pt" print("model_file", model_file) print("vocab_size", vocab_obj.vocab_size) print("user num", vocab_obj.user_size) ### get model network = _NETWORK(vocab_obj, args, device=device) ### add count parameters total_param_num = 0 for name, param in network.named_parameters(): if param.requires_grad: param_num = param.numel() total_param_num += param_num print(name, "\t", param_num) print("total parameters num", total_param_num) if args.train: logger_obj = _LOGGER() logger_obj.f_add_writer(args) # if torch.cuda.device_count() > 1: # print("... let us use", torch.cuda.device_count(), "GPUs!") # network = nn.DataParallel(network) # print("=="*20) # print("device", network.cuda()) # en_parameters = list(network.module.m_embedding.parameters()) + list(network.module.m_user_item_encoder.parameters()) + list(network.module.m_output2vocab.parameters()) # en_optimizer = _OPTIM(en_parameters, args) # de_parameters = network.module.m_generator.parameters() # de_optimizer = _OPTIM(de_parameters, args) en_parameters = list(network.m_embedding.parameters()) + list(network.m_user_item_encoder.parameters()) + list(network.m_output2vocab.parameters()) en_optimizer = _OPTIM(en_parameters, args) de_parameters = network.m_generator.parameters() de_optimizer = _OPTIM(de_parameters, args) trainer = _TRAINER(vocab_obj, args, device) trainer.f_train(train_data, valid_data, network, en_optimizer, de_optimizer, logger_obj) logger_obj.f_close_writer() if args.test: print("="*10, "test", "="*10) infer_obj = _INFER(vocab_obj, args, device) infer_obj.f_init_infer(network, args.model_file, reload_model=True) infer_obj.f_inference(train_data, valid_data) if args.eval: print("="*10, "eval", "="*10) eval_obj = _EVAL(vocab_obj, args, device) eval_obj.f_init_eval(network, args.model_file, reload_model=True) eval_obj.f_eval(train_data, valid_data)
de_optimizer = _OPTIM(de_parameters, args) # print("=="*20) # print("generator parameter") # for name, p in network.m_generator.named_parameters(): # print(name) # print("=="*20) # exit() trainer = _TRAINER(vocab_obj, args, device) trainer.f_train(train_data, valid_data, pretrain_encoder, pretrain_optimizer, network, en_optimizer, de_optimizer, logger_obj) logger_obj.f_close_writer() if args.test: print("="*10, "test", "="*10) infer_obj = _INFER(vocab_obj, args, device) infer_obj.f_init_infer(network, args.model_file, reload_model=True) infer_obj.f_inference(train_data, valid_data) if args.eval: print("="*10, "eval", "="*10) eval_obj = _EVAL(vocab_obj, args, device) eval_obj.f_init_eval(network, args.model_file, reload_model=True) eval_obj.f_eval(train_data, valid_data) if __name__ == "__main__":
def main(args): ts = time.strftime('%Y-%b-%d-%H:%M:%S', time.gmtime()) seed = 1111 set_seed(seed) #### get data device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("device", device) if args.parallel: local_rank = args.local_rank torch.distributed.init_process_group(backend="nccl") device = torch.device('cuda:{}'.format(local_rank)) data_obj = _DATA() train_data, valid_data, vocab_obj = data_obj.f_load_data_yelp(args) # train_data, valid_data = data() if args.train: now_time = datetime.datetime.now() time_name = str(now_time.month) + "_" + str(now_time.day) + "_" + str( now_time.hour) + "_" + str(now_time.minute) model_file = os.path.join(args.model_path, args.data_name + "_" + args.model_name) if not os.path.isdir(model_file): print("create a directory ", model_file) os.mkdir(model_file) args.model_file = model_file + "/model_best_" + time_name + ".pt" print("model_file", model_file) print("vocab_size", vocab_obj.vocab_size) print("user num", vocab_obj.user_size) ### get model network = _GEN_NETWORK(vocab_obj, args) ### add count parameters total_param_num = 0 for name, param in network.named_parameters(): if param.requires_grad: param_num = param.numel() total_param_num += param_num print(name, "\t", param_num) print("total parameters num", total_param_num) if args.train: logger_obj = _LOGGER() logger_obj.f_add_writer(args) E_network = _ENC_NETWORK(vocab_obj, args) E_network = E_network.to(device) # E_network = torch.nn.parallel.DistributedDataParallel(E_network, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True) # torch.distributed.barrier() # map_location = {'cuda:%d'%0:'cuda:%d'%local_rank} model_path = args.model_path E_model_file = args.E_model_file E_model_abs_file = os.path.join(model_path, E_model_file) print("E_model_abs_file", E_model_abs_file) check_point = torch.load(E_model_abs_file) # check_point = torch.load(E_model_abs_file, map_location=map_location) E_network.load_state_dict(check_point['model']) # torch.distributed.barrier() network = network.to(device) if args.parallel: network = torch.nn.parallel.DistributedDataParallel( network, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True) de_parameters = network.parameters() de_optimizer = _OPTIM(de_parameters, args) trainer = _TRAINER(vocab_obj, args, device) trainer.f_train_M(train_data, valid_data, E_network, network, de_optimizer, logger_obj, local_rank) logger_obj.f_close_writer() if args.test: print("=" * 10, "test", "=" * 10) infer_obj = _INFER(vocab_obj, args, device) infer_obj.f_init_infer(network, args.model_file, reload_model=True) infer_obj.f_inference(train_data, valid_data) if args.eval: print("=" * 10, "eval", "=" * 10) eval_obj = _EVAL(vocab_obj, args, device) eval_obj.f_init_eval(network, args.model_file, reload_model=True) eval_obj.f_eval(train_data, valid_data)
def main(args): ts = time.strftime('%Y-%b-%d-%H:%M:%S', time.gmtime()) seed = 1111 set_seed(seed) #### get data device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("device", device) args.decoder_model_type = args.decoder_model_type.lower() global_step = args.global_step_eval print("checkpoint dir", args.checkpoint_dir) output_decoder_dir = os.path.join(args.checkpoint_dir, "checkpoint-decoder-{}".format(global_step)) output_full_dir = os.path.join(args.checkpoint_dir, "checkpoint-full-{}".format(global_step)) print("output_decoder_dir: ", output_decoder_dir) print("output_full_dir: ", output_full_dir) checkpoints = [[output_decoder_dir]] MODEL_CLASSES = {'gpt2': (GPT2Config, GPT2ForLatentConnector, GPT2Tokenizer)} decoder_config_class, decoder_model_class, decoder_tokenizer_class = MODEL_CLASSES[args.decoder_model_type] model_decoder = decoder_model_class.from_pretrained(output_decoder_dir, latent_size=args.latent_size) tokenizer_decoder = decoder_tokenizer_class.from_pretrained(args.decoder_tokenizer_name if args.decoder_tokenizer_name else args.decoder_model_name_or_path, do_lower_case=args.do_lower_case) print("decoder_tokenizer_name ", args.decoder_tokenizer_name) print("decoder_model_name_or_path ", args.decoder_model_name_or_path) model_decoder.to(device) if args.block_size <= 0: args.block_size = tokenizer_decoder.max_len_single_sentence print("max_len_single_sentence: ", tokenizer_decoder.max_len_single_sentence) args.block_size = min(args.block_size, tokenizer_decoder.max_len_single_sentence) print("block size: ", args.block_size) checkpoint = torch.load(os.path.join(output_full_dir, "training.bin")) special_tokens_dict = {'pad_token': '<PAD>', 'bos_token': '<BOS>', 'eos_token': '<EOS>'} num_added_toks = tokenizer_decoder.add_special_tokens(special_tokens_dict) print('We have added', num_added_toks, 'tokens to GPT2') model_decoder.resize_token_embeddings(len(tokenizer_decoder)) assert tokenizer_decoder.pad_token == '<PAD>' data_obj = _DATA() train_data, valid_data, vocab_obj = data_obj.f_load_data_yelp_GPT(tokenizer_decoder, args) # train_data, valid_data = data() if args.train: now_time = datetime.datetime.now() time_name = str(now_time.month)+"_"+str(now_time.day)+"_"+str(now_time.hour)+"_"+str(now_time.minute) model_file = os.path.join(args.model_path, args.data_name+"_"+args.model_name) if not os.path.isdir(model_file): print("create a directory ", model_file) os.mkdir(model_file) args.model_file = model_file+"/model_best_"+time_name+".pt" print("model_file", model_file) network = _GEN_NETWORK(vocab_obj, args) ### add count parameters total_param_num = 0 for name, param in network.named_parameters(): if param.requires_grad: param_num = param.numel() total_param_num += param_num print(name, "\t", param_num) print("total parameters num", total_param_num) if args.train: logger_obj = _LOGGER() logger_obj.f_add_writer(args) E_network = _ENC_NETWORK(vocab_obj, args) E_network = E_network.to(device) # E_network = torch.nn.parallel.DistributedDataParallel(E_network, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True) # torch.distributed.barrier() # map_location = {'cuda:%d'%0:'cuda:%d'%local_rank} model_path = args.model_path # E_model_file = args.E_model_file # E_model_abs_file = os.path.join(model_path, E_model_file) # print("E_model_abs_file", E_model_abs_file) # check_point = torch.load(E_model_abs_file) # check_point = torch.load(E_model_abs_file, map_location=map_location) # E_network.load_state_dict(check_point['model']) # if args.user_pretrained_model: # pre_model = check_point['model_state_dict'] # model_dict = network.state_dict() # pre_dict = {k:v for k, v in pre_model.items() if k in model_dict} # model_dict.update(pre_dict) # network.load_state_dict(model_dict) network.init_tokenizer_decoder(tokenizer_decoder, model_decoder) network = network.to(device) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in network.named_parameters() if not any(nd in n for nd in no_decay)],'weight_decay':args.weight_decay }, {'params': [p for n, p in network.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay':0.0} ] t_total = len(train_data) // args.gradient_accumulation_steps * args.num_train_epochs optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) local_rank = 0 trainer = _TRAINER(vocab_obj, args, device) trainer.f_train_M(train_data, valid_data, E_network, network, optimizer, scheduler, logger_obj, local_rank, args) logger_obj.f_close_writer() if args.test: print("="*10, "test", "="*10) infer_obj = _INFER(vocab_obj, args, device) infer_obj.f_init_infer(network, args.model_file, reload_model=True) infer_obj.f_inference(train_data, valid_data) if args.eval: print("="*10, "eval", "="*10) eval_obj = _EVAL(vocab_obj, args, device) eval_obj.f_init_eval(network, args.model_file, reload_model=True) eval_obj.f_eval(train_data, valid_data)