def build_model(self): if self.params['multi_gpus']: self.strategy = tf.distribute.MirroredStrategy(devices=None) else: self.strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0") with self.strategy.scope(): self.model = Yolo(yaml_dir=self.params['yaml_dir']) self.anchors = self.model.module_list[-1].anchors self.stride = self.model.module_list[-1].stride self.num_classes = self.model.module_list[-1].num_classes self.loss_fn = YoloLoss(self.model.module_list[-1].anchors, ignore_iou_threshold=0.3, num_classes=self.num_classes, label_smoothing=self.params['label_smoothing'], img_size=self.params['img_size']) self.optimizer = Optimizer('adam')()
def get_lineup(df, sport, site): s3 = boto3.resource('s3') bucket = s3.Bucket('my-dfs-data') if sport == 'nba': constants = ['name', 'pos', 'event_id'] model_variables = ['pp', 'ppg', 'salary', 'lovecount', 'hatecount'] #need to replace this #df = df[df.oteam.isin(['DAL', 'DEN', 'ATL', 'HOU'])] #df = df[df.oteam.isin(['CHI', 'NO', 'NY', 'UTA', 'MIL', 'GS'])] #df = df[df.oteam.isin(['NY', 'UTA', 'MIL', 'GS'])] elif sport == 'pga': constants = ['name', 'event_id'] model_variables = [ 'pp', 'ppg', 'salary', 'vegas_odds_0', 'vegas_value_0' ] for col in model_variables: df[col] = pd.to_numeric(df[col]) df_tmp = df[constants + model_variables].dropna() preds = df_tmp[df_tmp.event_id == df.event_id.max()] obj = pickle.loads( s3.Bucket("my-dfs-data").Object("{}/modeling/model_{}.pkl".format( sport, site)).get()['Body'].read()) trace, scaler, player_ids = obj[0], obj[1], obj[2] preds = preds.merge(player_ids, how='left', on='name') tmp_preds = scaler.transform(preds[model_variables]) preds['posterior'] = None for i, name in enumerate(preds.name): idx = preds.loc[preds.name == name, 'player_idx'].values[0] if not np.isnan(idx): preds.loc[preds.name == name, 'posterior'] = [[ get_post_preds(i, trace, tmp_preds, idx) ]] else: print(name) preds.loc[preds.name == name, 'posterior'] = np.nan preds = preds.dropna() preds['preds'] = preds['posterior'].apply(lambda x: x[0].mean()) #use optimizer for lineups opt = Optimizer(preds, sport, site) opt.solve() opt.get_lineup() return opt.lineup
def train(data): print("Training model...") data.show_data_summary() save_data_name = data.model_dir +".dset" data.save(save_data_name) if data.sentence_classification: model = SentClassifier(data) else: model = SeqLabel(data) optimizer = Optimizer('sgd', 'adam', model, 'gcn', lr=data.HP_lr, lr_gcn=data.HP_lr_gcn, momentum=data.HP_momentum, lr_decay=data.HP_lr_decay) best_dev = -10 for idx in range(data.HP_iteration): epoch_start = time.time() temp_start = epoch_start print("Epoch: %s/%s" %(idx,data.HP_iteration)) instance_count = 0 sample_id = 0 sample_loss = 0 sample_loss_flat = 0 sample_loss_graph = 0 total_loss = 0 right_token = 0 whole_token = 0 random.shuffle(data.train_Ids) ## set model in train model model.train() model.zero_grad() batch_size = data.HP_batch_size batch_id = 0 train_num = len(data.train_Ids) total_batch = train_num//batch_size+1 for batch_id in range(total_batch): start = batch_id*batch_size end = (batch_id+1)*batch_size if end >train_num: end = train_num instance = data.train_Ids[start:end] if not instance: continue batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask, ans_matrix, wgt_matrix = batchify_with_label(data, instance, data.HP_gpu, True, data.sentence_classification) instance_count += 1 loss_flat, loss_graph, loss, tag_seq = model.calculate_loss(idx, batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, batch_label, mask, ans_matrix, wgt_matrix) right, whole = predict_check(tag_seq, batch_label, mask) right_token += right whole_token += whole sample_loss += loss.item() sample_loss_flat += loss_flat.item() sample_loss_graph += loss_graph.item() total_loss += loss.item() if end%500 == 0: temp_time = time.time() temp_cost = temp_time - temp_start temp_start = temp_time print(" Instance: %s; Time: %.2fs; loss_flat: %.4f; loss_graph: %.4f; loss: %.4f; acc: %.4f"%(end, temp_cost, sample_loss_flat, sample_loss_graph, sample_loss, (right_token+0.)/whole_token)) if sample_loss > 1e8 or str(sample_loss) == "nan": print("ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT....") exit(1) sys.stdout.flush() sample_loss = 0 sample_loss_flat = 0 sample_loss_graph = 0 loss.backward() if data.HP_clip is not None: torch.nn.utils.clip_grad_norm_(model.parameters(), data.HP_clip) optimizer.step() model.zero_grad() optimizer.update(idx+1, batch_id+1, total_batch) temp_time = time.time() temp_cost = temp_time - temp_start print(" Instance: %s; Time: %.2fs; loss_flat: %.4f; loss_graph: %.4f; loss: %.4f; acc: %.4f"%(end, temp_cost, sample_loss_flat, sample_loss_graph, sample_loss, (right_token+0.)/whole_token)) epoch_finish = time.time() epoch_cost = epoch_finish - epoch_start print("Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s, total loss: %s"%(idx, epoch_cost, train_num/epoch_cost, total_loss)) print("totalloss:", total_loss) if total_loss > 1e8 or str(total_loss) == "nan": print("ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT....") exit(1) # continue speed, p, r, f, _,_ = evaluate(data, model, "dev", idx) dev_finish = time.time() dev_cost = dev_finish - epoch_finish if data.seg: current_score = f print("Test: time: %.2fs, speed: %.2fst/s; [p: %.4f, r: %.4f, f: %.4f]"%(dev_cost, speed, p, r, f)) if current_score > best_dev: if data.seg: print("Exceed previous best f score:", best_dev) else: print("Exceed previous best acc score:", best_dev) model_name = data.model_dir +'.'+ str(idx) + ".model" print("Save current best model in file:", model_name) torch.save(model.state_dict(), model_name) best_dev = current_score # ## decode test speed, p, r, f, _,_ = evaluate(data, model, "test", idx) test_finish = time.time() test_cost = test_finish - dev_finish if data.seg: print("Test: time: %.2fs, speed: %.2fst/s; [p: %.4f, r: %.4f, f: %.4f]"%(test_cost, speed, p, r, f)) else: print("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f"%(test_cost, speed, acc)) gc.collect()
class Trainer(object): """ Trainer class that uses the dataset and model to train # Usage data_loader = tf.data.Dataset() trainer = Trainer(params) trainer.train(data_loader) """ def __init__(self, params): """ Constructor :param params: dict, with dir and training parameters """ self.params = params if os.path.exists(self.params['log_dir']): shutil.rmtree(self.params['log_dir']) self.log_writer = tf.summary.create_file_writer(self.params['log_dir']) self.global_step = tf.Variable(0, trainable=False, dtype=tf.int64) self.build_model() def build_model(self): """ Build the model, define the training strategy and model, loss, optimizer :return: """ if self.params['multi_gpus']: self.strategy = tf.distribute.MirroredStrategy(devices=None) else: self.strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0") with self.strategy.scope(): self.model = Yolo(yaml_dir=self.params['yaml_dir']) self.anchors = self.model.module_list[-1].anchors self.stride = self.model.module_list[-1].stride self.num_classes = self.model.module_list[-1].num_classes self.loss_fn = YoloLoss( self.model.module_list[-1].anchors, ignore_iou_threshold=0.3, num_classes=self.num_classes, label_smoothing=self.params['label_smoothing'], img_size=self.params['img_size']) self.optimizer = Optimizer('adam')() def train(self, train_dataset, valid_dataset=None, transfer='scratch'): """ train function :param train_dataset: train dataset built by tf.data :param valid_dataset: valid dataset build by td.data, optional :param transfer: pretrain :return: """ steps_per_epoch = train_dataset.len / self.params['batch_size'] self.total_steps = int(self.params['n_epochs'] * steps_per_epoch) self.params[ 'warmup_steps'] = self.params['warmup_epochs'] * steps_per_epoch with self.strategy.scope(): self.lr_scheduler = LrScheduler(self.total_steps, self.params) # => tf.keras.Model self.model = self.model(self.params['img_size']) ckpt = tf.train.Checkpoint(model=self.model, optimizer=self.optimizer) ckpt_manager = tf.train.CheckpointManager( ckpt, self.params['checkpoint_dir'], max_to_keep=5) if transfer == 'darknet': print("Load weights from ") model_pretrain = Yolo(self.params['yaml_dir'])() model_pretrain.load_weights() self.model.get_layer().set_weights() elif transfer == 'resume': print("Load weights from latest checkpoint") ckpt.restore(ckpt_manager.latest_checkpoint) elif transfer == 'scratch': print("Train from scratch") print(self.model.summary()) train_dataset = self.strategy.experimental_distribute_dataset( train_dataset) for epoch in range(1, self.params['n_epochs'] + 1): for step, (image, target) in enumerate(train_dataset): loss = self.dist_train_step(image, target) print('=> Epoch {}, Step {}, Loss {:.5f}'.format( epoch, self.global_step.numpy(), loss.numpy())) with self.log_writer.as_default(): tf.summary.scalar('loss', loss, step=self.global_step) tf.summary.scalar('lr', self.optimizer.lr, step=self.global_step) self.log_writer.flush() if epoch % 3 == 0: ckpt_save_path = ckpt_manager.save() print('Saving checkpoint for epoch {} at {}'.format( epoch, ckpt_save_path)) self.export_model() # @tf.function def train_step(self, image, target): with tf.GradientTape() as tape: logit = self.model(image, training=True) iou_loss, conf_loss, prob_loss = self.loss_fn(target, logit) total_loss = iou_loss + conf_loss + prob_loss gradients = tape.gradient(total_loss, self.model.trainable_variables) self.optimizer.apply_gradients( zip(gradients, self.model.trainable_variables)) lr = self.lr_scheduler.step() self.optimizer.lr.assign(lr) self.global_step.assign_add(1) return total_loss @tf.function def dist_train_step(self, image, target): with self.strategy.scope(): loss = self.strategy.run(self.train_step, args=(image, target)) total_loss_mean = self.strategy.reduce(tf.distribute.ReduceOp.MEAN, loss, axis=None) return total_loss_mean def validate(self, valid_dataset): valid_loss = [] for step, (image, target) in enumerate(valid_dataset): step_valid_loss = self.valid_step(image, target) valid_loss.append(step_valid_loss) return np.mean(valid_loss) def valid_step(self, image, label): logit = self.model(image, training=False) iou_loss, conf_loss, prob_loss = self.loss_fn(label, logit) return iou_loss + conf_loss + prob_loss def export_model(self): tf.saved_model.save(self.model, self.params['saved_model_dir']) print("pb model saved in {}".format(self.params['saved_model_dir']))
images_path = "/home/thuan/Desktop/visual_slam/Data_for_superglue/TUM_images_SuperGlue/sift/" load_data = CRDataset_train(poses_path, images_path, device) # load_data_test = CRDataset_test(poses_path, images_path, config, device) model = md.MainModel(config['main_model']).train().to(device) superpoint = SuperPoint(config.get('superpoint', {})).eval().to(device) criterion = PoseNetCriterion().to(device) #optimizer = torch.optim.Adam(model.parameters(), lr=0.001) optimizer_configs = { 'method': 'adam', 'base_lr': 1e-4, 'weight_decay': 5e-4, 'lr_decay': 1, 'lr_stepvalues': [k / 4 * 400 for k in range(1, 5)] } optimizer = Optimizer(model.parameters(), **optimizer_configs) train_loader = DataLoader(load_data, batch_size=6, num_workers=0, shuffle=False) # model.eval() # model(load_data_test[0]["features"]) number_batch = len(train_loader) his_losses = [] for epoch in range(400): optimizer.learner.zero_grad() pbar = enumerate(train_loader) pbar = tqdm(pbar, total=number_batch) count = 0
def main(): parser = parse() args = parser.parse_args() # Device if torch.cuda.is_available(): os.environ["CUDA_DEVICE_ORDER"] = 'PCI_BUS_ID' os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_ids gpu_ids = [i for i in range(len(args.gpu_ids.split(',')))] # Data download_data = Download() download_data.train_data() train_dataset = audio_skeleton_dataset(download_data.train_dst, 'train') val_dataset = audio_skeleton_dataset(download_data.train_dst, 'val') train_loader = DataLoader(train_dataset, batch_size=args.batch, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=args.batch, shuffle=False) # Model movement_net = MovementNet( args.d_input, args.d_output_body, args.d_output_rh, args.d_model, args.n_block, args.n_unet, args.n_attn, args.n_head, args.max_len, args.dropout, args.pre_lnorm, args.attn_type).to('cuda:0' if torch.cuda.is_available() else 'cpu') if torch.cuda.is_available() and len(args.gpu_ids.split(',')) > 1: movement_net = nn.DataParallel(movement_net, device_ids=gpu_ids) optimizer = Optimizer( torch.optim.Adam(movement_net.parameters(), betas=(0.9, 0.98), eps=1e-09), 1.0, args.d_model, args.warmup_steps) #------------------------ START TRAINING ---------------------------------# print('Training... \n') if args.early_stop_iter > 0: counter = 0 min_val_loss = float('inf') Epoch_train_loss = [] Epoch_val_loss = [] for e in range(args.epoch): print("epoch %d" % (e + 1)) # Training stage movement_net.train() pose_loss = [] for X_train, y_train, seq_len in train_loader: X_train, lengths = sort_sequences(X_train, seq_len) y_train, _ = sort_sequences(y_train, seq_len) mask = y_train != 0 mask = mask.type('torch.FloatTensor').to( 'cuda:0' if torch.cuda.is_available() else 'cpu') full_output = movement_net.forward(X_train, lengths) loss = L1_loss(full_output, y_train, mask[:, :, :1]) optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(movement_net.parameters(), 1.) optimizer.step() pose_loss.append(loss.data.cpu().numpy()) Epoch_train_loss.append(np.mean(pose_loss)) print('train loss: ' + str(np.mean(pose_loss))) # Validation stage movement_net.eval() pose_loss = [] with torch.no_grad(): for X_val, y_val, seq_len in val_loader: X_val, lengths = sort_sequences(X_val, seq_len) y_val, _ = sort_sequences(y_val, seq_len) mask = y_val != 0 mask = mask.type('torch.FloatTensor').to( 'cuda:0' if torch.cuda.is_available() else 'cpu') full_output = movement_net.forward(X_val, lengths) loss = L1_loss(full_output, y_val, mask[:, :, :1]) pose_loss.append(loss.data.cpu().numpy()) Epoch_val_loss.append(np.mean(pose_loss)) print('val loss: ' + str(np.mean(pose_loss)) + '\n') if counter == args.early_stop_iter: print("------------------early stopping------------------\n") break else: if min_val_loss > np.mean(pose_loss): min_val_loss = np.mean(pose_loss) counter = 0 if not os.path.exists('checkpoint'): os.makedirs('checkpoint') if torch.cuda.is_available() and len( args.gpu_ids.split(',')) > 1: state_dict = movement_net.module.state_dict() else: state_dict = movement_net.state_dict() torch.save( { 'epoch': e + 1, 'model_state_dict': { 'movement_net': state_dict }, 'optimizer_state_dict': optimizer.state_dict(), 'loss': min_val_loss }, args.checkpoint) else: counter += 1