def do_train(dataloaders, params: MinkLocParams, debug=False, visualize=False): # Create model class s = get_datetime() model = model_factory(params) model_name = 'model_' + params.model_params.model + '_' + s print('Model name: {}'.format(model_name)) weights_path = create_weights_folder() model_pathname = os.path.join(weights_path, model_name) if hasattr(model, 'print_info'): model.print_info() else: n_params = sum([param.nelement() for param in model.parameters()]) print('Number of model parameters: {}'.format(n_params)) # Move the model to the proper device before configuring the optimizer if torch.cuda.is_available(): device = "cuda" model.to(device) else: device = "cpu" print('Model device: {}'.format(device)) loss_fn = make_loss(params) # Training elements if params.weight_decay is None or params.weight_decay == 0: optimizer = torch.optim.Adam(model.parameters(), lr=params.lr) else: optimizer = torch.optim.Adam(model.parameters(), lr=params.lr, weight_decay=params.weight_decay) if params.scheduler is None: scheduler = None else: if params.scheduler == 'CosineAnnealingLR': scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=params.epochs + 1, eta_min=params.min_lr) elif params.scheduler == 'MultiStepLR': scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, params.scheduler_milestones, gamma=0.1) else: raise NotImplementedError('Unsupported LR scheduler: {}'.format( params.scheduler)) ########################################################################### # Initialize TensorBoard writer ########################################################################### now = datetime.now() logdir = os.path.join("../tf_logs", now.strftime("%Y%m%d-%H%M%S")) writer = SummaryWriter(logdir) ########################################################################### # ########################################################################### is_validation_set = 'val' in dataloaders if is_validation_set: phases = ['train', 'val'] else: phases = ['train'] # Training statistics stats = {'train': [], 'val': [], 'eval': []} for epoch in tqdm.tqdm(range(1, params.epochs + 1)): for phase in phases: if phase == 'train': model.train() else: model.eval() running_stats = [] # running stats for the current epoch count_batches = 0 for batch, positives_mask, negatives_mask in dataloaders[phase]: # batch is (batch_size, n_points, 3) tensor # labels is list with indexes of elements forming a batch count_batches += 1 batch_stats = {} if debug and count_batches > 2: break # Move everything to the device except 'coords' which must stay on CPU batch = { e: batch[e].to(device) if e != 'coords' else batch[e] for e in batch } n_positives = torch.sum(positives_mask).item() n_negatives = torch.sum(negatives_mask).item() if n_positives == 0 or n_negatives == 0: # Skip a batch without positives or negatives print( 'WARNING: Skipping batch without positive or negative examples' ) continue optimizer.zero_grad() if visualize: #visualize_batch(batch) pass with torch.set_grad_enabled(phase == 'train'): # Compute embeddings of all elements embeddings = model(batch) loss, temp_stats, _ = loss_fn(embeddings, positives_mask, negatives_mask) temp_stats = tensors_to_numbers(temp_stats) batch_stats.update(temp_stats) batch_stats['loss'] = loss.item() if phase == 'train': loss.backward() optimizer.step() running_stats.append(batch_stats) torch.cuda.empty_cache( ) # Prevent excessive GPU memory consumption by SparseTensors # ******* PHASE END ******* # Compute mean stats for the epoch epoch_stats = {} for key in running_stats[0].keys(): temp = [e[key] for e in running_stats] epoch_stats[key] = np.mean(temp) stats[phase].append(epoch_stats) print_stats(epoch_stats, phase) # ******* EPOCH END ******* if scheduler is not None: scheduler.step() loss_metrics = {'train': stats['train'][-1]['loss']} if 'val' in phases: loss_metrics['val'] = stats['val'][-1]['loss'] writer.add_scalars('Loss', loss_metrics, epoch) if 'num_triplets' in stats['train'][-1]: nz_metrics = {'train': stats['train'][-1]['num_non_zero_triplets']} if 'val' in phases: nz_metrics['val'] = stats['val'][-1]['num_non_zero_triplets'] writer.add_scalars('Non-zero triplets', nz_metrics, epoch) elif 'num_pairs' in stats['train'][-1]: nz_metrics = { 'train_pos': stats['train'][-1]['pos_pairs_above_threshold'], 'train_neg': stats['train'][-1]['neg_pairs_above_threshold'] } if 'val' in phases: nz_metrics['val_pos'] = stats['val'][-1][ 'pos_pairs_above_threshold'] nz_metrics['val_neg'] = stats['val'][-1][ 'neg_pairs_above_threshold'] writer.add_scalars('Non-zero pairs', nz_metrics, epoch) if params.batch_expansion_th is not None: # Dynamic batch expansion epoch_train_stats = stats['train'][-1] if 'num_non_zero_triplets' not in epoch_train_stats: print( 'WARNING: Batch size expansion is enabled, but the loss function is not supported' ) else: # Ratio of non-zero triplets rnz = epoch_train_stats[ 'num_non_zero_triplets'] / epoch_train_stats['num_triplets'] if rnz < params.batch_expansion_th: dataloaders['train'].batch_sampler.expand_batch() print('') # Save final model weights final_model_path = model_pathname + '_final.pth' torch.save(model.state_dict(), final_model_path) stats = {'train_stats': stats, 'params': params} # Evaluate the final model model.eval() final_eval_stats = evaluate(model, device, params) print('Final model:') print_eval_stats(final_eval_stats) stats['eval'] = {'final': final_eval_stats} print('') # Pickle training stats and parameters pickle_path = model_pathname + '_stats.pickle' pickle.dump(stats, open(pickle_path, "wb")) # Append key experimental metrics to experiment summary file model_params_name = os.path.split(params.model_params.model_params_path)[1] config_name = os.path.split(params.params_path)[1] _, model_name = os.path.split(model_pathname) prefix = "{}, {}, {}".format(model_params_name, config_name, model_name) export_eval_stats("experiment_results.txt", prefix, final_eval_stats)
def main(args: Namespace): if args.unconditional: assert args.morgan_similarity_threshold == 0 # shouldn't care about inputs in this case i2s = None if args.checkpoint_dir is not None: assert args.checkpoint_path is None for _, _, files in os.walk(args.checkpoint_dir): for fname in files: if fname.endswith('.pt'): args.checkpoint_path = os.path.join( args.checkpoint_dir, fname) if args.checkpoint_path is not None: print('loading model from checkpoint') model, i2s = load_model(args) full_train_dataset = PairDataset( path=args.train_path, i2s=i2s, batch_size=args.batch_size, extra_vocab_path=args.extra_precursors_path if args.extra_precursors_path is not None else None, max_data=args.train_max_data if args.train_max_data is not None else None) pair_datasets = full_train_dataset.split([0.9, 0.1], seed=0) train_dataset, val_dataset = pair_datasets[0], pair_datasets[1] predict_dataset = SourceDataset(path=args.val_path, i2s=train_dataset.i2s, s2i=train_dataset.s2i, pad_index=train_dataset.pad_index, start_index=train_dataset.start_index, end_index=train_dataset.end_index, batch_size=args.batch_size) if args.checkpoint_path is None: print('building model from scratch') model = Model(args=args, vocab_size=len(train_dataset.i2s), pad_index=train_dataset.pad_index, start_index=train_dataset.start_index, end_index=train_dataset.end_index) for param in model.parameters(): if param.dim() == 1: nn.init.constant_(param, 0) else: nn.init.xavier_normal_(param) print(model) print('num params: {:,}'.format( sum(p.numel() for p in model.parameters() if p.requires_grad))) model = model.cuda() chemprop_predictor = ChempropPredictor(args) criterion = LossFunction(train_dataset.pad_index, args.kl_weight) optimizer = optim.Adam(model.parameters(), lr=args.init_lr) scheduler = lr_scheduler.ExponentialLR(optimizer, 0.9) for epoch in range(args.epochs): print('epoch {}'.format(epoch)) train_dataset.reshuffle(seed=epoch) train(model=model, train_dataset=train_dataset, criterion=criterion, optimizer=optimizer, max_grad_norm=args.max_grad_norm) val_loss = validate(model=model, val_dataset=val_dataset, criterion=criterion) os.makedirs(os.path.join(args.save_dir, 'epoch' + str(epoch)), exist_ok=True) train_dataset.save( os.path.join(args.save_dir, 'epoch' + str(epoch), 'train_pairs.csv')) save_model(model=model, i2s=train_dataset.i2s, path=os.path.join(args.save_dir, 'epoch' + str(epoch), 'val_loss_{}.pt'.format(val_loss))) predict(model=model, predict_dataset=predict_dataset, save_dir=os.path.join(args.save_dir, 'epoch' + str(epoch)), args=args, chemprop_predictor=chemprop_predictor if not args.no_predictor_at_val else None, sample=not args.greedy_prediction, num_predictions=args.val_num_predictions, print_filter_frac=args.print_filter_frac) if epoch % args.evaluate_every == 0: evaluate(pred_smiles_dir=os.path.join(args.save_dir, 'epoch' + str(epoch)), train_path=args.train_path, val_path=args.val_path, checkpoint_dir=args.chemprop_dir, computed_prop=args.computed_prop, prop_min=args.prop_min, sim_thresholds=[0.2, 0.4, 0.6, 0.8, 0.9, 1.0], chemprop_predictor=chemprop_predictor, prop_max=args.prop_max, unconditional=args.unconditional) scheduler.step() if args.self_train_epochs > 0: # store parameters of current model for a loss to constrain it not to stray too far original_parameter_vector = parameters_to_vector( model.parameters()).data parameter_crit = nn.MSELoss() args.epoch_length = len(train_dataset.src) // 2 # Get properties of target molecules in train set train_dataset.tgt_props = np.array( chemprop_predictor(train_dataset.tgt_smiles)) augmented_train_dataset = deepcopy(train_dataset) epochs_to_dataset_creation = 0 for epoch in range(args.epochs, args.epochs + args.self_train_epochs): print('self train epoch {}'.format(epoch)) if epochs_to_dataset_creation == 0: train_dataset.reshuffle(seed=epoch) if args.self_train_max_data is not None: self_train_dataset = deepcopy(train_dataset) self_train_dataset.src, self_train_dataset.tgt = \ self_train_dataset.src[:args.self_train_max_data], self_train_dataset.tgt[:args.self_train_max_data] self_train_dataset.src_smiles, self_train_dataset.tgt_smiles = \ self_train_dataset.src_smiles[:args.self_train_max_data], self_train_dataset.tgt_smiles[:args.self_train_max_data] if hasattr(self_train_dataset, 'src_props'): self_train_dataset.src_props = self_train_dataset.src_props[: args . self_train_max_data] if hasattr(self_train_dataset, 'tgt_props'): self_train_dataset.tgt_props = self_train_dataset.tgt_props[: args . self_train_max_data] else: self_train_dataset = deepcopy(train_dataset) if args.extra_precursors_path is not None: self_train_dataset.add_dummy_pairs( args.extra_precursors_path) translations, props = generate_self_train_translations( train_dataset=self_train_dataset, model=model, chemprop_predictor=chemprop_predictor, args=args, k=args.k) if not args.keep_translations: # drop old translations and restart augmented_train_dataset = deepcopy(self_train_dataset) if args.unconditional: new_train_dataset = deepcopy(self_train_dataset) new_train_dataset.tgt_smiles = translations new_train_dataset.tgt = [ list(self_train_dataset.smiles2indices(smiles)) for smiles in new_train_dataset.tgt_smiles ] new_train_dataset.tgt = np.array(new_train_dataset.tgt) new_train_dataset.src_smiles = translations # any dummy is fine new_train_dataset.src = [ list(self_train_dataset.smiles2indices(smiles)) for smiles in new_train_dataset.src_smiles ] new_train_dataset.src = np.array(new_train_dataset.src) else: new_train_dataset = deepcopy(self_train_dataset) new_train_dataset.src = np.concatenate( [self_train_dataset.src for _ in range(args.k)]) new_train_dataset.src_smiles = [] for _ in range(args.k): new_train_dataset.src_smiles += self_train_dataset.src_smiles new_train_dataset.tgt = [] for i in range(args.k): new_train_dataset.tgt += [ translations[j][i] for j in range(len(translations)) ] new_train_dataset.tgt_smiles = [ self_train_dataset.indices2smiles(indices) for indices in new_train_dataset.tgt ] new_train_dataset.tgt = np.array(new_train_dataset.tgt) if args.replace_old_dataset: augmented_train_dataset = new_train_dataset else: augmented_train_dataset.add(new_train_dataset) if not args.unconditional: augmented_train_dataset.filter_dummy_pairs( need_props=False) # filters src == tgt pairs epochs_to_dataset_creation = args.epochs_per_dataset augmented_train_dataset.reshuffle(seed=epoch, need_props=False) epochs_to_dataset_creation -= 1 train(model=model, train_dataset=augmented_train_dataset, criterion=criterion, optimizer=optimizer, max_grad_norm=args.max_grad_norm, original_parameter_vector=original_parameter_vector, parameter_crit=parameter_crit, parameter_crit_weight=args.l2_diff_weight) val_loss = validate(model=model, val_dataset=val_dataset, criterion=criterion) os.makedirs(os.path.join(args.save_dir, 'epoch' + str(epoch)), exist_ok=True) augmented_train_dataset.save( os.path.join(args.save_dir, 'epoch' + str(epoch), 'train_pairs.csv')) save_model(model=model, i2s=train_dataset.i2s, path=os.path.join(args.save_dir, 'epoch' + str(epoch), 'val_loss_{}.pt'.format(val_loss))) predict(model=model, predict_dataset=predict_dataset, save_dir=os.path.join(args.save_dir, 'epoch' + str(epoch)), args=args, chemprop_predictor=chemprop_predictor if not args.no_predictor_at_val else None, sample=not args.greedy_prediction, num_predictions=args.val_num_predictions, print_filter_frac=args.print_filter_frac) evaluate(pred_smiles_dir=os.path.join(args.save_dir, 'epoch' + str(epoch)), train_path=args.train_path, val_path=args.val_path, checkpoint_dir=args.chemprop_dir, computed_prop=args.computed_prop, prop_min=args.prop_min, sim_thresholds=[0.2, 0.4, 0.6, 0.8, 0.9, 1.0], chemprop_predictor=chemprop_predictor, prop_max=args.prop_max, unconditional=args.unconditional) scheduler.step() # for convenient evaluation os.makedirs(os.path.join(args.save_dir, 'final_eval'), exist_ok=True) test_dataset = SourceDataset(path=args.test_path, i2s=train_dataset.i2s, s2i=train_dataset.s2i, pad_index=train_dataset.pad_index, start_index=train_dataset.start_index, end_index=train_dataset.end_index, batch_size=args.batch_size) predict(model=model, predict_dataset=test_dataset, save_dir=os.path.join(args.save_dir, 'final_eval'), args=args, chemprop_predictor=chemprop_predictor if not args.no_predictor_at_val else None, sample=not args.greedy_prediction, num_predictions=args.val_num_predictions, print_filter_frac=args.print_filter_frac) if args.final_eval_chemprop_dir is not None: args.computed_prop = None args.chemprop_dir = args.final_eval_chemprop_dir chemprop_predictor = ChempropPredictor(args) if args.final_eval_computed_prop is not None: args.chemprop_dir = None args.computed_prop = args.final_eval_computed_prop chemprop_predictor = ChempropPredictor(args) evaluate(pred_smiles_dir=os.path.join(args.save_dir, 'final_eval'), train_path=args.train_path, val_path=args.test_path, checkpoint_dir=args.chemprop_dir, computed_prop=args.computed_prop, prop_min=args.prop_min, sim_thresholds=[0.2, 0.4, 0.6, 0.8, 0.9, 1.0], chemprop_predictor=chemprop_predictor, prop_max=args.prop_max, unconditional=args.unconditional)