def test_save_load(self, data): tmp_cache_path = os.path.join(tempfile.mkdtemp(), 'tmp.npz') dataset = NumpyTupleDataset(*data) NumpyTupleDataset.save(tmp_cache_path, dataset) assert os.path.exists(tmp_cache_path) load_dataset = NumpyTupleDataset.load(tmp_cache_path) os.remove(tmp_cache_path) assert len(dataset._datasets) == len(load_dataset._datasets) for a, d in six.moves.zip(dataset._datasets, load_dataset._datasets): numpy.testing.assert_array_equal(a, d)
def main(): """Launcher.""" preprocessor = preprocess_method_dict["nfp"]() dataset = datasets.get_qm9(preprocessor, labels="h**o") cache_dir = "data/" if not (os.path.exists(cache_dir)): os.makedirs(cache_dir) NumpyTupleDataset.save(cache_dir + "data.npz", dataset) dataset = NumpyTupleDataset.load(cache_dir + 'data.npz') train_data_ratio = 0.7 train_data_size = int(len(dataset) * train_data_ratio) train, validation = split_dataset_random(dataset, train_data_size, 777) print('train dataset size:', len(train)) print('validation dataset size:', len(validation)) n_unit = 16 conv_layers = 4 model = GraphConvPredictor(NFP(n_unit, n_unit, conv_layers), MLP(n_unit, 1))
def main(): # Supported preprocessing/network list method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn'] label_names = [ 'A', 'B', 'C', 'mu', 'alpha', 'h**o', 'lumo', 'gap', 'r2', 'zpve', 'U0', 'U', 'H', 'G', 'Cv' ] scale_list = ['standardize', 'none'] parser = argparse.ArgumentParser(description='Regression with QM9.') parser.add_argument('--method', '-m', type=str, choices=method_list, default='nfp') parser.add_argument('--label', '-l', type=str, choices=label_names, default='', help='target label for regression, ' 'empty string means to predict all ' 'property at once') parser.add_argument('--scale', type=str, choices=scale_list, default='standardize', help='Label scaling method') parser.add_argument('--conv-layers', '-c', type=int, default=4) parser.add_argument('--batchsize', '-b', type=int, default=32) parser.add_argument('--gpu', '-g', type=int, default=-1) parser.add_argument('--out', '-o', type=str, default='result') parser.add_argument('--epoch', '-e', type=int, default=20) parser.add_argument('--unit-num', '-u', type=int, default=16) parser.add_argument('--seed', '-s', type=int, default=777) parser.add_argument('--train-data-ratio', '-t', type=float, default=0.7) args = parser.parse_args() seed = args.seed train_data_ratio = args.train_data_ratio method = args.method if args.label: labels = args.label cache_dir = os.path.join('input', '{}_{}'.format(method, labels)) class_num = len(labels) if isinstance(labels, list) else 1 else: labels = None cache_dir = os.path.join('input', '{}_all'.format(method)) class_num = len(D.get_qm9_label_names()) # Dataset preparation dataset = None if os.path.exists(cache_dir): print('load from cache {}'.format(cache_dir)) dataset = NumpyTupleDataset.load(os.path.join(cache_dir, 'data.npz')) if dataset is None: print('preprocessing dataset...') preprocessor = preprocess_method_dict[method]() dataset = D.get_qm9(preprocessor, labels=labels) os.makedirs(cache_dir) NumpyTupleDataset.save(os.path.join(cache_dir, 'data.npz'), dataset) if args.scale == 'standardize': # Standard Scaler for labels ss = StandardScaler() labels = ss.fit_transform(dataset.get_datasets()[-1]) dataset = NumpyTupleDataset(*dataset.get_datasets()[:-1], labels) train_data_size = int(len(dataset) * train_data_ratio) train, val = split_dataset_random(dataset, train_data_size, seed) # Network n_unit = args.unit_num conv_layers = args.conv_layers if method == 'nfp': print('Train NFP model...') model = GraphConvPredictor( NFP(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'ggnn': print('Train GGNN model...') model = GraphConvPredictor( GGNN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'schnet': print('Train SchNet model...') model = GraphConvPredictor( SchNet(out_dim=class_num, hidden_dim=n_unit, n_layers=conv_layers), None) elif method == 'weavenet': print('Train WeaveNet model...') n_atom = 20 n_sub_layer = 1 weave_channels = [50] * conv_layers model = GraphConvPredictor( WeaveNet(weave_channels=weave_channels, hidden_dim=n_unit, n_sub_layer=n_sub_layer, n_atom=n_atom), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'rsgcn': print('Train RSGCN model...') model = GraphConvPredictor( RSGCN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) else: raise ValueError('[ERROR] Invalid method {}'.format(method)) train_iter = I.SerialIterator(train, args.batchsize) val_iter = I.SerialIterator(val, args.batchsize, repeat=False, shuffle=False) def scaled_abs_error(x0, x1): if isinstance(x0, Variable): x0 = cuda.to_cpu(x0.data) if isinstance(x1, Variable): x1 = cuda.to_cpu(x1.data) if args.scale == 'standardize': scaled_x0 = ss.inverse_transform(cuda.to_cpu(x0)) scaled_x1 = ss.inverse_transform(cuda.to_cpu(x1)) diff = scaled_x0 - scaled_x1 elif args.scale == 'none': diff = cuda.to_cpu(x0) - cuda.to_cpu(x1) return numpy.mean(numpy.absolute(diff), axis=0)[0] classifier = L.Classifier(model, lossfun=F.mean_squared_error, accfun=scaled_abs_error) if args.gpu >= 0: chainer.cuda.get_device_from_id(args.gpu).use() classifier.to_gpu() optimizer = O.Adam() optimizer.setup(classifier) updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu, converter=concat_mols) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend( E.Evaluator(val_iter, classifier, device=args.gpu, converter=concat_mols)) trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(E.LogReport()) trainer.extend( E.PrintReport([ 'epoch', 'main/loss', 'main/accuracy', 'validation/main/loss', 'validation/main/accuracy', 'elapsed_time' ])) trainer.extend(E.ProgressBar()) trainer.run()
if __name__ == "__main__": # -- get dataset -- # parser = argparse.ArgumentParser() parser.add_argument("-p", "--path", type=str, default=None, help="path to your dataset") parser.add_argument("-a", "--atomic_num_path", type=str, default=atomic_num_list_path, help="path to your atomic num list file") args = parser.parse_args() dataset = NumpyTupleDataset.load(args.path) atomic_num_list = load_id_to_atomic_num(args.atomic_num_path) periodic_table = load_periodic_table() result_dict = {} for x, _ in tqdm(dataset): atomic_ids = filter(lambda x: x > 0, map(lambda x: atomic_num_list[x], x.astype(int))) for a in atomic_ids: symbol = periodic_table.loc[a]["symbol"] result_dict[symbol] = result_dict[ symbol] + 1 if symbol in result_dict else 1 for k, v in result_dict.items(): print("{:<5}{:<5}".format(k, v))
def main(input_args=None): # Parse the arguments. args = parse_arguments(input_args) device = args.gpu method = args.method if args.data_name == 'suzuki': datafile = 'data/suzuki_type_test_v2.csv' class_num = 119 class_dict = {'M': 28, 'L': 23, 'B': 35, 'S': 10, 'A': 17} dataset_filename = 'test_data.npz' labels = ['Yield', 'M', 'L', 'B', 'S', 'A', 'id'] elif args.data_name == 'CN': datafile = 'data/CN_coupling_test.csv' class_num = 206 class_dict = {'M': 44, 'L': 47, 'B': 13, 'S': 22, 'A': 74} dataset_filename = 'test_CN_data.npz' labels = ['Yield', 'M', 'L', 'B', 'S', 'A', 'id'] elif args.data_name == 'Negishi': datafile = 'data/Negishi_test.csv' class_num = 106 class_dict = {'M': 32, 'L': 20, 'T': 8, 'S': 10, 'A': 30} dataset_filename = 'test_Negishi_data.npz' labels = ['Yield', 'M', 'L', 'T', 'S', 'A', 'id'] elif args.data_name == 'PKR': datafile = 'data/PKR_test.csv' class_num = 83 class_dict = { 'M': 18, 'L': 6, 'T': 7, 'S': 15, 'A': 11, 'G': 1, 'O': 13, 'P': 4, 'other': 1 } dataset_filename = 'test_PKR_data.npz' labels = [ 'Yield', 'M', 'L', 'T', 'S', 'A', 'G', 'O', 'P', 'other', 'id' ] else: raise ValueError('Unexpected dataset name') cache_dir = os.path.join('input', '{}_all'.format(method)) # Dataset preparation. def postprocess_label(label_list): return numpy.asarray(label_list, dtype=numpy.float32) print('Preprocessing dataset...') # Load the cached dataset. dataset_cache_path = os.path.join(cache_dir, dataset_filename) dataset = None if os.path.exists(dataset_cache_path): print('Loading cached dataset from {}.'.format(dataset_cache_path)) dataset = NumpyTupleDataset.load(dataset_cache_path) if dataset is None: if args.method == 'mpnn': preprocessor = preprocess_method_dict['ggnn']() else: preprocessor = preprocess_method_dict[args.method]() parser = CSVFileParser( preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_col=['Reactant1', 'Reactant2', 'Product'], label_dicts=class_dict) dataset = parser.parse(datafile)['dataset'] # Cache the laded dataset. if not os.path.exists(cache_dir): os.makedirs(cache_dir) NumpyTupleDataset.save(dataset_cache_path, dataset) labels = dataset.get_datasets()[-2] ids = dataset.get_datasets()[-1][:, 1].reshape(-1, 1) yields = dataset.get_datasets()[-1][:, 0].reshape(-1, 1).astype( 'float32') # [:,0] added dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-2] + ( yields, labels, ))) # Load the standard scaler parameters, if necessary. scaler = None test = dataset print('Predicting...') # Set up the regressor. model_path = os.path.join(args.in_dir, args.model_filename) if os.path.exists(model_path): classifier = Classifier.load_pickle(model_path, device=args.gpu) else: predictor = set_up_predictor(args.method, args.unit_num, args.conv_layers, class_num) classifier = Classifier(predictor, lossfun=F.sigmoid_cross_entropy, metrics_fun=F.binary_accuracy, device=args.gpu) if args.load_modelname: serializers.load_npz(args.load_modelname, classifier) scaled_predictor = ScaledGraphConvPredictor( graph_conv=classifier.predictor.graph_conv, mlp=classifier.predictor.mlp) classifier.predictor = scaled_predictor # This callback function extracts only the inputs and discards the labels. def extract_inputs(batch, device=None): return concat_mols(batch, device=device)[:-1] # Predict the output labels. # Prediction function rewrite!!! y_pred = classifier.predict(test, converter=extract_inputs) y_pred_max = numpy.argmax(y_pred, axis=1) y_pred_max = y_pred_max.reshape(-1, 1) # y_pred_idx = y_pred.argsort(axis=1) # ascending # Extract the ground-truth labels. t = concat_mols(test, device=-1)[-1] # device 11/14 memory issue original_t = cuda.to_cpu(t) t_idx = original_t.squeeze(1) t_idx = t_idx.argsort(axis=1) # gt_indx = numpy.where(original_t == 1) # Construct dataframe. df_dict = {} for i, l in enumerate(labels[:1]): df_dict.update({ 'y_pred_{}'.format(l): y_pred_max[:, -1].tolist(), # [:,-1] 't_{}'.format(l): t_idx[:, -1].tolist(), }) df = pandas.DataFrame(df_dict) # Show a prediction/ground truth table with 5 random examples. print(df.sample(5)) n_eval = 10 for target_label in range(y_pred_max.shape[1]): label_name = labels[:1][0][target_label] print('label_name = {}, y_pred = {}, t = {}'.format( label_name, y_pred_max[:n_eval, target_label], t_idx[:n_eval, -1])) # Perform the prediction. print('Evaluating...') test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False) eval_result = Evaluator(test_iterator, classifier, converter=concat_mols, device=args.gpu)() print('Evaluation result: ', eval_result) with open(os.path.join(args.in_dir, 'eval_result.json'), 'w') as f: json.dump(eval_result, f) res_dic = {} for i in range(len(y_pred)): res_dic[i] = str(ids[i]) json.dump(res_dic, open(os.path.join(args.in_dir, "test_ids.json"), "w")) pickle.dump(y_pred, open(os.path.join(args.in_dir, "pred.pkl"), "wb")) pickle.dump(original_t, open(os.path.join(args.in_dir, "gt.pkl"), "wb"))
def main(): # Parse the arguments. args = parse_arguments() # Set up some useful variables that will be used later on. method = args.method if args.label: labels = args.label cache_dir = os.path.join('input', '{}_{}'.format(method, labels)) else: labels = D.get_qm9_label_names() cache_dir = os.path.join('input', '{}_all'.format(method)) # Get the filename corresponding to the cached dataset, based on the amount # of data samples that need to be parsed from the original dataset. num_data = args.num_data if num_data >= 0: dataset_filename = 'data_{}.npz'.format(num_data) else: dataset_filename = 'data.npz' # Load the cached dataset. dataset_cache_path = os.path.join(cache_dir, dataset_filename) dataset = None if os.path.exists(dataset_cache_path): print('Loading cached data from {}.'.format(dataset_cache_path)) dataset = NumpyTupleDataset.load(dataset_cache_path) if dataset is None: print('Preprocessing dataset...') preprocessor = preprocess_method_dict[method]() dataset = D.get_qm9(preprocessor, labels=labels) # Cache the newly preprocessed dataset. if not os.path.exists(cache_dir): os.mkdir(cache_dir) NumpyTupleDataset.save(dataset_cache_path, dataset) # Load the standard scaler parameters, if necessary. if args.scale == 'standardize': scaler_path = os.path.join(args.in_dir, 'scaler.pkl') print('Loading scaler parameters from {}.'.format(scaler_path)) with open(scaler_path, mode='rb') as f: scaler = pickle.load(f) else: print('No standard scaling was selected.') scaler = None # Split the dataset into training and testing. train_data_size = int(len(dataset) * args.train_data_ratio) _, test = split_dataset_random(dataset, train_data_size, args.seed) # Use a predictor with scaled output labels. model_path = os.path.join(args.in_dir, args.model_filename) regressor = Regressor.load_pickle(model_path, device=args.gpu) # Replace the default predictor with one that scales the output labels. scaled_predictor = ScaledGraphConvPredictor(regressor.predictor) scaled_predictor.scaler = scaler regressor.predictor = scaled_predictor # This callback function extracts only the inputs and discards the labels. def extract_inputs(batch, device=None): return concat_mols(batch, device=device)[:-1] # Predict the output labels. print('Predicting...') y_pred = regressor.predict(test, converter=extract_inputs) # Extract the ground-truth labels. t = concat_mols(test, device=-1)[-1] n_eval = 10 # Construct dataframe. df_dict = {} for i, l in enumerate(labels): df_dict.update({ 'y_pred_{}'.format(l): y_pred[:, i], 't_{}'.format(l): t[:, i], }) df = pandas.DataFrame(df_dict) # Show a prediction/ground truth table with 5 random examples. print(df.sample(5)) for target_label in range(y_pred.shape[1]): diff = y_pred[:n_eval, target_label] - t[:n_eval, target_label] print('target_label = {}, y_pred = {}, t = {}, diff = {}'.format( target_label, y_pred[:n_eval, target_label], t[:n_eval, target_label], diff)) # Run an evaluator on the test dataset. print('Evaluating...') test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False) eval_result = Evaluator(test_iterator, regressor, converter=concat_mols, device=args.gpu)() # Prevents the loss function from becoming a cupy.core.core.ndarray object # when using the GPU. This hack will be removed as soon as the cause of # the issue is found and properly fixed. loss = numpy.asscalar(cuda.to_cpu(eval_result['main/loss'])) eval_result['main/loss'] = loss print('Evaluation result: ', eval_result) # Save the evaluation results. with open(os.path.join(args.in_dir, 'eval_result.json'), 'w') as f: json.dump(eval_result, f)
def main(): # Parse the arguments. args = parse_arguments() # Set up some useful variables that will be used later on. method = args.method if args.label != 'all': labels = args.label cache_dir = os.path.join('input', '{}_{}'.format(method, labels)) class_num = len(labels) if isinstance(labels, list) else 1 else: labels = None cache_dir = os.path.join('input', '{}_all'.format(method)) class_num = len(D.get_qm9_label_names()) # Get the filename corresponding to the cached dataset, based on the amount # of data samples that need to be parsed from the original dataset. num_data = args.num_data if num_data >= 0: dataset_filename = 'data_{}.npz'.format(num_data) else: dataset_filename = 'data.npz' # Load the cached dataset. dataset_cache_path = os.path.join(cache_dir, dataset_filename) dataset = None if os.path.exists(dataset_cache_path): print('Loading cached dataset from {}.'.format(dataset_cache_path)) dataset = NumpyTupleDataset.load(dataset_cache_path) if dataset is None: print('Preprocessing dataset...') preprocessor = preprocess_method_dict[method]() if num_data >= 0: # Select the first `num_data` samples from the dataset. target_index = numpy.arange(num_data) dataset = D.get_qm9(preprocessor, labels=labels, target_index=target_index) else: # Load the entire dataset. dataset = D.get_qm9(preprocessor, labels=labels) # Cache the laded dataset. if not os.path.exists(cache_dir): os.makedirs(cache_dir) NumpyTupleDataset.save(dataset_cache_path, dataset) # Scale the label values, if necessary. if args.scale == 'standardize': print('Fit StandardScaler to the labels.') scaler = StandardScaler() scaler.fit(dataset.get_datasets()[-1]) else: print('No standard scaling was selected.') scaler = None # Split the dataset into training and validation. train_data_size = int(len(dataset) * args.train_data_ratio) train, valid = split_dataset_random(dataset, train_data_size, args.seed) # Set up the predictor. predictor = set_up_predictor(method, args.unit_num, args.conv_layers, class_num, scaler) # Set up the regressor. device = chainer.get_device(args.device) metrics_fun = {'mae': F.mean_absolute_error, 'rmse': rmse} regressor = Regressor(predictor, lossfun=F.mean_squared_error, metrics_fun=metrics_fun, device=device) print('Training...') run_train(regressor, train, valid=valid, batch_size=args.batchsize, epoch=args.epoch, out=args.out, extensions_list=None, device=device, converter=concat_mols, resume_path=None) # Save the regressor's parameters. model_path = os.path.join(args.out, args.model_filename) print('Saving the trained model to {}...'.format(model_path)) regressor.save_pickle(model_path, protocol=args.protocol)
def train(): parser = argparser.get_parser() args = parser.parse_args() device = -1 if args.gpu >= 0: device = args.gpu debug = args.debug print('input args:\n', json.dumps(vars(args), indent=4, separators=(',', ':'))) # pretty print args if args.data_name == 'qm9': from data import transform_qm9 transform_fn = transform_qm9.transform_fn atomic_num_list = [6, 7, 8, 9, 0] mlp_channels = [256, 256] gnn_channels = {'gcn': [8, 64], 'hidden': [128, 64]} valid_idx = transform_qm9.get_val_ids() elif args.data_name == 'zinc250k': transform_fn = transform_fn_zinc250k atomic_num_list = zinc250_atomic_num_list mlp_channels = [1024, 512] gnn_channels = {'gcn': [16, 128], 'hidden': [256, 64]} valid_idx = transform_zinc250k.get_val_ids() dataset = NumpyTupleDataset.load( os.path.join(args.data_dir, args.data_file)) dataset = TransformDataset(dataset, transform_fn) if len(valid_idx) > 0: train_idx = [t for t in range(len(dataset)) if t not in valid_idx] n_train = len(train_idx) train_idx.extend(valid_idx) train, test = chainer.datasets.split_dataset(dataset, n_train, train_idx) else: train, test = chainer.datasets.split_dataset_random( dataset, int(len(dataset) * 0.8), seed=args.seed) train_iter = chainer.iterators.SerialIterator(train, args.batch_size) num_masks = { 'node': args.num_node_masks, 'channel': args.num_channel_masks } mask_size = { 'node': args.node_mask_size, 'channel': args.channel_mask_size } num_coupling = { 'node': args.num_node_coupling, 'channel': args.num_channel_coupling } model_params = Hyperparameters( args.num_atoms, args.num_rels, len(atomic_num_list), num_masks=num_masks, mask_size=mask_size, num_coupling=num_coupling, batch_norm=args.apply_batch_norm, additive_transformations=args.additive_transformations, learn_dist=args.learn_dist, mlp_channels=mlp_channels, gnn_channels=gnn_channels) model = GraphNvpModel(model_params) if device >= 0: chainer.cuda.get_device(device).use() model.to_gpu(device) print('==========================================') if device >= 0: print('Using GPUs') print('Num Minibatch-size: {}'.format(args.batch_size)) print('Num epoch: {}'.format(args.max_epochs)) print('==========================================') os.makedirs(args.save_dir, exist_ok=True) model.save_hyperparams(os.path.join(args.save_dir, 'graphnvp-params.json')) opt = chainer.optimizers.Adam() opt.setup(model) updater = MolNvpUpdater(train_iter, opt, device=device, loss_func=None) trainer = training.Trainer(updater, (args.max_epochs, 'epoch'), out=args.save_dir) # trainer.extend(extensions.dump_graph('log_likelihood')) def print_validity(t): adj, x = generate_mols(model, batch_size=100, gpu=device) valid_mols = check_validity(adj, x, atomic_num_list, device)['valid_mols'] mol_dir = os.path.join(args.save_dir, 'generated_{}'.format(t.updater.epoch)) # mol_dir = os.path.join(args.save_dir, 'generated_{}'.format(t.updater.iteration)) os.makedirs(mol_dir, exist_ok=True) for ind, mol in enumerate(valid_mols): save_mol_png(mol, os.path.join(mol_dir, '{}.png'.format(ind))) if debug: # trainer.extend(print_validity, trigger=(1, 'epoch')) trainer.extend(print_validity, trigger=(100, 'iteration')) save_epochs = args.save_epochs if save_epochs == -1: save_epochs = args.max_epochs trainer.extend(extensions.snapshot(), trigger=(save_epochs, 'epoch')) # trainer.extend(extensions.PlotReport(['log_likelihood'], 'epoch', file_name='qm9.png'), # trigger=(100, 'iteration')) trainer.extend( extensions.PrintReport( ['epoch', 'log_likelihood', 'nll_x', 'nll_adj', 'elapsed_time'])) trainer.extend(extensions.LogReport()) trainer.extend(extensions.ProgressBar()) if args.load_params == 1: chainer.serializers.load_npz(args.load_snapshot, trainer) trainer.run() chainer.serializers.save_npz( os.path.join(args.save_dir, 'graph-nvp-final.npz'), model)
def main(): args = parse_arguments() # Set up some useful variables that will be used later on. dataset_name = args.dataset method = args.method num_data = args.num_data if args.label: labels = args.label cache_dir = os.path.join( 'input', '{}_{}_{}'.format(dataset_name, method, labels)) else: labels = None cache_dir = os.path.join('input', '{}_{}_all'.format(dataset_name, method)) # Load the cached dataset. filename = dataset_part_filename('test', num_data) path = os.path.join(cache_dir, filename) if os.path.exists(path): print('Loading cached dataset from {}.'.format(path)) test = NumpyTupleDataset.load(path) else: _, _, test = download_entire_dataset(dataset_name, num_data, labels, method, cache_dir) # Model-related data is stored this directory. model_dir = os.path.join(args.in_dir, os.path.basename(cache_dir)) model_filename = { 'classification': 'classifier.pkl', 'regression': 'regressor.pkl' } task_type = molnet_default_config[dataset_name]['task_type'] model_path = os.path.join(model_dir, model_filename[task_type]) print("model_path=" + model_path) print('Loading model weights from {}...'.format(model_path)) if task_type == 'classification': model = Classifier.load_pickle(model_path, device=args.gpu) elif task_type == 'regression': model = Regressor.load_pickle(model_path, device=args.gpu) else: raise ValueError('Invalid task type ({}) encountered when processing ' 'dataset ({}).'.format(task_type, dataset_name)) # Re-load the best-validation score snapshot # serializers.load_npz(os.path.join( # model_dir, "best_val_" + model_filename[task_type]), model) # Run an evaluator on the test dataset. print('Evaluating...') test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False) eval_result = Evaluator(test_iterator, model, converter=concat_mols, device=args.gpu)() print('Evaluation result: ', eval_result) # Add more stats if task_type == 'regression': # loss = cuda.to_cpu(numpy.array(eval_result['main/loss'])) # eval_result['main/loss'] = loss # convert to native values.. for k, v in eval_result.items(): eval_result[k] = float(v) elif task_type == "classification": # For Classifier, we do not equip the model with ROC-AUC evalation function # use a seperate ROC-AUC Evaluator here rocauc_result = ROCAUCEvaluator(test_iterator, model, converter=concat_mols, device=args.gpu, eval_func=model.predictor, name='test', ignore_labels=-1)() print('ROCAUC Evaluation result: ', rocauc_result) save_json(os.path.join(model_dir, 'rocauc_result.json'), rocauc_result) else: print('[WARNING] unknown task_type {}.'.format(task_type)) # Save the evaluation results. save_json(os.path.join(model_dir, 'eval_result.json'), eval_result)
def main(): args = parse_arguments() # Set up some useful variables that will be used later on. dataset_name = args.dataset method = args.method num_data = args.num_data if args.label: labels = args.label cache_dir = os.path.join( 'input', '{}_{}_{}'.format(dataset_name, method, labels)) else: labels = None cache_dir = os.path.join('input', '{}_{}_all'.format(dataset_name, method)) # Load the cached dataset. filename = dataset_part_filename('test', num_data) path = os.path.join(cache_dir, filename) if os.path.exists(path): print('Loading cached dataset from {}.'.format(path)) test = NumpyTupleDataset.load(path) else: _, _, test = download_entire_dataset(dataset_name, num_data, labels, method, cache_dir) # # Load the standard scaler parameters, if necessary. # if args.scale == 'standardize': # scaler_path = os.path.join(args.in_dir, 'scaler.pkl') # print('Loading scaler parameters from {}.'.format(scaler_path)) # with open(scaler_path, mode='rb') as f: # scaler = pickle.load(f) # else: # print('No standard scaling was selected.') # scaler = None # Model-related data is stored this directory. model_dir = os.path.join(args.in_dir, os.path.basename(cache_dir)) model_filename = { 'classification': 'classifier.pkl', 'regression': 'regressor.pkl' } task_type = molnet_default_config[dataset_name]['task_type'] model_path = os.path.join(model_dir, model_filename[task_type]) print('Loading model weights from {}...'.format(model_path)) if task_type == 'classification': model = Classifier.load_pickle(model_path, device=args.gpu) elif task_type == 'regression': model = Regressor.load_pickle(model_path, device=args.gpu) else: raise ValueError('Invalid task type ({}) encountered when processing ' 'dataset ({}).'.format(task_type, dataset_name)) # # Replace the default predictor with one that scales the output labels. # scaled_predictor = ScaledGraphConvPredictor(model.predictor) # scaled_predictor.scaler = scaler # model.predictor = scaled_predictor # Run an evaluator on the test dataset. print('Evaluating...') test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False) eval_result = Evaluator(test_iterator, model, converter=concat_mols, device=args.gpu)() print('Evaluation result: ', eval_result) # Save the evaluation results. with open(os.path.join(model_dir, 'eval_result.json'), 'w') as f: json.dump(eval_result, f)
def main(): # Parse the arguments. args = parse_arguments() method = args.method if args.data_name == 'suzuki': datafile = 'data/suzuki_type_train_v2.csv' class_num = 119 class_dict = {'M': 28, 'L': 23, 'B': 35, 'S': 10, 'A': 17} dataset_filename = 'data.npz' labels = ['Yield', 'M', 'L', 'B', 'S', 'A', 'id'] elif args.data_name == 'CN': datafile = 'data/CN_coupling_train.csv' class_num = 206 class_dict = {'M': 44, 'L': 47, 'B': 13, 'S': 22, 'A': 74} dataset_filename = 'CN_data.npz' labels = ['Yield', 'M', 'L', 'B', 'S', 'A', 'id'] elif args.data_name == 'Negishi': datafile = 'data/Negishi_train.csv' class_num = 106 class_dict = {'M': 32, 'L': 20, 'T': 8, 'S': 10, 'A': 30} dataset_filename = 'Negishi_data.npz' labels = ['Yield', 'M', 'L', 'T', 'S', 'A', 'id'] elif args.data_name == 'PKR': datafile = 'data/PKR_train.csv' class_num = 83 class_dict = { 'M': 18, 'L': 6, 'T': 7, 'S': 15, 'A': 11, 'G': 1, 'O': 13, 'P': 4, 'other': 1 } dataset_filename = 'PKR_data.npz' labels = [ 'Yield', 'M', 'L', 'T', 'S', 'A', 'G', 'O', 'P', 'other', 'id' ] else: raise ValueError('Unexpected dataset name') cache_dir = os.path.join('input', '{}_all'.format(method)) # Dataset preparation. Postprocessing is required for the regression task. def postprocess_label(label_list): return numpy.asarray(label_list, dtype=numpy.float32) # Load the cached dataset. dataset_cache_path = os.path.join(cache_dir, dataset_filename) dataset = None if os.path.exists(dataset_cache_path): print('Loading cached dataset from {}.'.format(dataset_cache_path)) dataset = NumpyTupleDataset.load(dataset_cache_path) if dataset is None: print('Preprocessing dataset...') if args.method == 'mpnn': preprocessor = preprocess_method_dict['ggnn']() else: preprocessor = preprocess_method_dict[args.method]() parser = CSVFileParser( preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_col=['Reactant1', 'Reactant2', 'Product'], label_dicts=class_dict) # Load the entire dataset. dataset = parser.parse(datafile)['dataset'] # Cache the laded dataset. if not os.path.exists(cache_dir): os.makedirs(cache_dir) NumpyTupleDataset.save(dataset_cache_path, dataset) # Scale the label values, if necessary. if args.scale == 'standardize': scaler = StandardScaler() labels = dataset.get_datasets()[-2] # yields = dataset.get_datasets()[-1] yields = dataset.get_datasets()[-1][:, 0].reshape(-1, 1).astype('float32') # Filter index here # range_exp = (0.0 <= yields) & (yields <= 1.0) range_exp = numpy.argsort(yields[:, 0]) # ascending start_len = 0 end_len = len(yields) #int(len(yields) / 4) range_exp = range_exp[start_len:end_len] range_dataset = (dataset.get_datasets()[0][range_exp], dataset.get_datasets()[1][range_exp], dataset.get_datasets()[2][range_exp], dataset.get_datasets()[3][range_exp], dataset.get_datasets()[4][range_exp], dataset.get_datasets()[5][range_exp]) yields = yields[range_exp] labels = labels[range_exp] dataset = NumpyTupleDataset(*(range_dataset + ( yields, labels, ))) else: scaler = None # Split the dataset into training and validation. train_data_size = int(len(dataset) * args.train_data_ratio) train, valid = split_dataset_random(dataset, train_data_size, args.seed) # Set up the predictor. predictor = set_up_predictor(args.method, args.unit_num, args.conv_layers, class_num) # Set up the iterator. train_iter = iterators.SerialIterator(train, args.batchsize) valid_iter = iterators.SerialIterator(valid, args.batchsize, repeat=False, shuffle=False) # Set up the regressor. device = args.gpu classifier = Classifier(predictor, lossfun=F.sigmoid_cross_entropy, metrics_fun=F.binary_accuracy, device=args.gpu) # Set up the optimizer. optimizer = optimizers.Adam() optimizer.setup(classifier) # Set up the updater. updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu, converter=concat_mols) # Set up the trainer. print('Training...') trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend( E.Evaluator(valid_iter, classifier, device=device, converter=concat_mols)) trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend( E.snapshot_object( classifier, filename='model_epoch-{.updater.epoch}')) # save every epoch trainer.extend(E.LogReport()) trainer.extend( E.PrintReport([ 'epoch', 'main/loss', 'main/accuracy', 'validation/main/loss', 'validation/main/accuracy', 'elapsed_time' ])) trainer.extend(E.ProgressBar()) trainer.run() # Save the regressor's parameters. model_path = os.path.join(args.out, args.model_filename) print('Saving the trained model to {}...'.format(model_path)) classifier.save_pickle(model_path, protocol=args.protocol) # Save the standard scaler's parameters. if scaler is not None: with open(os.path.join(args.out, 'scaler.pkl'), mode='wb') as f: pickle.dump(scaler, f, protocol=args.protocol)
default='true', help='if neighborhood of a molecule to be visualized') parser.add_argument('--save_fig', type=strtobool, default='true') args = parser.parse_args() chainer.config.train = False snapshot_path = os.path.join(args.model_dir, args.snapshot_path) hyperparams_path = os.path.join(args.model_dir, args.hyperparams_path) print("loading hyperparamaters from {}".format(hyperparams_path)) model_params = Hyperparameters(path=hyperparams_path) model = load_model(snapshot_path, model_params, debug=True) if args.gpu >= 0: model.to_gpu(args.gpu) true_data = NumpyTupleDataset.load( os.path.join(args.data_dir, args.molecule_file)) if args.data_name == 'qm9': atomic_num_list = [6, 7, 8, 9, 0] true_data = TransformDataset(true_data, transform_qm9.transform_fn) valid_idx = transform_qm9.get_val_ids() elif args.data_name == 'zinc250k': atomic_num_list = zinc250_atomic_num_list true_data = TransformDataset(true_data, transform_fn_zinc250k) valid_idx = transform_zinc250k.get_val_ids() train_idx = [t for t in range(len(true_data)) if t not in valid_idx] n_train = len(train_idx) train_idx.extend(valid_idx) train_data, _ = chainer.datasets.split_dataset(true_data, n_train, train_idx)
def main(): args.cuda = torch.cuda.is_available() args.device = 'cuda' if torch.cuda.is_available() else 'cpu' print(args) if args.data_name == 'qm9': from data import transform_qm9 transform_fn = transform_qm9.transform_fn args.atomic_num_list = [6, 7, 8, 9, 0] mlp_channels = [256, 256] gnn_channels = {'gcn': [8, 64], 'hidden': [64, 128]} valid_idx = transform_qm9.get_val_ids() elif args.data_name == 'zinc250k': from data import transform_zinc250k from data.transform_zinc250k import transform_fn_zinc250k transform_fn = transform_fn_zinc250k args.atomic_num_list = [6, 7, 8, 9, 15, 16, 17, 35, 53, 0] mlp_channels = [1024, 512] gnn_channels = {'gcn': [16, 128], 'hidden': [64, 256]} valid_idx = transform_zinc250k.get_val_ids() dataset = NumpyTupleDataset.load(joinpath(args.data_dir, args.data_file)) dataset = TransformDataset(dataset, transform_fn) if len(valid_idx) > 0: train_idx = [t for t in range(len(dataset)) if t not in valid_idx] n_train = len(train_idx) train_idx.extend(valid_idx) train, test = chainer.datasets.split_dataset(dataset, n_train, train_idx) else: train, test = chainer.datasets.split_dataset_random( dataset, int(len(dataset) * 0.8), seed=args.seed) num_masks = { 'node': args.num_node_masks, 'channel': args.num_channel_masks } mask_size = { 'node': args.node_mask_size, 'channel': args.channel_mask_size } num_coupling = { 'node': args.num_node_coupling, 'channel': args.num_channel_coupling } NVPmodel_params = Hyperparameters( args.num_atoms, args.num_rels, len(args.atomic_num_list), num_masks=num_masks, mask_size=mask_size, num_coupling=num_coupling, batch_norm=args.apply_batch_norm, additive_transformations=args.additive_transformations, mlp_channels=mlp_channels, gnn_channels=gnn_channels) model = GraphNvpModel(NVPmodel_params).to(args.device) train_iter = chainer.iterators.SerialIterator(train, args.batch_size, repeat=False) if isinstance(train_iter, iterator_module.Iterator): iterator = {'main': train_iter} # train_dataloader dataloader = iterator['main'] data_config, all_train_smiles = read_molecules(args.data_dir) converter = convert.concat_examples # fitting t_total = time() total_g_loss, total_d_loss = [], [] max_size = model.num_atoms # 9 for QM9 num_atom = max_size node_dim = model.num_features # 5 for QM9 # OR exclude padding dim. 5-1 bond_dim = model.num_bonds # 4 for QM9 best_g_loss, best_d_loss = sys.maxsize, sys.maxsize start_epoch = args.resume_epoch if args.resume: model = GraphNvpModel(hyperparams=NVPmodel_params).to(args.device) model_path = joinpath(args.model_save_dir, 'epoch{}-mle.ckpt'.format(args.resume_epoch)) model.load_state_dict( torch.load(model_path, map_location=lambda storage, loc: storage)) print("Resuming from epoch{}".format(args.resume_epoch)) all_unique_rate = [] all_valid_rate = [] all_novelty_rate = [] print('start fitting.') optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), args.mle_lr, betas=(args.beta1, args.beta2)) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='min', factor=args.lr_decay_factor, patience=args.lr_decay_patience, min_lr=args.lr_decay_min) optimizer.step() def generate_one(model, mute=False, cnt=None): """ inverse flow to generate one molecule Args: temp: temperature of normal distributions, we sample from (0, temp^2 * I) """ generate_start_t = time() num2bond = { 0: Chem.rdchem.BondType.SINGLE, 1: Chem.rdchem.BondType.DOUBLE, 2: Chem.rdchem.BondType.TRIPLE } num2bond_symbol = {0: '=', 1: '==', 2: '==='} num2atom = {0: 6, 1: 7, 2: 8, 3: 9, 4: 15, 5: 16, 6: 17, 7: 35, 8: 53} num2symbol = { 0: 'C', 1: 'N', 2: 'O', 3: 'F', 4: 'P', 5: 'S', 6: 'Cl', 7: 'Br', 8: 'I' } is_continue = True mol = None total_resample = 0 batch_size = 1 # Generating z = sample_z(model, batch_size=1) A, X = model.reverse( z, model.x_size) # For QM9: [16,9,9,5], [16,9,5], [16,8]-[B,z_dim] X = F.softmax(X, dim=2) mols = [ construct_mol(x_elem, adj_elem, args.atomic_num_list) for x_elem, adj_elem in zip(X, A) ] pure_valid = 0 smiles = '' num_atoms = -1 for mol in mols: assert mol is not None, 'mol is None...' final_valid = env.check_chemical_validity(mol) valency_valid = env.check_valency(mol) if final_valid is False or valency_valid is False: print( 'Warning: use valency check during generation but the final molecule is invalid!!!' ) continue num_atoms = mol.GetNumAtoms() num_bonds = mol.GetNumBonds() smiles = Chem.MolToSmiles(mol) if total_resample == 0: pure_valid = 1.0 if not mute: cnt = str(cnt) if cnt is not None else '' print( 'smiles%s: %s | #atoms: %d | #bonds: %d | #resample: %.5f | time: %.5f |' % (cnt, smiles, num_atoms, num_bonds, total_resample, time() - generate_start_t)) return smiles, A, X, pure_valid, num_atoms def train(model): for epoch in range(1 + start_epoch, args.epochs + 1 - start_epoch): batch_g_losses = [] batch_cnt = 0 epoch_example = 0 num_samples = len(dataloader.dataset) num_batches = math.ceil(num_samples / args.batch_size) pbar = tqdm(total=num_batches) for i_batch, batch_data in enumerate(copy.copy(dataloader)): batch_time_s = time() loss = {} in_arrays = converter(batch_data) X, A, label = in_arrays[0], in_arrays[1], in_arrays[2] X, A, label = torch.tensor(X, dtype=torch.float32).to(args.device), \ torch.tensor(A, dtype=torch.float32).to(args.device), \ torch.tensor(label, dtype=torch.float32).to(args.device) # Dequantization X_prime = X + 0.9 * torch.rand(X.shape, device=args.device) A_prime = A + 0.9 * torch.rand(A.shape, device=args.device) z, sum_log_det_jacs = model(A_prime, X_prime) nll = model.log_prob(z, sum_log_det_jacs) g_loss = nll loss['G/loss_g'] = g_loss.item() batch_g_losses.append(g_loss.item()) optimizer.zero_grad() g_loss.backward() optimizer.step() scheduler.step(g_loss) pbar.update() if i_batch % args.show_loss_step == 0: tqdm.write("Epoch %d, batch %d, Loss mle: %.5f" % (epoch, i_batch, g_loss.item())) pbar.close() print("Saving GraphNVP model trained with maximum liklihood") model_path = joinpath(args.model_save_dir, 'epoch{}-mle.ckpt'.format(epoch)) torch.save(model.state_dict(), model_path) print('Saved model checkpoints into {}...'.format( args.model_save_dir)) gen(model, epoch) def gen(model, epoch=-1): model.eval() all_smiles = [] pure_valids = [] appear_in_train = 0. start_t = time() cnt_mol = 0 cnt_gen = 0 out_path = joinpath(args.gen_path, 'mle_mols{}.txt'.format(epoch)) print("Generating %d mols for evaluation" % (args.num_gen)) while cnt_mol < args.num_gen: smiles, A, X, no_resample, num_atoms = generate_one(model, mute=False, cnt=cnt_gen) cnt_gen += 1 if cnt_gen > args.max_resample: break if num_atoms < 0 or num_atoms < args.min_atoms: print('#atoms of generated molecule less than %d, discarded!' % args.min_atoms) continue else: cnt_mol += 1 if cnt_mol % 100 == 0: print('cur cnt mol: %d' % cnt_mol) all_smiles.append(smiles) pure_valids.append(no_resample) print('Accepting: {}'.format(smiles)) if all_train_smiles is not None and smiles in all_train_smiles: appear_in_train += 1.0 mol = Chem.MolFromSmiles(smiles) qed_score = env.qed(mol) plogp_score = env.penalized_logp(mol) if cnt_mol > args.num_gen: print("Generating {} times rather than 100 times!".format(cnt_mol)) args.num_gen = cnt_mol unique_smiles = list(set(all_smiles)) unique_rate = len(unique_smiles) / args.num_gen pure_valid_rate = sum(pure_valids) / args.num_gen novelty = 1. - (appear_in_train / args.num_gen) print( 'Time for generating (%d/%d) molecules(#atoms>=%d) with %d resamplings: %.5f' % (cnt_gen - args.max_resample, args.num_gen, args.min_atoms, args.max_resample, time() - start_t)) print('| unique rate: %.5f | valid rate: %.5f | novelty: %.5f |' % (unique_rate, pure_valid_rate, novelty)) mol_img_dir = joinpath(args.img_dir, 'mol_img{}'.format(epoch)) os.makedirs(mol_img_dir, exist_ok=True) if not os.path.exists(args.gen_path): os.makedirs(args.gen_path) if out_path is not None and args.save: with open(out_path, 'w+') as out_file: cnt = 0 for i, mol in enumerate(all_smiles): # Invalid disconnection if '.' in all_smiles[i]: continue out_file.write(all_smiles[i] + '\n') save_mol_png(Chem.MolFromSmiles(mol), joinpath(mol_img_dir, '{}.png'.format(i))) cnt += 1 print('writing %d smiles into %s done!' % (cnt, out_path)) all_unique_rate.append(unique_rate) all_valid_rate.append(pure_valid_rate) all_novelty_rate.append(novelty) if args.save: print('saving metric of validity, novelty and uniqueness into %s' % (args.gen_path)) np.save(joinpath(args.gen_path, 'valid{}'.format(epoch)), np.array(all_valid_rate)) np.save(joinpath(args.gen_path, 'novelty{}'.format(epoch)), np.array(all_novelty_rate)) np.save(joinpath(args.gen_path, 'unique{}'.format(epoch)), np.array(all_unique_rate)) if args.mode == 'train': train(model) elif args.mode == 'gen': gen(model) else: print("Specify mode as 'train' or 'gen'")
def main(): args = parse_arguments() # Set up some useful variables that will be used later on. dataset_name = args.dataset method = args.method num_data = args.num_data n_unit = args.unit_num conv_layers = args.conv_layers task_type = molnet_default_config[dataset_name]['task_type'] model_filename = { 'classification': 'classifier.pkl', 'regression': 'regressor.pkl' } print('Using dataset: {}...'.format(dataset_name)) # Set up some useful variables that will be used later on. if args.label: labels = args.label cache_dir = os.path.join( 'input', '{}_{}_{}'.format(dataset_name, method, labels)) class_num = len(labels) if isinstance(labels, list) else 1 else: labels = None cache_dir = os.path.join('input', '{}_{}_all'.format(dataset_name, method)) class_num = len(molnet_default_config[args.dataset]['tasks']) # Load the train and validation parts of the dataset. filenames = [ dataset_part_filename(p, num_data) for p in ['train', 'valid'] ] paths = [os.path.join(cache_dir, f) for f in filenames] if all([os.path.exists(path) for path in paths]): dataset_parts = [] for path in paths: print('Loading cached dataset from {}.'.format(path)) dataset_parts.append(NumpyTupleDataset.load(path)) else: dataset_parts = download_entire_dataset(dataset_name, num_data, labels, method, cache_dir) train, valid = dataset_parts[0], dataset_parts[1] # # Scale the label values, if necessary. # if args.scale == 'standardize': # if task_type == 'regression': # print('Applying standard scaling to the labels.') # datasets, scaler = standardize_dataset_labels(datasets) # else: # print('Label scaling is not available for classification tasks.') # else: # print('No label scaling was selected.') # scaler = None # Set up the predictor. predictor = set_up_predictor(method, n_unit, conv_layers, class_num) # Set up the iterators. train_iter = iterators.SerialIterator(train, args.batchsize) valid_iter = iterators.SerialIterator(valid, args.batchsize, repeat=False, shuffle=False) # Load metrics for the current dataset. metrics = molnet_default_config[dataset_name]['metrics'] metrics_fun = { k: v for k, v in metrics.items() if isinstance(v, types.FunctionType) } loss_fun = molnet_default_config[dataset_name]['loss'] if task_type == 'regression': model = Regressor(predictor, lossfun=loss_fun, metrics_fun=metrics_fun, device=args.gpu) # TODO: Use standard scaler for regression task elif task_type == 'classification': model = Classifier(predictor, lossfun=loss_fun, metrics_fun=metrics_fun, device=args.gpu) else: raise ValueError('Invalid task type ({}) encountered when processing ' 'dataset ({}).'.format(task_type, dataset_name)) # Set up the optimizer. optimizer = optimizers.Adam() optimizer.setup(model) # Save model-related output to this directory. model_dir = os.path.join(args.out, os.path.basename(cache_dir)) if not os.path.exists(model_dir): os.makedirs(model_dir) # Set up the updater. updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu, converter=concat_mols) # Set up the trainer. trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=model_dir) trainer.extend( E.Evaluator(valid_iter, model, device=args.gpu, converter=concat_mols)) trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(E.LogReport()) # Report various metrics. print_report_targets = ['epoch', 'main/loss', 'validation/main/loss'] for metric_name, metric_fun in metrics.items(): if isinstance(metric_fun, types.FunctionType): print_report_targets.append('main/' + metric_name) print_report_targets.append('validation/main/' + metric_name) elif issubclass(metric_fun, BatchEvaluator): trainer.extend( metric_fun(valid_iter, model, device=args.gpu, eval_func=predictor, converter=concat_mols, name='val', raise_value_error=False)) print_report_targets.append('val/main/' + metric_name) else: raise TypeError('{} is not a supported metrics function.'.format( type(metrics_fun))) print_report_targets.append('elapsed_time') trainer.extend(E.PrintReport(print_report_targets)) trainer.extend(E.ProgressBar()) trainer.run() # Save the model's parameters. model_path = os.path.join(model_dir, model_filename[task_type]) print('Saving the trained model to {}...'.format(model_path)) model.save_pickle(model_path, protocol=args.protocol)
def main(): # Parse the arguments. args = parse_arguments() # Set up some useful variables that will be used later on. method = args.method if args.label != 'all': labels = args.label cache_dir = os.path.join('input', '{}_{}'.format(method, labels)) class_num = len(labels) if isinstance(labels, list) else 1 else: labels = None cache_dir = os.path.join('input', '{}_all'.format(method)) class_num = len(D.get_qm9_label_names()) # Get the filename corresponding to the cached dataset, based on the amount # of data samples that need to be parsed from the original dataset. num_data = args.num_data if num_data >= 0: dataset_filename = 'data_{}.npz'.format(num_data) else: dataset_filename = 'data.npz' # Load the cached dataset. dataset_cache_path = os.path.join(cache_dir, dataset_filename) dataset = None if os.path.exists(dataset_cache_path): print('Loading cached dataset from {}.'.format(dataset_cache_path)) dataset = NumpyTupleDataset.load(dataset_cache_path) if dataset is None: print('Preprocessing dataset...') preprocessor = preprocess_method_dict[method]() if num_data >= 0: # Select the first `num_data` samples from the dataset. target_index = numpy.arange(num_data) dataset = D.get_qm9(preprocessor, labels=labels, target_index=target_index) else: # Load the entire dataset. dataset = D.get_qm9(preprocessor, labels=labels) # Cache the laded dataset. if not os.path.exists(cache_dir): os.makedirs(cache_dir) NumpyTupleDataset.save(dataset_cache_path, dataset) # Scale the label values, if necessary. if args.scale == 'standardize': print('Applying standard scaling to the labels.') scaler = StandardScaler() scaled_t = scaler.fit_transform(dataset.get_datasets()[-1]) dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] + (scaled_t,))) else: print('No standard scaling was selected.') scaler = None # Split the dataset into training and validation. train_data_size = int(len(dataset) * args.train_data_ratio) train, valid = split_dataset_random(dataset, train_data_size, args.seed) # Set up the predictor. predictor = set_up_predictor(method, args.unit_num, args.conv_layers, class_num, scaler) # Set up the iterators. train_iter = iterators.SerialIterator(train, args.batchsize) valid_iter = iterators.SerialIterator(valid, args.batchsize, repeat=False, shuffle=False) # Set up the regressor. device = args.gpu metrics_fun = {'mae': MeanAbsError(scaler=scaler), 'rmse': RootMeanSqrError(scaler=scaler)} regressor = Regressor(predictor, lossfun=F.mean_squared_error, metrics_fun=metrics_fun, device=device) # Set up the optimizer. optimizer = optimizers.Adam() optimizer.setup(regressor) # Set up the updater. updater = training.StandardUpdater(train_iter, optimizer, device=device, converter=concat_mols) # Set up the trainer. trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend(E.Evaluator(valid_iter, regressor, device=device, converter=concat_mols)) trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(E.LogReport()) trainer.extend(E.PrintReport([ 'epoch', 'main/loss', 'main/mae', 'main/rmse', 'validation/main/loss', 'validation/main/mae', 'validation/main/rmse', 'elapsed_time'])) trainer.extend(E.ProgressBar()) trainer.run() # Save the regressor's parameters. model_path = os.path.join(args.out, args.model_filename) print('Saving the trained model to {}...'.format(model_path)) regressor.save_pickle(model_path, protocol=args.protocol)
def main(): # Parse the arguments. args = parse_arguments() args.out = os.path.join(args.out, args.method) save_args(args, args.out) if args.label: labels = args.label class_num = len(labels) if isinstance(labels, list) else 1 else: raise ValueError('No target label was specified.') # Dataset preparation. Postprocessing is required for the regression task. def postprocess_label_float(label_list): return numpy.asarray(label_list, dtype=numpy.float32) def postprocess_label_int(label_list): return numpy.asarray(label_list, dtype=numpy.int64) # Apply a preprocessor to the dataset. if args.train: ## training data fn,ext = os.path.splitext(args.train) if ext==".npz": print('Loading training dataset...') train = NumpyTupleDataset.load(args.train) else: print('Preprocessing training dataset...') preprocessor = preprocess_method_dict[args.method]() if args.classification: parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label_int,labels=labels, smiles_col='SMILES') else: parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label_float,labels=labels, smiles_col='SMILES') train = parser.parse(args.train)['dataset'] NumpyTupleDataset.save(os.path.join(args.out,os.path.split(fn)[1]), train) # Scale the label values, if necessary. if args.scale == 'standardize': scaler = StandardScaler() scaler.fit(train.get_datasets()[-1]) else: scaler = None ## test data fn,ext = os.path.splitext(args.val) if ext==".npz": print('Loading test dataset...') test = NumpyTupleDataset.load(args.val) else: print('Preprocessing test dataset...') preprocessor = preprocess_method_dict[args.method]() if args.classification: parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label_int,labels=labels, smiles_col='SMILES') else: parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label_float,labels=labels, smiles_col='SMILES') test = parser.parse(args.val)['dataset'] NumpyTupleDataset.save(os.path.join(args.out,os.path.split(fn)[1]), test) # Set up the model. device = chainer.get_device(args.device) converter = converter_method_dict[args.method] metrics_fun = {'mae': F.mean_absolute_error, 'rmse': rmse} if args.classification: if args.load_model: model = Classifier.load_pickle(args.load_model, device=device) print("model file loaded: ",args.load_model) else: predictor = set_up_predictor(args.method, args.unit_num, args.conv_layers, class_num) model = Classifier(predictor, lossfun=F.sigmoid_cross_entropy, metrics_fun=F.binary_accuracy, device=device) else: if args.load_model: model = Regressor.load_pickle(args.load_model, device=device) print("model file loaded: ",args.load_model) else: predictor = set_up_predictor( args.method+args.method_suffix, args.unit_num, args.conv_layers, class_num, label_scaler=scaler) model = Regressor(predictor, lossfun=F.mean_squared_error, metrics_fun=metrics_fun, device=device) if args.train: if args.balanced_iter: train = BalancedSerialIterator(train, args.batchsize, train.features[:, -1], ignore_labels=-1) train.show_label_stats() print('Training...') log_keys = ['main/mae','main/rmse','validation/main/mae','validation/main/rmse','validation/main/roc_auc'] extensions_list = [extensions.PlotReport(log_keys, 'iteration', trigger=(100, 'iteration'), file_name='loss.png')] if args.eval_roc and args.classification: extensions_list.append(ROCAUCEvaluator( test, model, eval_func=predictor, device=device, converter=converter, name='validation', pos_labels=1, ignore_labels=-1, raise_value_error=False)) save_json(os.path.join(args.out, 'args.json'), vars(args)) run_train(model, train, valid=test, batch_size=args.batchsize, epoch=args.epoch, out=args.out, extensions_list=extensions_list, device=device, converter=converter) #, resume_path=args.resume) # Save the model's parameters. model_path = os.path.join(args.out, args.model_filename) print('Saving the trained model to {}...'.format(model_path)) if hasattr(model.predictor.graph_conv, 'reset_state'): model.predictor.graph_conv.reset_state() model.save_pickle(model_path, protocol=args.protocol) ## prediction it = SerialIterator(test, args.batchsize, repeat=False, shuffle=False) result = [] for batch in it: in_arrays = convert._call_converter(converter, batch, device) with chainer.using_config('train', False), chainer.function.no_backprop_mode(): if isinstance(in_arrays, tuple): res = model(*in_arrays) elif isinstance(in_arrays, dict): res = model(**in_arrays) else: res = model(in_arrays) result.extend(model.y.array.get()) numpy.savetxt(os.path.join(args.out,"result.csv"), numpy.array(result)) eval_result = Evaluator(it, model, converter=converter,device=device)() print('Evaluation result: ', eval_result)
def main(): # Supported preprocessing/network list method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn'] label_names = [ 'A', 'B', 'C', 'mu', 'alpha', 'h**o', 'lumo', 'gap', 'r2', 'zpve', 'U0', 'U', 'H', 'G', 'Cv' ] scale_list = ['standardize', 'none'] parser = argparse.ArgumentParser(description='Regression with QM9.') parser.add_argument('--method', '-m', type=str, choices=method_list, default='nfp') parser.add_argument('--label', '-l', type=str, choices=label_names, default='', help='target label for regression, ' 'empty string means to predict all ' 'property at once') parser.add_argument('--scale', type=str, choices=scale_list, default='standardize', help='Label scaling method') parser.add_argument('--batchsize', '-b', type=int, default=32) parser.add_argument('--gpu', '-g', type=int, default=-1) parser.add_argument('--in-dir', '-i', type=str, default='result') parser.add_argument('--seed', '-s', type=int, default=777) parser.add_argument('--train-data-ratio', '-t', type=float, default=0.7) parser.add_argument('--model-filename', type=str, default='regressor.pkl') parser.add_argument('--num-data', type=int, default=-1, help='Number of data to be parsed from parser.' '-1 indicates to parse all data.') args = parser.parse_args() seed = args.seed train_data_ratio = args.train_data_ratio method = args.method if args.label: labels = args.label cache_dir = os.path.join('input', '{}_{}'.format(method, labels)) # class_num = len(labels) if isinstance(labels, list) else 1 else: labels = D.get_qm9_label_names() cache_dir = os.path.join('input', '{}_all'.format(method)) # class_num = len(labels) # Dataset preparation dataset = None num_data = args.num_data if num_data >= 0: dataset_filename = 'data_{}.npz'.format(num_data) else: dataset_filename = 'data.npz' dataset_cache_path = os.path.join(cache_dir, dataset_filename) if os.path.exists(dataset_cache_path): print('load from cache {}'.format(dataset_cache_path)) dataset = NumpyTupleDataset.load(dataset_cache_path) if dataset is None: print('preprocessing dataset...') preprocessor = preprocess_method_dict[method]() dataset = D.get_qm9(preprocessor, labels=labels) if not os.path.exists(cache_dir): os.mkdir(cache_dir) NumpyTupleDataset.save(dataset_cache_path, dataset) if args.scale == 'standardize': # Standard Scaler for labels with open(os.path.join(args.in_dir, 'ss.pkl'), mode='rb') as f: ss = pickle.load(f) else: ss = None train_data_size = int(len(dataset) * train_data_ratio) train, val = split_dataset_random(dataset, train_data_size, seed) regressor = Regressor.load_pickle(os.path.join(args.in_dir, args.model_filename), device=args.gpu) # type: Regressor # We need to feed only input features `x` to `predict`/`predict_proba`. # This converter extracts only inputs (x1, x2, ...) from the features which # consist of input `x` and label `t` (x1, x2, ..., t). def extract_inputs(batch, device=None): return concat_mols(batch, device=device)[:-1] def postprocess_fn(x): if ss is not None: # Model's output is scaled by StandardScaler, # so we need to rescale back. if isinstance(x, Variable): x = x.data scaled_x = ss.inverse_transform(cuda.to_cpu(x)) return scaled_x else: return x print('Predicting...') y_pred = regressor.predict(val, converter=extract_inputs, postprocess_fn=postprocess_fn) print('y_pred.shape = {}, y_pred[:5, 0] = {}'.format( y_pred.shape, y_pred[:5, 0])) t = concat_mols(val, device=-1)[-1] n_eval = 10 # Construct dataframe df_dict = {} for i, l in enumerate(labels): df_dict.update({ 'y_pred_{}'.format(l): y_pred[:, i], 't_{}'.format(l): t[:, i], }) df = pandas.DataFrame(df_dict) # Show random 5 example's prediction/ground truth table print(df.sample(5)) for target_label in range(y_pred.shape[1]): diff = y_pred[:n_eval, target_label] - t[:n_eval, target_label] print('target_label = {}, y_pred = {}, t = {}, diff = {}'.format( target_label, y_pred[:n_eval, target_label], t[:n_eval, target_label], diff)) # --- evaluate --- # To calc loss/accuracy, we can use `Evaluator`, `ROCAUCEvaluator` print('Evaluating...') val_iterator = SerialIterator(val, 16, repeat=False, shuffle=False) eval_result = Evaluator(val_iterator, regressor, converter=concat_mols, device=args.gpu)() print('Evaluation result: ', eval_result)
chainer.config.train = False # -- load model -- # model_hyperparams_path = os.path.join(gen_params.model_root_path, gen_params.model_hyperparams) model_path = os.path.join(gen_params.model_root_path, gen_params.model_file) model_params = Hyperparameter(model_hyperparams_path).subparams("model") model = load_model_from(model_path, model_params) if device >= 0: model.to_gpu(device) # -- load dataset -- # atomic_num_list = get_atomic_num_id(gen_params.atom_id_to_atomic_num) validation_idxs = get_validation_idxs(gen_params.train_validation_split) dataset = NumpyTupleDataset.load( os.path.join(gen_params.data_root_path, gen_params.dataset)) train_idxs = [i for i in range(len(dataset)) if i not in validation_idxs] n_train = len(train_idxs) train_idxs.extend(validation_idxs) train_set = chainer.datasets.SubDataset(dataset, 0, n_train, order=train_idxs) train_smiles = adj_to_smiles(train_set, atomic_num_list) # -- random generation -- # valid_ratio = [] unique_ratio = [] novel_ratio = [] if save_fig: gen_dir = os.path.join(gen_params.output_root_path, "generated")
def main(): method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn'] dataset_names = list(molnet_default_config.keys()) parser = argparse.ArgumentParser(description='molnet example') parser.add_argument('--method', '-m', type=str, choices=method_list, default='nfp') parser.add_argument('--label', '-l', type=str, default='', help='target label for regression, empty string means ' 'to predict all property at once') parser.add_argument('--conv-layers', '-c', type=int, default=4) parser.add_argument('--batchsize', '-b', type=int, default=32) parser.add_argument('--gpu', '-g', type=int, default=-1) parser.add_argument('--out', '-o', type=str, default='result') parser.add_argument('--epoch', '-e', type=int, default=20) parser.add_argument('--unit-num', '-u', type=int, default=16) parser.add_argument('--dataset', '-d', type=str, choices=dataset_names, default='bbbp') parser.add_argument('--protocol', type=int, default=2) parser.add_argument('--model-filename', type=str, default='regressor.pkl') parser.add_argument('--num-data', type=int, default=-1, help='Number of data to be parsed from parser.' '-1 indicates to parse all data.') args = parser.parse_args() dataset_name = args.dataset method = args.method num_data = args.num_data n_unit = args.unit_num conv_layers = args.conv_layers print('Use {} dataset'.format(dataset_name)) if args.label: labels = args.label cache_dir = os.path.join( 'input', '{}_{}_{}'.format(dataset_name, method, labels)) class_num = len(labels) if isinstance(labels, list) else 1 else: labels = None cache_dir = os.path.join('input', '{}_{}_all'.format(dataset_name, method)) class_num = len(molnet_default_config[args.dataset]['tasks']) # Dataset preparation def get_dataset_paths(cache_dir, num_data): filepaths = [] for filetype in ['train', 'valid', 'test']: filename = filetype + '_data' if num_data >= 0: filename += '_' + str(num_data) filename += '.npz' filepath = os.path.join(cache_dir, filename) filepaths.append(filepath) return filepaths filepaths = get_dataset_paths(cache_dir, num_data) if all([os.path.exists(fpath) for fpath in filepaths]): datasets = [] for fpath in filepaths: print('load from cache {}'.format(fpath)) datasets.append(NumpyTupleDataset.load(fpath)) else: print('preprocessing dataset...') preprocessor = preprocess_method_dict[method]() # only use first 100 for debug if num_data >= 0 target_index = numpy.arange(num_data) if num_data >= 0 else None datasets = D.molnet.get_molnet_dataset(dataset_name, preprocessor, labels=labels, target_index=target_index) if not os.path.exists(cache_dir): os.makedirs(cache_dir) datasets = datasets['dataset'] for i, fpath in enumerate(filepaths): NumpyTupleDataset.save(fpath, datasets[i]) train, val, _ = datasets # Network if method == 'nfp': print('Train NFP model...') predictor = GraphConvPredictor( NFP(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'ggnn': print('Train GGNN model...') predictor = GraphConvPredictor( GGNN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'schnet': print('Train SchNet model...') predictor = GraphConvPredictor( SchNet(out_dim=class_num, hidden_dim=n_unit, n_layers=conv_layers), None) elif method == 'weavenet': print('Train WeaveNet model...') n_atom = 20 n_sub_layer = 1 weave_channels = [50] * conv_layers predictor = GraphConvPredictor( WeaveNet(weave_channels=weave_channels, hidden_dim=n_unit, n_sub_layer=n_sub_layer, n_atom=n_atom), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'rsgcn': print('Train RSGCN model...') predictor = GraphConvPredictor( RSGCN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) else: raise ValueError('[ERROR] Invalid method {}'.format(method)) train_iter = iterators.SerialIterator(train, args.batchsize) val_iter = iterators.SerialIterator(val, args.batchsize, repeat=False, shuffle=False) metrics_fun = molnet_default_config[dataset_name]['metrics'] loss_fun = molnet_default_config[dataset_name]['loss'] task_type = molnet_default_config[dataset_name]['task_type'] if task_type == 'regression': model = Regressor(predictor, lossfun=loss_fun, metrics_fun=metrics_fun, device=args.gpu) # TODO(nakago): Use standard scaler for regression task elif task_type == 'classification': model = Classifier(predictor, lossfun=loss_fun, metrics_fun=metrics_fun, device=args.gpu) else: raise NotImplementedError( 'Not implemented task_type = {}'.format(task_type)) optimizer = optimizers.Adam() optimizer.setup(model) updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu, converter=concat_mols) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend( E.Evaluator(val_iter, model, device=args.gpu, converter=concat_mols)) trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(E.LogReport()) print_report_targets = ['epoch', 'main/loss', 'validation/main/loss'] if metrics_fun is not None and type(metrics_fun) == dict: for m_k in metrics_fun.keys(): print_report_targets.append('main/' + m_k) print_report_targets.append('validation/main/' + m_k) if task_type == 'classification': # Evaluation for train data takes time, skip for now. # trainer.extend(ROCAUCEvaluator( # train_iter, model, device=args.gpu, eval_func=predictor, # converter=concat_mols, name='train', raise_value_error=False)) # print_report_targets.append('train/main/roc_auc') trainer.extend( ROCAUCEvaluator(val_iter, model, device=args.gpu, eval_func=predictor, converter=concat_mols, name='val', raise_value_error=False)) print_report_targets.append('val/main/roc_auc') print_report_targets.append('elapsed_time') trainer.extend(E.PrintReport(print_report_targets)) trainer.extend(E.ProgressBar()) trainer.run() # --- save model --- protocol = args.protocol model.save_pickle(os.path.join(args.out, args.model_filename), protocol=protocol)
def main(): args = parse_arguments() # Set up some useful variables that will be used later on. dataset_name = args.dataset method = args.method num_data = args.num_data if args.label: labels = args.label cache_dir = os.path.join( 'input', '{}_{}_{}'.format(dataset_name, method, labels)) else: labels = None cache_dir = os.path.join('input', '{}_{}_all'.format(dataset_name, method)) # Load the cached dataset. filename = dataset_part_filename('test', num_data) path = os.path.join(cache_dir, filename) if os.path.exists(path): print('Loading cached dataset from {}.'.format(path)) test = NumpyTupleDataset.load(path) else: _, _, test = download_entire_dataset(dataset_name, num_data, labels, method, cache_dir) # # Load the standard scaler parameters, if necessary. # if args.scale == 'standardize': # scaler_path = os.path.join(args.in_dir, 'scaler.pkl') # print('Loading scaler parameters from {}.'.format(scaler_path)) # with open(scaler_path, mode='rb') as f: # scaler = pickle.load(f) # else: # print('No standard scaling was selected.') # scaler = None # Model-related data is stored this directory. model_dir = os.path.join(args.in_dir, os.path.basename(cache_dir)) model_filename = { 'classification': 'classifier.pkl', 'regression': 'regressor.pkl' } task_type = molnet_default_config[dataset_name]['task_type'] model_path = os.path.join(model_dir, model_filename[task_type]) print("model_path=" + model_path) print('Loading model weights from {}...'.format(model_path)) if task_type == 'classification': model = Classifier.load_pickle(model_path, device=args.gpu) elif task_type == 'regression': model = Regressor.load_pickle(model_path, device=args.gpu) else: raise ValueError('Invalid task type ({}) encountered when processing ' 'dataset ({}).'.format(task_type, dataset_name)) # Proposed by Ishiguro # ToDo: consider go/no-go with following modification # Re-load the best-validation score snapshot serializers.load_npz( os.path.join(model_dir, "best_val_" + model_filename[task_type]), model) # # Replace the default predictor with one that scales the output labels. # scaled_predictor = ScaledGraphConvPredictor(model.predictor) # scaled_predictor.scaler = scaler # model.predictor = scaled_predictor # Run an evaluator on the test dataset. print('Evaluating...') test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False) eval_result = Evaluator(test_iterator, model, converter=concat_mols, device=args.gpu)() print('Evaluation result: ', eval_result) # Proposed by Ishiguro: add more stats # ToDo: considre go/no-go with the following modification if task_type == 'regression': # loss = cuda.to_cpu(numpy.array(eval_result['main/loss'])) # eval_result['main/loss'] = loss # convert to native values.. for k, v in eval_result.items(): eval_result[k] = float(v) save_json(os.path.join(args.in_dir, 'eval_result.json'), eval_result) elif task_type == "classification": # For Classifier, we do not equip the model with ROC-AUC evalation function # use a seperate ROC-AUC Evaluator here rocauc_result = ROCAUCEvaluator(test_iterator, model, converter=concat_mols, device=args.gpu, eval_func=model.predictor, name='test', ignore_labels=-1)() print('ROCAUC Evaluation result: ', rocauc_result) save_json(os.path.join(args.in_dir, 'eval_result.json'), rocauc_result) else: pass # Save the evaluation results. save_json(os.path.join(model_dir, 'eval_result.json'), eval_result)
def main(): # Parse the arguments. args = parse_arguments() device = args.gpu # Set up some useful variables that will be used later on. method = args.method if args.label != 'all': label = args.label cache_dir = os.path.join('input', '{}_{}'.format(method, label)) labels = [label] else: labels = D.get_qm9_label_names() cache_dir = os.path.join('input', '{}_all'.format(method)) # Get the filename corresponding to the cached dataset, based on the amount # of data samples that need to be parsed from the original dataset. num_data = args.num_data if num_data >= 0: dataset_filename = 'data_{}.npz'.format(num_data) else: dataset_filename = 'data.npz' # Load the cached dataset. dataset_cache_path = os.path.join(cache_dir, dataset_filename) dataset = None if os.path.exists(dataset_cache_path): print('Loading cached data from {}.'.format(dataset_cache_path)) dataset = NumpyTupleDataset.load(dataset_cache_path) if dataset is None: print('Preprocessing dataset...') preprocessor = preprocess_method_dict[method]() dataset = D.get_qm9(preprocessor, labels=labels) # Cache the newly preprocessed dataset. if not os.path.exists(cache_dir): os.mkdir(cache_dir) NumpyTupleDataset.save(dataset_cache_path, dataset) # Use a predictor with scaled output labels. model_path = os.path.join(args.in_dir, args.model_filename) regressor = Regressor.load_pickle(model_path, device=device) scaler = regressor.predictor.scaler if scaler is not None: original_t = dataset.get_datasets()[-1] if args.gpu >= 0: scaled_t = cuda.to_cpu(scaler.transform( cuda.to_gpu(original_t))) else: scaled_t = scaler.transform(original_t) dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] + (scaled_t,))) # Split the dataset into training and testing. train_data_size = int(len(dataset) * args.train_data_ratio) _, test = split_dataset_random(dataset, train_data_size, args.seed) # This callback function extracts only the inputs and discards the labels. def extract_inputs(batch, device=None): return concat_mols(batch, device=device)[:-1] def postprocess_fn(x): if scaler is not None: scaled_x = scaler.inverse_transform(x) return scaled_x else: return x # Predict the output labels. print('Predicting...') y_pred = regressor.predict( test, converter=extract_inputs, postprocess_fn=postprocess_fn) # Extract the ground-truth labels. t = concat_mols(test, device=device)[-1] original_t = cuda.to_cpu(scaler.inverse_transform(t)) # Construct dataframe. df_dict = {} for i, l in enumerate(labels): df_dict.update({'y_pred_{}'.format(l): y_pred[:, i], 't_{}'.format(l): original_t[:, i], }) df = pandas.DataFrame(df_dict) # Show a prediction/ground truth table with 5 random examples. print(df.sample(5)) n_eval = 10 for target_label in range(y_pred.shape[1]): label_name = labels[target_label] diff = y_pred[:n_eval, target_label] - original_t[:n_eval, target_label] print('label_name = {}, y_pred = {}, t = {}, diff = {}' .format(label_name, y_pred[:n_eval, target_label], original_t[:n_eval, target_label], diff)) # Run an evaluator on the test dataset. print('Evaluating...') test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False) eval_result = Evaluator(test_iterator, regressor, converter=concat_mols, device=device)() print('Evaluation result: ', eval_result) # Save the evaluation results. save_json(os.path.join(args.in_dir, 'eval_result.json'), eval_result) # Calculate mean abs error for each label mae = numpy.mean(numpy.abs(y_pred - original_t), axis=0) eval_result = {} for i, l in enumerate(labels): eval_result.update({l: mae[i]}) save_json(os.path.join(args.in_dir, 'eval_result_mae.json'), eval_result)
def main(): args = parse_arguments() # Set up some useful variables that will be used later on. dataset_name = args.dataset method = args.method num_data = args.num_data n_unit = args.unit_num conv_layers = args.conv_layers task_type = molnet_default_config[dataset_name]['task_type'] model_filename = { 'classification': 'classifier.pkl', 'regression': 'regressor.pkl' } print('Using dataset: {}...'.format(dataset_name)) # Set up some useful variables that will be used later on. if args.label: labels = args.label cache_dir = os.path.join( 'input', '{}_{}_{}'.format(dataset_name, method, labels)) class_num = len(labels) if isinstance(labels, list) else 1 else: labels = None cache_dir = os.path.join('input', '{}_{}_all'.format(dataset_name, method)) class_num = len(molnet_default_config[args.dataset]['tasks']) # Load the train and validation parts of the dataset. filenames = [ dataset_part_filename(p, num_data) for p in ['train', 'valid'] ] paths = [os.path.join(cache_dir, f) for f in filenames] if all([os.path.exists(path) for path in paths]): dataset_parts = [] for path in paths: print('Loading cached dataset from {}.'.format(path)) dataset_parts.append(NumpyTupleDataset.load(path)) else: dataset_parts = download_entire_dataset(dataset_name, num_data, labels, method, cache_dir) train, valid = dataset_parts[0], dataset_parts[1] # Scale the label values, if necessary. scaler = None if args.scale == 'standardize': if task_type == 'regression': print('Applying standard scaling to the labels.') scaler = fit_scaler(dataset_parts) else: print('Label scaling is not available for classification tasks.') else: print('No label scaling was selected.') # Set up the predictor. predictor = set_up_predictor(method, n_unit, conv_layers, class_num, label_scaler=scaler) # Set up the iterators. train_iter = iterators.SerialIterator(train, args.batchsize) valid_iter = iterators.SerialIterator(valid, args.batchsize, repeat=False, shuffle=False) # Load metrics for the current dataset. metrics = molnet_default_config[dataset_name]['metrics'] metrics_fun = { k: v for k, v in metrics.items() if isinstance(v, types.FunctionType) } loss_fun = molnet_default_config[dataset_name]['loss'] device = chainer.get_device(args.device) if task_type == 'regression': model = Regressor(predictor, lossfun=loss_fun, metrics_fun=metrics_fun, device=device) elif task_type == 'classification': model = Classifier(predictor, lossfun=loss_fun, metrics_fun=metrics_fun, device=device) else: raise ValueError('Invalid task type ({}) encountered when processing ' 'dataset ({}).'.format(task_type, dataset_name)) # Set up the optimizer. optimizer = optimizers.Adam() optimizer.setup(model) # Save model-related output to this directory. model_dir = os.path.join(args.out, os.path.basename(cache_dir)) if not os.path.exists(model_dir): os.makedirs(model_dir) # Set up the updater. updater = training.StandardUpdater(train_iter, optimizer, device=device, converter=concat_mols) # Set up the trainer. trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=model_dir) trainer.extend( E.Evaluator(valid_iter, model, device=device, converter=concat_mols)) trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(E.LogReport()) # TODO: consider go/no-go of the following block # # (i) more reporting for val/evalutaion # # (ii) best validation score snapshot # if task_type == 'regression': # metric_name_list = list(metrics.keys()) # if 'RMSE' in metric_name_list: # trainer.extend(E.snapshot_object(model, "best_val_" + model_filename[task_type]), # trigger=training.triggers.MinValueTrigger('validation/main/RMSE')) # elif 'MAE' in metric_name_list: # trainer.extend(E.snapshot_object(model, "best_val_" + model_filename[task_type]), # trigger=training.triggers.MinValueTrigger('validation/main/MAE')) # else: # print("[WARNING] No validation metric defined?") # # elif task_type == 'classification': # train_eval_iter = iterators.SerialIterator( # train, args.batchsize, repeat=False, shuffle=False) # trainer.extend(ROCAUCEvaluator( # train_eval_iter, predictor, eval_func=predictor, # device=args.gpu, converter=concat_mols, name='train', # pos_labels=1, ignore_labels=-1, raise_value_error=False)) # # extension name='validation' is already used by `Evaluator`, # # instead extension name `val` is used. # trainer.extend(ROCAUCEvaluator( # valid_iter, predictor, eval_func=predictor, # device=args.gpu, converter=concat_mols, name='val', # pos_labels=1, ignore_labels=-1, raise_value_error=False)) # # trainer.extend(E.snapshot_object( # model, "best_val_" + model_filename[task_type]), # trigger=training.triggers.MaxValueTrigger('val/main/roc_auc')) # else: # raise NotImplementedError( # 'Not implemented task_type = {}'.format(task_type)) trainer.extend(AutoPrintReport()) trainer.extend(E.ProgressBar()) trainer.run() # Save the model's parameters. model_path = os.path.join(model_dir, model_filename[task_type]) print('Saving the trained model to {}...'.format(model_path)) model.save_pickle(model_path, protocol=args.protocol)
def main(): # Supported preprocessing/network list method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn'] label_names = [ 'A', 'B', 'C', 'mu', 'alpha', 'h**o', 'lumo', 'gap', 'r2', 'zpve', 'U0', 'U', 'H', 'G', 'Cv' ] scale_list = ['standardize', 'none'] parser = argparse.ArgumentParser(description='Regression with QM9.') parser.add_argument('--method', '-m', type=str, choices=method_list, default='nfp') parser.add_argument('--label', '-l', type=str, choices=label_names, default='', help='target label for regression, ' 'empty string means to predict all ' 'property at once') parser.add_argument('--scale', type=str, choices=scale_list, default='standardize', help='Label scaling method') parser.add_argument('--conv-layers', '-c', type=int, default=4) parser.add_argument('--batchsize', '-b', type=int, default=32) parser.add_argument('--gpu', '-g', type=int, default=-1) parser.add_argument('--out', '-o', type=str, default='result') parser.add_argument('--epoch', '-e', type=int, default=20) parser.add_argument('--unit-num', '-u', type=int, default=16) parser.add_argument('--seed', '-s', type=int, default=777) parser.add_argument('--train-data-ratio', '-t', type=float, default=0.7) parser.add_argument('--protocol', type=int, default=2) parser.add_argument('--model-filename', type=str, default='regressor.pkl') parser.add_argument('--num-data', type=int, default=-1, help='Number of data to be parsed from parser.' '-1 indicates to parse all data.') args = parser.parse_args() seed = args.seed train_data_ratio = args.train_data_ratio method = args.method if args.label: labels = args.label cache_dir = os.path.join('input', '{}_{}'.format(method, labels)) class_num = len(labels) if isinstance(labels, list) else 1 else: labels = None cache_dir = os.path.join('input', '{}_all'.format(method)) class_num = len(D.get_qm9_label_names()) # Dataset preparation dataset = None num_data = args.num_data if num_data >= 0: dataset_filename = 'data_{}.npz'.format(num_data) else: dataset_filename = 'data.npz' dataset_cache_path = os.path.join(cache_dir, dataset_filename) if os.path.exists(dataset_cache_path): print('load from cache {}'.format(dataset_cache_path)) dataset = NumpyTupleDataset.load(dataset_cache_path) if dataset is None: print('preprocessing dataset...') preprocessor = preprocess_method_dict[method]() if num_data >= 0: # only use first 100 for debug target_index = numpy.arange(num_data) dataset = D.get_qm9(preprocessor, labels=labels, target_index=target_index) else: dataset = D.get_qm9(preprocessor, labels=labels) os.makedirs(cache_dir) NumpyTupleDataset.save(dataset_cache_path, dataset) if args.scale == 'standardize': # Standard Scaler for labels ss = StandardScaler() labels = ss.fit_transform(dataset.get_datasets()[-1]) else: ss = None dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] + (labels, ))) train_data_size = int(len(dataset) * train_data_ratio) train, val = split_dataset_random(dataset, train_data_size, seed) # Network n_unit = args.unit_num conv_layers = args.conv_layers if method == 'nfp': print('Train NFP model...') model = GraphConvPredictor( NFP(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'ggnn': print('Train GGNN model...') model = GraphConvPredictor( GGNN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'schnet': print('Train SchNet model...') model = GraphConvPredictor( SchNet(out_dim=class_num, hidden_dim=n_unit, n_layers=conv_layers), None) elif method == 'weavenet': print('Train WeaveNet model...') n_atom = 20 n_sub_layer = 1 weave_channels = [50] * conv_layers model = GraphConvPredictor( WeaveNet(weave_channels=weave_channels, hidden_dim=n_unit, n_sub_layer=n_sub_layer, n_atom=n_atom), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'rsgcn': print('Train RSGCN model...') model = GraphConvPredictor( RSGCN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) else: raise ValueError('[ERROR] Invalid method {}'.format(method)) train_iter = I.SerialIterator(train, args.batchsize) val_iter = I.SerialIterator(val, args.batchsize, repeat=False, shuffle=False) regressor = Regressor( model, lossfun=F.mean_squared_error, metrics_fun={'abs_error': ScaledAbsError(scale=args.scale, ss=ss)}, device=args.gpu) optimizer = O.Adam() optimizer.setup(regressor) updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu, converter=concat_mols) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend( E.Evaluator(val_iter, regressor, device=args.gpu, converter=concat_mols)) trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(E.LogReport()) trainer.extend( E.PrintReport([ 'epoch', 'main/loss', 'main/abs_error', 'validation/main/loss', 'validation/main/abs_error', 'elapsed_time' ])) trainer.extend(E.ProgressBar()) trainer.run() # --- save regressor & standardscaler --- protocol = args.protocol regressor.save_pickle(os.path.join(args.out, args.model_filename), protocol=protocol) if args.scale == 'standardize': with open(os.path.join(args.out, 'ss.pkl'), mode='wb') as f: pickle.dump(ss, f, protocol=protocol)
def main(): # Parse the arguments. args = parse_arguments() # Set up some useful variables that will be used later on. method = args.method if args.label != 'all': label = args.label cache_dir = os.path.join('input', '{}_{}'.format(method, label)) labels = [label] else: labels = D.get_qm9_label_names() cache_dir = os.path.join('input', '{}_all'.format(method)) # Get the filename corresponding to the cached dataset, based on the amount # of data samples that need to be parsed from the original dataset. num_data = args.num_data if num_data >= 0: dataset_filename = 'data_{}.npz'.format(num_data) else: dataset_filename = 'data.npz' # Load the cached dataset. dataset_cache_path = os.path.join(cache_dir, dataset_filename) dataset = None if os.path.exists(dataset_cache_path): print('Loading cached data from {}.'.format(dataset_cache_path)) dataset = NumpyTupleDataset.load(dataset_cache_path) if dataset is None: print('Preprocessing dataset...') preprocessor = preprocess_method_dict[method]() dataset = D.get_qm9(preprocessor, labels=labels) # Cache the newly preprocessed dataset. if not os.path.exists(cache_dir): os.mkdir(cache_dir) NumpyTupleDataset.save(dataset_cache_path, dataset) # Use a predictor with scaled output labels. model_path = os.path.join(args.in_dir, args.model_filename) regressor = Regressor.load_pickle(model_path, device=args.gpu) scaler = regressor.predictor.scaler if scaler is not None: scaled_t = scaler.transform(dataset.get_datasets()[-1]) dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] + (scaled_t, ))) # Split the dataset into training and testing. train_data_size = int(len(dataset) * args.train_data_ratio) _, test = split_dataset_random(dataset, train_data_size, args.seed) # This callback function extracts only the inputs and discards the labels. def extract_inputs(batch, device=None): return concat_mols(batch, device=device)[:-1] def postprocess_fn(x): if scaler is not None: scaled_x = scaler.inverse_transform(x) return scaled_x else: return x # Predict the output labels. print('Predicting...') y_pred = regressor.predict(test, converter=extract_inputs, postprocess_fn=postprocess_fn) # Extract the ground-truth labels. t = concat_mols(test, device=-1)[-1] original_t = scaler.inverse_transform(t) # Construct dataframe. df_dict = {} for i, l in enumerate(labels): df_dict.update({ 'y_pred_{}'.format(l): y_pred[:, i], 't_{}'.format(l): original_t[:, i], }) df = pandas.DataFrame(df_dict) # Show a prediction/ground truth table with 5 random examples. print(df.sample(5)) n_eval = 10 for target_label in range(y_pred.shape[1]): label_name = labels[target_label] diff = y_pred[:n_eval, target_label] - original_t[:n_eval, target_label] print('label_name = {}, y_pred = {}, t = {}, diff = {}'.format( label_name, y_pred[:n_eval, target_label], original_t[:n_eval, target_label], diff)) # Run an evaluator on the test dataset. print('Evaluating...') test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False) eval_result = Evaluator(test_iterator, regressor, converter=concat_mols, device=args.gpu)() print('Evaluation result: ', eval_result) # Save the evaluation results. save_json(os.path.join(args.in_dir, 'eval_result.json'), eval_result) # Calculate mean abs error for each label mae = numpy.mean(numpy.abs(y_pred - original_t), axis=0) eval_result = {} for i, l in enumerate(labels): eval_result.update({l: mae[i]}) save_json(os.path.join(args.in_dir, 'eval_result_mae.json'), eval_result)
def train(hyperparams: Hyperparameter): # -- hyperparams -- # dataset_params = hyperparams.subparams("dataset") config_params = hyperparams.subparams("configuration") train_params = hyperparams.subparams("train") model_params = hyperparams.subparams("model") output_params = hyperparams.subparams("output") os.makedirs(output_params.root_dir, exist_ok=True) if hasattr(output_params, "logname"): log.basicConfig(filename=os.path.join(output_params.root_dir, output_params.logname), filemode="w", level=get_log_level(output_params.log_level)) else: log.basicConfig(level=get_log_level(output_params.log_level)) hyperparams.save(os.path.join(output_params.root_dir, "hyperparams.json")) atomic_num_list = get_atomic_num_id( os.path.join(config_params.root_dir, config_params.atom_id_to_atomic_num)) data_parallel = False if isinstance(train_params.device, int): main_device = train_params.device device = main_device elif isinstance(train_params.device, dict): main_device = train_params.device["main"] device = train_params.device data_parallel = True else: raise ValueError("Invalid device.") log.info("Main Device: {}".format(main_device)) log.info("dataset hyperparameters:\n{}\n".format(dataset_params)) log.info("configuration hyperparameters:\n{}\n".format(config_params)) log.info("train hyperparameters:\n{}\n".format(train_params)) log.info("model hyperparameters:\n{}\n".format(model_params)) log.info("output hyperparameters:\n{}\n".format(output_params)) # -- build dataset -- # if config_params.has("train_validation_split"): validation_idxs = get_validation_idxs( os.path.join(config_params.root_dir, config_params.train_validation_split)) else: validation_idxs = None dataset = NumpyTupleDataset.load( os.path.join(dataset_params.root_dir, dataset_params.name)) if validation_idxs: train_idxs = [ i for i in range(len(dataset)) if i not in validation_idxs ] trainset_size = len(train_idxs) train_idxs.extend(validation_idxs) trainset, valset = chainer.datasets.split_dataset( dataset, trainset_size, train_idxs) else: trainset, valset = chainer.datasets.split_dataset_random( dataset, int(len(dataset) * 0.8), seed=777) train_iter = chainer.iterators.SerialIterator(trainset, train_params.batch_size, shuffle=True) val_iter = chainer.iterators.SerialIterator(valset, train_params.batch_size, repeat=False, shuffle=False) # -- model -- # model = AttentionNvpModel(model_params) if isinstance(device, dict): log.info("Using multi-GPU {}".format(device)) model.to_gpu(main_device) elif device >= 0: log.info("Using GPU {}".format(device)) chainer.cuda.get_device(main_device).use() model.to_gpu(device) else: log.info("Using CPU") # -- training details -- # num_epoch = train_params.num_epoch opt_gen = get_optimizer(train_params.optimizer) if train_params.has("optimizer_params"): optimizer = opt_gen(**train_params.optimizer_params) else: optimizer = opt_gen() optimizer.setup(model) if data_parallel: updater = DataParallelNVPUpdater( train_iter, optimizer, devices=device, two_step=train_params.two_step, h_nll_weight=train_params.h_nll_weight) else: updater = NVPUpdater(train_iter, optimizer, device=device, two_step=train_params.two_step, h_nll_weight=train_params.h_nll_weight) trainer = training.Trainer(updater, (num_epoch, "epoch"), out=output_params.root_dir) if train_params.has("save_epoch"): save_epoch = train_params.save_epoch else: save_epoch = num_epoch # -- evaluation function -- # def print_validity(trainer): with chainer.using_device( chainer.backends.cuda.get_device_from_id( main_device)), chainer.using_config("train", False): save_mol = (get_log_level(output_params.log_level) <= log.DEBUG) x, adj = generate_mols(model, batch_size=100, device=main_device) # x: atom id valid_mols = check_validity(x, adj, atomic_num_list=atomic_num_list, device=main_device) if save_mol: mol_dir = os.path.join( output_params.root_dir, output_params.saved_mol_dir, "generated_{}".format(trainer.updater.epoch)) os.makedirs(mol_dir, exist_ok=True) for i, mol in enumerate(valid_mols["valid_mols"]): save_mol_png(mol, os.path.join(mol_dir, "{}.png".format(i))) # -- trainer extension -- # trainer.extend(extensions.snapshot(), trigger=(save_epoch, "epoch")) trainer.extend(extensions.LogReport(filename=output_params.trainlogname)) trainer.extend(print_validity, trigger=(1, "epoch")) trainer.extend( extensions.PrintReport([ "epoch", "neg_log_likelihood", "nll_x", "nll_adj", "z_var", "ln_det_x", "ln_det_adj", "elapsed_time" ])) trainer.extend(extensions.ProgressBar()) # -- start train -- # if hasattr(train_params, "load_snapshot"): log.info("Load snapshot from {}".format(train_params.load_snapshot)) chainer.serializers.load_npz(train_params.load_snapshot, trainer) trainer.run() chainer.serializers.save_npz( os.path.join(output_params.root_dir, output_params.final_model_name), model)
def main(): args = parse_arguments() # Set up some useful variables that will be used later on. dataset_name = args.dataset method = args.method num_data = args.num_data n_unit = args.unit_num conv_layers = args.conv_layers task_type = molnet_default_config[dataset_name]['task_type'] model_filename = {'classification': 'classifier.pkl', 'regression': 'regressor.pkl'} print('Using dataset: {}...'.format(dataset_name)) # Set up some useful variables that will be used later on. if args.label: labels = args.label cache_dir = os.path.join('input', '{}_{}_{}'.format(dataset_name, method, labels)) class_num = len(labels) if isinstance(labels, list) else 1 else: labels = None cache_dir = os.path.join('input', '{}_{}_all'.format(dataset_name, method)) class_num = len(molnet_default_config[args.dataset]['tasks']) # Load the train and validation parts of the dataset. filenames = [dataset_part_filename(p, num_data) for p in ['train', 'valid']] paths = [os.path.join(cache_dir, f) for f in filenames] if all([os.path.exists(path) for path in paths]): dataset_parts = [] for path in paths: print('Loading cached dataset from {}.'.format(path)) dataset_parts.append(NumpyTupleDataset.load(path)) else: dataset_parts = download_entire_dataset(dataset_name, num_data, labels, method, cache_dir) train, valid = dataset_parts[0], dataset_parts[1] # # Scale the label values, if necessary. # if args.scale == 'standardize': # if task_type == 'regression': # print('Applying standard scaling to the labels.') # datasets, scaler = standardize_dataset_labels(datasets) # else: # print('Label scaling is not available for classification tasks.') # else: # print('No label scaling was selected.') # scaler = None # Set up the predictor. predictor = set_up_predictor(method, n_unit, conv_layers, class_num) # Set up the iterators. train_iter = iterators.SerialIterator(train, args.batchsize) valid_iter = iterators.SerialIterator(valid, args.batchsize, repeat=False, shuffle=False) # Load metrics for the current dataset. metrics = molnet_default_config[dataset_name]['metrics'] metrics_fun = {k: v for k, v in metrics.items() if isinstance(v, types.FunctionType)} loss_fun = molnet_default_config[dataset_name]['loss'] if task_type == 'regression': model = Regressor(predictor, lossfun=loss_fun, metrics_fun=metrics_fun, device=args.gpu) # TODO: Use standard scaler for regression task elif task_type == 'classification': model = Classifier(predictor, lossfun=loss_fun, metrics_fun=metrics_fun, device=args.gpu) else: raise ValueError('Invalid task type ({}) encountered when processing ' 'dataset ({}).'.format(task_type, dataset_name)) # Set up the optimizer. optimizer = optimizers.Adam() optimizer.setup(model) # Save model-related output to this directory. model_dir = os.path.join(args.out, os.path.basename(cache_dir)) if not os.path.exists(model_dir): os.makedirs(model_dir) # Set up the updater. updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu, converter=concat_mols) # Set up the trainer. trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=model_dir) trainer.extend(E.Evaluator(valid_iter, model, device=args.gpu, converter=concat_mols)) trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(E.LogReport()) # Report various metrics. print_report_targets = ['epoch', 'main/loss', 'validation/main/loss'] for metric_name, metric_fun in metrics.items(): if isinstance(metric_fun, types.FunctionType): print_report_targets.append('main/' + metric_name) print_report_targets.append('validation/main/' + metric_name) elif issubclass(metric_fun, BatchEvaluator): trainer.extend(metric_fun(valid_iter, model, device=args.gpu, eval_func=predictor, converter=concat_mols, name='val', raise_value_error=False)) print_report_targets.append('val/main/' + metric_name) else: raise TypeError('{} is not a supported metrics function.' .format(type(metrics_fun))) print_report_targets.append('elapsed_time') # Augmented by Ishiguro # ToDo: consider go/no-go of the following block # (i) more reporting for val/evalutaion # (ii) best validation score snapshot if task_type == 'regression': if 'RMSE' in metric_name: trainer.extend(E.snapshot_object(model, "best_val_" + model_filename[task_type]), trigger=training.triggers.MinValueTrigger('validation/main/RMSE')) elif 'MAE' in metric_name: trainer.extend(E.snapshot_object(model, "best_val_" + model_filename[task_type]), trigger=training.triggers.MinValueTrigger('validation/main/MAE')) else: print("No validation metric defined?") assert(False) elif task_type == 'classification': train_eval_iter = iterators.SerialIterator(train, args.batchsize,repeat=False, shuffle=False) trainer.extend(ROCAUCEvaluator( train_eval_iter, predictor, eval_func=predictor, device=args.gpu, converter=concat_mols, name='train', pos_labels=1, ignore_labels=-1, raise_value_error=False)) # extension name='validation' is already used by `Evaluator`, # instead extension name `val` is used. trainer.extend(ROCAUCEvaluator( valid_iter, predictor, eval_func=predictor, device=args.gpu, converter=concat_mols, name='val', pos_labels=1, ignore_labels=-1)) print_report_targets.append('train/main/roc_auc') print_report_targets.append('validation/main/loss') print_report_targets.append('val/main/roc_auc') trainer.extend(E.snapshot_object(model, "best_val_" + model_filename[task_type]), trigger=training.triggers.MaxValueTrigger('val/main/roc_auc')) else: raise NotImplementedError( 'Not implemented task_type = {}'.format(task_type)) trainer.extend(E.PrintReport(print_report_targets)) trainer.extend(E.ProgressBar()) trainer.run() # Save the model's parameters. model_path = os.path.join(model_dir, model_filename[task_type]) print('Saving the trained model to {}...'.format(model_path)) model.save_pickle(model_path, protocol=args.protocol)
def train(config): # -- read hyperparameters -- log.info("Hyper-parameters:") device = get_and_log(config, "device", -1) out_dir = get_and_log(config, "out_dir", "./output") config_dir = get_and_log(config, "config_dir", "./config") dataset_dir = get_and_log(config, "dataset_dir", "./dataset") validation_idxs_filepath = get_and_log(config, "train_validation_split") dataset_name = get_and_log(config, "dataset", required=True) atomic_nums = get_and_log(config, "atom_id_to_atomic_num", required=True) batch_size = get_and_log(config, "batch_size", required=True) num_epoch = get_and_log(config, "num_epoch", required=True) word_size = get_and_log(config, "embed_word_size", required=True) molecule_size = get_and_log(config, "molecule_size", required=True) num_atom_type = get_and_log(config, "num_atom_type", required=True) save_epoch = get_and_log(config, "save_epoch", -1) kekulized = get_and_log(config, "kekulize", False) layers = get_and_log(config, "layers", required=True) scale_adj = get_and_log(config, "scale_adj", True) log_name = get_and_log(config, "log_name", "log") optimizer_type = get_and_log(config, "optimizer", "adam") optimizer_params = get_and_log(config, "optimizer_params") snapshot = get_and_log(config, "snapshot") num_edge_type = 4 if kekulized else 5 os.makedirs(out_dir, exist_ok=True) if validation_idxs_filepath is not None: validation_idxs = get_validation_idxs(os.path.join(config_dir, validation_idxs_filepath)) else: validation_idxs = None # -- build dataset -- dataset = NumpyTupleDataset.load(os.path.join(dataset_dir, dataset_name)) if validation_idxs: train_idxs = [i for i in range(len(dataset)) if i not in validation_idxs] trainset_size = len(train_idxs) train_idxs.extend(validation_idxs) trainset, testset = chainer.datasets.split_dataset(dataset, trainset_size, train_idxs) else: trainset, testset = chainer.datasets.split_dataset_random(dataset, int(len(dataset) * 0.8), seed=777) train_iter = chainer.iterators.SerialIterator(trainset, batch_size, shuffle=True) test_iter = chainer.iterators.SerialIterator(testset, batch_size, repeat=False, shuffle=False) # -- model -- model = AtomEmbedModel(word_size, num_atom_type, num_edge_type, layers, scale_adj) model.save_hyperparameters(os.path.join(out_dir, "atom_embed_model_hyper.json")) # -- training details -- if device >= 0: log.info("Using GPU") chainer.cuda.get_device(device).use() model.to_gpu(device) opt_func = get_optimizer(optimizer_type) if optimizer_params is not None: optimizer = opt_func(optimizer_params) else: optimizer = opt_func() optimizer.setup(model) updater = AtomEmbedUpdater(train_iter, optimizer, device=device) trainer = training.Trainer(updater, (num_epoch, "epoch"), out=out_dir) save_epoch = save_epoch if save_epoch >= 0 else num_epoch # -- trainer extension -- trainer.extend(extensions.snapshot, trigger=(save_epoch, "epoch")) trainer.extend(extensions.LogReport(filename=log_name)) trainer.extend(AtomEmbedEvaluator(test_iter, model, reporter=trainer.reporter, device=device)) trainer.extend(extensions.PrintReport(["epoch", "ce_loss", "accuracy", "validation/ce_loss", "validation/accuracy", "elapsed_time"])) trainer.extend(extensions.PlotReport(["ce_loss", "validation/ce_loss"], x_key="epoch", filename="cross_entrypy_loss.png")) trainer.extend(extensions.PlotReport(["accuracy", "validation/accuracy"], x_key="epoch", filename="accuracy.png")) if snapshot is not None: chainer.serializers.load_npz(snapshot, trainer) trainer.run() chainer.serializers.save_npz(os.path.join(out_dir, "final_embed_model.npz"), model)