def test_concat_mols_2d_gpu(data_2d, data_2d_expect): result = concat_mols(data_2d, device=0) assert chainer.cuda.get_device_from_array(result[0]).id == 0 assert chainer.cuda.get_device_from_array(result[1]).id == 0 assert numpy.array_equal(chainer.cuda.to_cpu(result[0]), data_2d_expect[0]) assert numpy.array_equal(chainer.cuda.to_cpu(result[1]), data_2d_expect[1])
def eval_func(atoms_1, adj_1, atoms_2, adj_2): sample = [ (atoms_1, adj_1), (atoms_2, adj_2), ] sample = concat_mols(sample) atoms_1, adj_1 = sample[0] atoms_2, adj_2 = sample[1] print(atoms_1, adj_1) print('shape 1:', atoms_1.shape, adj_1.shape) print('shape 2:', atoms_2.shape, adj_2.shape) pred, _ = model.predictor.predict(atoms_1, adj_1, atoms_2, adj_2) return pred
def predict(self, *args, batchsize=32, device=-1): if device >= 0: chainer.cuda.get_device_from_id(device).use() self.to_gpu() # Copy the model to the GPU # TODO: Not test yet, check behavior data = args[0] y_list = [] for i in range(0, len(data), batchsize): atoms, adjs = concat_mols(data[i:i + batchsize], device=device)[:2] y = self._predict(atoms, adjs) y_list.append(cuda.to_cpu(y.data)) y_array = numpy.concatenate(y_list, axis=0) return y_array
def main(): # Parse the arguments. args = parse_arguments() # Set up some useful variables that will be used later on. method = args.method if args.label != 'all': label = args.label cache_dir = os.path.join('input', '{}_{}'.format(method, label)) labels = [label] else: labels = D.get_qm9_label_names() cache_dir = os.path.join('input', '{}_all'.format(method)) # Get the filename corresponding to the cached dataset, based on the amount # of data samples that need to be parsed from the original dataset. num_data = args.num_data if num_data >= 0: dataset_filename = 'data_{}.npz'.format(num_data) else: dataset_filename = 'data.npz' # Load the cached dataset. dataset_cache_path = os.path.join(cache_dir, dataset_filename) dataset = None if os.path.exists(dataset_cache_path): print('Loading cached data from {}.'.format(dataset_cache_path)) dataset = NumpyTupleDataset.load(dataset_cache_path) if dataset is None: print('Preprocessing dataset...') preprocessor = preprocess_method_dict[method]() dataset = D.get_qm9(preprocessor, labels=labels) # Cache the newly preprocessed dataset. if not os.path.exists(cache_dir): os.mkdir(cache_dir) NumpyTupleDataset.save(dataset_cache_path, dataset) # Use a predictor with scaled output labels. model_path = os.path.join(args.in_dir, args.model_filename) regressor = Regressor.load_pickle(model_path, device=args.gpu) scaler = regressor.predictor.scaler if scaler is not None: scaled_t = scaler.transform(dataset.get_datasets()[-1]) dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] + (scaled_t, ))) # Split the dataset into training and testing. train_data_size = int(len(dataset) * args.train_data_ratio) _, test = split_dataset_random(dataset, train_data_size, args.seed) # This callback function extracts only the inputs and discards the labels. def extract_inputs(batch, device=None): return concat_mols(batch, device=device)[:-1] def postprocess_fn(x): if scaler is not None: scaled_x = scaler.inverse_transform(x) return scaled_x else: return x # Predict the output labels. print('Predicting...') y_pred = regressor.predict(test, converter=extract_inputs, postprocess_fn=postprocess_fn) # Extract the ground-truth labels. t = concat_mols(test, device=-1)[-1] original_t = scaler.inverse_transform(t) # Construct dataframe. df_dict = {} for i, l in enumerate(labels): df_dict.update({ 'y_pred_{}'.format(l): y_pred[:, i], 't_{}'.format(l): original_t[:, i], }) df = pandas.DataFrame(df_dict) # Show a prediction/ground truth table with 5 random examples. print(df.sample(5)) n_eval = 10 for target_label in range(y_pred.shape[1]): label_name = labels[target_label] diff = y_pred[:n_eval, target_label] - original_t[:n_eval, target_label] print('label_name = {}, y_pred = {}, t = {}, diff = {}'.format( label_name, y_pred[:n_eval, target_label], original_t[:n_eval, target_label], diff)) # Run an evaluator on the test dataset. print('Evaluating...') test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False) eval_result = Evaluator(test_iterator, regressor, converter=concat_mols, device=args.gpu)() print('Evaluation result: ', eval_result) # Save the evaluation results. save_json(os.path.join(args.in_dir, 'eval_result.json'), eval_result) # Calculate mean abs error for each label mae = numpy.mean(numpy.abs(y_pred - original_t), axis=0) eval_result = {} for i, l in enumerate(labels): eval_result.update({l: mae[i]}) save_json(os.path.join(args.in_dir, 'eval_result_mae.json'), eval_result)
def main(): # Parse the arguments. args = parse_arguments() # Set up some useful variables that will be used later on. method = args.method if args.label: labels = args.label cache_dir = os.path.join('input', '{}_{}'.format(method, labels)) else: labels = D.get_qm9_label_names() cache_dir = os.path.join('input', '{}_all'.format(method)) # Get the filename corresponding to the cached dataset, based on the amount # of data samples that need to be parsed from the original dataset. num_data = args.num_data if num_data >= 0: dataset_filename = 'data_{}.npz'.format(num_data) else: dataset_filename = 'data.npz' # Load the cached dataset. dataset_cache_path = os.path.join(cache_dir, dataset_filename) dataset = None if os.path.exists(dataset_cache_path): print('Loading cached data from {}.'.format(dataset_cache_path)) dataset = NumpyTupleDataset.load(dataset_cache_path) if dataset is None: print('Preprocessing dataset...') preprocessor = preprocess_method_dict[method]() dataset = D.get_qm9(preprocessor, labels=labels) # Cache the newly preprocessed dataset. if not os.path.exists(cache_dir): os.mkdir(cache_dir) NumpyTupleDataset.save(dataset_cache_path, dataset) # Load the standard scaler parameters, if necessary. if args.scale == 'standardize': scaler_path = os.path.join(args.in_dir, 'scaler.pkl') print('Loading scaler parameters from {}.'.format(scaler_path)) with open(scaler_path, mode='rb') as f: scaler = pickle.load(f) else: print('No standard scaling was selected.') scaler = None # Split the dataset into training and testing. train_data_size = int(len(dataset) * args.train_data_ratio) _, test = split_dataset_random(dataset, train_data_size, args.seed) # Use a predictor with scaled output labels. model_path = os.path.join(args.in_dir, args.model_filename) regressor = Regressor.load_pickle(model_path, device=args.gpu) # Replace the default predictor with one that scales the output labels. scaled_predictor = ScaledGraphConvPredictor(regressor.predictor) scaled_predictor.scaler = scaler regressor.predictor = scaled_predictor # This callback function extracts only the inputs and discards the labels. def extract_inputs(batch, device=None): return concat_mols(batch, device=device)[:-1] # Predict the output labels. print('Predicting...') y_pred = regressor.predict(test, converter=extract_inputs) # Extract the ground-truth labels. t = concat_mols(test, device=-1)[-1] n_eval = 10 # Construct dataframe. df_dict = {} for i, l in enumerate(labels): df_dict.update({ 'y_pred_{}'.format(l): y_pred[:, i], 't_{}'.format(l): t[:, i], }) df = pandas.DataFrame(df_dict) # Show a prediction/ground truth table with 5 random examples. print(df.sample(5)) for target_label in range(y_pred.shape[1]): diff = y_pred[:n_eval, target_label] - t[:n_eval, target_label] print('target_label = {}, y_pred = {}, t = {}, diff = {}'.format( target_label, y_pred[:n_eval, target_label], t[:n_eval, target_label], diff)) # Run an evaluator on the test dataset. print('Evaluating...') test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False) eval_result = Evaluator(test_iterator, regressor, converter=concat_mols, device=args.gpu)() # Prevents the loss function from becoming a cupy.core.core.ndarray object # when using the GPU. This hack will be removed as soon as the cause of # the issue is found and properly fixed. loss = numpy.asscalar(cuda.to_cpu(eval_result['main/loss'])) eval_result['main/loss'] = loss print('Evaluation result: ', eval_result) # Save the evaluation results. with open(os.path.join(args.in_dir, 'eval_result.json'), 'w') as f: json.dump(eval_result, f)
gwm=args.gwm) chainer.serializers.load_npz(args.g_action, g_action) # chainer.cuda.get_device_from_id(0).use() # g_stop.to_gpu() valid_raw = uspto_dataset.read_data(args.test_path) valid_dataset = uspto_dataset.USPTO_dataset(valid_raw) valid_iter = SerialIterator(valid_dataset, 20, repeat=False, shuffle=False) one_part_acc = [] for batch in valid_iter: # get one batch of test data f_atoms, f_bonds, super_node_x, \ atom_label, mask_reagents, mask_reactants_reagents, pair_label, mask_pair_select, \ action, step_num, \ stop_idx, \ sample_index = concat_mols(batch, device=-1) atom_label -= 1 mask_reagents -= 2 mask_reactants_reagents -= 2 action -= 1 with chainer.using_config('train', False): inference(g_stop, g_atom, g_pair, g_action, f_atoms, f_bonds, super_node_x, atom_label, mask_reagents, mask_reactants_reagents, pair_label, mask_pair_select, action, step_num, stop_idx, sample_index, valid_raw, args.out)
def main(): # Supported preprocessing/network list method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn'] scale_list = ['standardize', 'none'] parser = argparse.ArgumentParser( description='Regression with own dataset.') parser.add_argument('--datafile', type=str, default='dataset.csv') parser.add_argument('--method', '-m', type=str, choices=method_list, default='nfp') parser.add_argument('--label', '-l', nargs='+', default=['value1', 'value2'], help='target label for regression') parser.add_argument('--scale', type=str, choices=scale_list, default='standardize', help='Label scaling method') parser.add_argument('--conv-layers', '-c', type=int, default=4) parser.add_argument('--batchsize', '-b', type=int, default=32) parser.add_argument('--gpu', '-g', type=int, default=-1) parser.add_argument('--out', '-o', type=str, default='result') parser.add_argument('--epoch', '-e', type=int, default=20) parser.add_argument('--unit-num', '-u', type=int, default=16) parser.add_argument('--seed', '-s', type=int, default=777) parser.add_argument('--train-data-ratio', '-t', type=float, default=0.7) parser.add_argument('--protocol', type=int, default=2) args = parser.parse_args() seed = args.seed train_data_ratio = args.train_data_ratio method = args.method if args.label: labels = args.label class_num = len(labels) if isinstance(labels, list) else 1 else: sys.exit("Error: No target label is specified.") # Dataset preparation # Postprocess is required for regression task def postprocess_label(label_list): return numpy.asarray(label_list, dtype=numpy.float32) print('Preprocessing dataset...') preprocessor = preprocess_method_dict[method]() parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_col='SMILES') dataset = parser.parse(args.datafile)["dataset"] if args.scale == 'standardize': # Standard Scaler for labels scaler = StandardScaler() labels = scaler.fit_transform(dataset.get_datasets()[-1]) dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] + (labels, ))) else: # Not use scaler scaler = None train_data_size = int(len(dataset) * train_data_ratio) train, val = split_dataset_random(dataset, train_data_size, seed) # Network n_unit = args.unit_num conv_layers = args.conv_layers if method == 'nfp': print('Train NFP model...') model = GraphConvPredictor( NFP(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'ggnn': print('Train GGNN model...') model = GraphConvPredictor( GGNN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'schnet': print('Train SchNet model...') model = GraphConvPredictor( SchNet(out_dim=class_num, hidden_dim=n_unit, n_layers=conv_layers), None) elif method == 'weavenet': print('Train WeaveNet model...') n_atom = 20 n_sub_layer = 1 weave_channels = [50] * conv_layers model = GraphConvPredictor( WeaveNet(weave_channels=weave_channels, hidden_dim=n_unit, n_sub_layer=n_sub_layer, n_atom=n_atom), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'rsgcn': print('Train RSGCN model...') model = GraphConvPredictor( RSGCN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) else: raise ValueError('[ERROR] Invalid method {}'.format(method)) train_iter = iterators.SerialIterator(train, args.batchsize) val_iter = iterators.SerialIterator(val, args.batchsize, repeat=False, shuffle=False) regressor = Regressor( model, lossfun=F.mean_squared_error, metrics_fun={'abs_error': ScaledAbsError(scaler=scaler)}, device=args.gpu) optimizer = optimizers.Adam() optimizer.setup(regressor) updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu, converter=concat_mols) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend( E.Evaluator(val_iter, regressor, device=args.gpu, converter=concat_mols)) trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(E.LogReport()) # Note that original scale absolute errors are reported in # (validation/)main/abs_error trainer.extend( E.PrintReport([ 'epoch', 'main/loss', 'main/abs_error', 'validation/main/loss', 'validation/main/abs_error', 'elapsed_time' ])) trainer.extend(E.ProgressBar()) trainer.run() # --- save regressor's parameters --- protocol = args.protocol model_path = os.path.join(args.out, 'model.npz') print('saving trained model to {}'.format(model_path)) serializers.save_npz(model_path, regressor) if scaler is not None: with open(os.path.join(args.out, 'scaler.pkl'), mode='wb') as f: pickle.dump(scaler, f, protocol=protocol) # Example of prediction using trained model smiles = 'c1ccccc1' mol = Chem.MolFromSmiles(smiles) preprocessor = preprocess_method_dict[method]() standardized_smiles, mol = preprocessor.prepare_smiles_and_mol(mol) input_features = preprocessor.get_input_features(mol) atoms, adjs = concat_mols([input_features], device=args.gpu) prediction = model(atoms, adjs).data[0] if scaler is not None: prediction = scaler.inverse_transform(prediction) print('Prediction for {}:'.format(smiles)) for i, label in enumerate(args.label): print('{}: {}'.format(label, prediction[i]))
def main(): # Supported preprocessing/network list method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn'] label_names = [ 'A', 'B', 'C', 'mu', 'alpha', 'h**o', 'lumo', 'gap', 'r2', 'zpve', 'U0', 'U', 'H', 'G', 'Cv' ] scale_list = ['standardize', 'none'] parser = argparse.ArgumentParser(description='Regression with QM9.') parser.add_argument('--method', '-m', type=str, choices=method_list, default='nfp') parser.add_argument('--label', '-l', type=str, choices=label_names, default='', help='target label for regression, ' 'empty string means to predict all ' 'property at once') parser.add_argument('--scale', type=str, choices=scale_list, default='standardize', help='Label scaling method') parser.add_argument('--batchsize', '-b', type=int, default=32) parser.add_argument('--gpu', '-g', type=int, default=-1) parser.add_argument('--in-dir', '-i', type=str, default='result') parser.add_argument('--seed', '-s', type=int, default=777) parser.add_argument('--train-data-ratio', '-t', type=float, default=0.7) parser.add_argument('--model-filename', type=str, default='regressor.pkl') parser.add_argument('--num-data', type=int, default=-1, help='Number of data to be parsed from parser.' '-1 indicates to parse all data.') args = parser.parse_args() seed = args.seed train_data_ratio = args.train_data_ratio method = args.method if args.label: labels = args.label cache_dir = os.path.join('input', '{}_{}'.format(method, labels)) # class_num = len(labels) if isinstance(labels, list) else 1 else: labels = D.get_qm9_label_names() cache_dir = os.path.join('input', '{}_all'.format(method)) # class_num = len(labels) # Dataset preparation dataset = None num_data = args.num_data if num_data >= 0: dataset_filename = 'data_{}.npz'.format(num_data) else: dataset_filename = 'data.npz' dataset_cache_path = os.path.join(cache_dir, dataset_filename) if os.path.exists(dataset_cache_path): print('load from cache {}'.format(dataset_cache_path)) dataset = NumpyTupleDataset.load(dataset_cache_path) if dataset is None: print('preprocessing dataset...') preprocessor = preprocess_method_dict[method]() dataset = D.get_qm9(preprocessor, labels=labels) if not os.path.exists(cache_dir): os.mkdir(cache_dir) NumpyTupleDataset.save(dataset_cache_path, dataset) if args.scale == 'standardize': # Standard Scaler for labels with open(os.path.join(args.in_dir, 'ss.pkl'), mode='rb') as f: ss = pickle.load(f) else: ss = None train_data_size = int(len(dataset) * train_data_ratio) train, val = split_dataset_random(dataset, train_data_size, seed) regressor = Regressor.load_pickle(os.path.join(args.in_dir, args.model_filename), device=args.gpu) # type: Regressor # We need to feed only input features `x` to `predict`/`predict_proba`. # This converter extracts only inputs (x1, x2, ...) from the features which # consist of input `x` and label `t` (x1, x2, ..., t). def extract_inputs(batch, device=None): return concat_mols(batch, device=device)[:-1] def postprocess_fn(x): if ss is not None: # Model's output is scaled by StandardScaler, # so we need to rescale back. if isinstance(x, Variable): x = x.data scaled_x = ss.inverse_transform(cuda.to_cpu(x)) return scaled_x else: return x print('Predicting...') y_pred = regressor.predict(val, converter=extract_inputs, postprocess_fn=postprocess_fn) print('y_pred.shape = {}, y_pred[:5, 0] = {}'.format( y_pred.shape, y_pred[:5, 0])) t = concat_mols(val, device=-1)[-1] n_eval = 10 # Construct dataframe df_dict = {} for i, l in enumerate(labels): df_dict.update({ 'y_pred_{}'.format(l): y_pred[:, i], 't_{}'.format(l): t[:, i], }) df = pandas.DataFrame(df_dict) # Show random 5 example's prediction/ground truth table print(df.sample(5)) for target_label in range(y_pred.shape[1]): diff = y_pred[:n_eval, target_label] - t[:n_eval, target_label] print('target_label = {}, y_pred = {}, t = {}, diff = {}'.format( target_label, y_pred[:n_eval, target_label], t[:n_eval, target_label], diff)) # --- evaluate --- # To calc loss/accuracy, we can use `Evaluator`, `ROCAUCEvaluator` print('Evaluating...') val_iterator = SerialIterator(val, 16, repeat=False, shuffle=False) eval_result = Evaluator(val_iterator, regressor, converter=concat_mols, device=args.gpu)() print('Evaluation result: ', eval_result)
def main(): # Supported preprocessing/network list method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn'] scale_list = ['standardize', 'none'] parser = argparse.ArgumentParser( description='Regression with own dataset.') parser.add_argument('datafile', type=str) parser.add_argument('--method', '-m', type=str, choices=method_list, default='nfp') parser.add_argument('--label', '-l', nargs='+', help='target label for regression') parser.add_argument('--scale', type=str, choices=scale_list, default='standardize', help='Label scaling method') parser.add_argument('--conv-layers', '-c', type=int, default=4) parser.add_argument('--batchsize', '-b', type=int, default=32) parser.add_argument('--gpu', '-g', type=int, default=-1) parser.add_argument('--out', '-o', type=str, default='result') parser.add_argument('--epoch', '-e', type=int, default=20) parser.add_argument('--unit-num', '-u', type=int, default=16) parser.add_argument('--seed', '-s', type=int, default=777) parser.add_argument('--train-data-ratio', '-t', type=float, default=0.7) args = parser.parse_args() seed = args.seed train_data_ratio = args.train_data_ratio method = args.method if args.label: labels = args.label class_num = len(labels) if isinstance(labels, list) else 1 else: sys.exit("Error: No target label is specified.") # Dataset preparation # Postprocess is required for regression task def postprocess_label(label_list): return numpy.asarray(label_list, dtype=numpy.float32) print('Preprocessing dataset...') preprocessor = preprocess_method_dict[method]() parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_col='SMILES') dataset = parser.parse(args.datafile)["dataset"] if args.scale == 'standardize': # Standard Scaler for labels ss = StandardScaler() labels = ss.fit_transform(dataset.get_datasets()[-1]) dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] + (labels,))) train_data_size = int(len(dataset) * train_data_ratio) train, val = split_dataset_random(dataset, train_data_size, seed) # Network n_unit = args.unit_num conv_layers = args.conv_layers if method == 'nfp': print('Train NFP model...') model = GraphConvPredictor(NFP(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'ggnn': print('Train GGNN model...') model = GraphConvPredictor(GGNN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'schnet': print('Train SchNet model...') model = GraphConvPredictor( SchNet(out_dim=class_num, hidden_dim=n_unit, n_layers=conv_layers), None) elif method == 'weavenet': print('Train WeaveNet model...') n_atom = 20 n_sub_layer = 1 weave_channels = [50] * conv_layers model = GraphConvPredictor( WeaveNet(weave_channels=weave_channels, hidden_dim=n_unit, n_sub_layer=n_sub_layer, n_atom=n_atom), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'rsgcn': print('Train RSGCN model...') model = GraphConvPredictor( RSGCN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) else: raise ValueError('[ERROR] Invalid method {}'.format(method)) train_iter = I.SerialIterator(train, args.batchsize) val_iter = I.SerialIterator(val, args.batchsize, repeat=False, shuffle=False) def scaled_abs_error(x0, x1): if isinstance(x0, Variable): x0 = cuda.to_cpu(x0.data) if isinstance(x1, Variable): x1 = cuda.to_cpu(x1.data) if args.scale == 'standardize': scaled_x0 = ss.inverse_transform(cuda.to_cpu(x0)) scaled_x1 = ss.inverse_transform(cuda.to_cpu(x1)) diff = scaled_x0 - scaled_x1 elif args.scale == 'none': diff = cuda.to_cpu(x0) - cuda.to_cpu(x1) return numpy.mean(numpy.absolute(diff), axis=0)[0] classifier = L.Classifier(model, lossfun=F.mean_squared_error, accfun=scaled_abs_error) if args.gpu >= 0: chainer.cuda.get_device_from_id(args.gpu).use() classifier.to_gpu() optimizer = O.Adam() optimizer.setup(classifier) updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu, converter=concat_mols) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend(E.Evaluator(val_iter, classifier, device=args.gpu, converter=concat_mols)) trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(E.LogReport()) # Note that scaled errors are reported as (validation/)main/accuracy trainer.extend(E.PrintReport(['epoch', 'main/loss', 'main/accuracy', 'validation/main/loss', 'validation/main/accuracy', 'elapsed_time'])) trainer.extend(E.ProgressBar()) trainer.run() # Example of prediction using trained model smiles = 'c1ccccc1' mol = Chem.MolFromSmiles(smiles) preprocessor = preprocess_method_dict[method]() standardized_smiles, mol = preprocessor.prepare_smiles_and_mol(mol) input_features = preprocessor.get_input_features(mol) atoms, adjs = concat_mols([input_features], device=args.gpu) prediction = model(atoms, adjs).data[0] print('Prediction for {}:'.format(smiles)) for i, label in enumerate(args.label): print('{}: {}'.format(label, prediction[i]))
def main(): # Parse the arguments. args = parse_arguments() device = args.gpu # Set up some useful variables that will be used later on. method = args.method if args.label != 'all': label = args.label cache_dir = os.path.join('input', '{}_{}'.format(method, label)) labels = [label] else: labels = D.get_qm9_label_names() cache_dir = os.path.join('input', '{}_all'.format(method)) # Get the filename corresponding to the cached dataset, based on the amount # of data samples that need to be parsed from the original dataset. num_data = args.num_data if num_data >= 0: dataset_filename = 'data_{}.npz'.format(num_data) else: dataset_filename = 'data.npz' # Load the cached dataset. dataset_cache_path = os.path.join(cache_dir, dataset_filename) dataset = None if os.path.exists(dataset_cache_path): print('Loading cached data from {}.'.format(dataset_cache_path)) dataset = NumpyTupleDataset.load(dataset_cache_path) if dataset is None: print('Preprocessing dataset...') preprocessor = preprocess_method_dict[method]() dataset = D.get_qm9(preprocessor, labels=labels) # Cache the newly preprocessed dataset. if not os.path.exists(cache_dir): os.mkdir(cache_dir) NumpyTupleDataset.save(dataset_cache_path, dataset) # Use a predictor with scaled output labels. model_path = os.path.join(args.in_dir, args.model_filename) regressor = Regressor.load_pickle(model_path, device=device) scaler = regressor.predictor.scaler if scaler is not None: original_t = dataset.get_datasets()[-1] if args.gpu >= 0: scaled_t = cuda.to_cpu(scaler.transform( cuda.to_gpu(original_t))) else: scaled_t = scaler.transform(original_t) dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] + (scaled_t,))) # Split the dataset into training and testing. train_data_size = int(len(dataset) * args.train_data_ratio) _, test = split_dataset_random(dataset, train_data_size, args.seed) # This callback function extracts only the inputs and discards the labels. def extract_inputs(batch, device=None): return concat_mols(batch, device=device)[:-1] def postprocess_fn(x): if scaler is not None: scaled_x = scaler.inverse_transform(x) return scaled_x else: return x # Predict the output labels. print('Predicting...') y_pred = regressor.predict( test, converter=extract_inputs, postprocess_fn=postprocess_fn) # Extract the ground-truth labels. t = concat_mols(test, device=device)[-1] original_t = cuda.to_cpu(scaler.inverse_transform(t)) # Construct dataframe. df_dict = {} for i, l in enumerate(labels): df_dict.update({'y_pred_{}'.format(l): y_pred[:, i], 't_{}'.format(l): original_t[:, i], }) df = pandas.DataFrame(df_dict) # Show a prediction/ground truth table with 5 random examples. print(df.sample(5)) n_eval = 10 for target_label in range(y_pred.shape[1]): label_name = labels[target_label] diff = y_pred[:n_eval, target_label] - original_t[:n_eval, target_label] print('label_name = {}, y_pred = {}, t = {}, diff = {}' .format(label_name, y_pred[:n_eval, target_label], original_t[:n_eval, target_label], diff)) # Run an evaluator on the test dataset. print('Evaluating...') test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False) eval_result = Evaluator(test_iterator, regressor, converter=concat_mols, device=device)() print('Evaluation result: ', eval_result) # Save the evaluation results. save_json(os.path.join(args.in_dir, 'eval_result.json'), eval_result) # Calculate mean abs error for each label mae = numpy.mean(numpy.abs(y_pred - original_t), axis=0) eval_result = {} for i, l in enumerate(labels): eval_result.update({l: mae[i]}) save_json(os.path.join(args.in_dir, 'eval_result_mae.json'), eval_result)
def extract_inputs(batch, device=None): return concat_mols(batch, device=device)[:-1]
def test_concat_mols_2d_cpu(data_2d, data_2d_expect): result = concat_mols(data_2d, device=-1) assert numpy.array_equal(result[0], data_2d_expect[0]) assert numpy.array_equal(result[1], data_2d_expect[1])
h_s = self.embed_super(super_node) for step in range(self.n_update_layers): h_new = self.update_layers[step](h=h, adj=adj) h_new, h_s = self.gwm(h, h_new, h_s, step) h = h_new return h def reset_state(self): if hasattr(self.update_layers[0], 'reset_state'): [update_layer.reset_state() for update_layer in self.update_layers] self.gwm.reset_state() if __name__ == '__main__': import uspto_pre from chainer.iterators import SerialIterator from chainer_chemistry.dataset.converters import concat_mols train_raw = uspto_pre.read_data('../train.txt.proc') train_dataset = uspto_pre.USPTO_pre(train_raw[:100], 'softmax') train_iter = SerialIterator(train_dataset, 3) model = ggnn_gwm() for b in train_iter: atom_feature, adjs, supernode_feature, label, ind = concat_mols(b, padding=-1) print(model(atom_feature, adjs, supernode_feature))
def converter(batch, device): return concat_mols(batch, device)[:-1]
def train(gpu, method, epoch, batchsize, n_unit, conv_layers, dataset, smiles, M, n_split, split_idx, order): n = len(dataset) assert len(order) == n left_idx = (n // n_split) * split_idx is_right_most_split = (n_split == split_idx + 1) if is_right_most_split: test_order = order[left_idx:] train_order = order[:left_idx] else: right_idx = (n // n_split) * (split_idx + 1) test_order = order[left_idx:right_idx] train_order = np.concatenate([order[:left_idx], order[right_idx:]]) new_order = np.concatenate([train_order, test_order]) n_train = len(train_order) # Standard Scaler for labels ss = StandardScaler() labels = dataset.get_datasets()[-1] train_label = labels[new_order[:n_train]] ss = ss.fit(train_label) # fit only by train labels = ss.transform(dataset.get_datasets()[-1]) dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] + (labels, ))) dataset_train = SubDataset(dataset, 0, n_train, new_order) dataset_test = SubDataset(dataset, n_train, n, new_order) # Network model = predictor.build_predictor(method, n_unit, conv_layers, 1, dropout_ratio=0.25, n_layers=1) train_iter = I.SerialIterator(dataset_train, batchsize) val_iter = I.SerialIterator(dataset_test, batchsize, repeat=False, shuffle=False) def scaled_abs_error(x0, x1): if isinstance(x0, Variable): x0 = cuda.to_cpu(x0.data) if isinstance(x1, Variable): x1 = cuda.to_cpu(x1.data) scaled_x0 = ss.inverse_transform(cuda.to_cpu(x0)) scaled_x1 = ss.inverse_transform(cuda.to_cpu(x1)) diff = scaled_x0 - scaled_x1 return np.mean(np.absolute(diff), axis=0)[0] regressor = Regressor(model, lossfun=F.mean_squared_error, metrics_fun={'abs_error': scaled_abs_error}, device=gpu) optimizer = O.Adam(alpha=0.0005) optimizer.setup(regressor) updater = training.StandardUpdater(train_iter, optimizer, device=gpu, converter=concat_mols) dir_path = get_dir_path(batchsize, n_unit, conv_layers, M, method) dir_path = os.path.join(dir_path, str(split_idx) + "-" + str(n_split)) os.makedirs(dir_path, exist_ok=True) print('creating ', dir_path) np.save(os.path.join(dir_path, "test_idx"), np.array(test_order)) trainer = training.Trainer(updater, (epoch, 'epoch'), out=dir_path) trainer.extend( E.Evaluator(val_iter, regressor, device=gpu, converter=concat_mols)) trainer.extend(E.LogReport()) trainer.extend( E.PrintReport([ 'epoch', 'main/loss', 'main/abs_error', 'validation/main/loss', 'validation/main/abs_error', 'elapsed_time' ])) trainer.extend(E.ProgressBar()) trainer.run() # --- Plot regression evaluation result --- dataset_test = SubDataset(dataset, n_train, n, new_order) batch_all = concat_mols(dataset_test, device=gpu) serializers.save_npz(os.path.join(dir_path, "model.npz"), model) result = model(batch_all[0], batch_all[1]) result = ss.inverse_transform(cuda.to_cpu(result.data)) answer = ss.inverse_transform(cuda.to_cpu(batch_all[2])) plot_result(result, answer, save_filepath=os.path.join(dir_path, "result.png")) # --- Plot regression evaluation result end --- np.save(os.path.join(dir_path, "output.npy"), result) np.save(os.path.join(dir_path, "answer.npy"), answer) smiles_part = np.array(smiles)[test_order] np.save(os.path.join(dir_path, "smiles.npy"), smiles_part) # calculate saliency and save it. save_result(dataset, model, dir_path, M)
def main(input_args=None): # Parse the arguments. args = parse_arguments(input_args) device = args.gpu method = args.method if args.data_name == 'suzuki': datafile = 'data/suzuki_type_test_v2.csv' class_num = 119 class_dict = {'M': 28, 'L': 23, 'B': 35, 'S': 10, 'A': 17} dataset_filename = 'test_data.npz' labels = ['Yield', 'M', 'L', 'B', 'S', 'A', 'id'] elif args.data_name == 'CN': datafile = 'data/CN_coupling_test.csv' class_num = 206 class_dict = {'M': 44, 'L': 47, 'B': 13, 'S': 22, 'A': 74} dataset_filename = 'test_CN_data.npz' labels = ['Yield', 'M', 'L', 'B', 'S', 'A', 'id'] elif args.data_name == 'Negishi': datafile = 'data/Negishi_test.csv' class_num = 106 class_dict = {'M': 32, 'L': 20, 'T': 8, 'S': 10, 'A': 30} dataset_filename = 'test_Negishi_data.npz' labels = ['Yield', 'M', 'L', 'T', 'S', 'A', 'id'] elif args.data_name == 'PKR': datafile = 'data/PKR_test.csv' class_num = 83 class_dict = { 'M': 18, 'L': 6, 'T': 7, 'S': 15, 'A': 11, 'G': 1, 'O': 13, 'P': 4, 'other': 1 } dataset_filename = 'test_PKR_data.npz' labels = [ 'Yield', 'M', 'L', 'T', 'S', 'A', 'G', 'O', 'P', 'other', 'id' ] else: raise ValueError('Unexpected dataset name') cache_dir = os.path.join('input', '{}_all'.format(method)) # Dataset preparation. def postprocess_label(label_list): return numpy.asarray(label_list, dtype=numpy.float32) print('Preprocessing dataset...') # Load the cached dataset. dataset_cache_path = os.path.join(cache_dir, dataset_filename) dataset = None if os.path.exists(dataset_cache_path): print('Loading cached dataset from {}.'.format(dataset_cache_path)) dataset = NumpyTupleDataset.load(dataset_cache_path) if dataset is None: if args.method == 'mpnn': preprocessor = preprocess_method_dict['ggnn']() else: preprocessor = preprocess_method_dict[args.method]() parser = CSVFileParser( preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_col=['Reactant1', 'Reactant2', 'Product'], label_dicts=class_dict) dataset = parser.parse(datafile)['dataset'] # Cache the laded dataset. if not os.path.exists(cache_dir): os.makedirs(cache_dir) NumpyTupleDataset.save(dataset_cache_path, dataset) labels = dataset.get_datasets()[-2] ids = dataset.get_datasets()[-1][:, 1].reshape(-1, 1) yields = dataset.get_datasets()[-1][:, 0].reshape(-1, 1).astype( 'float32') # [:,0] added dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-2] + ( yields, labels, ))) # Load the standard scaler parameters, if necessary. scaler = None test = dataset print('Predicting...') # Set up the regressor. model_path = os.path.join(args.in_dir, args.model_filename) if os.path.exists(model_path): classifier = Classifier.load_pickle(model_path, device=args.gpu) else: predictor = set_up_predictor(args.method, args.unit_num, args.conv_layers, class_num) classifier = Classifier(predictor, lossfun=F.sigmoid_cross_entropy, metrics_fun=F.binary_accuracy, device=args.gpu) if args.load_modelname: serializers.load_npz(args.load_modelname, classifier) scaled_predictor = ScaledGraphConvPredictor( graph_conv=classifier.predictor.graph_conv, mlp=classifier.predictor.mlp) classifier.predictor = scaled_predictor # This callback function extracts only the inputs and discards the labels. def extract_inputs(batch, device=None): return concat_mols(batch, device=device)[:-1] # Predict the output labels. # Prediction function rewrite!!! y_pred = classifier.predict(test, converter=extract_inputs) y_pred_max = numpy.argmax(y_pred, axis=1) y_pred_max = y_pred_max.reshape(-1, 1) # y_pred_idx = y_pred.argsort(axis=1) # ascending # Extract the ground-truth labels. t = concat_mols(test, device=-1)[-1] # device 11/14 memory issue original_t = cuda.to_cpu(t) t_idx = original_t.squeeze(1) t_idx = t_idx.argsort(axis=1) # gt_indx = numpy.where(original_t == 1) # Construct dataframe. df_dict = {} for i, l in enumerate(labels[:1]): df_dict.update({ 'y_pred_{}'.format(l): y_pred_max[:, -1].tolist(), # [:,-1] 't_{}'.format(l): t_idx[:, -1].tolist(), }) df = pandas.DataFrame(df_dict) # Show a prediction/ground truth table with 5 random examples. print(df.sample(5)) n_eval = 10 for target_label in range(y_pred_max.shape[1]): label_name = labels[:1][0][target_label] print('label_name = {}, y_pred = {}, t = {}'.format( label_name, y_pred_max[:n_eval, target_label], t_idx[:n_eval, -1])) # Perform the prediction. print('Evaluating...') test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False) eval_result = Evaluator(test_iterator, classifier, converter=concat_mols, device=args.gpu)() print('Evaluation result: ', eval_result) with open(os.path.join(args.in_dir, 'eval_result.json'), 'w') as f: json.dump(eval_result, f) res_dic = {} for i in range(len(y_pred)): res_dic[i] = str(ids[i]) json.dump(res_dic, open(os.path.join(args.in_dir, "test_ids.json"), "w")) pickle.dump(y_pred, open(os.path.join(args.in_dir, "pred.pkl"), "wb")) pickle.dump(original_t, open(os.path.join(args.in_dir, "gt.pkl"), "wb"))
def inference(): logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser(description='') parser.add_argument('--batch_size', type=int, default=16) # parser.add_argument('--epoch', type=int, default=10) # parser.add_argument('--lr', type=float, default=1e-3) parser.add_argument('--gpu', action='store_true') parser.add_argument('--out', default='result_debug/') # parser.add_argument('--frequency', type=int, default=-1) # parser.add_argument('--decay_iter', type=int, default=40000) parser.add_argument('--gnn_dim', type=int, default=300) parser.add_argument('--n_layers', type=int, default=3) parser.add_argument('--nn_dim', type=int, default=100) # parser.add_argument('--train_path', default='../train.txt.proc') parser.add_argument('--valid_path', default='../test.txt.proc') parser.add_argument('--communicator', type=str, default='pure_nccl') parser.add_argument('--type', default='debug') parser.add_argument('--rich', type=strtobool, default='false') parser.add_argument('--snapshot') args = parser.parse_args() assert args.type in ['debug', 'all'] if args.gpu: comm = chainermn.create_communicator(args.communicator) device = comm.intra_rank else: comm = chainermn.create_communicator('naive') device = -1 if comm.rank == 0: print(glob.glob(args.out + 'snapshot_*')) model = pair_matrix_model(gnn_dim=args.gnn_dim, n_layers=args.n_layers, nn_dim=args.nn_dim) chainer.serializers.load_npz(args.out + args.snapshot, model) if device > 0: chainer.cuda.get_device_from_id(device).use() model.to_gpu() valid_raw = uspto_pre.read_data(args.valid_path) if comm.rank == 0: if args.type == 'debug': valid_dataset = uspto_pre.USPTO_pre(valid_raw[:40], args.rich) elif args.type == 'all': valid_dataset = uspto_pre.USPTO_pre(valid_raw, args.rich) else: valid_dataset = None, None valid_dataset = chainermn.scatter_dataset(valid_dataset, comm, shuffle=False) valid_iter = SerialIterator(valid_dataset, args.batch_size, repeat=False, shuffle=False) for batch in valid_iter: in_arrays = concat_mols(batch=batch, device=device, padding=-1) assert isinstance(in_arrays, tuple) with chainer.using_config('train', False): h = model.ggnn_gwm(in_arrays[0], in_arrays[1], in_arrays[2]) ind = in_arrays[4] adj = in_arrays[1][:, :4, :, :] batch_size = h.shape[0] graph_size = h.shape[1] hidden_size = h.shape[2] h = h.reshape(batch_size, 1, -1, hidden_size) + h.reshape( batch_size, -1, 1, hidden_size) h = h.reshape(batch_size, -1, hidden_size) h1 = model.nn1(h) h1 = h1.reshape(batch_size, graph_size, graph_size, 5) for x in range(batch_size): index = ind[x] action_pred = [] for i in range(graph_size): for j in range(i + 1, graph_size): if adj[x, 0, i, j] == -1: break else: bt_pred = h1[x, i, j].data.argmax() if bt_pred == 0: if adj[x, :, i, j].sum() != 0: action_pred.append( str(i) + '-' + str(j) + '-' + str(bt_pred)) else: if adj[x, bt_pred - 1, i, j] == 0: action_pred.append( str(i) + '-' + str(j) + '-' + str(bt_pred)) with open(args.out + 'inf_' + args.snapshot + '.txt', 'a') as file: file.write(str(index)) for a_p in action_pred: file.write('\t' + a_p) file.write('\n')