def test_regression_censor_weights(dev, data_dir="test_chembl23", rm_output=True): rstr = random_str(12) output_dir = f"./{data_dir}/models-{rstr}/" cmd = (f"python train.py --x ./{data_dir}/chembl_23mini_x.npy" + f" --y_regr ./{data_dir}/chembl_23mini_y.npy" + f" --y_censor ./{data_dir}/chembl_23mini_y_censored.npy" + f" --weights_regr ./{data_dir}/chembl_23mini_regr_weights.csv" + f" --folding ./{data_dir}/chembl_23mini_folds.npy" + f" --batch_ratio 0.1" + f" --output_dir {output_dir}" + f" --hidden_sizes 20" + f" --epochs 2" + f" --lr 1e-3" + f" --lr_steps 3" + f" --dev {dev}" + f" --verbose 1") download_chembl23(data_dir) res = subprocess.run(cmd.split()) assert res.returncode == 0 conf_file = glob.glob(f"{output_dir}/*.json")[0] model_file = glob.glob(f"{output_dir}/*.pt")[0] results = sc.load_results(conf_file) assert "conf" in results assert "validation" in results assert results["validation"]["regression"].shape[0] > 0 if rm_output: shutil.rmtree(output_dir)
def test_regression(dev, data_dir="test_chembl23", rm_output=True): rstr = random_str(12) output_dir = f"./{data_dir}/models-{rstr}/" cmd = (f"python train.py --x ./{data_dir}/chembl_23mini_x.npy" + f" --y_regr ./{data_dir}/chembl_23mini_y.npy" + f" --folding ./{data_dir}/chembl_23mini_folds.npy" + f" --batch_ratio 0.1" + f" --output_dir {output_dir}" + f" --hidden_sizes 20" + f" --epochs 2" + f" --lr 1e-3" + f" --lr_steps 1" + f" --dev {dev}" + f" --verbose 1") download_chembl23(data_dir) res = subprocess.run(cmd.split()) assert res.returncode == 0 assert os.path.isdir(os.path.join(output_dir, "boards")) conf_file = glob.glob(f"{output_dir}/*.json")[0] model_file = glob.glob(f"{output_dir}/*.pt")[0] results = sc.load_results(conf_file) assert "conf" in results assert "validation" in results assert results["validation"]["regression"].shape[0] > 0 cmd_pred = (f"python predict.py --x ./{data_dir}/chembl_23mini_x.npy" + f" --outprefix {output_dir}/yhat" + f" --conf {conf_file}" + f" --model {model_file}" + f" --dev {dev}") res_pred = subprocess.run(cmd_pred.split()) assert res_pred.returncode == 0 yhat = np.load(f"{output_dir}/yhat-regr.npy") assert results["conf"].regr_output_size == yhat.shape[1] if rm_output: shutil.rmtree(output_dir)
vprint(args) if args.run_name is not None: name = args.run_name else: if args.hidden_sizes is not None: name = f"sc_{args.prefix}_h{'.'.join([str(h) for h in args.hidden_sizes])}_ldo{args.last_dropout:.1f}_wd{args.weight_decay}" name += f"_lr{args.lr}_lrsteps{'.'.join([str(s) for s in args.lr_steps])}_ep{args.epochs}" name += f"_fva{args.fold_va}_fte{args.fold_te}" else: name = f"sc_{args.prefix}_h_nohidden_ldo{args.last_dropout:.1f}_wd{args.weight_decay}" name += f"_lr{args.lr}_lrsteps{'.'.join([str(s) for s in args.lr_steps])}_ep{args.epochs}" name += f"_fva{args.fold_va}_fte{args.fold_te}" vprint(f"Run name is '{name}'.") conf = sc.load_results(args.conf, two_heads=True)["conf"] if conf.last_hidden_sizes is None: setattr(conf, "last_hidden_sizes", []) dev = args.dev net = sc.SparseFFN(conf).to(dev) state_dict = torch.load(args.model, map_location=torch.device(dev)) if conf.model_type == "federated": state_dict_new = OrderedDict() state_dict_new["net.0.net_freq.weight"] = state_dict["0.0.net_freq.weight"] state_dict_new["net.0.net_freq.bias"] = state_dict["0.0.net_freq.bias"] state_dict_new["net.2.net.2.weight"] = state_dict["1.net.2.weight"] state_dict_new["net.2.net.2.bias"] = state_dict["1.net.2.bias"] state_dict = state_dict_new net.load_state_dict(state_dict)
def predict(): parser = argparse.ArgumentParser(description="Using trained model to make predictions.") parser.add_argument("--x", help="Descriptor file (matrix market, .npy or .npz)", type=str, required=True) parser.add_argument("--y_class", "--y", "--y_classification", help="Sparse pattern file for classification, optional. If provided returns predictions for given locations only (matrix market, .npy or .npz)", type=str, default=None) parser.add_argument("--y_regr", "--y_regression", help="Sparse pattern file for regression, optional. If provided returns predictions for given locations only (matrix market, .npy or .npz)", type=str, default=None) parser.add_argument("--folding", help="Folds for rows of y, optional. Needed if only one fold should be predicted.", type=str, required=False) parser.add_argument("--predict_fold", help="One or more folds, integer(s). Needed if --folding is provided.", nargs="+", type=int, required=False) parser.add_argument("--outprefix", help="Prefix for output files, '-class.npy', '-regr.npy' will be appended.", type=str, required=True) parser.add_argument("--conf", help="Model conf file (.json or .npy)", type=str, required=True) parser.add_argument("--model", help="Pytorch model file (.pt)", type=str, required=True) parser.add_argument("--batch_size", help="Batch size (default 4000)", type=int, default=4000) parser.add_argument("--last_hidden", help="If set to 1 returns last hidden layer instead of Yhat", type=int, default=0) parser.add_argument("--dropout", help="If set to 1 enables dropout for evaluation", type=int, default=0) parser.add_argument("--inverse_normalization", help="If set to 1 enables inverse normalization given means and variances from config file", type=int, default=0) parser.add_argument("--weights_class", "--task_weights", "--weights_classification", help="CSV file with columns task_id, training_weight, aggregation_weight, task_type (for classification tasks)", type=str, default=None) parser.add_argument("--dev", help="Device to use (default cuda:0)", type=str, default="cuda:0") parser.add_argument("--num_workers", help="Number of workers for DataLoader", type=int, default=4) args = parser.parse_args() print(args) results_loaded = sc.load_results(args.conf, two_heads=True) conf = results_loaded["conf"] if args.inverse_normalization == 1: stats = results_loaded["stats"] x = sc.load_sparse(args.x) x = sc.fold_transform_inputs(x, folding_size=conf.fold_inputs, transform=conf.input_transform) print(f"Input dimension: {x.shape[1]}") print(f"#samples: {x.shape[0]}") ## error checks for --y_class, --y_regr, --folding and --predict_fold if args.last_hidden: assert args.y_class is None, "Cannot use '--last_hidden 1' with sparse predictions ('--y_class' or '--y_regr' is specified)." if args.y_class is None and args.y_regr is None: assert args.predict_fold is None, "To use '--predict_fold' please specify '--y_class' and/or '--y_regr'." assert args.folding is None, "To use '--folding' please specify '--y_class' and/or '--y_regr'." else: if args.predict_fold is None: assert args.folding is None, "If --folding is given please also specify --predict_fold." if args.folding is None: assert args.predict_fold is None, "If --predict_fold is given please also specify --folding." res = types.SimpleNamespace(task_id=None, training_weight=None, aggregation_weight=None, task_type=None, censored_weight=torch.FloatTensor(), cat_id=None) if args.weights_class is not None: tasks_class = pd.read_csv(args.weights_class) if "catalog_id" in tasks_class: res.cat_id = tasks_class.catalog_id.values tasks_cat_id_list = None select_cat_ids = None if res.cat_id is not None: tasks_cat_id_list = [[x,i] for i,x in enumerate(res.cat_id) if str(x) != 'nan'] tasks_cat_ids = [i for i,x in enumerate(res.cat_id) if str(x) != 'nan'] select_cat_ids = np.array(tasks_cat_ids) cat_id_size = len(tasks_cat_id_list) else: cat_id_size = 0 dev = args.dev net = sc.SparseFFN(conf).to(dev) state_dict = torch.load(args.model, map_location=torch.device(dev)) if conf.model_type == "federated": state_dict_new = OrderedDict() state_dict_new["net.0.net_freq.weight"] = state_dict["0.0.net_freq.weight"] state_dict_new["net.0.net_freq.bias"] = state_dict["0.0.net_freq.bias"] state_dict_new["net.2.net.2.weight"] = state_dict["1.net.2.weight"] state_dict_new["net.2.net.2.bias"] = state_dict["1.net.2.bias"] state_dict = state_dict_new net.load_state_dict(state_dict) print(f"Model weights: '{args.model}'") print(f"Model config: '{args.conf}'.") y_class = sc.load_check_sparse(args.y_class, (x.shape[0], conf.class_output_size)) y_regr = sc.load_check_sparse(args.y_regr, (x.shape[0], conf.regr_output_size)) if args.folding is not None: folding = np.load(args.folding) if args.folding else None assert folding.shape[0] == x.shape[0], f"Folding has {folding.shape[0]} rows and X has {x.shape[0]}. Must be equal." keep = np.isin(folding, args.predict_fold) y_class = sc.keep_row_data(y_class, keep) y_regr = sc.keep_row_data(y_regr, keep) dataset_te = sc.ClassRegrSparseDataset(x=x, y_class=y_class, y_regr=y_regr) loader_te = DataLoader(dataset_te, batch_size=args.batch_size, num_workers = args.num_workers, pin_memory=True, collate_fn=dataset_te.collate) if args.last_hidden: ## saving only hidden layer out = sc.predict_hidden(net, loader_te, dev=dev, dropout=args.dropout, progress=True) filename = f"{args.outprefix}-hidden.npy" np.save(filename, out.numpy()) print(f"Saved (numpy) matrix of hiddens to '{filename}'.") else: if args.y_class is None and args.y_regr is None: class_out, regr_out = sc.predict_dense(net, loader_te, dev=dev, dropout=args.dropout, progress=True, y_cat_columns=select_cat_ids) else: class_out, regr_out = sc.predict_sparse(net, loader_te, dev=dev, dropout=args.dropout, progress=True, y_cat_columns=select_cat_ids) if args.inverse_normalization == 1: regr_out = sc.inverse_normalization(regr_out, mean=np.array(stats["mean"]), variance=np.array(stats["var"]), array=True) if net.class_output_size > 0: np.save(f"{args.outprefix}-class.npy", class_out) print(f"Saved prediction matrix (numpy) for classification to '{args.outprefix}-class.npy'.") if net.regr_output_size > 0: np.save(f"{args.outprefix}-regr.npy", regr_out) print(f"Saved prediction matrix (numpy) for regression to '{args.outprefix}-regr.npy'.")
type=int, default=0) parser.add_argument("--dropout", help="If set to 1 enables dropout for evaluation", type=int, default=0) parser.add_argument("--dev", help="Device to use (default cuda:0)", type=str, default="cuda:0") args = parser.parse_args() print(args) conf = sc.load_results(args.conf)["conf"] ecfp = sc.load_sparse(args.x) if ecfp is None: parser.print_help() print("--x: Descriptor file must have suffix .mtx or .npy") sys.exit(1) if conf.fold_inputs is not None: ecfp = sc.fold_inputs(ecfp, folding_size=conf.fold_inputs) print(f"Folding inputs to {ecfp.shape[1]} dimensions.") ## error checks for --y, --folding and --predict_fold if args.last_hidden: assert args.y is None, "Cannot use '--last_hidden 1' with sparse predictions ('--y' is specified)." if args.y is None: assert args.predict_fold is None, "To use '--predict_fold' please specify '--y'."
def test_classification(dev, data_dir="test_chembl23", rm_output=True): rstr = random_str(12) output_dir = f"./{data_dir}/models-{rstr}/" cmd = (f"python train.py --x ./{data_dir}/chembl_23mini_x.npy" + f" --y_class ./{data_dir}/chembl_23mini_y.npy" + f" --folding ./{data_dir}/chembl_23mini_folds.npy" + f" --batch_ratio 0.1" + f" --output_dir {output_dir}" + f" --hidden_sizes 20" + f" --epochs 2" + f" --lr 1e-3" + f" --lr_steps 1" + f" --dev {dev}" + f" --verbose 1") download_chembl23(data_dir) res = subprocess.run(cmd.split()) assert res.returncode == 0 conf_file = glob.glob(f"{output_dir}/*.json")[0] model_file = glob.glob(f"{output_dir}/*.pt")[0] results = sc.load_results(conf_file) assert os.path.isdir(os.path.join(output_dir, "boards")) assert "conf" in results assert "validation" in results assert results["validation"]["classification"].shape[0] > 0 cmd_pred = (f"python predict.py --x ./{data_dir}/chembl_23mini_x.npy" + f" --outprefix {output_dir}/yhat" + f" --conf {conf_file}" + f" --model {model_file}" + f" --dev {dev}") res_pred = subprocess.run(cmd_pred.split()) assert res_pred.returncode == 0 yhat = np.load(f"{output_dir}/yhat-class.npy") assert results["conf"].class_output_size == yhat.shape[1] assert (yhat >= 0).all() assert (yhat <= 1).all() ## checking --last_hidden 1 cmd_hidden = (f"python predict.py --x ./{data_dir}/chembl_23mini_x.npy" + f" --outprefix {output_dir}/yhat" + f" --conf {conf_file}" + f" --model {model_file}" + f" --last_hidden 1" + f" --dev {dev}") res_hidden = subprocess.run(cmd_hidden.split()) assert res_hidden.returncode == 0 hidden = np.load(f"{output_dir}/yhat-hidden.npy") assert results["conf"].hidden_sizes[-1] == hidden.shape[1] ## sparse prediction cmd_sparse = (f"python predict.py --x ./{data_dir}/chembl_23mini_x.npy" + f" --y_class ./{data_dir}/chembl_23mini_y.npy" + f" --outprefix {output_dir}/yhat" + f" --conf {conf_file}" + f" --model {model_file}" + f" --dev {dev}") res_sparse = subprocess.run(cmd_sparse.split()) assert res_sparse.returncode == 0 ysparse = sc.load_sparse(f"{output_dir}/yhat-class.npy") ytrue = sc.load_sparse(f"./{data_dir}/chembl_23mini_y.npy") assert ytrue.shape == ysparse.shape assert type(ysparse) == scipy.sparse.csr.csr_matrix assert (ysparse.data >= 0).all() assert (ysparse.data <= 1).all() ytrue_nz = ytrue.nonzero() ysparse_nz = ysparse.nonzero() assert (ytrue_nz[0] == ysparse_nz[0]).all(), "incorrect sparsity pattern" assert (ytrue_nz[1] == ysparse_nz[1]).all(), "incorrect sparsity pattern" ## fold filtering cmd_folding = (f"python predict.py --x ./{data_dir}/chembl_23mini_x.npy" + f" --y_class ./{data_dir}/chembl_23mini_y.npy" + f" --folding ./{data_dir}/chembl_23mini_folds.npy" + f" --predict_fold 1 2" f" --outprefix {output_dir}/yhat" + f" --conf {conf_file}" + f" --model {model_file}" + f" --dev {dev}") res_folding = subprocess.run(cmd_folding.split()) assert res_folding.returncode == 0 yfolding = sc.load_sparse(f"{output_dir}/yhat-class.npy") ytrue = sc.load_sparse(f"./{data_dir}/chembl_23mini_y.npy") assert ytrue.shape == yfolding.shape assert type(yfolding) == scipy.sparse.csr.csr_matrix assert (yfolding.data >= 0).all() assert (yfolding.data <= 1).all() assert yfolding.nnz < ytrue.nnz if rm_output: shutil.rmtree(output_dir)