def parse_csv(pred_path, true_path, target): """ Get the list of predicted and real values from a csv file. Running `predict.sh` on the results of a ChemProp calculation produces a csv file of the predictions of each ChemProp fold and a JSON file that summarizes the predictions of each fold. Args: pred_path (str): path to predicted values true_path (str): path to real values target (str): name of property you're predicting Returns: pred (list[np.array]): the predictions of this model. Given as a list of length 1 that contains an array of length `num_species` (number of species). Given in this way to be consistent with `parse_json` below. real (list[np.array]): same as `pred` but with the real values. """ pred_dic = read_csv(pred_path) pred = np.array(pred_dic[target]) real_dic = read_csv(true_path) real = np.array(real_dic[target]) return [pred], [real]
def predict(cp_folder, test_path, cp_model_path, device, check_paths): """ Get and save the prediction results from a ChemProp model. Args: cp_folder (str): path to the chemprop folder on your computer test_path (str): path to the file with the test SMILES and their properties cp_model_path (str): path to the folder with the model of interest device (Union[str, int]): device to evaluate the model on check_paths (list[str]): paths to the different model checkpoints Returns: reals (dict):dictionary of the form {prop: real}, where `real` are the real values of the property `prop`. preds (list[dict]): same as `real` but for predicted. One for each model. """ script = os.path.join(cp_folder, "predict.py") preds_path = os.path.join(cp_model_path, f"test_pred.csv") # load the arguments from that model to get the features path args_path = f"{cp_model_path}/fold_0/args.json" if not os.path.isfile(args_path): args_path = args_path.replace("fold_0/", "") with open(args_path, "r") as f: args = json.load(f) features_path = args["separate_test_features_path"] # predictions from different models preds = [] for i, check_path in enumerate(check_paths): # make the chemprop command this_path = preds_path.replace(".csv", f"_{i}.csv") cmd = (f"source activate chemprop && python {script} " f" --test_path {test_path} --preds_path {this_path} " f" --checkpoint_paths {check_path} ") if device == "cpu": cmd += f" --no_cuda" else: cmd += f" --gpu {device} " if features_path is not None: feat_str = " ".join(features_path) cmd += f" --features_path {feat_str}" p = bash_command(cmd) p.wait() pred = read_csv(this_path) preds.append(pred) real = read_csv(test_path) return real, preds
def get_splits(sample_dic, csv_folder): """ Figure out which split (train, val or test) each SMILES in `sample_dic` belongs to. Args: sample_dic (dict): Sample of `summary_dic` that is used in this combined dataset. `summary_dic` contains information about all smiles strings we have, except for their conformers. csv_folder (str): path to folder that contains the csv files with the test/val/train smiles. Returns: sample_dic (dict): `sample_dic`, but with each sub-dictionary updated to contain the split assignment of the SMILES. """ for name in ["train", "val", "test"]: path = os.path.join(csv_folder, f"{name}_full.csv") csv_dic = read_csv(path) for i, smiles in enumerate(csv_dic["smiles"]): # add any properties present in the csv props = { key: csv_dic[key][i] for key in csv_dic.keys() if key != "smiles" } sample_dic[smiles].update({"split": name, **props}) # get rid of anything that doesn't have a split labels keys = list(sample_dic.keys()) for key in keys: if "split" not in sample_dic[key]: sample_dic.pop(key) return sample_dic
def load_data(train_path, val_path, test_path): """ Load data from csvs into a dictionary for the different splits. Args: train_path (str): path to csv with training data val_path (str): path to csv with validation data test_path (str): path to csv with test data Returns: data (dict): dictionary of the form {split: sub_dic} for each split, where sub_dic contains SMILES strings and values for each property. """ data = {} paths = [train_path, val_path, test_path] names = ["train", "val", "test"] for name, path in zip(names, paths): data[name] = read_csv(path) return data
def make_hyp_csvs(base_config_path, max_specs, seed): """ Make csv files for the subsection of the SMILES that will be used for hyperparameter optimization. Args: base_config_path (str): where your basic job config file is, with parameters that may or may not be changed depending on the given run max_specs (int): maximum number of species to use in hyperparameter optimization. seed (int): random seed to use for split. Returns: None """ # load the base config with open(base_config_path, "r") as f: base_dic = json.load(f) # load the SMILES strings from the train and validation # paths, then sample them train_path = base_dic["data_path"] val_path = base_dic.get("separate_val_path") paths = [train_path, val_path] # initialize the dictionary by reading the train data prop_dic = read_csv(paths[0]) # if the validation data is separate, add the data lists # together if val_path is not None: new_dic = read_csv(val_path) for key, val in new_dic.items(): prop_dic[key] += val # generate a proportional sample by first getting the # properties to be predicted, then making a `sample_dic`, # and finally calling `prop_split` props = list(filter(lambda x: x != "smiles", prop_dic.keys())) dataset_type = base_dic.get("dataset_type", "regression") num_smiles = len(prop_dic["smiles"]) sample_dic = { prop_dic["smiles"][idx]: {prop: prop_dic[prop][idx] for prop in props} for idx in range(num_smiles) } keep_smiles = prop_split(max_specs=max_specs, dataset_type=dataset_type, props=props, sample_dic=sample_dic, seed=seed) # save to csv new_dic = {"smiles": keep_smiles} for prop in props: new_dic.update({prop: [sample_dic[key][prop] for key in keep_smiles]}) smiles_folder = "/".join(train_path.split("/")[:-1]) hyp_path = os.path.join(smiles_folder, "hyperopt_full.csv") write_csv(hyp_path, new_dic)