def save_splits(dataset, dataset_folder, thread, sample_dic): """ Save the train/val/test splits of the dataset Args: dataset (nff.data.dataset): NFF dataset dataset_folder (str): base folder for the datasets thread (int): thread for chunk of dataset sample_dic (dict): Sample of `summary_dic` that is used in this combined dataset. `summary_dic` contains information about all smiles strings we have, except for their conformers. Returns: None """ split_names = ["train", "val", "test"] split_idx = {name: [] for name in split_names} for i, smiles in enumerate(dataset.props['smiles']): split_name = sample_dic[smiles]["split"] split_idx[split_name].append(i) fprint("Saving...") data_folder = get_data_folder(dataset_folder, thread) for name in split_names: dset = split_dataset(dataset, split_idx[name]) dset_path = os.path.join(data_folder, name + ".pth.tar") dset.save(dset_path)
def save_best(dic_path, metric, model_path): """ Save the best parameters from the optimization. Args: dic_path (str): path to the JSON file with the scores metric (str): metric by which to evaluate model performance model_path (str): directory of model and dataset Returns: None """ # load the scores with open(dic_path, "r") as f: score_list = json.load(f) # get the best parameters objective = METRIC_DIC[convert_metric(metric)] pref = 1 if (objective == "minimize") else (-1) hyper_scores = [pref * score_dic[metric] for score_dic in score_list] best_params = score_list[np.argmin(hyper_scores)] # print the best parameters save_path = os.path.join(model_path, "best_params.json") best_str = "\n ".join( [f"{key}: {val}" for key, val in best_params.items()]) fprint(f"Best parameters are {best_str}") fprint(f"Saving to {save_path}") # save them with open(save_path, "w") as f: json.dump(best_params, f, indent=4, sort_keys=True)
def model_from_metric(model, model_folder, metric): """ Get the model with the best validation score according to a specified metric. Args: model (nff.nn.models): original NFF model loaded model_folder (str): path to the folder that the model is being trained in metric (str): name of metric to use Returns: model (nff.nn.models): NFF model updated with the state dict of the model with the best metric """ # the metric asked for should be in chemprop notation (e.g. auc, prc-auc), # but when training a CP3D model we use different names # (e.g. roc_auc, prc_auc), so we need to transform into that name if metric in CHEMPROP_TRANSFORM: use_metric = CHEMPROP_TRANSFORM[metric] else: use_metric = metric # find the best epoch by reading the csv with the metrics best_score, best_epoch = parse_score(model_folder, use_metric) check_path = os.path.join(model_folder, "checkpoints", f"checkpoint-{best_epoch}.pth.tar") state_dict = torch.load(check_path, map_location="cpu")["model"] fprint(f"Loading model state dict from {check_path}") model.load_state_dict(state_dict) model.eval() return model
def update_info(job_path, vals, param_names, prop_name): """ Update the config information and save it. Args: job_path (str): path to the folder with the job config file vals (list): new values to use param_names (list[str]): names of the parameters being updated prop_name (str): Name of property you're predicting Returns: None """ with open(job_path, "r") as f: info = json.load(f) real_names = [] real_vals = [] for param_name, val in zip(param_names, vals): if param_name.startswith("log_"): # if anything starts with "log_" (e.g. "log_schnet_dropout"), # exponentiate its value to get the actual number real_names.append(param_name.replace("log_", "")) real_vals.append(np.exp(val)) else: real_names.append(param_name) real_vals.append(val) # update values for param_type, val in zip(real_names, real_vals): if 'dropout' in param_type: update_dropout(info=info, dropout=val, dropout_type=param_type, prop_name=prop_name) elif param_type == "num_heads": update_heads(info=info, heads=val) elif param_type == "attention_type": info["model_params"]["boltzmann_dict"]["type"] = val else: if param_type not in info["model_params"]: msg = (f"Warning: assuming that {param_type} " "is just a key in `model_params`, but " "it is not currently in `model_params` in " "the config file. If it should be in a " "different location then you will need " "to write a custom function for updating " "it.") fprint(msg) update_general(info, key=param_type, val=val) # save with open(job_path, "w") as f: json.dump(info, f, indent=4, sort_keys=True)
def make_nff_dataset(spec_dics, nbrlist_cutoff, parallel_feat_threads, strict_conformers, csv_folder, extra_features, add_directed_idx, average_nbrs=False): """ Make an NFF dataset Args: spec_dics (list[dict]): a dictionary with data for each species nbr_list_cutoff (float): Cutoff for two atoms to be considered neighbors. parallel_feat_threads (int): how many parallel threads to use when making the efeatures. strict_conformers (bool): Whether to exclude any species whose conformers don't all have the same SMILES. csv_folder (str): path to folder that contains the csv files with the test/val/train smiles. extra_features (list[dict]): list of extra features dictionaries add_directed_idx (bool): whether to calculate and add the kj and ji indices. These indices tell you which edges connect to other edges. Returns: big_dataset (nff.data.dataset): NFF dataset """ fprint("Making dataset with %d species" % (len(spec_dics))) if average_nbrs: big_dataset = make_avg_dataset( spec_dics=spec_dics, nbrlist_cutoff=nbrlist_cutoff, parallel_feat_threads=parallel_feat_threads, strict_conformers=strict_conformers) else: big_dataset = make_big_dataset( spec_dics=spec_dics, nbrlist_cutoff=nbrlist_cutoff, parallel_feat_threads=parallel_feat_threads) # clean up fprint("Cleaning up dataset...") big_dataset = clean_up_dset(dset=big_dataset, nbrlist_cutoff=nbrlist_cutoff, strict_conformers=strict_conformers, csv_folder=csv_folder, add_directed_idx=add_directed_idx, num_procs=parallel_feat_threads) # add any other requested features big_dataset = add_features(dset=big_dataset, extra_features=extra_features, parallel_feat_threads=parallel_feat_threads) return big_dataset
def add_kj_ji_parallel(dataset, num_procs): fprint((f"Adding kj and ji indices with {num_procs} " "parallel processes")) datasets = split_dataset(dataset=dataset, num=num_procs) datasets = kj_ji_parallel(datasets) new_props = rejoin_props(datasets) dataset.props = new_props
def add_bond_idx_parallel(dataset, num_procs): fprint((f"Adding bond indices with {num_procs} " "parallel processes")) datasets = split_dataset(dataset=dataset, num=num_procs) datasets = bond_idx_parallel(datasets) new_props = rejoin_props(datasets) dataset.props = new_props
def main(from_model_path, to_model_path, num_confs, conf_file, **kwargs): """ Load the dataset, reduce the number of conformers, and save it. Args: from_model_path (str): The path to the folder in which the old dataset is saved. to_model_path (str): The path to the folder in which the new dataset will be saved. num_confs (int): Desired number of conformers per species conf_file (str): Path to the JSON file that tells you which conformer indices to use for each species. Returns: None """ # load `conf_file` if given if conf_file is not None: with open(conf_file, "r") as f: idx_dic = json.load(f) else: idx_dic = None # If the folder has sub_folders 0, 1, ..., etc., # then load each dataset in each sub-folder. Otherwise # the dataset must be in the main folder. folders = sorted([i for i in os.listdir(from_model_path) if i.isdigit()], key=lambda x: int(x)) if folders == []: folders = [""] # Go through each dataset, update it, and save it for folder in tqdm(folders): fprint(folder) for name in ["train.pth.tar", "test.pth.tar", "val.pth.tar"]: load_path = os.path.join(from_model_path, folder, name) if not os.path.isfile(load_path): continue dataset = Dataset.from_file(load_path) dataset = trim_confs(dataset=dataset, num_confs=num_confs, idx_dic=idx_dic) save_folder = os.path.join(to_model_path, folder) if not os.path.isdir(save_folder): os.makedirs(save_folder) save_path = os.path.join(save_folder, name) dataset.save(save_path)
def summarize_rd(new_sets, first_set): """ Summarize how many RDKit mols were successfully made. Args: first_set (nff.data.dataset): initial NFF dataset new_sets (list): chunks of new datasets updated with RDKit mols. Returns: None """ tried = len(first_set) succ = sum([len(d) for d in new_sets]) pct = succ / tried * 100 fprint("Converted %d of %d molecules (%.2f%%)." % (succ, tried, pct))
def get_metrics(actual_dic, pred_dics, metrics, cp_model_path): """ Get all requested metric scores for a set of predictions and save to a JSON file. Args: actual_dic (dict): dictionary of the form {prop: real}, where `real` are the real values of the property `prop`. pred_dics (list[dict]): list of dictionaries, each the same as `real` but with values predicted by each different model. metrics (list[str]): metrics to apply cp_model_path (str): path to the folder with the model of interest Returns: None """ overall_dic = {} for i, pred_dic in enumerate(pred_dics): metric_dic = {} for prop in pred_dic.keys(): if prop == "smiles": continue actual = actual_dic[prop] pred = pred_dic[prop] metric_dic[prop] = {} for metric in metrics: score = apply_metric(metric, pred, actual) metric_dic[prop][metric] = score overall_dic[str(i)] = metric_dic props = [prop for prop in pred_dic.keys() if prop != 'smiles'] overall_dic['average'] = {prop: {} for prop in props} sub_dics = [val for key, val in overall_dic.items() if key != 'average'] for prop in props: for key in sub_dics[0][prop].keys(): vals = [sub_dic[prop][key] for sub_dic in sub_dics] mean = np.mean(vals).item() std = np.std(vals).item() overall_dic['average'][prop][key] = {"mean": mean, "std": std} save_path = os.path.join(cp_model_path, f"test_metrics.json") with open(save_path, "w") as f: json.dump(overall_dic, f, indent=4, sort_keys=True) fprint(f"Saved metric scores to {save_path}")
def featurize_parallel(dataset, num_procs, bond_feats=BOND_FEAT_TYPES, atom_feats=ATOM_FEAT_TYPES): """ Add RDKit mols, atom features and bond features to a dataset in parallel. Args: dataset (nff.data.dataset): NFF dataset num_procs (int): number of parallel processes bond_feats (list[str]): names of bond features atom_feats (list[str]): names of atom features Returns: None """ # offsets can be sparse tensors which in torch version <= 1.3 # can't yet be pickled. So we have to remove them to not cause # errors in the pickling during parallelization add_offsets = False if "offsets" in dataset.props: offsets = copy.deepcopy(dataset.props["offsets"]) add_offsets = True dataset.props.pop("offsets") msg = f"Featurizing dataset with {num_procs} parallel processes." if num_procs == 1: msg = msg.replace("processes", "process") fprint(msg) # split the dataset so processes can act in parallel on the chunks datasets = split_dataset(dataset=dataset, num=num_procs) # add RDKit mols if they're not already in the dataset has_rdmols = all(['rd_mols' in dset.props for dset in datasets]) if not has_rdmols: fprint("Converting xyz to RDKit mols...") datasets = rd_parallel(datasets) summarize_rd(new_sets=datasets, first_set=dataset) fprint("Featurizing bonds...") datasets = bonds_parallel(datasets, feat_types=bond_feats) fprint("Featurizing atoms...") datasets = atoms_parallel(datasets, feat_types=atom_feats) # rejoin the dataset new_props = rejoin_props(datasets) dataset.props = new_props # rename the bond list as `bonded_nbr_list` new_props["bonded_nbr_list"] = copy.deepcopy(new_props["bond_list"]) new_props.pop("bond_list") if add_offsets: dataset.props["offsets"] = offsets
def summarize(save_paths, feat_folder): """ Summarize where the files were saved and what their contents are. Args: save_paths (list[str]): list of the paths to all the saved features files feat_folder (str): path to the folder that contains all the feature files. Returns: None """ base_dir = "/".join(save_paths[0].split("/")[:-1]) save_names = [get_name(path) for path in save_paths] num_files = len(save_paths) string = "\n".join(save_names) summary = (f"Saved {num_files} files with features \n" f"Used model in {feat_folder} \n\n" f"Save folder: \n{base_dir}\n\n" f"Save names: \n{string}") fprint(summary)
def add_features(dset, extra_features, parallel_feat_threads): """ Add any requested features to the dataset Args: dset (nff.data.dataset): NFF dataset extra_features (list[dict]): list of extra features, where each item is a dictionary of the form {"name": name, "params": {params needed}}. parallel_feat_threads (int): how many parallel threads to use when making the efeatures. Returns: dset (nff.data.dataset): updated NFF dataset """ for dic in tqdm(extra_features): name = dic["name"] params = dic["params"] if name.lower() == "e3fp": length = params["length"] fprint(f"Adding E3FP fingerprints of size {length}...") dset.add_e3fp(length, num_procs=parallel_feat_threads) if name.lower() == "whim": fprint("Adding whim fingerprints...") dset.featurize_rdkit('whim') if name.lower() == "morgan": length = params["length"] fprint(f"Adding Morgan fingerprints of size {length}...") dset.add_morgan(length) return dset
def add_e3fp_parallel(dataset, fp_length, num_procs): """ Add E3FP fingerprints to a dataset in parallel. Args: dataset (nff.data.dataset): NFF dataset fp_length (int): fingerprint length num_procs (int): number of parallel processes Returns: None """ msg = f"Adding E3FP fingerprints with {num_procs} parallel processes." if num_procs == 1: msg = msg.replace("processes", "process") fprint(msg) # split the dataset, run E3FP in parallel, and rejoin it datasets = split_dataset(dataset=dataset, num=num_procs) datasets = e3fp_parallel(datasets, fp_length=fp_length) new_props = rejoin_props(datasets) dataset.props = new_props
def objective(hyperparams): # clean up model folder from previous interation clean_up(model_path=model_path) # Convert hyperparams from float to int when necessary for key, typ in param_type_dic.items(): if typ == "int": hyperparams[key] = int(hyperparams[key]) # print hyperparameters being used val_str = " " + "\n ".join( [f"{key}: {val}" for key, val in hyperparams.items()]) fprint(f"Hyperpameters used this round:\n{val_str}") # update config file, run, get the score, and save vals = [hyperparams[key] for key in param_names] update_info(job_path=job_path, vals=vals, param_names=param_names, prop_name=prop_name) # train the model and get the score best_score = run(job_path=job_path, model_path=model_path, metric=metric) # get the hyperparameter score, given that the aim is # to minimize whatever comes out metric_obj = METRIC_DIC[convert_metric(metric)] hyper_score = -best_score if (metric_obj == "maximize") else best_score # save the score save_score(dic_path=dic_path, hyperparams=hyperparams, metric=metric, best_score=best_score) return hyper_score
def summarize(csv_folder, dataset_type): """ Summarize where the splits have been saved and what their contents are. Args: csv_folder (str): path to the folder in which we will save oru csv files with the SMILES, properties and training splits. dataset_type (str): type of problem, e.g. "classification" or "regression". Returns: None """ msgs = [] for name in ['train', 'val', 'test', 'all']: if name == 'all': path = os.path.join(csv_folder, f"{name}.csv") else: path = os.path.join(csv_folder, f"{name}_full.csv") with open(path, "r") as f: lines = f.readlines()[1:] num_specs = len(lines) this_msg = f"{num_specs} species" if dataset_type == "classification": num_pos = len( [line for line in lines if int(line.split(",")[-1]) == 1]) this_msg += f", {num_pos} positives" msgs.append(this_msg) msg = (f"Splits saved in {csv_folder}\n" f"Train files: train_smiles.csv and train_full.csv ({msgs[0]})\n" f"Validation files: val_smiles.csv and val_full.csv ({msgs[1]}) \n" f"Test files: test_smiles.csv and test_full.csv ({msgs[2]})\n" f"Combined file: all.csv ({msgs[3]})") fprint(msg)
def cp_hyperopt(cp_folder, hyp_folder, rerun): """ Run hyperparameter optimization with ChemProp. Args: cp_folder (str): path to the chemprop folder on your computer hyp_folder (str): where you want to store your hyperparameter optimization models rerun (bool): whether to rerun hyperparameter optimization if `hyp_folder` already exists and has the completion file `best_params.json`. Returns: best_params (dict): best parameters from hyperparameter optimization """ # path to `best_params.json` file param_file = os.path.join(hyp_folder, "best_params.json") params_exist = os.path.isfile(param_file) # If it exists and you don't want to re-run, then load it if params_exist and (not rerun): fprint(f"Loading hyperparameter results from {param_file}\n") with open(param_file, "r") as f: best_params = json.load(f) return best_params # otherwise run the script and read in the results hyp_script = os.path.join(cp_folder, "hyperparameter_optimization.py") config_path = os.path.join(hyp_folder, "config.json") with open(config_path, "r") as f: config = json.load(f) data_path = config["data_path"] dataset_type = config["dataset_type"] cmd = get_cp_cmd(hyp_script, config_path, data_path, dataset_type) cmd += f" --config_save_path {param_file}" fprint(f"Running hyperparameter optimization in folder {hyp_folder}\n") fprint(cmd) p = bash_command(f"source activate chemprop && {cmd}") p.wait() with open(param_file, "r") as f: best_params = json.load(f) return best_params
def clean_up_dset(dset, nbrlist_cutoff, strict_conformers, csv_folder, add_directed_idx, num_procs): """ Do various things to clean up the dataset after you've made it. Args: dset (nff.data.dataset): NFF dataset nbrlist_cutoff (float): Cutoff for two atoms to be considered neighbors. strict_conformers (bool): Whether to exclude any species whose conformers don't all have the same SMILES. csv_folder (str): path to folder that contains the csv files with the test/val/train smiles. add_directed_idx (bool): whether to calculate and add the kj and ji indices. These indices tell you which edges connect to other edges. num_procs (int): how many parallel threads to use when making the kj and ji indices. Returns: dset (nff.data.dataset): cleaned up dataset """ old_num = len(dset) # smiles we're getting rid of remove_smiles = [] total = 3 + int(add_directed_idx) with tqdm(total=total) as pbar: # if requested, get rid of any species whose conformers have different # SMILES strings if strict_conformers: dset, removed = filter_same_smiles(dset) remove_smiles += removed # iterate the tqdm progress bar pbar.update(1) # Get rid of any conformers whose bond lists aren't subsets of the # neighbor list dset, removed = filter_bonds_in_nbr(nbrlist_cutoff, dset) remove_smiles += removed pbar.update(1) # Add the indices of the neighbor list that correspond to # bonded atoms. Only use one process to avoid running # out of memory dset.generate_bond_idx(num_procs=1) pbar.update(1) # Make sure the dataset is directed dset.make_all_directed() # add the kj and ji idx if requested if add_directed_idx: # only use one process to avoid running out of memory dset.generate_kj_ji(num_procs=1) pbar.update(1) # Re-save the train/val/test splits accounting for the fact that some # species are no longer there resave_splits(csv_folder=csv_folder, remove_smiles=remove_smiles) new_num = old_num - len(remove_smiles) changed_num = old_num != new_num # Print a warning if the total number of species has changed if changed_num: msg = ("WARNING: the original SMILES splits have been re-saved with " f"{new_num} species, reduced from the original {old_num}, " f"because only {new_num} species made it into the final " "dataset. This could be because of conformers with bond " "lengths greater than the cutoff distance of %.2f" ) % nbrlist_cutoff if strict_conformers: msg += (", or because the conformers of certain species didn't " "all have the same SMILES string") msg += "." fprint(msg) return dset
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--from_model_path', type=str, help="Path to model from which original data comes") parser.add_argument('--to_model_path', type=str, help="Path to model to which new data is saved") parser.add_argument('--num_confs', type=int, help="Number of conformers per species", default=1) parser.add_argument('--conf_file', type=str, help=("Path to json that says which conformer " "to use for each species. This is optional. " "If you don't specify the conformers, the " "script will default to taking the `num_confs` " "lowest conformers, ordered by statistical " "weight."), default=None) args = parser.parse_args() try: main(**args.__dict__) except Exception as e: fprint(e) pdb.post_mortem()
def main(dset_folder, device, model_folder, batch_size, prop, sub_batch_size, feat_save_folder, metric=None, val_only=False, train_only=False, test_only=False, track=True, max_confs=None, **kwargs): """ Get fingerprints and predictions from the model. Args: dset_folder (str): folder with the data in it device (Union[str, int]): device on which you run the model model_folder (str): path to the folder that the model is being trained in batch_size (int): how many data points per batch prop (str): property to predict sub_batch_size (int): how many conformers to put in memory at a time feat_save_folder (str): folder in which we're saving teh features metric (str): name of metric to use. If not given, this defaults to taking the model with the best validation loss. train_only (bool): only load the training set val_only (bool): only load the validation set test_only (bool): only load the test set track (bool): Whether to track progress with tqdm max_confs (int): Maximum number of conformers to use when evaluating the model """ # get the model initially by taken the one saved as "best_model" model = load_model(model_folder) # update its state_dict with the checkpoint from the epoch with # the best metric score if metric is None: fprint(("WARNING: You have not specified a metric with which " "to choose the best model. Defaulting to whichever was " "chosen as the best model during training ")) else: fprint(f"Loading model with best validation {metric}") model = model_from_metric(model=model, model_folder=model_folder, metric=metric) model.eval() paths, dset_names = get_dset_paths(dset_folder, train_only=train_only, val_only=val_only, test_only=test_only) # go through each dataset, create a loader, evaluate the model, # and save the predictions iter_func = get_iter_func(track, num_track=len(dset_names)) for i in iter_func(range(len(dset_names))): results = {} targets = {} j = 0 for path in tqdm(paths[i]): dataset = Dataset.from_file(path) if max_confs is not None: dataset = trim_confs(dataset=dataset, num_confs=max_confs, idx_dic=None, enum_func=iter_func) loader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_dicts) new_results, new_targets = evaluate(model, loader, device=device, sub_batch_size=sub_batch_size, track=track) is_first = (j == 0) results = add_dics(base=results, new=new_results, is_first=is_first) targets = add_dics(base=targets, new=new_targets, is_first=is_first) j += 1 name = dset_names[i] save_name = f"pred_{metric}_{name}.pickle" if feat_save_folder is None: feat_save_folder = dset_folder if not os.path.isdir(feat_save_folder): os.makedirs(feat_save_folder) pickle_path = os.path.join(feat_save_folder, save_name) save(results=results, targets=targets, feat_save_folder=pickle_path, prop=prop)
def get_gpu_splits(weight_path, rank, world_size, params, max_confs): """ Check if there are already datasets in each parallel folder. If so, load and return those datasets instead of loading the whole thing in memory on every gpu and splitting afterwards. Args: weight_path (str): training folder rank (int): global rank of the current process world_size (int): total number number of gpus altogether params (dict): training/network parameters max_confs (int): maximum number of conformers per species. Returns: datasets (list): train, val, and test get_datasets if the datasets have already been split by GPU. None otherwise. """ # get the parallel folders: weight_path / {0, 1, 2, ..., n_gpus} par_folders = [os.path.join(weight_path, folder) for folder in os.listdir(weight_path) if folder.isdigit()] # see if the data has already been split by gpu train_splits = ["train.pth.tar", "val.pth.tar", "test.pth.tar"] dset_name = "dataset.pth.tar" has_train_splits = all([name in os.listdir(folder) for name in train_splits for folder in par_folders]) has_dset = all([dset_name in os.listdir(folder) for folder in par_folders]) has_splits = (has_train_splits or has_dset) and len( par_folders) >= world_size # if not, return None if not has_splits: return dat_path = os.path.join(weight_path, str(rank), "dataset.pth.tar") split_paths = [os.path.join(weight_path, str(rank), name + ".pth.tar") for name in ["train", "val", "test"]] # if the train/val/test splits are already saved, then load them if all([os.path.isfile(path) for path in split_paths]): if max_confs is not None and (rank == 0): conf_str = "conformer" if max_confs == 1 else "conformers" fprint(("Reducing each species to have a maximum of " f"{max_confs} {conf_str}...")) datasets = [load_dset(path, max_confs, rank) for path in split_paths] return datasets # otherwise get the dataset, split it, and save it dataset = load_dset(dat_path, max_confs, rank) # split this sub-dataset into train/val/test train, val, test = split_train_validation_test( dataset, val_size=params['split'][0], test_size=params['split'][1] ) datasets = [train, val, test] # save the splits to the training folder names = ['train', 'val', 'test'] for d_set, name in zip(datasets, names): data_path = os.path.join(weight_path, str(rank), "{}.pth.tar".format(name)) d_set.save(data_path) return datasets
def make_nff_dataset(spec_dics, nbrlist_cutoff, parallel_feat_threads, strict_conformers, csv_folder, extra_features, add_directed_idx): """ Make an NFF dataset Args: spec_dics (list[dict]): a dictionary with data for each species nbr_list_cutoff (float): Cutoff for two atoms to be considered neighbors. parallel_feat_threads (int): how many parallel threads to use when making the efeatures. strict_conformers (bool): Whether to exclude any species whose conformers don't all have the same SMILES. csv_folder (str): path to folder that contains the csv files with the test/val/train smiles. extra_features (list[dict]): list of extra features dictionaries add_directed_idx (bool): whether to calculate and add the kj and ji indices. These indices tell you which edges connect to other edges. Returns: big_dataset (nff.data.dataset): NFF dataset """ fprint("Making dataset with %d species" % (len(spec_dics))) props_list = [] nbr_list = [] rd_mols_list = [] for j, spec_dic in tqdm_enum(spec_dics): # Exclude keys related to individual conformers. These # include conformer features, in case you've already put # those in your pickle files. If not we'll generate them # below small_spec_dic = { key: val for key, val in spec_dic.items() if key not in CONF_KEYS } # Treat each species' data like a regular dataset # and use it to generate neighbor lists dataset = Dataset(small_spec_dic, units='kcal/mol') # number of atoms in the molecule mol_size = len(dataset.props["nxyz"][0]) dataset.generate_neighbor_list(cutoff=nbrlist_cutoff, undirected=False) # now combine the neighbor lists so that this set # of nxyz's can be treated like one big molecule nbrs = dataset.props['nbr_list'] new_nbrs = [] # shift by i * mol_size for each conformer for i in range(len(nbrs)): new_nbrs.append(nbrs[i] + i * mol_size) # add to list of conglomerated neighbor lists nbr_list.append(torch.cat(new_nbrs)) dataset.props.pop('nbr_list') # concatenate the nxyz's nxyz = np.concatenate([np.array(item) for item in spec_dic["nxyz"] ]).reshape(-1, 4).tolist() # add properties as necessary new_dic = { "mol_size": mol_size, "nxyz": nxyz, "weights": torch.Tensor(spec_dic["weights"]).reshape(-1, 1) / sum(spec_dic["weights"]), "degeneracy": torch.Tensor(spec_dic["degeneracy"]).reshape(-1, 1), "energy": torch.Tensor(spec_dic["energy"]).reshape(-1, 1), "num_atoms": [len(nxyz)] } new_dic.update({ key: val[:1] for key, val in dataset.props.items() if key not in new_dic.keys() }) props_list.append(new_dic) rd_mols_list.append(spec_dic["rd_mols"]) # Add props that are in some datasets but not others props_list = add_missing(props_list) # convert the list of dicationaries into a dicationary of lists / tensors props_dic = concatenate_dict(*props_list) # make a combined dataset where the species look like they're # one big molecule big_dataset = Dataset(props_dic, units='kcal/mol') # give it the proper neighbor list and rdkit mols big_dataset.props['nbr_list'] = nbr_list big_dataset.props["rd_mols"] = rd_mols_list # generate atom and bond features big_dataset.featurize(num_procs=parallel_feat_threads) # clean up fprint("Cleaning up dataset...") big_dataset = clean_up_dset(dset=big_dataset, nbrlist_cutoff=nbrlist_cutoff, strict_conformers=strict_conformers, csv_folder=csv_folder, add_directed_idx=add_directed_idx, num_procs=parallel_feat_threads) # add any other requested features big_dataset = add_features(dset=big_dataset, extra_features=extra_features, parallel_feat_threads=parallel_feat_threads) return big_dataset
def report_delta(bare_dic): """ For a binary task, report analysis on the difference between similarity among hits and similarity between hits and misses. Args: bare_dic (dict): bare dictionary of similarities Returns: None """ for key, dic in bare_dic.items(): fprint(f"Results for {key}") fprint("+/- indicates standard deviation of the mean") # attention and random differences in similarity delta_att = dic['intra_pos']['att'] - dic['inter']['att'] delta_rand = dic['intra_pos']['random'] - dic['inter']['random'] # compute mean for attention delta_att_mean = np.mean(delta_att) # std deviation on the mean delta_att_std = np.std(delta_att) / (len(delta_att))**0.5 # same for random delta_rand_mean = np.mean(delta_rand) delta_rand_std = np.std(delta_rand) / (len(delta_rand))**0.5 # delta delta is the difference in deltas between random and attention, # a measure of how much attention is learning delta_delta_mean = delta_att_mean - delta_rand_mean delta_delta_std = ((np.var(delta_att) + np.var(delta_rand))**0.5 / (len(delta_att))**0.5) fprint("Delta att: %.4f +/- %.4f" % (delta_att_mean, delta_att_std)) fprint("Delta rand: %.4f +/- %.4f" % (delta_rand_mean, delta_rand_std)) fprint("Delta delta: %.4f +/- %.4f" % (delta_delta_mean, delta_delta_std)) fprint("\n")
def conf_sims_from_files(model_path, max_samples, classifier, seed, external_fp_fn=None, summary_path=None, rd_path=None, fp_kwargs=None): """ Get similarity among species according to predictions of different models, given a folder with all of the prediction pickles. Args: model_path (str): path to the folder where the prediction pickles are saved. max_samples (int): maximum number of pairs to compare classifier (bool): whether your model is a classifier seed (int): random seed external_fp_fn (str, optional): name of the fingerprinting function you want to use. If none is provided then the model's generated fingerprint will be used. summary_path (str, optional): path of the file with the summary dictionary of species properties, their pickle paths, etc. rd_path (str, optional): path to the folder that has all your pickles with RDKit mols. fp_kwargs (dict, optional): any keyword arguments you need when calling an external fingeprinter. Returns: analysis (dict): dictionary of the form {prediction_name: similarity_dic} for the name of each prediction file. bare_data (dict): same idea as `analysis` but with the full set of similarities between each molecule. """ fprint("Loading pickle files...") pred_files = get_pred_files(model_path) pred = load_preds(pred_files) bare_data = {} fprint("Calculating fingerprint similarities...") for key in tqdm(pred): dic = pred[key] annotate_confs(dic) fp_dics = attention_sim(dic=dic, max_samples=max_samples, classifier=classifier, seed=seed, external_fp_fn=external_fp_fn, summary_path=summary_path, rd_path=rd_path, fp_kwargs=fp_kwargs) bare_data[key] = fp_dics # analyze the bare data analysis = {} analyze_data(bare_data, analysis) if classifier: report_delta(bare_data) return analysis, bare_data
def main(max_confs, summary_path, dataset_folder, pickle_folder, num_threads, thread, nbrlist_cutoff, csv_folder, parallel_feat_threads, strict_conformers, extra_features, add_directed_idx, average_nbrs, **kwargs): """ Sample species, load their pickles, create an NFF dataset, and save train/val/test splits. Args: max_confs (int): Maximum number of conformers per species summary_path (str): Path to file with summary dictionary dataset_folder (str): base folder for the datasets pickle_folder (str): path to folder that contains all the pickle files. Each sub-dictionary in `sample_dic` will have the key `pickle_path`. Joining `pickle_folder` with `pickle_path` gives the full path to the file. num_threads (int): Total number of sections into which we're splitting and saving the dataset. thread (int): Index that tells us which section of the total dataset that we're creating and saving nbrlist_cutoff (float): Cutoff for two atoms to be considered neighbors. csv_folder (str): path to folder that contains the csv files with the test/val/train smiles. parallel_feat_threads (int): how many parallel threads to use when making the efeatures. strict_conformers (bool): Whether to exclude any species whose conformers don't all have the same SMILES. extra_features (list[dict]): list of extra features, where each item is a dictionary of the form {"name": name, "params": {params needed}}. add_directed_idx (bool): whether to calculate and add the kj and ji indices. These indices tell you which edges connect to other edges. Returns: None """ with open(summary_path, "r") as f: summary_dic = json.load(f) fprint("Loading splits...") sample_dic = get_sample(summary_dic=summary_dic, thread=thread, num_threads=num_threads, csv_folder=csv_folder) fprint("Loading data from pickle files...") overall_dic = load_data_from_pickle(sample_dic, pickle_folder) fprint("Converting data...") spec_dics = convert_data(overall_dic, max_confs) fprint("Combining to make NFF dataset...") dataset = make_nff_dataset(spec_dics=spec_dics, nbrlist_cutoff=nbrlist_cutoff, parallel_feat_threads=parallel_feat_threads, strict_conformers=strict_conformers, csv_folder=csv_folder, extra_features=extra_features, add_directed_idx=add_directed_idx, average_nbrs=average_nbrs) fprint("Creating test/train/val splits...") save_splits(dataset=dataset, dataset_folder=dataset_folder, thread=thread, sample_dic=sample_dic) fprint((f"Complete! Saved section {thread} of the dataset in " f"{os.path.join(dataset_folder, str(thread))}.\n\n"))
def main(base_config_path, hyp_config_path, use_hyperopt, rerun_hyperopt, cp_folder, feature_folder, model_folder_cp, metrics, feat_options, mpnn_options, **kwargs): """ Run transfer learning using fingerprints from 3D models evaluated by performance on a variety of metrics. Different models are trained with the fingerprints and with or without an MPNN. Args: base_config_path (str): where your basic job config file is, with parameters that may or may not be changed depending on the given run hyp_config_path (str): where your basic hyperopt job config file is, with parameters that may or may not be changed depending on the given run use_hyperopt (bool): do a hyperparameter optimization before training the model rerun_hyperopt (bool): whether to rerun hyperparameter optimization if `hyp_folder` already exists and has the completion file `best_params.json`. cp_folder (str): path to the chemprop folder on your computer feature_folder (str): directory with files for the features of the species model_folder_cp (str): directory in which you'll be saving your model folders metrics (list[str]): metrics you want to use feat_options (list[bool]): options you want to use for features. For example, [True, False] means you want to train one model with features and one without, while [True] just means you want to train one with features. mpnn_options (list[bool]): same idea as `feat_options`, but for whether or not to use an MPNN Returns: None """ cwd = os.path.abspath(".") script = os.path.join(cwd, "cp_tl.py") for feat in feat_options: for mpnn in mpnn_options: # can't run anything without either features or an MPNN if (not feat) and (not mpnn): continue for metric in metrics: paths = [] for split in ['train', 'val', 'test']: paths.append(os.path.join(feature_folder, f"{split}_{metric}.npz")) train_feat_path, val_feat_path, test_feat_path = paths train_folder = get_train_folder( model_folder_cp=model_folder_cp, feature_folder=feature_folder, metric=metric, feat=feat, mpnn=mpnn) msg = get_msg(feat, mpnn, train_folder) fprint(msg) cmd = (f"python {script} " f"--base_config_path {base_config_path} " f"--hyp_config_path {hyp_config_path} " f"--metric {metric} " f"--train_feat_path {train_feat_path} " f"--val_feat_path {val_feat_path} " f"--test_feat_path {test_feat_path} " f"--train_folder {train_folder} " f"--cp_folder {cp_folder} ") if use_hyperopt: cmd += "--use_hyperopt " if rerun_hyperopt: cmd += "--rerun_hyperopt " if not mpnn: cmd += "--features_only " if not feat: cmd += "--no_features " p = bash_command(cmd) p.wait()