Exemplo n.º 1
0
def save_splits(dataset, dataset_folder, thread, sample_dic):
    """
    Save the train/val/test splits of the dataset
    Args:
        dataset (nff.data.dataset): NFF dataset
        dataset_folder (str): base folder for the datasets
        thread (int): thread for chunk of dataset
        sample_dic (dict): Sample of `summary_dic` that is used
            in this combined dataset. `summary_dic` contains
            information about all smiles strings we have, except
            for their conformers.
    Returns:
        None
    """

    split_names = ["train", "val", "test"]
    split_idx = {name: [] for name in split_names}

    for i, smiles in enumerate(dataset.props['smiles']):
        split_name = sample_dic[smiles]["split"]
        split_idx[split_name].append(i)

    fprint("Saving...")

    data_folder = get_data_folder(dataset_folder, thread)

    for name in split_names:
        dset = split_dataset(dataset, split_idx[name])
        dset_path = os.path.join(data_folder, name + ".pth.tar")
        dset.save(dset_path)
Exemplo n.º 2
0
def save_best(dic_path, metric, model_path):
    """
    Save the best parameters from the optimization.
    Args:
      dic_path (str): path to the JSON file with the scores
      metric (str): metric by which to evaluate model performance
      model_path (str): directory of model and dataset
    Returns:
      None
    """

    # load the scores
    with open(dic_path, "r") as f:
        score_list = json.load(f)

    # get the best parameters
    objective = METRIC_DIC[convert_metric(metric)]
    pref = 1 if (objective == "minimize") else (-1)
    hyper_scores = [pref * score_dic[metric] for score_dic in score_list]
    best_params = score_list[np.argmin(hyper_scores)]

    # print the best parameters
    save_path = os.path.join(model_path, "best_params.json")
    best_str = "\n  ".join(
        [f"{key}: {val}" for key, val in best_params.items()])
    fprint(f"Best parameters are {best_str}")
    fprint(f"Saving to {save_path}")

    # save them
    with open(save_path, "w") as f:
        json.dump(best_params, f, indent=4, sort_keys=True)
Exemplo n.º 3
0
def model_from_metric(model, model_folder, metric):
    """
    Get the model with the best validation score according
    to a specified metric.
    Args:
      model (nff.nn.models): original NFF model loaded
      model_folder (str): path to the folder that the model is being trained in
      metric (str): name of metric to use
    Returns:
      model (nff.nn.models): NFF model updated with the state dict of
        the model with the best metric
    """

    # the metric asked for should be in chemprop notation (e.g. auc, prc-auc),
    # but when training a CP3D model we use different names
    # (e.g. roc_auc, prc_auc), so we need to transform into that name

    if metric in CHEMPROP_TRANSFORM:
        use_metric = CHEMPROP_TRANSFORM[metric]

    else:
        use_metric = metric

    # find the best epoch by reading the csv with the metrics
    best_score, best_epoch = parse_score(model_folder, use_metric)
    check_path = os.path.join(model_folder, "checkpoints",
                              f"checkpoint-{best_epoch}.pth.tar")

    state_dict = torch.load(check_path, map_location="cpu")["model"]
    fprint(f"Loading model state dict from {check_path}")
    model.load_state_dict(state_dict)
    model.eval()

    return model
Exemplo n.º 4
0
def update_info(job_path, vals, param_names, prop_name):
    """
    Update the config information and save it.
    Args:
      job_path (str): path to the folder with the job config file
      vals (list): new values to use
      param_names (list[str]): names of the parameters being updated
      prop_name (str): Name of property you're predicting
    Returns:
      None
    """

    with open(job_path, "r") as f:
        info = json.load(f)

    real_names = []
    real_vals = []

    for param_name, val in zip(param_names, vals):
        if param_name.startswith("log_"):
            # if anything starts with "log_" (e.g. "log_schnet_dropout"),
            # exponentiate its value to get the actual number
            real_names.append(param_name.replace("log_", ""))
            real_vals.append(np.exp(val))
        else:
            real_names.append(param_name)
            real_vals.append(val)

    # update values
    for param_type, val in zip(real_names, real_vals):
        if 'dropout' in param_type:
            update_dropout(info=info,
                           dropout=val,
                           dropout_type=param_type,
                           prop_name=prop_name)

        elif param_type == "num_heads":
            update_heads(info=info, heads=val)

        elif param_type == "attention_type":
            info["model_params"]["boltzmann_dict"]["type"] = val

        else:
            if param_type not in info["model_params"]:
                msg = (f"Warning: assuming that {param_type} "
                       "is just a key in `model_params`, but "
                       "it is not currently in `model_params` in "
                       "the config file. If it should be in a "
                       "different location then you will need "
                       "to write a custom function for updating "
                       "it.")

                fprint(msg)

            update_general(info, key=param_type, val=val)

    # save
    with open(job_path, "w") as f:
        json.dump(info, f, indent=4, sort_keys=True)
Exemplo n.º 5
0
def make_nff_dataset(spec_dics,
                     nbrlist_cutoff,
                     parallel_feat_threads,
                     strict_conformers,
                     csv_folder,
                     extra_features,
                     add_directed_idx,
                     average_nbrs=False):
    """
    Make an NFF dataset
    Args:
        spec_dics (list[dict]): a dictionary with data for each species
        nbr_list_cutoff (float): Cutoff for two atoms to be considered
            neighbors.
        parallel_feat_threads (int): how many parallel threads
            to use when making the efeatures.
        strict_conformers (bool): Whether to exclude any species whose
            conformers don't all have the same SMILES.
        csv_folder (str): path to folder that contains the csv files
            with the test/val/train smiles.
        extra_features (list[dict]): list of extra features dictionaries
        add_directed_idx (bool): whether to calculate and add the kj
            and ji indices. These indices tell you which edges connect
            to other edges.
    Returns:
        big_dataset (nff.data.dataset): NFF dataset

    """

    fprint("Making dataset with %d species" % (len(spec_dics)))

    if average_nbrs:
        big_dataset = make_avg_dataset(
            spec_dics=spec_dics,
            nbrlist_cutoff=nbrlist_cutoff,
            parallel_feat_threads=parallel_feat_threads,
            strict_conformers=strict_conformers)
    else:
        big_dataset = make_big_dataset(
            spec_dics=spec_dics,
            nbrlist_cutoff=nbrlist_cutoff,
            parallel_feat_threads=parallel_feat_threads)

    # clean up
    fprint("Cleaning up dataset...")
    big_dataset = clean_up_dset(dset=big_dataset,
                                nbrlist_cutoff=nbrlist_cutoff,
                                strict_conformers=strict_conformers,
                                csv_folder=csv_folder,
                                add_directed_idx=add_directed_idx,
                                num_procs=parallel_feat_threads)

    # add any other requested features
    big_dataset = add_features(dset=big_dataset,
                               extra_features=extra_features,
                               parallel_feat_threads=parallel_feat_threads)

    return big_dataset
Exemplo n.º 6
0
def add_kj_ji_parallel(dataset, num_procs):

    fprint((f"Adding kj and ji indices with {num_procs} "
            "parallel processes"))

    datasets = split_dataset(dataset=dataset, num=num_procs)
    datasets = kj_ji_parallel(datasets)
    new_props = rejoin_props(datasets)
    dataset.props = new_props
Exemplo n.º 7
0
def add_bond_idx_parallel(dataset, num_procs):

    fprint((f"Adding bond indices with {num_procs} "
            "parallel processes"))

    datasets = split_dataset(dataset=dataset, num=num_procs)
    datasets = bond_idx_parallel(datasets)
    new_props = rejoin_props(datasets)
    dataset.props = new_props
Exemplo n.º 8
0
def main(from_model_path, to_model_path, num_confs, conf_file, **kwargs):
    """
    Load the dataset, reduce the number of conformers, and save it.
    Args:
        from_model_path (str): The path to the folder in which
            the old dataset is saved.
        to_model_path (str): The path to the folder in which
            the new dataset will be saved.
        num_confs (int): Desired number of conformers per species
        conf_file (str): Path to the JSON file that tells you which
            conformer indices to use for each species.
    Returns:
        None
    """

    # load `conf_file` if given

    if conf_file is not None:
        with open(conf_file, "r") as f:
            idx_dic = json.load(f)
    else:
        idx_dic = None

    # If the folder has sub_folders 0, 1, ..., etc.,
    # then load each dataset in each sub-folder. Otherwise
    # the dataset must be in the main folder.

    folders = sorted([i for i in os.listdir(from_model_path) if i.isdigit()],
                     key=lambda x: int(x))

    if folders == []:
        folders = [""]

    # Go through each dataset, update it, and save it

    for folder in tqdm(folders):

        fprint(folder)
        for name in ["train.pth.tar", "test.pth.tar", "val.pth.tar"]:
            load_path = os.path.join(from_model_path, folder, name)
            if not os.path.isfile(load_path):
                continue
            dataset = Dataset.from_file(load_path)
            dataset = trim_confs(dataset=dataset,
                                 num_confs=num_confs,
                                 idx_dic=idx_dic)

            save_folder = os.path.join(to_model_path, folder)
            if not os.path.isdir(save_folder):
                os.makedirs(save_folder)
            save_path = os.path.join(save_folder, name)
            dataset.save(save_path)
Exemplo n.º 9
0
def summarize_rd(new_sets, first_set):
    """
    Summarize how many RDKit mols were successfully made.
    Args:
        first_set (nff.data.dataset): initial NFF dataset
        new_sets (list): chunks of new datasets updated
            with RDKit mols.
    Returns:
        None
    """
    tried = len(first_set)
    succ = sum([len(d) for d in new_sets])
    pct = succ / tried * 100
    fprint("Converted %d of %d molecules (%.2f%%)." %
           (succ, tried, pct))
Exemplo n.º 10
0
def get_metrics(actual_dic, pred_dics, metrics, cp_model_path):
    """
    Get all requested metric scores for a set of predictions and save
    to a JSON file.
    Args:
      actual_dic (dict): dictionary of the form {prop: real}, where `real` are the
        real values of the property `prop`.
      pred_dics (list[dict]): list of dictionaries, each the same as `real` but
        with values predicted by each different model.
      metrics (list[str]): metrics to apply
      cp_model_path (str): path to the folder with the model of interest
    Returns:
      None
    """

    overall_dic = {}
    for i, pred_dic in enumerate(pred_dics):
        metric_dic = {}
        for prop in pred_dic.keys():
            if prop == "smiles":
                continue
            actual = actual_dic[prop]
            pred = pred_dic[prop]

            metric_dic[prop] = {}

            for metric in metrics:
                score = apply_metric(metric, pred, actual)
                metric_dic[prop][metric] = score

            overall_dic[str(i)] = metric_dic

    props = [prop for prop in pred_dic.keys() if prop != 'smiles']
    overall_dic['average'] = {prop: {} for prop in props}
    sub_dics = [val for key, val in overall_dic.items() if key != 'average']

    for prop in props:
        for key in sub_dics[0][prop].keys():
            vals = [sub_dic[prop][key] for sub_dic in sub_dics]
            mean = np.mean(vals).item()
            std = np.std(vals).item()
            overall_dic['average'][prop][key] = {"mean": mean, "std": std}

    save_path = os.path.join(cp_model_path, f"test_metrics.json")
    with open(save_path, "w") as f:
        json.dump(overall_dic, f, indent=4, sort_keys=True)

    fprint(f"Saved metric scores to {save_path}")
Exemplo n.º 11
0
def featurize_parallel(dataset,
                       num_procs,
                       bond_feats=BOND_FEAT_TYPES,
                       atom_feats=ATOM_FEAT_TYPES):
    """
    Add RDKit mols, atom features and bond features to a dataset in 
    parallel.
    Args:
         dataset (nff.data.dataset): NFF dataset
         num_procs (int): number of parallel processes
         bond_feats (list[str]): names of bond features
         atom_feats (list[str]): names of atom features
    Returns:
        None
    """

    # offsets can be sparse tensors which in torch version <= 1.3
    # can't yet be pickled. So we have to remove them to not cause
    # errors in the pickling during parallelization

    add_offsets = False
    if "offsets" in dataset.props:
        offsets = copy.deepcopy(dataset.props["offsets"])
        add_offsets = True
        dataset.props.pop("offsets")

    msg = f"Featurizing dataset with {num_procs} parallel processes."
    if num_procs == 1:
        msg = msg.replace("processes", "process")
    fprint(msg)

    # split the dataset so processes can act in parallel on the chunks
    datasets = split_dataset(dataset=dataset, num=num_procs)

    # add RDKit mols if they're not already in the dataset
    has_rdmols = all(['rd_mols' in dset.props for dset in datasets])
    if not has_rdmols:
        fprint("Converting xyz to RDKit mols...")
        datasets = rd_parallel(datasets)
        summarize_rd(new_sets=datasets, first_set=dataset)

    fprint("Featurizing bonds...")
    datasets = bonds_parallel(datasets, feat_types=bond_feats)

    fprint("Featurizing atoms...")
    datasets = atoms_parallel(datasets, feat_types=atom_feats)

    # rejoin the dataset
    new_props = rejoin_props(datasets)
    dataset.props = new_props

    # rename the bond list as `bonded_nbr_list`
    new_props["bonded_nbr_list"] = copy.deepcopy(new_props["bond_list"])
    new_props.pop("bond_list")

    if add_offsets:
        dataset.props["offsets"] = offsets
Exemplo n.º 12
0
def summarize(save_paths, feat_folder):
    """
    Summarize where the files were saved and what their contents are.
    Args:
      save_paths (list[str]): list of the paths to all the saved features files
      feat_folder (str): path to the folder that contains all the feature files.
    Returns:
      None
    """

    base_dir = "/".join(save_paths[0].split("/")[:-1])
    save_names = [get_name(path) for path in save_paths]
    num_files = len(save_paths)
    string = "\n".join(save_names)
    summary = (f"Saved {num_files} files with features \n"
               f"Used model in {feat_folder} \n\n"
               f"Save folder: \n{base_dir}\n\n"
               f"Save names: \n{string}")
    fprint(summary)
Exemplo n.º 13
0
def add_features(dset, extra_features, parallel_feat_threads):
    """
    Add any requested features to the dataset
    Args:
        dset (nff.data.dataset): NFF dataset
        extra_features (list[dict]): list of extra features,
            where each item is a dictionary of the form
            {"name": name, "params": {params needed}}.
        parallel_feat_threads (int): how many parallel threads
            to use when making the efeatures.
    Returns:
        dset (nff.data.dataset): updated NFF dataset
    """

    for dic in tqdm(extra_features):

        name = dic["name"]
        params = dic["params"]

        if name.lower() == "e3fp":
            length = params["length"]
            fprint(f"Adding E3FP fingerprints of size {length}...")
            dset.add_e3fp(length, num_procs=parallel_feat_threads)
        if name.lower() == "whim":
            fprint("Adding whim fingerprints...")
            dset.featurize_rdkit('whim')
        if name.lower() == "morgan":
            length = params["length"]
            fprint(f"Adding Morgan fingerprints of size {length}...")
            dset.add_morgan(length)

    return dset
Exemplo n.º 14
0
def add_e3fp_parallel(dataset,
                      fp_length,
                      num_procs):
    """
    Add E3FP fingerprints to a dataset in parallel.
    Args:
         dataset (nff.data.dataset): NFF dataset
         fp_length (int): fingerprint length
         num_procs (int): number of parallel processes
    Returns:
        None
    """
    msg = f"Adding E3FP fingerprints with {num_procs} parallel processes."
    if num_procs == 1:
        msg = msg.replace("processes", "process")
    fprint(msg)

    # split the dataset, run E3FP in parallel, and rejoin it
    datasets = split_dataset(dataset=dataset, num=num_procs)
    datasets = e3fp_parallel(datasets, fp_length=fp_length)
    new_props = rejoin_props(datasets)
    dataset.props = new_props
Exemplo n.º 15
0
    def objective(hyperparams):

        # clean up model folder from previous interation
        clean_up(model_path=model_path)

        # Convert hyperparams from float to int when necessary
        for key, typ in param_type_dic.items():
            if typ == "int":
                hyperparams[key] = int(hyperparams[key])

        # print hyperparameters being used
        val_str = "  " + "\n  ".join(
            [f"{key}: {val}" for key, val in hyperparams.items()])
        fprint(f"Hyperpameters used this round:\n{val_str}")

        # update config file, run, get the score, and save
        vals = [hyperparams[key] for key in param_names]
        update_info(job_path=job_path,
                    vals=vals,
                    param_names=param_names,
                    prop_name=prop_name)

        # train the model and get the score
        best_score = run(job_path=job_path,
                         model_path=model_path,
                         metric=metric)

        # get the hyperparameter score, given that the aim is
        # to minimize whatever comes out
        metric_obj = METRIC_DIC[convert_metric(metric)]
        hyper_score = -best_score if (metric_obj == "maximize") else best_score

        # save the score
        save_score(dic_path=dic_path,
                   hyperparams=hyperparams,
                   metric=metric,
                   best_score=best_score)

        return hyper_score
Exemplo n.º 16
0
def summarize(csv_folder, dataset_type):
    """
    Summarize where the splits have been saved and what their contents are.
    Args:
      csv_folder (str): path to the folder in which we will save oru
        csv files with the SMILES, properties and training splits.
      dataset_type (str): type of problem, e.g. "classification" or 
        "regression".
    Returns:
      None
    """
    msgs = []
    for name in ['train', 'val', 'test', 'all']:
        if name == 'all':
            path = os.path.join(csv_folder, f"{name}.csv")
        else:
            path = os.path.join(csv_folder, f"{name}_full.csv")
        with open(path, "r") as f:
            lines = f.readlines()[1:]

        num_specs = len(lines)
        this_msg = f"{num_specs} species"
        if dataset_type == "classification":
            num_pos = len(
                [line for line in lines if int(line.split(",")[-1]) == 1])
            this_msg += f", {num_pos} positives"

        msgs.append(this_msg)

    msg = (f"Splits saved in {csv_folder}\n"
           f"Train files: train_smiles.csv and train_full.csv ({msgs[0]})\n"
           f"Validation files: val_smiles.csv and val_full.csv ({msgs[1]}) \n"
           f"Test files: test_smiles.csv and test_full.csv ({msgs[2]})\n"
           f"Combined file: all.csv ({msgs[3]})")

    fprint(msg)
Exemplo n.º 17
0
def cp_hyperopt(cp_folder, hyp_folder, rerun):
    """
    Run hyperparameter optimization with ChemProp.
    Args:
      cp_folder (str): path to the chemprop folder on your computer
      hyp_folder (str): where you want to store your hyperparameter
        optimization models
      rerun (bool): whether to rerun hyperparameter optimization if
        `hyp_folder` already exists and has the completion file
        `best_params.json`.
    Returns:
      best_params (dict): best parameters from hyperparameter 
        optimization
    """

    # path to `best_params.json` file
    param_file = os.path.join(hyp_folder, "best_params.json")
    params_exist = os.path.isfile(param_file)

    # If it exists and you don't want to re-run, then load it
    if params_exist and (not rerun):

        fprint(f"Loading hyperparameter results from {param_file}\n")

        with open(param_file, "r") as f:
            best_params = json.load(f)
        return best_params

    # otherwise run the script and read in the results

    hyp_script = os.path.join(cp_folder, "hyperparameter_optimization.py")
    config_path = os.path.join(hyp_folder, "config.json")

    with open(config_path, "r") as f:
        config = json.load(f)

    data_path = config["data_path"]
    dataset_type = config["dataset_type"]
    cmd = get_cp_cmd(hyp_script, config_path, data_path, dataset_type)
    cmd += f" --config_save_path {param_file}"

    fprint(f"Running hyperparameter optimization in folder {hyp_folder}\n")

    fprint(cmd)
    p = bash_command(f"source activate chemprop && {cmd}")
    p.wait()

    with open(param_file, "r") as f:
        best_params = json.load(f)

    return best_params
Exemplo n.º 18
0
def clean_up_dset(dset, nbrlist_cutoff, strict_conformers, csv_folder,
                  add_directed_idx, num_procs):
    """
    Do various things to clean up the dataset after you've made it.
    Args:
        dset (nff.data.dataset): NFF dataset
        nbrlist_cutoff (float): Cutoff for two atoms to be considered
            neighbors.
        strict_conformers (bool): Whether to exclude any species whose
            conformers don't all have the same SMILES.
        csv_folder (str): path to folder that contains the csv files
            with the test/val/train smiles.
        add_directed_idx (bool): whether to calculate and add the kj
            and ji indices. These indices tell you which edges connect
            to other edges.
        num_procs (int): how many parallel threads to use when making the 
            kj and ji indices.
    Returns:
        dset (nff.data.dataset): cleaned up dataset

    """

    old_num = len(dset)

    # smiles we're getting rid of
    remove_smiles = []
    total = 3 + int(add_directed_idx)

    with tqdm(total=total) as pbar:

        # if requested, get rid of any species whose conformers have different
        # SMILES strings
        if strict_conformers:
            dset, removed = filter_same_smiles(dset)
            remove_smiles += removed

        # iterate the tqdm progress bar
        pbar.update(1)

        # Get rid of any conformers whose bond lists aren't subsets of the
        # neighbor list
        dset, removed = filter_bonds_in_nbr(nbrlist_cutoff, dset)
        remove_smiles += removed
        pbar.update(1)

        # Add the indices of the neighbor list that correspond to
        # bonded atoms. Only use one process to avoid running
        # out of memory

        dset.generate_bond_idx(num_procs=1)
        pbar.update(1)

        # Make sure the dataset is directed
        dset.make_all_directed()

        # add the kj and ji idx if requested
        if add_directed_idx:
            # only use one process to avoid running out of memory
            dset.generate_kj_ji(num_procs=1)
            pbar.update(1)

    # Re-save the train/val/test splits accounting for the fact that some
    # species are no longer there

    resave_splits(csv_folder=csv_folder, remove_smiles=remove_smiles)
    new_num = old_num - len(remove_smiles)

    changed_num = old_num != new_num

    # Print a warning if the total number of species has changed
    if changed_num:
        msg = ("WARNING: the original SMILES splits have been re-saved with "
               f"{new_num} species, reduced from the original {old_num}, "
               f"because only {new_num} species made it into the final "
               "dataset. This could be because of conformers with bond "
               "lengths greater than the cutoff distance of %.2f"
               ) % nbrlist_cutoff

        if strict_conformers:
            msg += (", or because the conformers of certain species didn't "
                    "all have the same SMILES string")
        msg += "."

        fprint(msg)

    return dset
Exemplo n.º 19
0
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--from_model_path',
                        type=str,
                        help="Path to model from which original data comes")
    parser.add_argument('--to_model_path',
                        type=str,
                        help="Path to model to which new data is saved")
    parser.add_argument('--num_confs',
                        type=int,
                        help="Number of conformers per species",
                        default=1)
    parser.add_argument('--conf_file',
                        type=str,
                        help=("Path to json that says which conformer "
                              "to use for each species. This is optional. "
                              "If you don't specify the conformers, the "
                              "script will default to taking the `num_confs` "
                              "lowest conformers, ordered by statistical "
                              "weight."),
                        default=None)

    args = parser.parse_args()

    try:
        main(**args.__dict__)
    except Exception as e:
        fprint(e)
        pdb.post_mortem()
Exemplo n.º 20
0
def main(dset_folder,
         device,
         model_folder,
         batch_size,
         prop,
         sub_batch_size,
         feat_save_folder,
         metric=None,
         val_only=False,
         train_only=False,
         test_only=False,
         track=True,
         max_confs=None,
         **kwargs):
    """
    Get fingerprints and predictions from the model.
    Args:
      dset_folder (str): folder with the data in it
      device (Union[str, int]): device on which you run the model
      model_folder (str): path to the folder that the model is being trained in
      batch_size (int): how many data points per batch
      prop (str): property to predict
      sub_batch_size (int): how many conformers to put in memory at a time
      feat_save_folder (str): folder in which we're saving teh features
      metric (str): name of metric to use. If not given, this defaults to
        taking the model with the best validation loss.
      train_only (bool): only load the training set
      val_only (bool): only load the validation set
      test_only (bool): only load the test set
      track (bool): Whether to track progress with tqdm
      max_confs (int): Maximum number of conformers to use when evaluating the
          model
    """

    # get the model initially by taken the one saved as "best_model"
    model = load_model(model_folder)
    # update its state_dict with the checkpoint from the epoch with
    # the best metric score

    if metric is None:
        fprint(("WARNING: You have not specified a metric with which "
                "to choose the best model. Defaulting to whichever was "
                "chosen as the best model during training "))
    else:
        fprint(f"Loading model with best validation {metric}")
        model = model_from_metric(model=model,
                                  model_folder=model_folder,
                                  metric=metric)
    model.eval()

    paths, dset_names = get_dset_paths(dset_folder,
                                       train_only=train_only,
                                       val_only=val_only,
                                       test_only=test_only)

    # go through each dataset, create a loader, evaluate the model,
    # and save the predictions

    iter_func = get_iter_func(track, num_track=len(dset_names))

    for i in iter_func(range(len(dset_names))):
        results = {}
        targets = {}
        j = 0
        for path in tqdm(paths[i]):
            dataset = Dataset.from_file(path)
            if max_confs is not None:
                dataset = trim_confs(dataset=dataset,
                                     num_confs=max_confs,
                                     idx_dic=None,
                                     enum_func=iter_func)

            loader = DataLoader(dataset,
                                batch_size=batch_size,
                                collate_fn=collate_dicts)

            new_results, new_targets = evaluate(model,
                                                loader,
                                                device=device,
                                                sub_batch_size=sub_batch_size,
                                                track=track)

            is_first = (j == 0)
            results = add_dics(base=results,
                               new=new_results,
                               is_first=is_first)

            targets = add_dics(base=targets,
                               new=new_targets,
                               is_first=is_first)
            j += 1

        name = dset_names[i]
        save_name = f"pred_{metric}_{name}.pickle"
        if feat_save_folder is None:
            feat_save_folder = dset_folder
        if not os.path.isdir(feat_save_folder):
            os.makedirs(feat_save_folder)

        pickle_path = os.path.join(feat_save_folder, save_name)

        save(results=results,
             targets=targets,
             feat_save_folder=pickle_path,
             prop=prop)
Exemplo n.º 21
0
def get_gpu_splits(weight_path,
                   rank,
                   world_size,
                   params,
                   max_confs):
    """ 
    Check if there are already datasets in each parallel folder.
    If so, load and return those datasets instead of loading the whole
    thing in memory on every gpu and splitting afterwards.

    Args:
        weight_path (str): training folder
        rank (int): global rank of the current process
        world_size (int): total number number of gpus altogether
        params (dict): training/network parameters
        max_confs (int): maximum number of conformers per
            species.

    Returns:
        datasets (list): train, val, and test get_datasets if
            the datasets have already been split by GPU.
            None otherwise.

    """

    # get the parallel folders: weight_path / {0, 1, 2, ..., n_gpus}

    par_folders = [os.path.join(weight_path, folder) for
                   folder in os.listdir(weight_path) if
                   folder.isdigit()]

    # see if the data has already been split by gpu

    train_splits = ["train.pth.tar", "val.pth.tar", "test.pth.tar"]
    dset_name = "dataset.pth.tar"
    has_train_splits = all([name in os.listdir(folder) for name in train_splits
                            for folder in par_folders])
    has_dset = all([dset_name in os.listdir(folder) for folder in par_folders])
    has_splits = (has_train_splits or has_dset) and len(
        par_folders) >= world_size

    # if not, return None

    if not has_splits:
        return

    dat_path = os.path.join(weight_path, str(rank), "dataset.pth.tar")
    split_paths = [os.path.join(weight_path, str(rank), name + ".pth.tar")
                   for name in ["train", "val", "test"]]

    # if the train/val/test splits are already saved, then load them

    if all([os.path.isfile(path) for path in split_paths]):
        if max_confs is not None and (rank == 0):
            conf_str = "conformer" if max_confs == 1 else "conformers"
            fprint(("Reducing each species to have a maximum of "
                    f"{max_confs} {conf_str}..."))
        datasets = [load_dset(path, max_confs, rank) for path in split_paths]
        return datasets

    # otherwise get the dataset, split it, and save it

    dataset = load_dset(dat_path, max_confs, rank)

    # split this sub-dataset into train/val/test

    train, val, test = split_train_validation_test(
        dataset,
        val_size=params['split'][0],
        test_size=params['split'][1]
    )

    datasets = [train, val, test]

    # save the splits to the training folder

    names = ['train', 'val', 'test']
    for d_set, name in zip(datasets, names):
        data_path = os.path.join(weight_path, str(rank),
                                 "{}.pth.tar".format(name))
        d_set.save(data_path)

    return datasets
Exemplo n.º 22
0
def make_nff_dataset(spec_dics, nbrlist_cutoff, parallel_feat_threads,
                     strict_conformers, csv_folder, extra_features,
                     add_directed_idx):
    """
    Make an NFF dataset
    Args:
        spec_dics (list[dict]): a dictionary with data for each species
        nbr_list_cutoff (float): Cutoff for two atoms to be considered
            neighbors.
        parallel_feat_threads (int): how many parallel threads
            to use when making the efeatures.
        strict_conformers (bool): Whether to exclude any species whose
            conformers don't all have the same SMILES.
        csv_folder (str): path to folder that contains the csv files
            with the test/val/train smiles.
        extra_features (list[dict]): list of extra features dictionaries
        add_directed_idx (bool): whether to calculate and add the kj
            and ji indices. These indices tell you which edges connect
            to other edges.
    Returns:
        big_dataset (nff.data.dataset): NFF dataset

    """

    fprint("Making dataset with %d species" % (len(spec_dics)))

    props_list = []
    nbr_list = []
    rd_mols_list = []

    for j, spec_dic in tqdm_enum(spec_dics):

        # Exclude keys related to individual conformers. These
        # include conformer features, in case you've already put
        # those in your pickle files. If not we'll generate them
        # below

        small_spec_dic = {
            key: val
            for key, val in spec_dic.items() if key not in CONF_KEYS
        }

        # Treat each species' data like a regular dataset
        # and use it to generate neighbor lists

        dataset = Dataset(small_spec_dic, units='kcal/mol')

        # number of atoms in the molecule
        mol_size = len(dataset.props["nxyz"][0])
        dataset.generate_neighbor_list(cutoff=nbrlist_cutoff, undirected=False)

        # now combine the neighbor lists so that this set
        # of nxyz's can be treated like one big molecule

        nbrs = dataset.props['nbr_list']
        new_nbrs = []

        # shift by i * mol_size for each conformer
        for i in range(len(nbrs)):
            new_nbrs.append(nbrs[i] + i * mol_size)

        # add to list of conglomerated neighbor lists
        nbr_list.append(torch.cat(new_nbrs))
        dataset.props.pop('nbr_list')

        # concatenate the nxyz's
        nxyz = np.concatenate([np.array(item) for item in spec_dic["nxyz"]
                               ]).reshape(-1, 4).tolist()

        # add properties as necessary
        new_dic = {
            "mol_size":
            mol_size,
            "nxyz":
            nxyz,
            "weights":
            torch.Tensor(spec_dic["weights"]).reshape(-1, 1) /
            sum(spec_dic["weights"]),
            "degeneracy":
            torch.Tensor(spec_dic["degeneracy"]).reshape(-1, 1),
            "energy":
            torch.Tensor(spec_dic["energy"]).reshape(-1, 1),
            "num_atoms": [len(nxyz)]
        }

        new_dic.update({
            key: val[:1]
            for key, val in dataset.props.items() if key not in new_dic.keys()
        })

        props_list.append(new_dic)
        rd_mols_list.append(spec_dic["rd_mols"])

    # Add props that are in some datasets but not others
    props_list = add_missing(props_list)
    # convert the list of dicationaries into a dicationary of lists / tensors
    props_dic = concatenate_dict(*props_list)
    # make a combined dataset where the species look like they're
    # one big molecule
    big_dataset = Dataset(props_dic, units='kcal/mol')
    # give it the proper neighbor list and rdkit mols
    big_dataset.props['nbr_list'] = nbr_list
    big_dataset.props["rd_mols"] = rd_mols_list

    # generate atom and bond features
    big_dataset.featurize(num_procs=parallel_feat_threads)

    # clean up
    fprint("Cleaning up dataset...")
    big_dataset = clean_up_dset(dset=big_dataset,
                                nbrlist_cutoff=nbrlist_cutoff,
                                strict_conformers=strict_conformers,
                                csv_folder=csv_folder,
                                add_directed_idx=add_directed_idx,
                                num_procs=parallel_feat_threads)

    # add any other requested features
    big_dataset = add_features(dset=big_dataset,
                               extra_features=extra_features,
                               parallel_feat_threads=parallel_feat_threads)

    return big_dataset
Exemplo n.º 23
0
def report_delta(bare_dic):
    """
    For a binary task, report analysis on the difference between
        similarity among hits and similarity between hits and misses.
    Args:
        bare_dic (dict): bare dictionary of similarities
    Returns:
        None
    """
    for key, dic in bare_dic.items():
        fprint(f"Results for {key}")
        fprint("+/- indicates standard deviation of the mean")

        # attention and random differences in similarity
        delta_att = dic['intra_pos']['att'] - dic['inter']['att']
        delta_rand = dic['intra_pos']['random'] - dic['inter']['random']

        # compute mean for attention
        delta_att_mean = np.mean(delta_att)
        # std deviation on the mean
        delta_att_std = np.std(delta_att) / (len(delta_att))**0.5

        # same for random
        delta_rand_mean = np.mean(delta_rand)
        delta_rand_std = np.std(delta_rand) / (len(delta_rand))**0.5

        # delta delta is the difference in deltas between random and attention,
        # a measure of how much attention is learning

        delta_delta_mean = delta_att_mean - delta_rand_mean
        delta_delta_std = ((np.var(delta_att) + np.var(delta_rand))**0.5 /
                           (len(delta_att))**0.5)

        fprint("Delta att: %.4f +/- %.4f" % (delta_att_mean, delta_att_std))
        fprint("Delta rand: %.4f +/- %.4f" % (delta_rand_mean, delta_rand_std))
        fprint("Delta delta: %.4f +/- %.4f" %
               (delta_delta_mean, delta_delta_std))
        fprint("\n")
Exemplo n.º 24
0
def conf_sims_from_files(model_path,
                         max_samples,
                         classifier,
                         seed,
                         external_fp_fn=None,
                         summary_path=None,
                         rd_path=None,
                         fp_kwargs=None):
    """
    Get similarity among species according to predictions of different
    models, given a folder with all of the prediction pickles.
    Args:
        model_path (str): path to the folder where the prediction pickles
            are saved.
        max_samples (int): maximum number of pairs to compare
        classifier (bool): whether your model is a classifier
        seed (int): random seed
        external_fp_fn (str, optional): name of the fingerprinting
            function you want to use. If none is provided then the model's
            generated fingerprint will be used.
        summary_path (str, optional): path of the file with the summary
            dictionary of species properties, their pickle
            paths, etc.
        rd_path (str, optional): path to the folder that has all your
            pickles with RDKit mols.
        fp_kwargs (dict, optional): any keyword arguments you need
            when calling an external fingeprinter.
    Returns:
        analysis (dict): dictionary of the form {prediction_name:
            similarity_dic} for the name of each prediction file.
        bare_data (dict): same idea as `analysis` but with the full
            set of similarities between each molecule.
    """

    fprint("Loading pickle files...")
    pred_files = get_pred_files(model_path)
    pred = load_preds(pred_files)

    bare_data = {}

    fprint("Calculating fingerprint similarities...")

    for key in tqdm(pred):
        dic = pred[key]
        annotate_confs(dic)
        fp_dics = attention_sim(dic=dic,
                                max_samples=max_samples,
                                classifier=classifier,
                                seed=seed,
                                external_fp_fn=external_fp_fn,
                                summary_path=summary_path,
                                rd_path=rd_path,
                                fp_kwargs=fp_kwargs)
        bare_data[key] = fp_dics

        # analyze the bare data
        analysis = {}
        analyze_data(bare_data, analysis)

        if classifier:
            report_delta(bare_data)

    return analysis, bare_data
Exemplo n.º 25
0
def main(max_confs, summary_path, dataset_folder, pickle_folder, num_threads,
         thread, nbrlist_cutoff, csv_folder, parallel_feat_threads,
         strict_conformers, extra_features, add_directed_idx, average_nbrs,
         **kwargs):
    """
    Sample species, load their pickles, create an NFF dataset, and
    save train/val/test splits.
    Args:
        max_confs (int): Maximum number of conformers per species
        summary_path (str): Path to file with summary dictionary
        dataset_folder (str): base folder for the datasets
        pickle_folder (str): path to folder that contains all
            the pickle files. Each sub-dictionary in `sample_dic`
            will have the key `pickle_path`. Joining `pickle_folder`
            with `pickle_path` gives the full path to the file.
        num_threads (int): Total number of sections into which
            we're splitting and saving the dataset.
        thread (int): Index that tells us which section of the
            total dataset that we're creating and saving
        nbrlist_cutoff (float): Cutoff for two atoms to be considered
            neighbors.
        csv_folder (str): path to folder that contains the csv files
            with the test/val/train smiles.
        parallel_feat_threads (int): how many parallel threads
            to use when making the efeatures.
        strict_conformers (bool): Whether to exclude any species whose
            conformers don't all have the same SMILES.
        extra_features (list[dict]): list of extra features,
            where each item is a dictionary of the form
            {"name": name, "params": {params needed}}.
        add_directed_idx (bool): whether to calculate and add the kj
            and ji indices. These indices tell you which edges connect
            to other edges.
    Returns:
        None

    """

    with open(summary_path, "r") as f:
        summary_dic = json.load(f)

    fprint("Loading splits...")

    sample_dic = get_sample(summary_dic=summary_dic,
                            thread=thread,
                            num_threads=num_threads,
                            csv_folder=csv_folder)

    fprint("Loading data from pickle files...")
    overall_dic = load_data_from_pickle(sample_dic, pickle_folder)

    fprint("Converting data...")
    spec_dics = convert_data(overall_dic, max_confs)

    fprint("Combining to make NFF dataset...")
    dataset = make_nff_dataset(spec_dics=spec_dics,
                               nbrlist_cutoff=nbrlist_cutoff,
                               parallel_feat_threads=parallel_feat_threads,
                               strict_conformers=strict_conformers,
                               csv_folder=csv_folder,
                               extra_features=extra_features,
                               add_directed_idx=add_directed_idx,
                               average_nbrs=average_nbrs)

    fprint("Creating test/train/val splits...")
    save_splits(dataset=dataset,
                dataset_folder=dataset_folder,
                thread=thread,
                sample_dic=sample_dic)

    fprint((f"Complete! Saved section {thread} of the dataset in "
            f"{os.path.join(dataset_folder, str(thread))}.\n\n"))
Exemplo n.º 26
0
def main(base_config_path,
         hyp_config_path,
         use_hyperopt,
         rerun_hyperopt,
         cp_folder,
         feature_folder,
         model_folder_cp,
         metrics,
         feat_options,
         mpnn_options,
         **kwargs):
    """
    Run transfer learning using fingerprints from 3D models evaluated by performance
    on a variety of metrics. Different models are trained with the fingerprints and
    with or without an MPNN.
    Args:
      base_config_path (str): where your basic job config file
        is, with parameters that may or may not be changed depending
        on the given run
      hyp_config_path (str): where your basic hyperopt job config file
        is, with parameters that may or may not be changed depending
        on the given run
      use_hyperopt (bool): do a hyperparameter optimization before training
        the model
      rerun_hyperopt (bool): whether to rerun hyperparameter optimization if
        `hyp_folder` already exists and has the completion file
        `best_params.json`.
      cp_folder (str): path to the chemprop folder on your computer
      feature_folder (str): directory with files for the features of the species
      model_folder_cp (str): directory in which you'll be saving your model
        folders
      metrics (list[str]): metrics you want to use
      feat_options (list[bool]): options you want to use for features. For example,
        [True, False] means you want to train one model with features and one without,
        while [True] just means you want to train one with features.
      mpnn_options (list[bool]): same idea as `feat_options`, but for whether or not to
        use an MPNN
    Returns:
      None
    """

    cwd = os.path.abspath(".")
    script = os.path.join(cwd, "cp_tl.py")

    for feat in feat_options:
        for mpnn in mpnn_options:
            # can't run anything without either features or an MPNN
            if (not feat) and (not mpnn):
                continue
            for metric in metrics:

                paths = []
                for split in ['train', 'val', 'test']:
                    paths.append(os.path.join(feature_folder,
                                              f"{split}_{metric}.npz"))

                train_feat_path, val_feat_path, test_feat_path = paths

                train_folder = get_train_folder(
                    model_folder_cp=model_folder_cp,
                    feature_folder=feature_folder,
                    metric=metric,
                    feat=feat,
                    mpnn=mpnn)

                msg = get_msg(feat, mpnn, train_folder)
                fprint(msg)

                cmd = (f"python {script} "
                       f"--base_config_path {base_config_path} "
                       f"--hyp_config_path {hyp_config_path} "
                       f"--metric {metric} "
                       f"--train_feat_path {train_feat_path} "
                       f"--val_feat_path {val_feat_path} "
                       f"--test_feat_path {test_feat_path} "
                       f"--train_folder {train_folder} "
                       f"--cp_folder {cp_folder} ")

                if use_hyperopt:
                    cmd += "--use_hyperopt "
                if rerun_hyperopt:
                    cmd += "--rerun_hyperopt "
                if not mpnn:
                    cmd += "--features_only "
                if not feat:
                    cmd += "--no_features "

                p = bash_command(cmd)
                p.wait()