def main():
    """ Get Chembl-23.

    Preprocessing steps:

    1) filter SMILES shorter than 5 and longer than 200 chars and those with forbidden symbols
    2) canonicalize, neutralize, only permit smiles shorter than 100 chars
    3) shuffle, write files, check if they are consistently hashed.
    """
    setup_default_logger()

    argparser = get_argparser()
    args = argparser.parse_args()

    # Set constants
    np.random.seed(1337)
    neutralization_rxns = initialise_neutralisation_reactions()
    smiles_dict = AllowedSmilesCharDictionary()

    print("Preprocessing ChEMBL molecules...")

    chembl_file = os.path.join(args.destination, CHEMBL_FILE_NAME)

    data = (
        pkgutil.get_data("guacamol.data", "holdout_set_gcm_v1.smiles").decode("utf-8").splitlines()
    )

    holdout_mols = [i.split(" ")[0] for i in data]
    holdout_set = set(canonicalize_list(holdout_mols, False))
    holdout_fps = get_fingerprints_from_smileslist(holdout_set)

    # Download Chembl23 if needed.
    download_if_not_present(chembl_file, uri=CHEMBL_URL)
    raw_smiles = get_raw_smiles(
        chembl_file, smiles_char_dict=smiles_dict, open_fn=gzip.open, extract_fn=extract_chembl
    )

    file_prefix = "chembl24_canon"

    print(
        f"and standardizing {len(raw_smiles)} molecules using {args.n_jobs} cores, "
        f"and excluding molecules based on ECFP4 similarity of > {TANIMOTO_CUTOFF} to the holdout set."
    )

    # Process all the SMILES in parallel
    runner = Parallel(n_jobs=args.n_jobs, verbose=2)

    joblist = (
        delayed(filter_and_canonicalize)(
            smiles_str, holdout_set, holdout_fps, neutralization_rxns, TANIMOTO_CUTOFF, False
        )
        for smiles_str in raw_smiles
    )

    output = runner(joblist)

    # Put all nonzero molecules in a list, remove duplicates, sort and shuffle

    all_good_mols = sorted(list(set([item[0] for item in output if item])))
    np.random.shuffle(all_good_mols)
    print(f"Ended up with {len(all_good_mols)} molecules. Preparing splits...")

    # Split into train-dev-test
    # Check whether the md5-hashes of the generated smiles files match
    # the precomputed hashes, this ensures everyone works with the same splits.

    VALID_SIZE = int(0.05 * len(all_good_mols))
    TEST_SIZE = int(0.15 * len(all_good_mols))

    dev_set = all_good_mols[0:VALID_SIZE]
    dev_path = os.path.join(args.destination, f"{file_prefix}_dev-valid.smiles")
    write_smiles(dev_set, dev_path)

    test_set = all_good_mols[VALID_SIZE : VALID_SIZE + TEST_SIZE]
    test_path = os.path.join(args.destination, f"{file_prefix}_test.smiles")
    write_smiles(test_set, test_path)

    train_set = all_good_mols[VALID_SIZE + TEST_SIZE :]
    train_path = os.path.join(args.destination, f"{file_prefix}_train.smiles")
    write_smiles(train_set, train_path)

    # check the hashes
    valid_hashes = [
        compare_hash(train_path, TRAIN_HASH),
        compare_hash(dev_path, VALID_HASH),
        compare_hash(test_path, TEST_HASH),
    ]

    if not all(valid_hashes):
        raise SystemExit(f"Invalid hashes for the dataset files")

    print("Dataset generation successful. You are ready to go.")
Exemplo n.º 2
0
def main():
    """ Get Chembl-23.

    Preprocessing steps:

    1) filter SMILES shorter than 5 and longer than 200 chars and those with forbidden symbols
    2) canonicalize, neutralize, only permit smiles shorter than 100 chars
    3) shuffle, write files, check if they are consistently hashed.
    """
    argparser = get_argparser()
    args = argparser.parse_args()

    # Set constants
    np.random.seed(1337)
    neutralization_rxns = initialise_neutralisation_reactions()
    smiles_dict = AllowedSmilesCharDictionary()

    tanimoto_cutoff = args.tanimoto_cutoff

    # Either use chembl, or supplied SMILES file.

    print('Preprocessing molecules...')

    if args.chembl:

        print('Using Chembl')

        chembl_file = os.path.join(args.destination, CHEMBL_FILE_NAME)

        data = pkgutil.get_data('guacamol.data', 'holdout_set_gcm_v1.smiles').decode('utf-8').splitlines()

        holdout_mols = [i.split(' ')[0] for i in data]
        holdout_set = set(canonicalize_list(holdout_mols, False))
        holdout_fps = get_fingerprints_from_smileslist(holdout_set)

        # Download Chembl23 if needed.
        download_if_not_present(chembl_file,
                                uri=CHEMBL_URL)
        raw_smiles = get_raw_smiles(chembl_file, smiles_char_dict=smiles_dict, open_fn=gzip.open,
                                    extract_fn=extract_chembl)

        file_prefix = 'chembl24_canon'

        print(f'Excluding molecules based on ECFP4 similarity of > {tanimoto_cutoff} to the holdout set')

    else:
        if args.input is None:
            raise IOError(
                'You need to specify an input smiles file with -i {file} or --input {file}. \n'
                'Alternatively, provide the --chembl flag to download and process molecules from ChEMBL24 (recommended)')

        raw_smiles = get_raw_smiles(args.input, smiles_char_dict=smiles_dict, open_fn=open,
                                    extract_fn=extract_smilesfile)
        tanimoto_cutoff = 100  # effectively no cutoff
        holdout_set = set([])
        holdout_fps = []
        file_prefix = args.output_prefix

    print()
    print(f'Standardizing {len(raw_smiles)} molecules using {args.n_jobs} cores...')

    # Process all the SMILES in parallel
    runner = Parallel(n_jobs=args.n_jobs, verbose=2)

    joblist = (delayed(filter_and_canonicalize)(smiles_str,
                                                holdout_set,
                                                holdout_fps,
                                                neutralization_rxns,
                                                tanimoto_cutoff,
                                                False)
               for smiles_str in raw_smiles)

    output = runner(joblist)

    # Put all nonzero molecules in a list, remove duplicates, sort and shuffle

    all_good_mols = sorted(list(set([item[0] for item in output if item])))
    np.random.shuffle(all_good_mols)
    print(f'Ended up with {len(all_good_mols)} molecules. Preparing splits...')

    # Split into train-dev-test
    # Check whether the md5-hashes of the generated smiles files match
    # the precomputed hashes, this ensures everyone works with the same splits.

    VALID_SIZE = int(0.05 * len(all_good_mols))
    TEST_SIZE = int(0.15 * len(all_good_mols))

    dev_set = all_good_mols[0:VALID_SIZE]
    dev_path = os.path.join(args.destination, f'{file_prefix}_dev-valid.smiles')
    write_smiles(dev_set, dev_path)

    test_set = all_good_mols[VALID_SIZE:VALID_SIZE + TEST_SIZE]
    test_path = os.path.join(args.destination, f'{file_prefix}_test.smiles')
    write_smiles(test_set, test_path)

    train_set = all_good_mols[VALID_SIZE + TEST_SIZE:]
    train_path = os.path.join(args.destination, f'{file_prefix}_train.smiles')
    write_smiles(train_set, train_path)

    # for chembl, check the hashes
    if args.chembl:
        compare_hash(train_path, TRAIN_HASH)
        compare_hash(dev_path, VALID_HASH)
        compare_hash(test_path, TEST_HASH)

        print('The train/test/dev-file md5 hashes match the expected hashes.')

    print('You are ready to go.')