Exemplos de download_if_not_present em Python

Linguagem de programação: Python

Espaço para nome / nome do pacote: guacamol.utils.data

Método / Função: download_if_not_present

Exemplos em hotexamples.com: 2

download_if_not_present em Python - 2 exemplos encontrados. Esses são os exemplos do mundo real mais bem avaliados de guacamol.utils.data.download_if_not_present em Python extraídos de projetos de código aberto. Você pode avaliar os exemplos para nos ajudar a melhorar a qualidade deles.

Exemplo n.º 1

0

Exibir arquivo

Arquivo: get_data.py Projeto: VasaKiDD/end-to-end-drug-discovery

def main(): """ Get Chembl-23. Preprocessing steps: 1) filter SMILES shorter than 5 and longer than 200 chars and those with forbidden symbols 2) canonicalize, neutralize, only permit smiles shorter than 100 chars 3) shuffle, write files, check if they are consistently hashed. """ setup_default_logger() argparser = get_argparser() args = argparser.parse_args() # Set constants np.random.seed(1337) neutralization_rxns = initialise_neutralisation_reactions() smiles_dict = AllowedSmilesCharDictionary() print("Preprocessing ChEMBL molecules...") chembl_file = os.path.join(args.destination, CHEMBL_FILE_NAME) data = ( pkgutil.get_data("guacamol.data", "holdout_set_gcm_v1.smiles").decode("utf-8").splitlines() ) holdout_mols = [i.split(" ")[0] for i in data] holdout_set = set(canonicalize_list(holdout_mols, False)) holdout_fps = get_fingerprints_from_smileslist(holdout_set) # Download Chembl23 if needed. download_if_not_present(chembl_file, uri=CHEMBL_URL) raw_smiles = get_raw_smiles( chembl_file, smiles_char_dict=smiles_dict, open_fn=gzip.open, extract_fn=extract_chembl ) file_prefix = "chembl24_canon" print( f"and standardizing {len(raw_smiles)} molecules using {args.n_jobs} cores, " f"and excluding molecules based on ECFP4 similarity of > {TANIMOTO_CUTOFF} to the holdout set." ) # Process all the SMILES in parallel runner = Parallel(n_jobs=args.n_jobs, verbose=2) joblist = ( delayed(filter_and_canonicalize)( smiles_str, holdout_set, holdout_fps, neutralization_rxns, TANIMOTO_CUTOFF, False ) for smiles_str in raw_smiles ) output = runner(joblist) # Put all nonzero molecules in a list, remove duplicates, sort and shuffle all_good_mols = sorted(list(set([item[0] for item in output if item]))) np.random.shuffle(all_good_mols) print(f"Ended up with {len(all_good_mols)} molecules. Preparing splits...") # Split into train-dev-test # Check whether the md5-hashes of the generated smiles files match # the precomputed hashes, this ensures everyone works with the same splits. VALID_SIZE = int(0.05 * len(all_good_mols)) TEST_SIZE = int(0.15 * len(all_good_mols)) dev_set = all_good_mols[0:VALID_SIZE] dev_path = os.path.join(args.destination, f"{file_prefix}_dev-valid.smiles") write_smiles(dev_set, dev_path) test_set = all_good_mols[VALID_SIZE : VALID_SIZE + TEST_SIZE] test_path = os.path.join(args.destination, f"{file_prefix}_test.smiles") write_smiles(test_set, test_path) train_set = all_good_mols[VALID_SIZE + TEST_SIZE :] train_path = os.path.join(args.destination, f"{file_prefix}_train.smiles") write_smiles(train_set, train_path) # check the hashes valid_hashes = [ compare_hash(train_path, TRAIN_HASH), compare_hash(dev_path, VALID_HASH), compare_hash(test_path, TEST_HASH), ] if not all(valid_hashes): raise SystemExit(f"Invalid hashes for the dataset files") print("Dataset generation successful. You are ready to go.")

Exemplo n.º 2

0

Exibir arquivo

Arquivo: get_data.py Projeto: stephenra/guacamol

def main(): """ Get Chembl-23. Preprocessing steps: 1) filter SMILES shorter than 5 and longer than 200 chars and those with forbidden symbols 2) canonicalize, neutralize, only permit smiles shorter than 100 chars 3) shuffle, write files, check if they are consistently hashed. """ argparser = get_argparser() args = argparser.parse_args() # Set constants np.random.seed(1337) neutralization_rxns = initialise_neutralisation_reactions() smiles_dict = AllowedSmilesCharDictionary() tanimoto_cutoff = args.tanimoto_cutoff # Either use chembl, or supplied SMILES file. print('Preprocessing molecules...') if args.chembl: print('Using Chembl') chembl_file = os.path.join(args.destination, CHEMBL_FILE_NAME) data = pkgutil.get_data('guacamol.data', 'holdout_set_gcm_v1.smiles').decode('utf-8').splitlines() holdout_mols = [i.split(' ')[0] for i in data] holdout_set = set(canonicalize_list(holdout_mols, False)) holdout_fps = get_fingerprints_from_smileslist(holdout_set) # Download Chembl23 if needed. download_if_not_present(chembl_file, uri=CHEMBL_URL) raw_smiles = get_raw_smiles(chembl_file, smiles_char_dict=smiles_dict, open_fn=gzip.open, extract_fn=extract_chembl) file_prefix = 'chembl24_canon' print(f'Excluding molecules based on ECFP4 similarity of > {tanimoto_cutoff} to the holdout set') else: if args.input is None: raise IOError( 'You need to specify an input smiles file with -i {file} or --input {file}. \n' 'Alternatively, provide the --chembl flag to download and process molecules from ChEMBL24 (recommended)') raw_smiles = get_raw_smiles(args.input, smiles_char_dict=smiles_dict, open_fn=open, extract_fn=extract_smilesfile) tanimoto_cutoff = 100 # effectively no cutoff holdout_set = set([]) holdout_fps = [] file_prefix = args.output_prefix print() print(f'Standardizing {len(raw_smiles)} molecules using {args.n_jobs} cores...') # Process all the SMILES in parallel runner = Parallel(n_jobs=args.n_jobs, verbose=2) joblist = (delayed(filter_and_canonicalize)(smiles_str, holdout_set, holdout_fps, neutralization_rxns, tanimoto_cutoff, False) for smiles_str in raw_smiles) output = runner(joblist) # Put all nonzero molecules in a list, remove duplicates, sort and shuffle all_good_mols = sorted(list(set([item[0] for item in output if item]))) np.random.shuffle(all_good_mols) print(f'Ended up with {len(all_good_mols)} molecules. Preparing splits...') # Split into train-dev-test # Check whether the md5-hashes of the generated smiles files match # the precomputed hashes, this ensures everyone works with the same splits. VALID_SIZE = int(0.05 * len(all_good_mols)) TEST_SIZE = int(0.15 * len(all_good_mols)) dev_set = all_good_mols[0:VALID_SIZE] dev_path = os.path.join(args.destination, f'{file_prefix}_dev-valid.smiles') write_smiles(dev_set, dev_path) test_set = all_good_mols[VALID_SIZE:VALID_SIZE + TEST_SIZE] test_path = os.path.join(args.destination, f'{file_prefix}_test.smiles') write_smiles(test_set, test_path) train_set = all_good_mols[VALID_SIZE + TEST_SIZE:] train_path = os.path.join(args.destination, f'{file_prefix}_train.smiles') write_smiles(train_set, train_path) # for chembl, check the hashes if args.chembl: compare_hash(train_path, TRAIN_HASH) compare_hash(dev_path, VALID_HASH) compare_hash(test_path, TEST_HASH) print('The train/test/dev-file md5 hashes match the expected hashes.') print('You are ready to go.')