Exemplo n.º 1
0
# get unique clusters and methods
clusters = tc['cluster'].unique()

# set up output data frame
mism = pd.DataFrame()

# iterate over clusters
for cluster in tqdm(clusters):
    # get the query structures
    pred_smiles = tc[(tc['method'] == 'PRISM 4') & \
                     (tc['cluster'] == cluster)].pred_smiles.unique()
    true_smiles = tc[(tc['method'] == 'PRISM 4') & \
                     (tc['cluster'] != cluster)].true_smiles.unique()

    # get Tanimoto coefficients
    true_mols = clean_mols(true_smiles)
    true_fps = get_ecfp6_fingerprints(true_mols)
    pred_mols = clean_mols(pred_smiles)
    pred_fps = get_ecfp6_fingerprints(pred_mols)
    tcs = get_tanimoto(true_fps, pred_fps)
    true_col = [[y for x in pred_smiles] for y in true_smiles]
    pred_col = [[x for x in pred_smiles] for y in true_smiles]
    # create data frame
    res = pd.DataFrame({
        'cluster': cluster,
        'true_smiles': flatten(true_col),
        'pred_smiles': flatten(pred_col),
        'Tc': tcs
    })

    # append to master sheet
Exemplo n.º 2
0
for io_tuple in io:
    input_file = io_tuple[0]
    output_file = io_tuple[1]
    print("processing input file: " + os.path.basename(input_file))

    # read input
    dat = pd.read_csv(input_file)

    # create results container
    res = pd.DataFrame()

    # process antiSMASH SMILES
    print(".. processing antiSMASH smiles ...")
    antismash_smiles = dat['smiles_as'].values
    antismash_mols = clean_mols(antismash_smiles)
    antismash_fps = get_ecfp6_fingerprints(antismash_mols)
    for i, query_fp in enumerate(tqdm(antismash_fps)):
        tcs = get_tanimoto([query_fp], antismash_fps)
        rows = pd.DataFrame({
            'query_idx': i + 1,
            'target_idx': list(range(1,
                                     len(tcs) + 1)),
            'method': 'antiSMASH 5',
            'Tc': tcs
        })
        res = res.append(rows)

    # process PRISM SMILES
    print(".. processing PRISM smiles ...")
    prism_smiles = dat['smiles_pr'].values
Exemplo n.º 3
0
parser.add_argument('--sampled_files',
                    type=str,
                    nargs='*',
                    help='file(s) containing sampled SMILES')
parser.set_defaults(stop_if_exists=False)
args = parser.parse_args()
print(args)

# make output directories
if not os.path.isdir(args.output_dir):
    os.makedirs(args.output_dir)

# read the training set SMILES, and convert to moelcules
org_smiles = read_smiles(args.original_file)
org_mols = [
    mol for mol in clean_mols(
        org_smiles, selfies=args.selfies, deepsmiles=args.deepsmiles) if mol
]
org_canonical = [Chem.MolToSmiles(mol) for mol in org_mols]


# define helper function to get # of rotatable bonds
def pct_rotatable_bonds(mol):
    n_bonds = mol.GetNumBonds()
    if n_bonds > 0:
        rot_bonds = Lipinski.NumRotatableBonds(mol) / n_bonds
    else:
        rot_bonds = 0
    return rot_bonds


# define helper function to get % of stereocenters
    args.output_dir = os.path.dirname(args.smiles_file)

# optionally stop if output file already exists
filename = os.path.basename(args.smiles_file)
split = os.path.splitext(filename)
output_file = os.path.join(args.output_dir, split[0] + "-outcomes.csv.gz")
if os.path.isfile(output_file) and args.stop_if_exists:
    print("output file " + output_file + " exists: stopping early")
    sys.exit()

# create results container
res = pd.DataFrame()

# read SMILES and convert to molecules
smiles = read_smiles(args.smiles_file)
mols = [mol for mol in clean_mols(smiles, selfies=args.selfies,
                                  deepsmiles=args.deepsmiles) if mol]
canonical = [Chem.MolToSmiles(mol, isomericSmiles=False) for mol in mols]

# also read the reference file
ref_smiles = read_smiles(args.reference_file)
ref_mols = [mol for mol in clean_mols(ref_smiles) if mol]
ref_canonical = [Chem.MolToSmiles(mol, isomericSmiles=False) for mol in \
                 ref_mols]

## drop known molecules
canonical = [sm for sm in canonical if sm not in ref_canonical]
# re-parse molecules
mols = [mol for mol in clean_mols(canonical) if mol]

# calculate descriptors
## heteroatom distribution
Exemplo n.º 5
0
# parse arguments
input_file = sys.argv[1]
output_file = sys.argv[2]

# read SMILES
basename = os.path.basename(input_file)
smiles = read_smiles(input_file)

# remove duplicated SMILES
smiles = np.unique(smiles)
# record original count
initial_count = len(smiles)
print("parsing " + str(initial_count) + " unique SMILES")

# convert to molecules
mols = clean_mols(smiles, stereochem=False)
# remove molecules that could not be parsed
mols = [mol for mol in mols if mol]
print("parsed " + str(len(mols)) + " unique, valid canonical SMILES")

# remove salts/solvents
mols = [remove_salts_solvents(mol, hac=3) for mol in tqdm(mols)]
# remove molecules that could not be parsed
mols = [mol for mol in mols if mol]
# remove charges
mols = [NeutraliseCharges(mol) for mol in tqdm(mols)]
print("parsed " + str(len(mols)) + \
      " molecules with >3 heavy atoms and 1 fragment")

# remove molecules with invalid atoms
## what unique atoms are present in any molecule?
Exemplo n.º 6
0
paths = pd.read_table(git_dir + "/data/platinum/raw_paths.txt")
mols = pd.read_table(git_dir + "/data/platinum/all_mols.txt")
plat = pd.merge(mols, paths, how='left', on='id')

# get unique inputs
clusters = plat.cluster.unique()

# for each cluster: 
res = pd.DataFrame()
random.seed(0)
for cluster in tqdm(clusters): 
    print(cluster)
    # get real SMILES
    true_smiles = plat['smiles'][plat['cluster'] == cluster]
    # create molecules
    true_mols = clean_mols(true_smiles)
    # randomly sample one
    true_mol = sample(true_mols, 1)[0]
    # get functional groups
    fgs = identify_functional_groups(true_mol)
    row = pd.DataFrame({'cluster': cluster, 
                        'type': 'True',
                        'functional_group': [str(fg) for fg in fgs]})
    res = res.append(row)
    
    # test PRISM predictions
    prism_dir = git_dir + "/data/predictions/prism"
    prism_file = prism_dir + "/" + cluster + ".json"
    if os.path.isfile(prism_file):
        # read all SMILES from JSON
        with open(prism_file) as f: