Пример #1
0
def collect():
    all_smiles = []
    Peak_data = []
    RI_data = []
    Morgan_fp = []
    CDK_fp = []
    CDK_des = []
    MolWt = []
    # for i in tqdm(range(20)):
    for i in tqdm(range(len(all_mol))):
        try:
            m = read_mol(i)
        except:
            continue
        '''
        if  'TMS derivative' in m['name']:
            derive = 1
        else:
            derive = 0
        '''
        try:
            mol = Chem.MolFromSmiles(m['smiles'])
            molwt = CalcExactMolWt(mol)
            if molwt > 2000:
                continue
            smiles = Chem.MolToSmiles(mol)
            # check element
            elements = parser_formula(MolToFormula(MolFromSmiles(smiles)))
            for e in elements:
                if e not in [
                        'C', 'H', 'O', 'N', 'S', 'P', 'Si', 'F', 'Cl', 'Br',
                        'I'
                ]:
                    raise ValueError('contain uncommon element')
            morgan_fp = np.array(
                AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=4096))
            cdk_fp = get_cdk_fingerprints(smiles)
            # cdk_fp = fp2vec(cdk_fp)
            cdk_des = np.array(get_cdk_descriptors(smiles))
            # cdk_des = getMolecularDescriptor(MolFromSmiles(smiles)).values()
            # cdk_des  = np.array(list(itertools.chain(*cdk_des)))
            ri = list(m['RI'].values())
            peak_vec = ms2vec(m['peakindex'], m['peakintensity'])
        except:
            continue

        all_smiles.append(smiles)
        Peak_data.append(peak_vec)
        RI_data.append(ri)
        Morgan_fp.append(morgan_fp)
        CDK_fp.append(cdk_fp)
        CDK_des.append(cdk_des)
        MolWt.append(molwt)

    # save
    np.save('DeepEI/data/retention.npy', np.array(RI_data))
    np.save('DeepEI/data/descriptor.npy', np.array(CDK_des))
    np.save('DeepEI/data/molwt.npy', np.array(MolWt))

    Peak_data = csr_matrix(np.array(Peak_data))
    Morgan_fp = csr_matrix(np.array(Morgan_fp))
    CDK_fp = csr_matrix(np.array(CDK_fp))
    save_npz('DeepEI/data/peakvec.npz', Peak_data)
    save_npz('DeepEI/data/morgan.npz', Morgan_fp)
    save_npz('DeepEI/data/fingerprints.npz', CDK_fp)

    with open('DeepEI/data/all_smiles.json', 'w') as t:
        json.dump(all_smiles, t)
Пример #2
0
        nist_fps)[:, fpkeep].todense()  # fingerprints of nist compounds
    nist_spec = load_npz('DeepEI/data/peakvec.npz').todense()

    pred_spec = np.load(
        'DeepEI/data/neims_spec_massbank.npy')  # spectra predicted by NEIMS

    output = pd.DataFrame(columns=['smiles', 'mass', 'score', 'rank'])
    for i in tqdm(range(len(smiles))):
        smi = smiles[i]
        std_smi = Chem.MolToSmiles(Chem.MolFromSmiles(smi))
        mass = molwt[i]
        speci = spec[i]
        pred_fp = pred_fps[i]
        pred_sp = pred_spec[i]
        try:
            true_fp = np.array(get_cdk_fingerprints(
                std_smi))  # true fingerprint of the "unknown"
        except:
            continue
        true_fp = true_fp[fpkeep]
        true_score_fp = jaccard_score(pred_fp,
                                      true_fp)  # fp score of the true compound
        true_score_sp = weitht_dot_product(
            speci, pred_sp)  # sp score of the true compound
        true_score = true_score_fp * 0.7 + true_score_sp * 0.3

        candidate = np.where(
            np.abs(nist_masses - mass) < 5)[0]  # candidate of nist
        cand_smi = nist_smiles[candidate]
        rep_ind = np.where(
            cand_smi == std_smi)[0]  # if the compound in nist, remove it.
        candidate = np.delete(candidate, rep_ind)
Пример #3
0
    pred_smiles.append(Chem.MolToSmiles(m))
    pred_masses.append(CalcExactMolWt(m))

spec_vecs = []
for spec in tqdm(spectra):
    spec_vecs.append(ms2vec(spec['mz'], spec['intensity']))
spec_vecs = np.array(spec_vecs)
spec_vecs1 = csr_matrix(spec_vecs)

msbk_spec = np.array(msbk_spec)
msbk_spec = csr_matrix(msbk_spec)

save_npz('DeepEI/data/neims_spec_msbk.npz', spec_vecs1)
save_npz('DeepEI/data/msbk_spec.npz', msbk_spec)
with open('DeepEI/data/msbk_smiles.json', 'w') as t:
    json.dump(msbk_smiles, t)
with open('DeepEI/data/neims_msbk_smiles.json', 'w') as t:
    json.dump(pred_smiles, t)
np.save('DeepEI/data/msbk_masses.npy', msbk_masses)
np.save('DeepEI/data/neims_msbk_masses.npy', pred_masses)

neims_msbk_cdkfps = []
for smi in tqdm(pred_smiles):
    try:
        fp = get_cdk_fingerprints(smi)
    except:
        fp = np.zeros(8034)
    neims_msbk_cdkfps.append(fp)
neims_msbk_cdkfps = np.array(neims_msbk_cdkfps)
neims_msbk_cdkfps = csr_matrix(np.array(neims_msbk_cdkfps))
save_npz('DeepEI/data/neims_msbk_cdkfps', neims_msbk_cdkfps)
Пример #4
0
    pred_fps = predict_fingerprint(
        test_spec, fpkeep)  # predict fingerprint of the "unknown"
    cdk_fp = load_npz('DeepEI/data/fingerprints.npz')
    cdk_fp = csr_matrix(cdk_fp)[:, fpkeep].todense()

    output = pd.DataFrame(columns=['smiles', 'mass', 'score', 'rank'])
    for i in tqdm(range(len(test))):
        smi = test_smiles[i]
        std_smi = Chem.MolToSmiles(Chem.MolFromSmiles(smi))
        mass = test_mass[i]
        speci = test_spec[i]
        pred_fp = pred_fps[i]
        pred_sp = pred_spec[i]
        try:
            true_fp = np.array(
                get_cdk_fingerprints(smi))  # true fingerprint of the "unknown"
        except:
            continue
        true_fp = true_fp[fpkeep]
        true_score_fp = jaccard_score(pred_fp,
                                      true_fp)  # fp score of the true compound
        true_score_sp = weitht_dot_product(
            speci, pred_sp)  # sp score of the true compound
        true_score = 0.7 * true_score_fp + 0.3 * true_score_sp

        candidate = np.where(np.abs(masses - mass) < 5)[0]  # candidate of nist
        cand_smi = smiles[candidate]
        rep_ind = np.where(
            cand_smi == std_smi)[0]  # if the compound in nist, remove it.
        candidate = np.delete(candidate, rep_ind)