def collect(): all_smiles = [] Peak_data = [] RI_data = [] Morgan_fp = [] CDK_fp = [] CDK_des = [] MolWt = [] # for i in tqdm(range(20)): for i in tqdm(range(len(all_mol))): try: m = read_mol(i) except: continue ''' if 'TMS derivative' in m['name']: derive = 1 else: derive = 0 ''' try: mol = Chem.MolFromSmiles(m['smiles']) molwt = CalcExactMolWt(mol) if molwt > 2000: continue smiles = Chem.MolToSmiles(mol) # check element elements = parser_formula(MolToFormula(MolFromSmiles(smiles))) for e in elements: if e not in [ 'C', 'H', 'O', 'N', 'S', 'P', 'Si', 'F', 'Cl', 'Br', 'I' ]: raise ValueError('contain uncommon element') morgan_fp = np.array( AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=4096)) cdk_fp = get_cdk_fingerprints(smiles) # cdk_fp = fp2vec(cdk_fp) cdk_des = np.array(get_cdk_descriptors(smiles)) # cdk_des = getMolecularDescriptor(MolFromSmiles(smiles)).values() # cdk_des = np.array(list(itertools.chain(*cdk_des))) ri = list(m['RI'].values()) peak_vec = ms2vec(m['peakindex'], m['peakintensity']) except: continue all_smiles.append(smiles) Peak_data.append(peak_vec) RI_data.append(ri) Morgan_fp.append(morgan_fp) CDK_fp.append(cdk_fp) CDK_des.append(cdk_des) MolWt.append(molwt) # save np.save('DeepEI/data/retention.npy', np.array(RI_data)) np.save('DeepEI/data/descriptor.npy', np.array(CDK_des)) np.save('DeepEI/data/molwt.npy', np.array(MolWt)) Peak_data = csr_matrix(np.array(Peak_data)) Morgan_fp = csr_matrix(np.array(Morgan_fp)) CDK_fp = csr_matrix(np.array(CDK_fp)) save_npz('DeepEI/data/peakvec.npz', Peak_data) save_npz('DeepEI/data/morgan.npz', Morgan_fp) save_npz('DeepEI/data/fingerprints.npz', CDK_fp) with open('DeepEI/data/all_smiles.json', 'w') as t: json.dump(all_smiles, t)
nist_fps)[:, fpkeep].todense() # fingerprints of nist compounds nist_spec = load_npz('DeepEI/data/peakvec.npz').todense() pred_spec = np.load( 'DeepEI/data/neims_spec_massbank.npy') # spectra predicted by NEIMS output = pd.DataFrame(columns=['smiles', 'mass', 'score', 'rank']) for i in tqdm(range(len(smiles))): smi = smiles[i] std_smi = Chem.MolToSmiles(Chem.MolFromSmiles(smi)) mass = molwt[i] speci = spec[i] pred_fp = pred_fps[i] pred_sp = pred_spec[i] try: true_fp = np.array(get_cdk_fingerprints( std_smi)) # true fingerprint of the "unknown" except: continue true_fp = true_fp[fpkeep] true_score_fp = jaccard_score(pred_fp, true_fp) # fp score of the true compound true_score_sp = weitht_dot_product( speci, pred_sp) # sp score of the true compound true_score = true_score_fp * 0.7 + true_score_sp * 0.3 candidate = np.where( np.abs(nist_masses - mass) < 5)[0] # candidate of nist cand_smi = nist_smiles[candidate] rep_ind = np.where( cand_smi == std_smi)[0] # if the compound in nist, remove it. candidate = np.delete(candidate, rep_ind)
pred_smiles.append(Chem.MolToSmiles(m)) pred_masses.append(CalcExactMolWt(m)) spec_vecs = [] for spec in tqdm(spectra): spec_vecs.append(ms2vec(spec['mz'], spec['intensity'])) spec_vecs = np.array(spec_vecs) spec_vecs1 = csr_matrix(spec_vecs) msbk_spec = np.array(msbk_spec) msbk_spec = csr_matrix(msbk_spec) save_npz('DeepEI/data/neims_spec_msbk.npz', spec_vecs1) save_npz('DeepEI/data/msbk_spec.npz', msbk_spec) with open('DeepEI/data/msbk_smiles.json', 'w') as t: json.dump(msbk_smiles, t) with open('DeepEI/data/neims_msbk_smiles.json', 'w') as t: json.dump(pred_smiles, t) np.save('DeepEI/data/msbk_masses.npy', msbk_masses) np.save('DeepEI/data/neims_msbk_masses.npy', pred_masses) neims_msbk_cdkfps = [] for smi in tqdm(pred_smiles): try: fp = get_cdk_fingerprints(smi) except: fp = np.zeros(8034) neims_msbk_cdkfps.append(fp) neims_msbk_cdkfps = np.array(neims_msbk_cdkfps) neims_msbk_cdkfps = csr_matrix(np.array(neims_msbk_cdkfps)) save_npz('DeepEI/data/neims_msbk_cdkfps', neims_msbk_cdkfps)
pred_fps = predict_fingerprint( test_spec, fpkeep) # predict fingerprint of the "unknown" cdk_fp = load_npz('DeepEI/data/fingerprints.npz') cdk_fp = csr_matrix(cdk_fp)[:, fpkeep].todense() output = pd.DataFrame(columns=['smiles', 'mass', 'score', 'rank']) for i in tqdm(range(len(test))): smi = test_smiles[i] std_smi = Chem.MolToSmiles(Chem.MolFromSmiles(smi)) mass = test_mass[i] speci = test_spec[i] pred_fp = pred_fps[i] pred_sp = pred_spec[i] try: true_fp = np.array( get_cdk_fingerprints(smi)) # true fingerprint of the "unknown" except: continue true_fp = true_fp[fpkeep] true_score_fp = jaccard_score(pred_fp, true_fp) # fp score of the true compound true_score_sp = weitht_dot_product( speci, pred_sp) # sp score of the true compound true_score = 0.7 * true_score_fp + 0.3 * true_score_sp candidate = np.where(np.abs(masses - mass) < 5)[0] # candidate of nist cand_smi = smiles[candidate] rep_ind = np.where( cand_smi == std_smi)[0] # if the compound in nist, remove it. candidate = np.delete(candidate, rep_ind)