def collect(): all_smiles = [] Peak_data = [] RI_data = [] Morgan_fp = [] CDK_fp = [] CDK_des = [] MolWt = [] # for i in tqdm(range(20)): for i in tqdm(range(len(all_mol))): try: m = read_mol(i) except: continue ''' if 'TMS derivative' in m['name']: derive = 1 else: derive = 0 ''' try: mol = Chem.MolFromSmiles(m['smiles']) molwt = CalcExactMolWt(mol) if molwt > 2000: continue smiles = Chem.MolToSmiles(mol) # check element elements = parser_formula(MolToFormula(MolFromSmiles(smiles))) for e in elements: if e not in [ 'C', 'H', 'O', 'N', 'S', 'P', 'Si', 'F', 'Cl', 'Br', 'I' ]: raise ValueError('contain uncommon element') morgan_fp = np.array( AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=4096)) cdk_fp = get_cdk_fingerprints(smiles) # cdk_fp = fp2vec(cdk_fp) cdk_des = np.array(get_cdk_descriptors(smiles)) # cdk_des = getMolecularDescriptor(MolFromSmiles(smiles)).values() # cdk_des = np.array(list(itertools.chain(*cdk_des))) ri = list(m['RI'].values()) peak_vec = ms2vec(m['peakindex'], m['peakintensity']) except: continue all_smiles.append(smiles) Peak_data.append(peak_vec) RI_data.append(ri) Morgan_fp.append(morgan_fp) CDK_fp.append(cdk_fp) CDK_des.append(cdk_des) MolWt.append(molwt) # save np.save('DeepEI/data/retention.npy', np.array(RI_data)) np.save('DeepEI/data/descriptor.npy', np.array(CDK_des)) np.save('DeepEI/data/molwt.npy', np.array(MolWt)) Peak_data = csr_matrix(np.array(Peak_data)) Morgan_fp = csr_matrix(np.array(Morgan_fp)) CDK_fp = csr_matrix(np.array(CDK_fp)) save_npz('DeepEI/data/peakvec.npz', Peak_data) save_npz('DeepEI/data/morgan.npz', Morgan_fp) save_npz('DeepEI/data/fingerprints.npz', CDK_fp) with open('DeepEI/data/all_smiles.json', 'w') as t: json.dump(all_smiles, t)
s = [weitht_dot_product(x, X[i, :]) for i in range(X.shape[0])] return s # predict ms of the isolate compounds # run once, then save pred_spec = np.zeros((len(test), 2000)) for a, i in enumerate(tqdm(test)): smi = smiles[i] writeSDF(smi, 'Temp/mol.sdf') cwd = 'E:\\project\\deep-molecular-massspec' cmd = 'python make_spectra_prediction.py --input_file=E:/project/DeepEI/Temp/mol.sdf --output_file=E:/project/DeepEI/Temp/mol_anno.sdf --weights_dir=model/massspec_weights' subprocess.call(cmd, cwd=cwd) try: speci = parser_NEIMS('Temp/mol_anno.sdf') pred_vec = ms2vec(speci['mz'], speci['intensity']) os.unlink('Temp/mol_anno.sdf') except: pred_vec = np.zeros( 2000 ) # # if error, use a zero vec as placeholder. but it won't count when comparsion. pred_spec[a, :] = pred_vec os.unlink('Temp/mol.sdf') np.save('Discussion/NIST_test/neims_spec_nist.npy', pred_spec) if __name__ == '__main__': from scipy.sparse import load_npz masses = np.load('DeepEI/data/molwt.npy') spec = load_npz('DeepEI/data/peakvec.npz').todense()
split = json.load(js) keep = np.array(split['keep']) isolate = np.array(split['isolate']) # NEIMS spectra of NIST nist_smiles = np.array(json.load(open('DeepEI/data/all_smiles.json'))) writeSDF(nist_smiles, 'Temp/mol.sdf') cwd = 'E:\\project\\deep-molecular-massspec' cmd = 'python make_spectra_prediction.py --input_file=E:/project/DeepEI/Temp/mol.sdf --output_file=E:/project/DeepEI/Temp/mol_anno.sdf --weights_dir=model/massspec_weights' subprocess.call(cmd, cwd=cwd) spectra = parser_NEIMS('Temp/mol_anno.sdf') spec_vecs = [] for spec in tqdm(spectra): spec_vecs.append(ms2vec(spec['mz'], spec['intensity'])) spec_vecs = np.array(spec_vecs) spec_vecs1 = csr_matrix(spec_vecs) save_npz('DeepEI/data/neims_spec_nist.npz', spec_vecs1) # NEIMS spectra of MassBank exist_smiles = nist_smiles[keep] data = msp.read('E:/data/GCMS DB_AllPublic-KovatsRI-VS2.msp') msbk_smiles = [] msbk_spec = [] msbk_masses = [] for i, (param, ms) in enumerate(tqdm(data)): smi = param['smiles'] try: smi = Chem.MolToSmiles(Chem.MolFromSmiles(smi)) except:
from DeepEI.predict import predict_fingerprint from DeepEI.utils import ms2vec, vec2ms, get_cdk_fingerprints data = msp.read('E:/data/GCMS DB_AllPublic-KovatsRI-VS2.msp') smiles = [] spec = [] molwt = [] for i, (param, ms) in enumerate(tqdm(data)): smi = param['smiles'] try: mass = CalcExactMolWt(Chem.MolFromSmiles(smi)) except: continue molwt.append(mass) smiles.append(smi) spec.append(ms2vec(ms[:, 0], ms[:, 1])) mlp = pd.read_csv('Fingerprint/results/mlp_result.txt', sep='\t', header=None) mlp.columns = ['id', 'accuracy', 'precision', 'recall', 'f1'] fpkeep = mlp['id'][np.where(mlp['f1'] > 0.5)[0]] spec = np.array(spec) pred_fps = predict_fingerprint( spec, fpkeep) # predict fingerprint of the "unknown" nist_smiles = np.array(json.load(open('DeepEI/data/all_smiles.json'))) nist_masses = np.load('DeepEI/data/molwt.npy') nist_fps = load_npz('DeepEI/data/fingerprints.npz') nist_fps = csr_matrix(
nist_masses = np.load('DeepEI/data/molwt.npy') nist_spec = load_npz('DeepEI/data/peakvec.npz').todense() data = msp.read('E:/data/GCMS DB_AllPublic-KovatsRI-VS2.msp') smiles = [] spec = [] molwt = [] for i, (param, ms) in enumerate(tqdm(data)): smi = param['smiles'] try: mass = CalcExactMolWt(Chem.MolFromSmiles(smi)) except: continue molwt.append(mass) smiles.append(smi) spec.append(ms2vec(ms[:,0], ms[:,1])) pred_spec = [] output = pd.DataFrame(columns=['smiles', 'mass', 'score', 'rank']) for i in tqdm(range(len(smiles))): smi = smiles[i] # smiles try: std_smi = Chem.MolToSmiles(Chem.MolFromSmiles(smi)) except: std_smi = '' specr = spec[i] # true spectrum mass = molwt[i] # true mol weight try: writeSDF(smi, 'Temp/mol.sdf') except: continue
RI_data = [] Morgan_fp = [] CDK_fp = [] CDK_des = [] MolWt = [] mols = Chem.SDMolSupplier(f) for m in tqdm(mols): if m is None: continue smi = Chem.MolToSmiles(m) morgan_fp = np.array( AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=4096)) cdk_fp = get_cdk_fingerprints(smi) cdk_des = np.array(get_cdk_descriptors(smi)) props = m.GetPropsAsDict() molwt = props['EXACT MASS'] peaks = props['MASS SPECTRAL PEAKS'] peaks = peaks.split('\n') peakindex = np.array([round(float(p.split(' ')[0])) for p in peaks]) peakintensity = np.array([float(p.split(' ')[1]) for p in peaks]) peak_vec = ms2vec(m['peakindex'], m['peakintensity']) all_smiles.append(smi) Peak_data.append(peak_vec) Morgan_fp.append(morgan_fp) CDK_fp.append(cdk_fp) CDK_des.append(cdk_des) MolWt.append(molwt)