try: mass = CalcExactMolWt(Chem.MolFromSmiles(smi)) except: continue molwt.append(mass) smiles.append(smi) spec.append(ms2vec(ms[:, 0], ms[:, 1])) mlp = pd.read_csv('Fingerprint/results/mlp_result.txt', sep='\t', header=None) mlp.columns = ['id', 'accuracy', 'precision', 'recall', 'f1'] fpkeep = mlp['id'][np.where(mlp['f1'] > 0.5)[0]] spec = np.array(spec) pred_fps = predict_fingerprint( spec, fpkeep) # predict fingerprint of the "unknown" nist_smiles = np.array(json.load(open('DeepEI/data/all_smiles.json'))) nist_masses = np.load('DeepEI/data/molwt.npy') nist_fps = load_npz('DeepEI/data/fingerprints.npz') nist_fps = csr_matrix( nist_fps)[:, fpkeep].todense() # fingerprints of nist compounds nist_spec = load_npz('DeepEI/data/peakvec.npz').todense() pred_spec = np.load( 'DeepEI/data/neims_spec_massbank.npy') # spectra predicted by NEIMS output = pd.DataFrame(columns=['smiles', 'mass', 'score', 'rank']) for i in tqdm(range(len(smiles))): smi = smiles[i] std_smi = Chem.MolToSmiles(Chem.MolFromSmiles(smi))
keep, :] neims_msbk_smiles = np.array( json.load(open('DeepEI/data/neims_msbk_smiles.json'))) neims_msbk_masses = np.load('DeepEI/data/neims_msbk_masses.npy') neims_msbk_spec = load_npz('DeepEI/data/neims_spec_msbk.npz').todense() neims_msbk_cdkfps = load_npz('DeepEI/data/neims_msbk_cdkfps.npz').todense() msbk_smiles = np.array(json.load(open('DeepEI/data/msbk_smiles.json'))) msbk_masses = np.load('DeepEI/data/msbk_masses.npy') msbk_spec = load_npz('DeepEI/data/msbk_spec.npz').todense() mlp = pd.read_csv('Fingerprint/results/mlp_result.txt', sep='\t', header=None) mlp.columns = ['id', 'accuracy', 'precision', 'recall', 'f1'] fpkeep = mlp['id'][np.where(mlp['f1'] > 0.5)[0]] pred_fps = predict_fingerprint(msbk_spec, fpkeep) db_smiles = np.array(list(nist_smiles) + list(neims_msbk_smiles)) db_masses = np.append(nist_masses, neims_msbk_masses) db_spec = np.append(neims_nist_spec, neims_msbk_spec, axis=0) db_fingerprints = np.append(nist_fingerprint, neims_msbk_cdkfps, axis=0)[:, fpkeep] if __name__ == '__main__': output = pd.DataFrame( columns=['smiles', 'mass', 'score', 'rank', 'inNIST']) for i, smi in enumerate(tqdm(msbk_smiles)): specr = msbk_spec[i] # true spectrum mass = msbk_masses[i] # true mol weight
test_rindex = test_ri[i,0] test_spec = test_spec[i,:] test_mass = test_masses[i] test = i # only keep fingerprint with f1 > 0.5 mlp = pd.read_csv('Fingerprint/results/mlp_result.txt', sep='\t', header=None) mlp.columns = ['id', 'accuracy', 'precision', 'recall', 'f1'] fpkeep = mlp['id'][np.where(mlp['f1'] > 0.5)[0]] cdk_fp = load_npz('DeepEI/data/fingerprints.npz') cdk_fp = csr_matrix(cdk_fp)[:, fpkeep].todense() # predict fingerprints via ms pred_fp = predict_fingerprint(test_spec, fpkeep) # rank output = pd.DataFrame(columns=['smiles', 'mass', 'true RI', 'predict RI', 'mass filter', 'RI filter', 'Without filter']) for i in tqdm(range(len(test))): smi = test_smiles[i] mass = test_mass[i] ri = test_rindex[i] pred_fpi = pred_fp[i,:] trueindex = np.where(all_smiles == smi)[0][0] # mass filter candidate = np.where(np.abs(all_masses - mass) < 5)[0] w_true = np.where(candidate==trueindex)[0] if len(w_true)==0: rank_mass = 99999