示例#1
0
def collect():
    all_smiles = []
    Peak_data = []
    RI_data = []
    Morgan_fp = []
    CDK_fp = []
    CDK_des = []
    MolWt = []
    # for i in tqdm(range(20)):
    for i in tqdm(range(len(all_mol))):
        try:
            m = read_mol(i)
        except:
            continue
        '''
        if  'TMS derivative' in m['name']:
            derive = 1
        else:
            derive = 0
        '''
        try:
            mol = Chem.MolFromSmiles(m['smiles'])
            molwt = CalcExactMolWt(mol)
            if molwt > 2000:
                continue
            smiles = Chem.MolToSmiles(mol)
            # check element
            elements = parser_formula(MolToFormula(MolFromSmiles(smiles)))
            for e in elements:
                if e not in [
                        'C', 'H', 'O', 'N', 'S', 'P', 'Si', 'F', 'Cl', 'Br',
                        'I'
                ]:
                    raise ValueError('contain uncommon element')
            morgan_fp = np.array(
                AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=4096))
            cdk_fp = get_cdk_fingerprints(smiles)
            # cdk_fp = fp2vec(cdk_fp)
            cdk_des = np.array(get_cdk_descriptors(smiles))
            # cdk_des = getMolecularDescriptor(MolFromSmiles(smiles)).values()
            # cdk_des  = np.array(list(itertools.chain(*cdk_des)))
            ri = list(m['RI'].values())
            peak_vec = ms2vec(m['peakindex'], m['peakintensity'])
        except:
            continue

        all_smiles.append(smiles)
        Peak_data.append(peak_vec)
        RI_data.append(ri)
        Morgan_fp.append(morgan_fp)
        CDK_fp.append(cdk_fp)
        CDK_des.append(cdk_des)
        MolWt.append(molwt)

    # save
    np.save('DeepEI/data/retention.npy', np.array(RI_data))
    np.save('DeepEI/data/descriptor.npy', np.array(CDK_des))
    np.save('DeepEI/data/molwt.npy', np.array(MolWt))

    Peak_data = csr_matrix(np.array(Peak_data))
    Morgan_fp = csr_matrix(np.array(Morgan_fp))
    CDK_fp = csr_matrix(np.array(CDK_fp))
    save_npz('DeepEI/data/peakvec.npz', Peak_data)
    save_npz('DeepEI/data/morgan.npz', Morgan_fp)
    save_npz('DeepEI/data/fingerprints.npz', CDK_fp)

    with open('DeepEI/data/all_smiles.json', 'w') as t:
        json.dump(all_smiles, t)
示例#2
0
        s = [weitht_dot_product(x, X[i, :]) for i in range(X.shape[0])]
    return s


# predict ms of the isolate compounds
# run once, then save
pred_spec = np.zeros((len(test), 2000))
for a, i in enumerate(tqdm(test)):
    smi = smiles[i]
    writeSDF(smi, 'Temp/mol.sdf')
    cwd = 'E:\\project\\deep-molecular-massspec'
    cmd = 'python make_spectra_prediction.py --input_file=E:/project/DeepEI/Temp/mol.sdf --output_file=E:/project/DeepEI/Temp/mol_anno.sdf --weights_dir=model/massspec_weights'
    subprocess.call(cmd, cwd=cwd)
    try:
        speci = parser_NEIMS('Temp/mol_anno.sdf')
        pred_vec = ms2vec(speci['mz'], speci['intensity'])
        os.unlink('Temp/mol_anno.sdf')
    except:
        pred_vec = np.zeros(
            2000
        )  #  # if error, use a zero vec as placeholder. but it won't count when comparsion.
    pred_spec[a, :] = pred_vec
    os.unlink('Temp/mol.sdf')
np.save('Discussion/NIST_test/neims_spec_nist.npy', pred_spec)

if __name__ == '__main__':

    from scipy.sparse import load_npz

    masses = np.load('DeepEI/data/molwt.npy')
    spec = load_npz('DeepEI/data/peakvec.npz').todense()
示例#3
0
文件: neims.py 项目: wxlsummer/DeepEI
    split = json.load(js)
keep = np.array(split['keep'])
isolate = np.array(split['isolate'])

# NEIMS spectra of NIST
nist_smiles = np.array(json.load(open('DeepEI/data/all_smiles.json')))

writeSDF(nist_smiles, 'Temp/mol.sdf')
cwd = 'E:\\project\\deep-molecular-massspec'
cmd = 'python make_spectra_prediction.py --input_file=E:/project/DeepEI/Temp/mol.sdf --output_file=E:/project/DeepEI/Temp/mol_anno.sdf --weights_dir=model/massspec_weights'
subprocess.call(cmd, cwd=cwd)
spectra = parser_NEIMS('Temp/mol_anno.sdf')

spec_vecs = []
for spec in tqdm(spectra):
    spec_vecs.append(ms2vec(spec['mz'], spec['intensity']))
spec_vecs = np.array(spec_vecs)
spec_vecs1 = csr_matrix(spec_vecs)
save_npz('DeepEI/data/neims_spec_nist.npz', spec_vecs1)

# NEIMS spectra of MassBank
exist_smiles = nist_smiles[keep]
data = msp.read('E:/data/GCMS DB_AllPublic-KovatsRI-VS2.msp')
msbk_smiles = []
msbk_spec = []
msbk_masses = []
for i, (param, ms) in enumerate(tqdm(data)):
    smi = param['smiles']
    try:
        smi = Chem.MolToSmiles(Chem.MolFromSmiles(smi))
    except:
示例#4
0
    from DeepEI.predict import predict_fingerprint
    from DeepEI.utils import ms2vec, vec2ms, get_cdk_fingerprints

    data = msp.read('E:/data/GCMS DB_AllPublic-KovatsRI-VS2.msp')
    smiles = []
    spec = []
    molwt = []
    for i, (param, ms) in enumerate(tqdm(data)):
        smi = param['smiles']
        try:
            mass = CalcExactMolWt(Chem.MolFromSmiles(smi))
        except:
            continue
        molwt.append(mass)
        smiles.append(smi)
        spec.append(ms2vec(ms[:, 0], ms[:, 1]))

    mlp = pd.read_csv('Fingerprint/results/mlp_result.txt',
                      sep='\t',
                      header=None)
    mlp.columns = ['id', 'accuracy', 'precision', 'recall', 'f1']
    fpkeep = mlp['id'][np.where(mlp['f1'] > 0.5)[0]]

    spec = np.array(spec)
    pred_fps = predict_fingerprint(
        spec, fpkeep)  # predict fingerprint of the "unknown"

    nist_smiles = np.array(json.load(open('DeepEI/data/all_smiles.json')))
    nist_masses = np.load('DeepEI/data/molwt.npy')
    nist_fps = load_npz('DeepEI/data/fingerprints.npz')
    nist_fps = csr_matrix(
示例#5
0
 nist_masses = np.load('DeepEI/data/molwt.npy')
 nist_spec = load_npz('DeepEI/data/peakvec.npz').todense()
 
 data = msp.read('E:/data/GCMS DB_AllPublic-KovatsRI-VS2.msp')
 smiles = []
 spec = []
 molwt = []
 for i, (param, ms) in enumerate(tqdm(data)):
     smi = param['smiles']
     try:
         mass = CalcExactMolWt(Chem.MolFromSmiles(smi))
     except:
         continue
     molwt.append(mass)
     smiles.append(smi)
     spec.append(ms2vec(ms[:,0], ms[:,1]))
 
 pred_spec = []
 output = pd.DataFrame(columns=['smiles', 'mass', 'score', 'rank'])
 for i in tqdm(range(len(smiles))):
     smi = smiles[i] # smiles
     try:
         std_smi = Chem.MolToSmiles(Chem.MolFromSmiles(smi))
     except:
         std_smi = ''
     specr = spec[i] # true spectrum
     mass = molwt[i] # true mol weight
     try:
         writeSDF(smi, 'Temp/mol.sdf')
     except:
         continue
示例#6
0
RI_data = []
Morgan_fp = []
CDK_fp = []
CDK_des = []
MolWt = []

mols = Chem.SDMolSupplier(f)
for m in tqdm(mols):
    if m is None:
        continue
    smi = Chem.MolToSmiles(m)
    morgan_fp = np.array(
        AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=4096))
    cdk_fp = get_cdk_fingerprints(smi)
    cdk_des = np.array(get_cdk_descriptors(smi))

    props = m.GetPropsAsDict()
    molwt = props['EXACT MASS']
    peaks = props['MASS SPECTRAL PEAKS']
    peaks = peaks.split('\n')
    peakindex = np.array([round(float(p.split(' ')[0])) for p in peaks])
    peakintensity = np.array([float(p.split(' ')[1]) for p in peaks])
    peak_vec = ms2vec(m['peakindex'], m['peakintensity'])

    all_smiles.append(smi)
    Peak_data.append(peak_vec)
    Morgan_fp.append(morgan_fp)
    CDK_fp.append(cdk_fp)
    CDK_des.append(cdk_des)
    MolWt.append(molwt)