def _parse_metlin_file(self, f_path, kegg_mass, inchi): tree = ET.parse(f_path) root = tree.getroot() eles = root.findall("./ExperimentInformations/Comment") for ele in eles: if ele.get('Id')=='kegg': kegg_id = ele.get('Value') if ele.get('Id') == 'Metlin-ID': metlin_id = ele.get('Value') if ele.get("Id") == 'cas': cas = ele.get('Value') eles = root.findall("./ExperimentInformations") for ele in eles: # should have only one element mass_diff = float(ele.attrib['ModificationMass']) c_name = ele.attrib['CompoundName'] c_formula = ele.attrib['MolecularFormula'] spectra = []; spectra_ms1 = [] eles = root.findall("./Spectra/Spectrum") for ele in eles: if ele.get("MSLevel") == "1": spectra_ms1.append(ele) if ele.get("MSLevel") == "2": spectra.append(ele) if kegg_id not in kegg_mass: print "Ignore %s:kegg_mass doesn't have %s" % (f_path,kegg_id) return [] if kegg_id not in inchi: print "Ignore %s:kegg_inchi doesn't have %s" % (f_path,kegg_id) return [] mass = kegg_mass[kegg_id] inchi = inchi[kegg_id] spectra_list = [] for spec in spectra: ce = int(spec.attrib['CollisionEnergy']) spectrum = Spectrum() spectrum.f_name = f_path spectrum.mass = float(mass) spectrum.precursor = mass + mass_diff spectrum.mode = "POSITIVE" spectrum.inchi = inchi spectrum.cas = cas spectrum.pubchem_sid = "NULL" spectrum.pubchem_cid = "NULL" spectrum.kegg_id = kegg_id spectrum.metlin_id = metlin_id spectrum.ce = ce peaks = spec.findall("Peak") _peaks = [] for peak in peaks: _mass = float(peak.get("Mass")); _inten = float(peak.get("Intensity")) if _inten > 1: _peaks.append((_mass,_inten/100)) spectrum.peaks = _peaks spectra_list.append(spectrum) return spectra_list
def _parse_ms_file(self, f_path): # print "Parse file:",f_path # read ms/ms file in f = open(f_path) data = f.read() f.close() # create Spectrum instance spectrum = Spectrum(f_path) # set f_name spectrum.f_name = f_path # set metlin id spectrum.metlin_id = f_path[f_path.find("pos") + 3:f_path.find(".")] # set precursor _precursor = re.findall("parentmass[: ]+([0-9\.]+)", data) if len(_precursor) > 0: precursor = float(_precursor[0]) else: raise Exception("ERROR: precursor not set for %s!" % f_path) spectrum.precursor = precursor spectrum.mass = precursor - 1.00794 # set peaks and intensity _peaks = [] seg = False for line in data.split('\n'): if line.find("collision") != -1: seg = True continue if not line: seg = False continue if seg: words = line.split() mass = float(words[0]) inten = float(words[1]) _peaks.append((mass, inten)) spectrum.peaks = _peaks return spectrum
def _parse_ms_file(self, f_path): # print "Parse file:",f_path # read ms/ms file in f = open(f_path) data = f.read() f.close() # create Spectrum instance spectrum = Spectrum(f_path) # set f_name spectrum.f_name = f_path # set metlin id spectrum.metlin_id = f_path[f_path.find("pos")+3:f_path.find(".")] # set precursor _precursor = re.findall("parentmass[: ]+([0-9\.]+)",data) if len(_precursor) > 0: precursor = float(_precursor[0]) else: raise Exception("ERROR: precursor not set for %s!" % f_path) spectrum.precursor = precursor spectrum.mass = precursor - 1.00794 # set peaks and intensity _peaks = [] seg = False for line in data.split('\n'): if line.find("collision") != -1: seg = True continue if not line: seg = False continue if seg: words = line.split() mass = float(words[0]) inten = float(words[1]) _peaks.append((mass,inten)) spectrum.peaks = _peaks return spectrum
def _parse_massbank_file(self, f_path): print "Parse file:", f_path # read ms/ms file in f = open(f_path) data = f.read() f.close() # create Spectrum instance spectrum = Spectrum(f_path) # set f_name spectrum.f_name = f_path # set precursor _precursor = re.findall( "MS\$FOCUSED_ION: PRECURSOR_M/Z[: ]+([0-9\.]+)", data) if len(_precursor) > 0: precursor = float(_precursor[0]) else: _basepeak = re.findall("MS\$FOCUSED_ION: BASE_PEAK[: ]+([0-9\.]+)", data) if len(_basepeak) > 0: print("WARNING: using base peak as precursor for %s!" % f_path) precursor = float(_basepeak[0]) else: raise Exception("ERROR: precursor not set for %s!" % f_path) spectrum.precursor = precursor # set ion mode _mode = re.findall("ION_MODE ([A-Z]+)", data) if len(_mode) > 0: mode = _mode[0] else: _mode = re.findall("MODE ([A-Z]+)", data) if len(_mode) > 0: print("WARNING: ion mode is set by MODE for %s!" % f_path) mode = _mode[0] else: raise Exception("ERROR: mode not set for %s!" % f_path) spectrum.mode = mode if spectrum.mode == 'POSITIVE': spectrum.mass = spectrum.precursor - 1.00794 else: spectrum.mass = spectrum.precursor + 1.00794 _ppm = re.findall("SE\$SEARCH_PPM[: ]+([0-9\.]+)", data) if len(_ppm) > 0: ppm = int(_ppm[0]) else: raise Exception("ERROR: PPM not set for %s!" % f_path) spectrum.ppm = ppm # set peaks _peaks = [] lines = data.split("\n") ready = False for line in lines: if len(line) == 0: continue if line.find("PK$PEAK") != -1: ready = True continue if ready: if line.find("N/A") != -1: raise Exception("ERROR: no peaks in %s" % f_path) words = line.split() mass = float(words[0]) inten = float(words[1]) #mass = mass+numpy.random.normal(0,1e-8,1) # add noise #mass = float("%.3f" % mass) _peaks.append((mass, inten)) spectrum.peaks = _peaks _ce = re.findall("COLLISION_ENERGY (\w+)", data) if len(_ce) > 0: ce = _ce[0] ce = ce.replace("eV", "") if ce.isdigit(): spectrum.ce = int(ce) return spectrum
def _parse_massbank_file(self, f_path): print "Parse file:",f_path # read ms/ms file in f = open(f_path) data = f.read() f.close() # create Spectrum instance spectrum = Spectrum(f_path) # set f_name spectrum.f_name = f_path # set mass _mass = re.findall("CH\$EXACT_MASS[: ]+([0-9\.]+)", data) if len(_mass) > 0: mass = float(_mass[0]) else: raise Exception("ERROR: mass filed error in file %s " % f_path) spectrum.mass = mass # set precursor _precursor = re.findall("MS\$FOCUSED_ION: PRECURSOR_M/Z[: ]+([0-9\.]+)",data) if len(_precursor) > 0: precursor = float(_precursor[0]) else: _basepeak = re.findall("MS\$FOCUSED_ION: BASE_PEAK[: ]+([0-9\.]+)",data) if len(_basepeak)>0 : print ("WARNING: using base peak as precursor for %s!" % f_path) precursor = float(_basepeak[0]) else: raise Exception("ERROR: precursor not set for %s!" % f_path) spectrum.precursor = precursor # set ion mode _mode = re.findall("ION_MODE ([A-Z]+)", data) if len(_mode) > 0: mode = _mode[0] else: _mode = re.findall("MODE ([A-Z]+)", data) if len(_mode)>0: print ("WARNING: ion mode is set by MODE for %s!" % f_path) mode = _mode[0] else: raise Exception("ERROR: mode not set for %s!" % f_path) spectrum.mode = mode # set peaks _peaks = [] lines = data.split("\n"); ready = False for line in lines: if len(line) == 0: continue if line.find("PK$PEAK") != -1: ready = True continue if ready: if line.find("N/A") != -1: raise Exception("ERROR: no peaks in %s" % f_path) words = line.split() mass = float(words[0]) inten = float(words[1]) #mass = mass+numpy.random.normal(0,1e-8,1) # add noise #mass = float("%.3f" % mass) _peaks.append((mass,inten)) spectrum.peaks = _peaks # set inchi _inchi = re.findall("IUPAC: (.+)",data) if len(_inchi) > 0: if _inchi[0].find('unknown') != -1: print f_path, 'has no inchi!' inchi = _inchi[0] #raise Exception("Error: no inchi for %s!" % f_path) else: inchi = _inchi[0] else: raise Exception("Error: no inchi for %s!" % f_path) if "InChI=" not in inchi: # some inchi may not contains the head inchi = "InChI=" + inchi spectrum.inchi = inchi # below are optional field for Spectrum _cas = re.findall("CH\$LINK: CAS[: ]+([0-9\-]+)", data) if len(_cas) > 0: cas = _cas[0] spectrum.cas = cas _metlin = re.findall("CH\$LINK: METLIN[: ]+([0-9]+)", data) if len(_metlin) > 0: metlin = _metlin[0] spectrum.metlin_id = metlin else: spectrum.metlin_id = 'NULL' _sid = re.findall("PUBCHEM SID[: ]+(\w+)", data) if len(_sid) > 0: sid = _sid[0] spectrum.pubchem_sid = sid else: _sid = re.findall("PUBCHEM[: ]+([0-9]+)", data) if len(_sid) > 0: sid = _sid[0] spectrum.pubchem_sid = sid _cid = re.findall("PUBCHEM CID[: ]+(\w+)", data) if len(_cid) > 0: cid = _cid[0] spectrum.pubchem_cid = cid _kegg_id = re.findall("LINK: KEGG (\w+)", data) if len(_kegg_id) > 0: kegg_id = _kegg_id[0] spectrum.kegg_id = kegg_id _ce = re.findall("COLLISION_ENERGY (\w+)",data) if len(_ce) > 0: ce = _ce[0] ce = ce.replace("eV","") if ce.isdigit(): spectrum.ce = int(ce) return spectrum
def _parse_metlin_file(self, f_path, kegg_mass, inchi): tree = ET.parse(f_path) root = tree.getroot() eles = root.findall("./ExperimentInformations/Comment") for ele in eles: if ele.get('Id') == 'kegg': kegg_id = ele.get('Value') if ele.get('Id') == 'Metlin-ID': metlin_id = ele.get('Value') if ele.get("Id") == 'cas': cas = ele.get('Value') eles = root.findall("./ExperimentInformations") for ele in eles: # should have only one element mass_diff = float(ele.attrib['ModificationMass']) c_name = ele.attrib['CompoundName'] c_formula = ele.attrib['MolecularFormula'] spectra = [] spectra_ms1 = [] eles = root.findall("./Spectra/Spectrum") for ele in eles: if ele.get("MSLevel") == "1": spectra_ms1.append(ele) if ele.get("MSLevel") == "2": spectra.append(ele) if kegg_id not in kegg_mass: print "Ignore %s:kegg_mass doesn't have %s" % (f_path, kegg_id) return [] if kegg_id not in inchi: print "Ignore %s:kegg_inchi doesn't have %s" % (f_path, kegg_id) return [] mass = kegg_mass[kegg_id] inchi = inchi[kegg_id] spectra_list = [] for spec in spectra: ce = int(spec.attrib['CollisionEnergy']) spectrum = Spectrum() spectrum.f_name = f_path spectrum.mass = float(mass) spectrum.precursor = mass + mass_diff spectrum.mode = "POSITIVE" spectrum.inchi = inchi spectrum.cas = cas spectrum.pubchem_sid = "NULL" spectrum.pubchem_cid = "NULL" spectrum.kegg_id = kegg_id spectrum.metlin_id = metlin_id spectrum.ce = ce peaks = spec.findall("Peak") _peaks = [] for peak in peaks: _mass = float(peak.get("Mass")) _inten = float(peak.get("Intensity")) if _inten > 1: _peaks.append((_mass, _inten / 100)) spectrum.peaks = _peaks spectra_list.append(spectrum) return spectra_list
def _parse_massbank_file(self, f_path): print "Parse file:",f_path # read ms/ms file in f = open(f_path) data = f.read() f.close() # create Spectrum instance spectrum = Spectrum(f_path) # set f_name spectrum.f_name = f_path # set precursor _precursor = re.findall("MS\$FOCUSED_ION: PRECURSOR_M/Z[: ]+([0-9\.]+)",data) if len(_precursor) > 0: precursor = float(_precursor[0]) else: _basepeak = re.findall("MS\$FOCUSED_ION: BASE_PEAK[: ]+([0-9\.]+)",data) if len(_basepeak)>0 : print ("WARNING: using base peak as precursor for %s!" % f_path) precursor = float(_basepeak[0]) else: raise Exception("ERROR: precursor not set for %s!" % f_path) spectrum.precursor = precursor # set ion mode _mode = re.findall("ION_MODE ([A-Z]+)", data) if len(_mode) > 0: mode = _mode[0] else: _mode = re.findall("MODE ([A-Z]+)", data) if len(_mode)>0: print ("WARNING: ion mode is set by MODE for %s!" % f_path) mode = _mode[0] else: raise Exception("ERROR: mode not set for %s!" % f_path) spectrum.mode = mode if spectrum.mode == 'POSITIVE': spectrum.mass = spectrum.precursor - 1.00794 else: spectrum.mass = spectrum.precursor + 1.00794 _ppm = re.findall("SE\$SEARCH_PPM[: ]+([0-9\.]+)",data) if len(_ppm) > 0: ppm = int(_ppm[0]) else: raise Exception("ERROR: PPM not set for %s!" % f_path) spectrum.ppm = ppm # set peaks _peaks = [] lines = data.split("\n"); ready = False for line in lines: if len(line) == 0: continue if line.find("PK$PEAK") != -1: ready = True continue if ready: if line.find("N/A") != -1: raise Exception("ERROR: no peaks in %s" % f_path) words = line.split() mass = float(words[0]) inten = float(words[1]) #mass = mass+numpy.random.normal(0,1e-8,1) # add noise #mass = float("%.3f" % mass) _peaks.append((mass,inten)) spectrum.peaks = _peaks _ce = re.findall("COLLISION_ENERGY (\w+)",data) if len(_ce) > 0: ce = _ce[0] ce = ce.replace("eV","") if ce.isdigit(): spectrum.ce = int(ce) return spectrum