def _validate_moldb_df(df): errors = [] for idx, row in df.iterrows(): line_n = idx + 2 for col in df.columns: if not row[col] or row[col].isspace(): errors.append({ 'line': line_n, 'row': row.values.tolist(), 'error': 'Empty value' }) try: if '.' in row.formula: raise InvalidFormulaError('"." symbol not supported') parseSumFormula(row.formula) except Exception as e: errors.append({ 'line': line_n, 'row': row.values.tolist(), 'error': repr(e) }) errors.sort(key=lambda d: d['line']) return errors
def centroids(self, formula): """ Args ----- formula : str Returns ----- list[tuple] """ try: pyisocalc.parseSumFormula( formula) # tests that formula is parsable iso_pattern = isotopePattern(str(formula)) iso_pattern.addCharge(int(self.charge)) fwhm = self.sigma * SIGMA_TO_FWHM resolving_power = iso_pattern.masses[0] / fwhm instrument_model = InstrumentModel('tof', resolving_power) centr = iso_pattern.centroids(instrument_model) mzs_ = np.array(centr.masses) ints_ = 100. * np.array(centr.intensities) mzs_, ints_ = self._trim(mzs_, ints_, self.n_peaks) n = len(mzs_) mzs = np.zeros(self.n_peaks) mzs[:n] = np.array(mzs_) ints = np.zeros(self.n_peaks) ints[:n] = ints_ return mzs, ints except Exception as e: logger.warning('%s - %s', formula, e) return None, None
def ion_centroids(self, sf, adduct): """ Args ---- sf : str adduct: str Returns ------- : list of tuples """ try: pyisocalc.parseSumFormula( sf + adduct) # tests is the sf and adduct compatible iso_pattern = isotopePattern(str(sf + adduct)) iso_pattern.addCharge(int(self.charge)) fwhm = self.sigma * SIGMA_TO_FWHM resolving_power = iso_pattern.masses[0] / fwhm instrument_model = InstrumentModel('tof', resolving_power) centr = iso_pattern.centroids(instrument_model) mzs = np.array(centr.masses) ints = 100. * np.array(centr.intensities) mzs, ints = self._trim(mzs, ints, ISOTOPIC_PEAK_N) return mzs, ints except Exception as e: logger.warning('%s %s - %s', sf, adduct, e) return None, None
def ion_centroids(self, sf, adduct): """ Args ---- sf : str adduct: str Returns ------- : list of tuples """ try: pyisocalc.parseSumFormula(sf + adduct) # tests is the sf and adduct compatible iso_pattern = isotopePattern(str(sf + adduct)) iso_pattern.addCharge(int(self.charge)) fwhm = self.sigma * SIGMA_TO_FWHM resolving_power = iso_pattern.masses[0] / fwhm instrument_model = InstrumentModel('tof', resolving_power) centr = iso_pattern.centroids(instrument_model) mzs = np.array(centr.masses) ints = 100. * np.array(centr.intensities) mzs, ints = self._trim(mzs, ints, ISOTOPIC_PEAK_N) return mzs, ints except Exception as e: logger.warning('%s %s - %s', sf, adduct, e) return None, None
def parsable(sf): try: parseSumFormula(sf) return True except Exception as e: logger.warning(e) return False
def get_mass(self): logging.info(self.sum_formula) logging.info(pyisocalc.parseSumFormula(self.sum_formula)) spec = pyisocalc.perfect_pattern(pyisocalc.parseSumFormula(self.sum_formula), charge=0) logging.info(spec) mass = spec.get_spectrum(source='centroids')[0][np.argmax(spec.get_spectrum(source='centroids')[1])] logging.info(mass) return mass
def get_mass(self): logging.info(self.sum_formula) logging.info(pyisocalc.parseSumFormula(self.sum_formula)) spec = pyisocalc.perfect_pattern(pyisocalc.parseSumFormula( self.sum_formula), charge=0) logging.info(spec) mass = spec.get_spectrum(source='centroids')[0][np.argmax( spec.get_spectrum(source='centroids')[1])] logging.info(mass) return mass
def is_valid(sf): if '.' in sf: LOG.warning('"." in formula {}, skipping'.format(sf)) return False try: parseSumFormula(sf) except Exception as e: LOG.warning(e) return False else: return True
def get_delta_atoms(self): def addElement(elDict, element, number): elDict.setdefault(element, []).append(number) self.delta_formula = self.delta_formula.strip() if all([ self.delta_formula.startswith("+"), self.delta_formula.startswith("-") ]): self.delta_formula = "+" + self.delta_formula formula_split = re.split(u'([+-])', self.delta_formula) el_dict = {} for sign, el in zip(formula_split[1::2], formula_split[2::2]): this_el_dict = dict([ (segment.element().name(), int("{}1".format(sign)) * segment.amount()) for segment in pyisocalc.parseSumFormula(el).get_segments() ]) for this_el in this_el_dict: addElement(el_dict, this_el, this_el_dict[this_el]) sign_dict = {1: "+", -1: "-"} for this_el in el_dict: el_dict[this_el] = sum(el_dict[this_el]) el_string = "".join([ "{}{}{}".format(sign_dict[np.sign(el_dict[el])], el, abs(el_dict[el])) for el in el_dict if el_dict[el] != 0 ]) return el_string
def get_isotope_pattern(self, formula_adduct_string, charge): perfect_pattern = pyisocalc.perfect_pattern(pyisocalc.parseSumFormula(formula_adduct_string), charge=charge) sigma = self.sigma_at_mz(perfect_pattern.get_spectrum(source='centroids')[0][0]) pts_per_mz = self.points_per_mz(sigma) spec = pyisocalc.apply_gaussian(perfect_pattern, sigma, pts_per_mz) centroided_mzs, centroided_ints, _ = gradient(*spec.get_spectrum()) spec.add_centroids(centroided_mzs, centroided_ints) return spec
def parse_sf_string(self, sf_string): # string can be of form A1B2C3-D4E5+F from pyMSpec.pyisocalc.pyisocalc import parseSumFormula if all([sf_string[0] != '+', sf_string[0] != '-']): sf_string = '+' + sf_string sf = parseSumFormula(sf_string[1:]) atoms = {} for segment in sf.get_segments(): atoms[segment.element().name()] = segment.amount() return atoms
def get_isotope_pattern(self, formula_adduct_string, charge): perfect_pattern = pyisocalc.perfect_pattern( pyisocalc.parseSumFormula(formula_adduct_string), charge=charge) sigma = self.sigma_at_mz( perfect_pattern.get_spectrum(source='centroids')[0][0]) pts_per_mz = self.points_per_mz(sigma) spec = pyisocalc.apply_gaussian(perfect_pattern, sigma, pts_per_mz) centroided_mzs, centroided_ints, _ = gradient(*spec.get_spectrum()) spec.add_centroids(centroided_mzs, centroided_ints) return spec
def generate_molecular_weights(self, sf_string, charge=0): atoms = self.parse_sf_string(sf_string) sf_string = '' for a in sorted(atoms.keys()): if atoms[a] > 0: sf_string += '{}{}'.format(a, atoms[a]) elif atoms[a] < 0: raise ValueError('negative elements for {} with {}'.format(a, atoms[a])) ms_output = complete_isodist(parseSumFormula(sf_string), charge=charge, output='', plot=False, sigma=0.01, resolution=2000000, cutoff=0.0001, do_centroid=False) return ms_output.get_spectrum(source='centroids')[0]
def normalise_sf(sf_string): from pyMSpec.pyisocalc.pyisocalc import InvalidFormulaError, ParseError from pyMSpec.pyisocalc.pyisocalc import parseSumFormula import logging try: sf = parseSumFormula(sf_string) except (ParseError, InvalidFormulaError) as e: logging.debug(e) return "" except: logging.warning("failed to parse: {}".format(sf_string)) return "" return sf.__unicode__()
def get_mz(self, adduct): """ Calculate the precursor mass for this molecule with a given adduct :param adduct: object of class Adduct :return: float """ try: formula = self.make_ion_formula(adduct) spec = pyisocalc.perfect_pattern(pyisocalc.parseSumFormula(formula), charge=adduct.charge) mass = spec.get_spectrum(source='centroids')[0][np.argmax(spec.get_spectrum(source='centroids')[1])] return mass except: logging.debug(self.name, adduct) return -1.
def get_mz(self, adduct): """ Calculate the precursor mass for this molecule with a given adduct :param adduct: object of class Adduct :return: float """ try: formula = self.make_ion_formula(adduct) spec = pyisocalc.perfect_pattern( pyisocalc.parseSumFormula(formula), charge=adduct.charge) mass = spec.get_spectrum(source='centroids')[0][np.argmax( spec.get_spectrum(source='centroids')[1])] return mass except: logging.debug(self.name, adduct) return -1.
def calculate_isotope_patterns(sum_formulae, adduct='', instrument=[], isocalc_sig=0.01, isocalc_resolution=200000., isocalc_do_centroid=True, charge=1,verbose=True): from pyMSpec.pyisocalc import pyisocalc ### Generate a mz list of peak centroids for each sum formula with the given adduct mz_list = {} for n, sum_formula in enumerate(sum_formulae): try: if verbose: print n/float(len(sum_formulae)), sum_formula, adduct try: isotope_ms = instrument.get_isotope_pattern(sum_formula+adduct, charge=charge) sf = pyisocalc.parseSumFormula("{}{}".format(sum_formula,adduct)) except pyMSpec.pyisocalc.canopy.sum_formula.ParseError as e: print "error->", str(e), sum_formula, adduct continue except KeyError as e: if str(e).startswith("KeyError: "): print str(e) continue except ValueError as e: if str(e).startswith("Element not recognised"): print str(e) continue else: print sum_formula, adduct raise except pyisocalc.InvalidFormulaError as e: print str(e) continue #try: #isotope_ms = pyisocalc.complete_isodist(sf,sigma=isocalc_sig,charge=charge, pts_per_mz=isocalc_resolution) #isotope_ms = pyisocalc.isodist(sf, plot=False, sigma=isocalc_sig, charges=charge, # resolution=isocalc_resolution, do_centroid=isocalc_do_centroid) except MemoryError as e: #todo: print -> logging.debug print "Memory error: {}{}".format(sf, str(e)) continue except KeyError as e: print "KeyError: {}".format(str(e)) continue if not sum_formula in mz_list: mz_list[sum_formula] = {} mz_list[sum_formula][adduct] = isotope_ms.get_spectrum(source='centroids') return mz_list
def readRelationships(self): """ currently links nodes through 'is_a', 'has_role' without any distinction between the two """ logging.info('Loading CheBI ontology into a graph...') self._g = nx.DiGraph() self._nodes_by_id = {} self._ids_by_name = {} metabolite = None for term in oboTerms(self.filename): if 'id' not in term: continue if term.get('name') == 'metabolite': metabolite = term['id'] if 'is_a' in term: for parent in term['is_a']: self._g.add_edge(parent, term['id']) if 'relationship' in term: for parent in term['relationship']: if parent.startswith('has_role'): role = parent.split(' ', 1)[1] self._g.add_edge(role, term['id']) for synonym in term.get('synonym', []): if 'RELATED FORMULA' in synonym: s = synonym.split('"')[1] term['sum_formula'] = s try: sf = pyisocalc.parseSumFormula(s) term['_sf'] = sf except: pass elif 'RELATED InChI ' in synonym: term['InChI'] = synonym.split('"')[1] elif 'RELATED InChIKey' in synonym: term['InChIKey'] = synonym.split('"')[1] self._nodes_by_id[term['id']] = term self._ids_by_name[term['name']] = term['id'] logging.info('Found {} terms.'.format(len(self._nodes_by_id)))
def readRelationships(self): """ currently links nodes through 'is_a', 'has_role' without any distinction between the two """ logging.info('Loading CheBI ontology into a graph...') self._g = nx.DiGraph() self._nodes_by_id = {} self._ids_by_name = {} metabolite = None for term in oboTerms(self.filename): if 'id' not in term: continue if term.get('name') == 'metabolite': metabolite = term['id'] if 'is_a' in term: for parent in term['is_a']: self._g.add_edge(parent, term['id']) if 'relationship' in term: for parent in term['relationship']: if parent.startswith('has_role'): role = parent.split(' ', 1)[1] self._g.add_edge(role, term['id']) for synonym in term.get('synonym', []): if 'RELATED FORMULA' in synonym: s = synonym.split('"')[1] term['sum_formula'] = s try: sf = pyisocalc.parseSumFormula(s) term['_sf'] = sf except: pass break self._nodes_by_id[term['id']] = term self._ids_by_name[term['name']] = term['id'] logging.info('Found {} terms.'.format(len(self._nodes_by_id)))
def get_delta_atoms(self): def addElement(elDict, element, number): elDict.setdefault(element, []).append(number) self.delta_formula = self.delta_formula.strip() if all([self.delta_formula.startswith("+"), self.delta_formula.startswith("-")]): self.delta_formula = "+" + self.delta_formula formula_split = re.split(u'([+-])', self.delta_formula) logging.debug(formula_split) el_dict = {} for sign, el in zip(formula_split[1::2], formula_split[2::2]): this_el_dict = dict([(segment.element().name(), int("{}1".format(sign)) * segment.amount()) for segment in pyisocalc.parseSumFormula(el).get_segments()]) for this_el in this_el_dict: logging.debug(el_dict) addElement(el_dict, this_el, this_el_dict[this_el]) sign_dict = {1: "+", -1: "-"} for this_el in el_dict: el_dict[this_el] = sum(el_dict[this_el]) logging.debug(el_dict) el_string = "".join(["{}{}{}".format(sign_dict[np.sign(el_dict[el])], el, abs(el_dict[el])) for el in el_dict if el_dict[el] != 0]) logging.debug(el_string) return el_string
def normalized(sf): return str(pyisocalc.parseSumFormula(sf))
def _sf_elements(sf): return [ seg.element().name() for seg in parseSumFormula(sf).get_segments() ]
def _sf_elements(sf): return [seg.element().name() for seg in parseSumFormula(sf).get_segments()]
def _isodist(self, sf_adduct): sf_adduct_obj = parseSumFormula(sf_adduct) return complete_isodist(sf_adduct_obj, sigma=self.sigma, charge=self.charge, pts_per_mz=self.pts_per_mz, centroid_kwargs={'weighted_bins': 5})
def normalize_sf(self, sf): return str(pyisocalc.parseSumFormula(sf))
args = parser.parse_args() print args instr = Instrument(args) adducts = map(signedAdduct, args.adducts) assert(args.dynrange > 1) for adduct in adducts: assert(isValidAdduct(adduct)) detection_limit = 1.0 / args.dynrange output_filename = os.path.join(os.getcwd(), os.path.expanduser(args.output)) db = set() if args.db: for line in open(args.db): sf_str = str(pyisocalc.parseSumFormula(line.strip())) db.add(sf_str) print "target database size:", len(db) layers = {} noise = {} with open(args.input) as f: with np.load(f) as data: W = data['W'] H = data['H'] shape = data['shape'] mz_axis = data['mz_axis'][:, 0] nmf_ppms = data['mz_axis'][:, 1] noise['prob'] = data['noise_prob'] noise['sqrt_avg'] = data['noise_sqrt_avg']
def test_compare_generate_complex_ion_formula_with_pymspec(formula, comma_separated_adducts): adducts = comma_separated_adducts.split(',') ion_formula = safe_generate_ion_formula(formula, *adducts) assert ion_formula == str(pyisocalc.parseSumFormula(formula + ''.join(adducts)))
def test_compare_generate_simple_ion_formula_with_pymspec(formula, adduct): ion_formula = generate_ion_formula(formula, adduct) assert ion_formula == str(pyisocalc.parseSumFormula(formula + adduct))
def get_principal_peak(self, formula_adduct_string, charge): perfect_pattern = pyisocalc.perfect_pattern( pyisocalc.parseSumFormula(formula_adduct_string), charge=charge).get_spectrum(source='centroids') return perfect_pattern[0][np.argmax(perfect_pattern[1])]