def get_search(): '''Returns the search query for string matching''' if not defaults.DEFAULTS['search_case_sensitive']: return re.compile(defaults.DEFAULTS['search_form'], re.I) else: return re.compile(defaults.DEFAULTS['search_form'])
def __init__(self): super(UniProtServerIterator, self).__init__() self.downloader = uniprot.GeneDownloader() if self.escape_filter: self.regex = re.compile(re.escape(self.filter)) else: self.regex = re.compile(self.filter)
def fromengine(cls, engine, start=False, end=False): '''Compiles the start and end subs and initializes the class''' startsub = engine.defaults.start if start: startsub = re.compile(startsub) endsub = engine.defaults.end if end: endsub = re.compile(endsub) return cls(startsub, endsub)
def __init__(self, source): super(ParsePDCsv, self).__init__(source) logger.Logging.info("Initializing ParsePDSqlite3....") self.engine = get_engine(self.data) residues = ''.join(chemical_defs.AMINOACIDS) nterm, cterm = map(self.engine.get, ['nterm', 'cterm']) self.mod_parser = re.compile(self._mod.format(residues, nterm, cterm))
class ParseCsv(base.MatchedPeptideBase): '''Processes data from matched scans to dictionary from a file object''' # REGEXP # ------ header = re.compile(HEADER) def __init__(self, row): super(ParseCsv, self).__init__(row) self.csv = csv_.CSVUtils(row) self.csv.process['modifications'] = self.getmodification self.modparser = modifications.ModificationParser(row) @logger.call('matched', 'debug') def __call__(self): ''' Finds search name from Protein Prospector output, finds dataframe and dumps to dictionary of lists, where the keys are columns and lists values. ''' self.setfileheader() header = self.row.engines['matched'].defaults.header - 1 while header: self.fileobj.readline() header -= 1 self.csv.set_reader(self.fileobj) self.csv() self.setids() # SETTERS def setfileheader(self): '''Grabs the search and project names from the header line.''' match = self.header.match(self.fileobj.readline()) self.row.data['attrs']['project'] = match.group(1).strip() self.row.data['attrs']['search'] = match.group(2).strip() # GETTERS def getmodification(self, unparsed): '''Returns the parsed modification from the unparsed string''' peptide = self.row.data['matched']['peptide'][-1] start = self.row.data['matched']['start'][-1] # parse the modification if isinstance(unparsed, list): unparsed = unparsed[0] modification = Modification(unparsed, peptide, start) self.modparser(modification) return modification.todict()
def __init__(self, fileobj, group, engine): super(ParseText, self).__init__() self.fileobj = fileobj self.group = group self.source = self.app.discovererthread self.scan_finder = scan_parser.ScanFinder.fromengine(engine) self._parser = getattr(self, PARSERS[engine.tostr()]) self.re_scan = re.compile(engine.defaults.regexp)
class ColumnsDict(dict): '''Custom __getitem__ that removes suffixes''' # REGEXES # ------- suffix = re.compile(r'^.* (?:[A-Z0-9])$') whitespace = re.compile(r'\s+') # MAGIC def __getitem__(self, key, dict_getitem=dict.__getitem__): '''CD[k] -> v''' key = self._keychecker(key) return dict_getitem(self, key) # HELPERS def _keychecker(self, item): '''Returns the suffix-less key''' if isinstance(item, six.string_types): return self._stringchecker(item) elif isinstance(item, (list, tuple)): return self._sequencechecker(item) def _stringchecker(self, item): '''Returns the suffix-less key from a string column''' if self.suffix.match(item): return item[:-2] return item def _sequencechecker(self, item): '''Returns the suffix-less key from a column sequence''' # comparative has null string columns in row 1, while quantitative # has null or isotope columns in row 0 string = [i for i in item if self.whitespace.sub("", i)][-1] if self.suffix.match(string): return string[:-2] return string
def __init__(self, row, group, fileobj): super(ParseFullms, self).__init__() self.row = row self.group = group self.fileobj = fileobj engine = row.engines['ms1'] self.extractor = scans.ChromatogramExtractor(row) self.scan_finder = scan_parser.ScanFinder.fromengine(engine, end=True) self._parser = getattr(self, PARSERS[engine.tostr()]) self.re_scan = re.compile(engine.defaults.regexp)
# LICENSE=Licensed to: --- # MP= # NM= # COM=10ftmol BSA # IATOL= # IA2TOL= # IASTOL= # IBTOL= # IB2TOL= # IBSTOL= # IYTOL= # IY2TOL= # REGEXP # ------ CONSTANT_RESIDUES = re.compile(r'C_term|N_term|[A-Z]') FIXEDMOD = re.compile(r'^FixedMod\d*$') FIXMOD_RESIDUES = re.compile(r'^FixedModResidues\d*$') # CONSTANTS # --------- TERMINI = {'C_term': 'cterm', 'N_term': 'nterm'} # HELPERS # ------- @logger.init('matched', 'DEBUG') class Hits(base.BaseObject): '''Convenience class to facilitate adding Mascot search hits to a scan''' def __init__(self, modifications, queries):
class MSFParser(SQLiteUtils): ''' Provides convenient methods to parse the MSF file format, which is a simple, readable SQLITE3 format. ''' _fraction = None _path_sep = re.compile(r'\\|/') def __init__(self, parent): super(MSFParser, self).__init__() self.data = parent.data self.engine = get_engine(self.data) self.source = parent.source self.fragments = parent.fragments self.database = sqlite3.connect(parent.fileobj.name) self.cursor = self.database.cursor() self.peptide_to_spectrum = {} self.peptides = mapping.OrderedRecursiveDict() self.proteins = {} # all mods are defined internally within the file mods = ProteomeDiscovererMods(self.cursor, self.engine) # need to convert to the namedtuple format params.tupleize(mods) self.mod_ids = mods.ids self.engine['mods'].update(mods) def run(self): '''On start''' self.init_spectrum() self.fetch_enzyme() self.set_file() self.add_spectrum() self.add_scores() self.add_mods() self.add_ppms() self.set_proteins() self.add_proteins() # ------------------ # MAIN # ------------------ def init_spectrum(self): '''Sets the peptides with a SpectrumID as a key reference''' columns = "PeptideID, SpectrumID, Sequence, SearchEngineRank" self.cursor.execute("SELECT {} FROM Peptides;".format(columns)) # Spectrum ID is the unique spectral identifier # Peptide ID is the unique peptide identifier # Sequence is the peptide sequence, ex: "KATNE" for peptide_id, spectrum_id, sequence, rank in self.cursor: self.peptide_to_spectrum[peptide_id] = spectrum_id entry = self.peptides[spectrum_id][peptide_id] entry['peptide'] = sequence entry['rank'] = rank def fetch_enzyme(self): '''Extracts the proteolytic enzyme for the peptide search''' columns = "ParameterName, ParameterValue" table = "ProcessingNodeParameters" items = {k: v for k, v in self.fetch("fetchall", columns, table)} try: self.data['enzyme'] = items['Enzyme'] except KeyError: # no enzyme defined pass def set_file(self): '''Sets the current file name uses from the RAW file''' files = self.fetch("fetchall", "FileName", "FileInfos") # joins consecutive file names together, FileInfos should be 1 # use regex path splitter, since Windows path mappings don't work # on Linux or OS X. files = (self._path_sep.split(i[0])[-1] for i in files) self._fraction = ' - '.join(files) for entry in self.peptides.values(): entry['fraction'] = self._fraction def add_spectrum(self): '''Adds the spectral data, which includes the m/zs and charge states''' columns = "SpectrumID, ScanNumbers, Charge, Mass" self.cursor.execute("SELECT {} FROM SpectrumHeaders;".format(columns)) # Spectrum ID is the unique spectral identifier # num is the scan number associated with the spectral ID # charge is the charge state of the peptide # mass is the singly-charged mass of the peptide for spectrum_id, num, charge, mass in self.cursor: # peptide_ids = self.spectrum[spectrum_id] self.peptides[spectrum_id]['num'] = num self.peptides[spectrum_id]['z'] = charge # mz = masstools.mz(mass - params.PROTON_MASS, charge, 0) self.peptides[spectrum_id]['m/z'] = mz def add_scores(self): ''' Adds the score and calculates the EV from the score : score == -10*log(ev, 10) ''' columns = "PeptideID, ScoreValue" self.cursor.execute("SELECT {} FROM PeptideScores;".format(columns)) # Peptide ID is the unique peptide identifier, which # is for each peptide hit from the spectral ID # Score is the - 10 * log(p-value, 10) for peptide_id, score in self.cursor: expect = 10**(-score / 10) spectrum_id = self.peptide_to_spectrum[peptide_id] entry = self.peptides[spectrum_id][peptide_id] entry['score'] = score # need to calculate the p-value, or expectation value entry['ev'] = expect def add_mods(self): '''Finds all the target mods from the IDs and adds them to a holder''' self._mod_templates() self._internal_mods() self._terminal_mods() def add_ppms(self): '''Calculates the PPMs from the mods and peptides for each entry''' keys = ['peptide', 'm/z', 'z'] for peptide_id, spectrum_id in self.peptide_to_spectrum.items(): entry = self.peptides[spectrum_id] hit = entry[peptide_id] mod = hit['mods'] peptide, exper, charge = map(entry.get, keys) hit['formula'] = formula = self.calculate_formula(peptide, mod) hit['ppm'] = self.calculate_ppm(formula, mod, exper, charge) def set_proteins(self): '''Creates a {ProteinID: (UniProt ID: Protein Name)} holder''' columns = 'ProteinID, Description' table = "ProteinAnnotations" self.cursor.execute("SELECT {0} FROM {1};".format(columns, table)) for protein_id, description in self.cursor: # description == '>sp|P62894|CYC_BOVIN Cytochrome ...' id_, name = description.split('|')[1:] self.proteins[protein_id] = (id_, name) def add_proteins(self): '''Adds the protein names and IDs to each entry''' columns = "PeptideID, ProteinID" self.cursor.execute("SELECT {} FROM PeptidesProteins;".format(columns)) for peptide_id, protein_id in self.cursor: spectrum_id = self.peptide_to_spectrum[peptide_id] entry = self.peptides[spectrum_id][peptide_id] id_, name = self.proteins[protein_id] entry['id'] = id_ entry['name'] = name # ------------------ # UTILS # ------------------ # MODS def _internal_mods(self): '''Adds all the internal modifications to the mod holders''' columns = "PeptideID, AminoAcidModificationID, Position" table = "PeptidesAminoAcidModifications" self.cursor.execute("SELECT {} FROM {};".format(columns, table)) for peptide_id, mod_id, position in self.cursor: spectrum_id = self.peptide_to_spectrum.get(peptide_id) modname = self.mod_ids[mod_id] # for some weird reason, the mods can have peptide IDs which don't # exist otherwise, causing errors. Not decoys, nothing. if spectrum_id is not None: mods = self.peptides[spectrum_id][peptide_id]['mods'] mods['certain'].setdefault(modname, []) mods['certain'][modname].append(position) def _terminal_mods(self): '''Adds all the N-/C-terminal modifications to the mod holders''' columns = "PeptideID, TerminalModificationID" table = "PeptidesTerminalModifications" self.cursor.execute("SELECT {} FROM {};".format(columns, table)) for peptide_id, mod_id in self.cursor: spectrum_id = self.peptide_to_spectrum.get(peptide_id) # for some weird reason, the mods can have peptide IDs which don't # exist otherwise, causing errors. Not decoys, nothing. if spectrum_id is not None: mods = self.peptides[spectrum_id][peptide_id]['mods'] modname = self.mod_ids[mod_id] mods['certain'].setdefault(modname, []) if self.engine['nterm'] in self.engine['mods'][modname][1]: mods['certain'][modname].append(self.engine['nterm']) else: mods['certain'][modname].append(self.engine['cterm']) def _mod_templates(self): '''Sets the mod template holders for each peptide ID''' template = params.TEMPLATES['mods'] for peptide_id, spectrum_id in self.peptide_to_spectrum.items(): hit = self.peptides[spectrum_id][peptide_id] hit.setdefault('mods', copy.deepcopy(template))
# CONSTANTS # --------- REPLACE = '[REPLACE]' SKYLINE = r'\[CROSSLINKER(\+-?[0-9]+\.[0-9])?\]' CROSSLINKER_POSITIVE = '[CROSSLINKER+{0}]' CROSSLINKER_NEGATIVE = '[CROSSLINKER{0}]' CROSSLINKER = '[CROSSLINKER]' SKYLINE_POSITIVE = '[+{0}]' SKYLINE_NEGATIVE = '[-{0}]' # REGEXES # ------- PARENTHESES = re.compile(r'\(|\)') LETTERS = re.compile('([A-Z]{2})') MASS_MODIFICATION = re.compile(r'(\[(?:\+|-)([0-9]*\.?[0-9]?)\])') # HELPERS # ------- def mappedresidues(deadends): '''Maps the CSV residue keys as values to each individual residue''' mapped = defaultdict(list) for key in deadends: for residue in key.split(','): mapped[residue].append(key)
class PeptideDatabase(PeptideDBSettings, CutSites, AddMods, HDF5Utils): ''' Creates a custom peptide database with various settings from 1- max crosslinker modifications for each permutation, and the number of mods up to the max (with XL mods) defined by the search. ''' db_modes = {True: {'standard', 'decoy'}, False: {'standard'}} peptides = None searchables = None mods_length = None modifications_dtype = None mod_ids = None _mode = None id_regex = re.compile(uniprot.SERVER['id']['regex'], re.IGNORECASE) entry_regex = re.compile(uniprot.SERVER['entry']['regex'], re.IGNORECASE) def __init__(self, grp, xler, source): super(PeptideDatabase, self).__init__() self.grp = grp self.xler = xler self.source = source self.mods = params.CUSTOM_MODS self.react = set(self.xler['react_sites']) # default to true, newly set value self.uncleaved = self.xler.get("uncleaved", True) self.fragment_masses = self.get_fragment_masses() self.basemods = self.organize_basemods() self._set_mod_ids() self.sequences = self._get_sequences() self.decoy = params.MASS_FINGERPRINT['search_decoys'] self._peptide_holders() self.run() def run(self): '''On start''' self._mode = 'standard' self.add_ids() self.make_searchables() if self.decoy: self._mode = 'decoy' self.make_searchables() for key in {'base_peptides', 'peptides'}: del self.grp[key] self.linearize() # ------------------ # PUBLIC # ------------------ def add_ids(self): '''Adds a way to map the sequence ids to the current holder''' ids = [i.encode('utf-8') for i in self.sequences] self.grp.attrs.create('protein_ids', data=ids) names = [i.name.encode('utf-8') for i in self.sequences.values()] self.grp.attrs.create('protein_names', data=names) def make_searchables(self): '''On start''' self.cut_sequences() self.add_mods() # ------------------ # PRIVATE -- INIT # ------------------ def _get_sequences(self): '''Updates sequence dictionary with entries from the limited db''' sequences = {} limited_db = self.source.limited_database for id_ in limited_db: if id_ in self.source.custom_proteins: sequences[id_] = self.source.custom_proteins[id_] elif id_ in self.source.gene_name: sequences[id_] = self.source.gene_name[id_] return sequences def _peptide_holders(self): '''Adds peptide holders for later stimulated cutting''' for key in {'base_peptides', 'decoy', 'standard'}: self.grp.create_group(key) peptides = self.grp.create_group('peptides') db_keys = self.db_modes[self.decoy] for key in db_keys: for missed_cleavages in range(self.max_missed + 1): peptides.create_group('{}/{}'.format(key, missed_cleavages)) # temporary, fast data holder for in memory searchables self.searchables = {} for key in db_keys: self.searchables[key] = defaultdict(list) def _set_mod_ids(self): ''' Assigns unique mod ids for each modification and stores a copy in the dataset. ''' names = [i[0] for item in self.basemods.values() for i in item] names += self.xler['fragments']['name'] # grabs the number of places required to store data self.mods_length = (len(names) // 10) + 1 # +2 for n and c-term max_mods = self.mods_length * self.max_length + 2 self.modifications_dtype = 'S{}'.format(max_mods) self.mod_ids = {} for index, name in enumerate(names): self.mod_ids[name] = index # store as attrs to unpack later self.grp.attrs.create('modification_ids', data=range(len(names))) bin_names = [i.encode('utf-8') for i in names] self.grp.attrs.create('modification', data=bin_names)
# MP= # NM= # COM=10ftmol BSA # IATOL= # IA2TOL= # IASTOL= # IBTOL= # IB2TOL= # IBSTOL= # IYTOL= # IY2TOL= # REGEXP # ------ CONSTANT_RESIDUES = re.compile(r'C_term|N_term|[A-Z]') FIXEDMOD = re.compile(r'^FixedMod\d*$') FIXMOD_RESIDUES = re.compile(r'^FixedModResidues\d*$') # CONSTANTS # --------- TERMINI = { 'C_term': 'cterm', 'N_term': 'nterm' } # HELPERS # -------
''' # load modules from xldlib import exception from xldlib.definitions import re, ZIP from xldlib.objects import matched from xldlib.qt.objects import base from xldlib.utils import logger from . import hierarchical # REGEXES # ------- NUMBER = re.compile(r'-?[0-9]*\.?[0-9]+') # HELPERS # ------- @logger.init('matched', 'DEBUG') class CheckTermini(base.BaseObject): '''Helper class which identifies false modification termini''' def __init__(self, engine): super(CheckTermini, self).__init__() self.nterm = engine.defaults.nterm self.engine_modifications = engine.defaults.modifications
from xldlib.definitions import re # load objects/functions from collections import defaultdict # CONSTANTS # --------- HEADER_CHARACTERS = 3 # REGEXP # ------ FIRST_CAP = re.compile('(.)([A-Z][a-z]+)') ALL_CAP = re.compile('([a-z0-9])([A-Z])') # REGISTER # -------- # Item register to avoid malicious, external script execution REGISTER = defaultdict(set) NAME_REGISTER = {} # DATA #---- BUILTINS = {
(comparable to BioPython). :copyright: (c) 2015 The Regents of the University of California. :license: GNU GPL, see licenses/GNU GPLv3.txt for more details. ''' # load modules from xldlib import exception from xldlib.definitions import re from xldlib.utils import logger from xldlib.utils.io_ import high_level, ziptools # REGEXP # ------ ISOFORM = re.compile(r'^(.*)\.\d$') HYPHEN = re.compile(r'-') ASTERIX = re.compile(r'\*') # OBJECTS # ------- class FastaParserMixin(object): ''' Mixin to provide methods to parse FASTA records using specific identifiers. ''' # NON-PUBLIC
Processing Mascot modnames depending on the UniMod specification. :copyright: (c) 2015 The Regents of the University of California. :license: GNU GPL, see licenses/GNU GPLv3.txt for more details. ''' # load modules/submodules from xldlib import resources from xldlib.definitions import re from xldlib.qt.objects import base # REGEXES # ------- PARSERS = [re.compile(i.regexp) for i in resources.SCAN_TITLES] # MATCHER # ------- class TitleFormatter(base.BaseObject): ''' Identify the scan title format based on regex matches, otherwise raise an `AssertionError`. After identifying the title format, use the title formatter to extract the scan number from scan filters. ''' def __init__(self): super(TitleFormatter, self).__init__()
class ChemicalParserMixin(object): '''Mixin for parsing chemical formulas and updating a mapping object''' # REGEX # ----- chemical = re.compile(r'^{}$'.format(CHEMICAL)) monomer = re.compile(r'^{}$'.format(MONOMER)) atom = re.compile(r'[A-Z][a-z]?') # PUBLIC def update_chemical(self, formula, count): ''' Update atomic counts from a string or mapping `formula`, and multiple by `count`. Args: formula (string, mapping): chemical or glycan formula count (int): scalar for elemental counts ''' if isinstance(formula, Mapping): self._update_mapping(formula, count) elif isinstance(formula, six.string_types): self._update_str(formula, count) # NON-PUBLIC def _update_mapping(self, formula, count): ''' Update the elemental counts from a mapping `formula` multiplied by a scalar `count`. See `update_chemical` for full arg specs. ''' for symbol, isotopes in formula.items(): for isotope, number in isotopes.items(): self[symbol][isotope] += number * count def _update_str(self, formula, count): ''' Update the elemental counts from a str `formula` multiplied by a scalar `count`. See `update_chemical` for full arg specs. ''' for item in formula.split(): for symbol, isotope, number in self._parse(item): self[symbol][isotope] += number * count def _parse(self, item): ''' Extract atomic symbol, isotope, and atomic counts from the `item` string. Args: item (str): str in format of "13C6", "13C(6)", "Hex", "C6" ''' match = self.chemical.match(item) if match: return [self._parse_element(match)] else: return self._parse_monomer(self.monomer.match(item)) def _parse_element(self, match): '''Extract elemental data from re `match` group''' isotope, symbol, free, parentheses = match.groups() assert self._symbolchecker(symbol) isotope = int(isotope or -1) count = _element_count(free, parentheses) return symbol, isotope, count def _parse_monomer(self, match): '''Extract glycan monomer data from re `match` group''' monomer, free, parentheses = match.groups() count = _element_count(free, parentheses) formula = MONOMERS[monomer] for item in formula.split(): match = self.chemical.match(item) symbol, isotope, number = self._parse_element(match) yield symbol, isotope, number * count # HELPERS def _symbolchecker(self, symbol): '''Check validity of atomic `symbol`''' return self.atom.match(symbol)
def getproteinmods(self, modification, start): '''Gets the protein mods from the manually specified data''' if defaults.DEFAULTS['concatenate_hybrid_modifications']: modification = modification.concatenate() positions = list(self.getcertain(modification, start)) if modification['uncertain']: positions.append(self.getuncertain(modification, start)) return positions # REGEXP # ------ LETTERS = re.compile('([A-Z]{2})') PARENTHESES = re.compile(r'(\))') # STRINGS # ------- TERMINUS = '{0}-{1}' INTERNAL = '{0}({1}){2}' # PEPTIDE-EMBEDDED MODIFICATIONS # ------------------------------ @logger.init('spreadsheet', 'DEBUG') class ModificationsInPeptide(base.BaseObject): '''Adds the given user mods to the target peptide sequence'''
from xldlib.utils import decorators, logger, xictools __all__ = [ 'Amplitudes', 'Dataframe', 'HierarchicalDataframe', 'QuantitativeDataframe' ] # CONSTANTS # --------- LOWER_SIGMA = u'\u03C3' UPPER_SIGMA = u'\u03A3' # REGEXES # ------- NONQUANTIFIED = re.compile(u'<|>|-|{}'.format(xictools.INFINITY), re.UNICODE) # DATA # ---- CONCATENATED = { 'report', 'best_peptide', 'best_peptide_file', } POLYPEPTIDE = { reports.LINKTYPES['interlink'], reports.LINKTYPES['multilink'], }
Processing Mascot modnames depending on the UniMod specification. :copyright: (c) 2015 The Regents of the University of California. :license: GNU GPL, see licenses/GNU GPLv3.txt for more details. ''' # load modules/submodules from xldlib import resources from xldlib.definitions import re from xldlib.qt.objects import base # REGEXES # ------- PARSERS = [re.compile(i.regexp) for i in resources.SCAN_TITLES] # MATCHER # ------- class TitleFormatter(base.BaseObject): ''' Identify the scan title format based on regex matches, otherwise raise an `AssertionError`. After identifying the title format, use the title formatter to extract the scan number from scan filters. ''' def __init__(self):
from xldlib.definitions import re, ZIP from xldlib.qt.objects import base from xldlib.resources.parameters import defaults from xldlib.utils import decorators, logger from xldlib.xlpy.tools import peak_picking # load objects/functions from collections import namedtuple # OBJECTS # ------- BinaryData = namedtuple("BinaryData", "data precision compression") # REGEXP # ------ MZML_SCAN = re.compile(r'scan=([0-9]+)') # HELPERS # ------- @logger.init('scans', level='DEBUG') class Start(base.BaseObject): '''Utilities for processing data from XML start elements''' def __init__(self, group): super(Start, self).__init__() self.group = group self.source = self.app.discovererthread def spectrum(self, attrs):
class ScientificSpinBox(DoubleSpinBox): ''' Recipe for a QDoubleSpinBox with support for floating point numbers in scientific notation. ''' # VALIDATION # ---------- float_regex = re.compile(SCIENTIFIC) precision = 1 def __init__(self, parent=None, **kwds): super(ScientificSpinBox, self).__init__(parent, **kwds) self.validator = QtGui.QDoubleValidator() self.validator.setNotation(QtGui.QDoubleValidator.ScientificNotation) self._string = '{{:0.{num}e}}'.format(num=self.precision) self.formatter = self._string.format # PUBLIC FUNCTIONS def validate(self, text, position): return self.validator.validate(text, position) def fixup(self, text): return self.tostr(self.tofloat(text)) def valueFromText(self, text): return self.tofloat(text) def textFromValue(self, value): return self.tostr(value) def stepBy(self, steps): value = self.value() + steps * self.singleStep() self.lineEdit().setText(self.formatter(value)) # HELPERS def tofloat(self, text, default=0.): '''Returns a floating-point representation from text''' match = self.float_regex.match(text) if match is not None: return self.frommatch(match) return default def tostr(self, value): return self.formatter(value) @staticmethod def frommatch(match): '''Converts a match group to a floating point representation''' sig, sign, exp = match.groups() if sign is None: sign = '+' if exp is None: exp = '0' return float(sig) * (10**int(sign + exp))
'ID_REGEX', 'MNEMONIC_REGEX' ] # CONSTANTS # --------- ID = (r'[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]' r'([A-Z][A-Z0-9]{2}[0-9]){1,2}') MNEMONIC = r'[a-zA-Z0-9]{1,5}_[a-zA-Z0-9]{1,5}' # REGEXP # ------ ID_REGEX = re.compile(ID, re.IGNORECASE) MNEMONIC_REGEX = re.compile(MNEMONIC, re.IGNORECASE) GIT_SERVER = { 'domain': { 'repos': 'repos', 'tags': ['tags'], 'releases': ['releases'], 'assets': ['releases', 'assets'], 'owner': 'Alexhuszagh', 'repo': 'xlDiscoverer' }, 'protocol': 'https', 'scheme': '://', 'host': 'api.github.com',
from xldlib.resources.parameters import defaults from xldlib.utils import decorators, logger from xldlib.xlpy.tools import peak_picking # load objects/functions from collections import namedtuple # OBJECTS # ------- BinaryData = namedtuple("BinaryData", "data precision compression") # REGEXP # ------ MZML_SCAN = re.compile(r'scan=([0-9]+)') # HELPERS # ------- @logger.init('scans', level='DEBUG') class Start(base.BaseObject): '''Utilities for processing data from XML start elements''' def __init__(self, group): super(Start, self).__init__() self.group = group self.source = self.app.discovererthread
class ProteomeDiscovererMods(mapping.MethodlessCopyDict, SQLiteUtils): ''' Processes the SQLITE3 mod objects to produce a mod dictionary : 3 SQLITE3 tables -> {modname: formula: res_csv} ''' _attrs = { 'identifiers': ("AminoAcidModificationID, AminoAcidID", "AminoAcidModificationsAminoAcids"), 'mods': ("AminoAcidModificationID, ModificationName, " "Abbreviation, Substitution, LeavingGroup", "AminoAcidModifications"), 'aminoacids': ("AminoAcidID, AminoAcidName, OneLetterCode", "AminoAcids"), } formula = re.compile(r'\(|\)') def __init__(self, cursor, engine): super(ProteomeDiscovererMods, self).__init__() self.cursor = cursor self.engine = engine self._terms = {self.engine['nterm'], self.engine['cterm']} for attr, (column, table) in self._attrs.items(): setattr(self, attr, self.fetch("fetchall", column, table)) self.aminoacids = {i[0]: i[1:] for i in self.aminoacids} self.mods = {i[0]: i[1:] for i in self.mods} self.ids = {} self.add_standard() self.add_terminal() self.process_residues() # ------------------ # MAIN # ------------------ def add_standard(self): '''Adds the standard mods which do not have a "terminal" ID''' for mod_id, aminoacid_id in self.identifiers: modname, abbrev, string_formula, leaving_group = self.mods[mod_id] if 'Mascot' in modname: continue self.ids[mod_id] = modname formula = self._get_formula(string_formula, modname, leaving_group) if formula is None: continue holder = [formula, set()] self.setdefault(modname, holder) self.setdefault(abbrev, holder) residue = self._get_residue(aminoacid_id) self[modname][1].add(residue) def add_terminal(self): '''Adds the mods which are terminal only''' standard_ids = set(i[0] for i in self.identifiers) terminal_ids = set(self.mods).difference(standard_ids) for mod_id in terminal_ids: modname, abbrev = self.mods[mod_id][0:2] self.ids[mod_id] = modname self.setdefault(modname, TERMINAL_MODS[modname]) self.setdefault(abbrev, TERMINAL_MODS[modname]) def process_residues(self): '''Processes the residues from a set to the CSV format''' # memoize to avoid double processing the same holder from the # modname and the abbrev memo = set() for values in self.values(): id_ = id(values) if id_ not in memo: residues = ','.join(sorted(values[1])) values[1] = residues memo.add(id_) # ------------------ # UTILS # ------------------ def _get_formula(self, string_formula, modname, leaving_group): ''' Produces the net mod formula, with the addition or overall formula defined by string_formula and the loss by the leaving_group. If no formula is defined or is a monomer formula (not chemical), an AssertionError or AttributeError is defined and the formula is attempted to be solved via .mods.MONOMERS. ''' try: assert string_formula string_formula = self.formula.sub('', string_formula) formula = chemical.Molecule(string_formula) formula.update_formula(leaving_group, count=-1) formula = formula.tostr() except (AssertionError, AttributeError): formula = MONOMERS.get(modname) if formula is None: print(exception.CODES['024'].format(modname), file=sys.stderr) return formula def _get_residue(self, aminoacid_id): '''Returns the residue name based on whether N-/C-term or internal''' aminoacid_name, one_letter = self.aminoacids[aminoacid_id] if aminoacid_name in self._terms: residue = aminoacid_name else: residue = one_letter return residue
'HierarchicalDataframe', 'QuantitativeDataframe' ] # CONSTANTS # --------- LOWER_SIGMA = u'\u03C3' UPPER_SIGMA = u'\u03A3' # REGEXES # ------- NONQUANTIFIED = re.compile(u'<|>|-|{}'.format(xictools.INFINITY), re.UNICODE) # DATA # ---- CONCATENATED = { 'report', 'best_peptide', 'best_peptide_file', } POLYPEPTIDE = { reports.LINKTYPES['interlink'], reports.LINKTYPES['multilink'], }
def __init__(self, enzyme=None): super(ProteolyticEnzyme, self).__init__() self.enzyme = self._enzymechecker(enzyme) self.cut_regex = re.compile(self.enzyme.cut_regex)
Solution: Same as above, only with 'D,E' rather than 'K' ''' # load modules from xldlib import exception from xldlib.definitions import re, ZIP from xldlib.objects import matched from xldlib.qt.objects import base from xldlib.utils import logger from . import hierarchical # REGEXES # ------- NUMBER = re.compile(r'-?[0-9]*\.?[0-9]+') # HELPERS # ------- @logger.init('matched', 'DEBUG') class CheckTermini(base.BaseObject): '''Helper class which identifies false modification termini''' def __init__(self, engine): super(CheckTermini, self).__init__() self.nterm = engine.defaults.nterm self.engine_modifications = engine.defaults.modifications source = self.app.discovererthread
'bzip2', 'gz', 'hdf5', 'mime', 'pkzip', 'raw', 'seek_start', 'sqlite', 'tar', 'xml', ] # REGEX # ----- XML_DECLARATION = re.compile(r'<\?xml version="\d\.\d" encoding=".+"\?>\r?\n') XML_FORMAT = re.compile(r'^\s*<\w+ xmlns=') MIME_DELARATION = re.compile('MIME-Version: .+') # HELPERS # ------- @contextlib.contextmanager def seek_start(fileobj): ''' Context manager which seeks the fileobj start, yields the fileobj, and then re-seeks the object start to allow sequential reads for file-format determination to leave the fileobj start position unchanged.
from xldlib.definitions import re, ZIP from xldlib.qt.objects import base from xldlib.resources.parameters import defaults from xldlib.utils import decorators, logger from xldlib.xlpy.tools import peak_picking # OBJECTS # ------- BinaryData = namedtuple("BinaryData", "data precision compression byteorder") # REGEXP # ------ MZXML_RT = re.compile(r'^PT((\d*\.?\d*)M)?((\d*\.?\d*)S)?$') # HELPERS # ------- @logger.init('scans', level='DEBUG') class Start(base.BaseObject): '''Utilities for processing data from XML start elements''' def __init__(self, group): super(Start, self).__init__() self.source = self.app.discovererthread
# load modules/submodules from xldlib.definitions import re __all__ = ['GIT_SERVER', 'ID_REGEX', 'MNEMONIC_REGEX'] # CONSTANTS # --------- ID = (r'[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]' r'([A-Z][A-Z0-9]{2}[0-9]){1,2}') MNEMONIC = r'[a-zA-Z0-9]{1,5}_[a-zA-Z0-9]{1,5}' # REGEXP # ------ ID_REGEX = re.compile(ID, re.IGNORECASE) MNEMONIC_REGEX = re.compile(MNEMONIC, re.IGNORECASE) GIT_SERVER = { 'domain': { 'repos': 'repos', 'tags': ['tags'], 'releases': ['releases'], 'assets': ['releases', 'assets'], 'owner': 'Alexhuszagh', 'repo': 'xlDiscoverer' }, 'protocol': 'https', 'scheme': '://', 'host': 'api.github.com', 'path': '/',