Exemplo n.º 1
0
def execute(filelocation, args, outdir, filters=None,
            executable='msConvert.exe'):
    """Execute the msConvert tool on Windows operating systems.

    :param filelocation: input file path
    :param args: str() or list(), msConvert arguments for details see the
        msConvert help below.
    :param outdir: path of the output directory
    :param filters: str() or list(), specify additional parameters and filters,
        for details see the msConvert help below.
    :param executable: must specify the complete file path of the msConvert.exe
        if its location is not in the ``PATH`` environment variable.
    """

    procArgs = [executable, filelocation]
    procArgs.extend(aux.toList(args))
    if filters is not None:
        for arg in aux.toList(filters):
            procArgs.extend(['--filter', arg])
    procArgs.extend(['-o', outdir])

    ## run it ##
    proc = subprocess.Popen(procArgs, stderr=subprocess.PIPE)

    ## But do not wait till netstat finish, start displaying output immediately ##
    while True:
        out = proc.stderr.read(1)
        if out == '' and proc.poll() != None:
            break
        if out != '':
            sys.stdout.write(out)
            sys.stdout.flush()
Exemplo n.º 2
0
    def getArrays(self,
                  attr=None,
                  sort=False,
                  reverse=False,
                  selector=None,
                  defaultValue=None,
                  report='lfq'):
        """ #TODO: docstring
        """
        selector = (lambda fgi: fgi.isValid) if selector is None else selector
        attr = attr if attr is not None else []
        attr = set(['id', 'intensities'] + aux.toList(attr))
        items = self.getItems(sort, reverse, selector)
        arrays = _getArrays(items, attr, defaultValue)

        for specfile in self._matrixTemplate:
            arrays[specfile] = list()
        for intensities in arrays['intensities']:
            for specfile, intensitiy in zip(self._matrixTemplate, intensities):
                arrays[specfile].append(intensitiy)
        for specfile in self._matrixTemplate:
            arrays[specfile] = numpy.array(arrays[specfile],
                                           dtype=numpy.float64)
        del arrays['intensities']

        return arrays
Exemplo n.º 3
0
def execute(filelocation, outpath, executable, args=None, switchArgs=None):
    """Executes the dinosaur tool on Windows operating systems.

    :param filelocation: either a single mgf file path or a list of file paths.
    :param outpath: path of the output file, file must not exist
    :param executable: must specify the complete file path of the
        spectra-cluster-cli.jar file, supported version is 1.0.2 BETA.
    :param args: list of arguments containing a value, for details see the
        spectra-cluster-cli help. Arguments should be added as tuples or a list.
        For example: [('precursor_tolerance', '0.5'), ('rounds', '3')]
    :param switchArgs: list of arguments not containing a value, for details see
        the spectra-cluster-cli help. Arguments should be added as strings.
        For example: ['fast_mode', 'keep_binary_files']
    """
    procArgs = ['java', '-jar', executable]
    procArgs.extend(['-output_path', outpath])
    if args is not None:
        for arg in args:
            procArgs.extend(['-' + arg[0], arg[1]])
    if switchArgs is not None:
        procArgs.extend(['-' + arg for arg in switchArgs])

    procArgs.extend(aux.toList(filelocation))

    ## run it ##
    proc = subprocess.Popen(procArgs, stderr=subprocess.PIPE)

    ## But do not wait till netstat finish, start displaying output immediately ##
    while True:
        out = proc.stderr.read(1)
        if out == '' and proc.poll() != None:
            break
        if out != '':
            sys.stdout.write(out)
            sys.stdout.flush()
Exemplo n.º 4
0
def expectedLabelPosition(peptide, labelStateInfo, sequence=None,
                          modPositions=None):
    """Returns a modification description of a certain label state of a peptide.

    :param peptide: Peptide sequence used to calculat the expected label state
        modifications
    :param labelStateInfo: An entry of :attr:`LabelDescriptor.labels` that
        describes a label state
    :param sequence: unmodified amino acid sequence of :var:`peptide`, if None
        it is generated by :func:`maspy.peptidemethods.removeModifications()`
    :param modPositions: dictionary describing the modification state of
        "peptide", if None it is generated by
        :func:`maspy.peptidemethods.returnModPositions()`

    :returns: {sequence position: sorted list of expected label modifications
                  on that position, ...
               }
    """
    if modPositions is None:
        modPositions = maspy.peptidemethods.returnModPositions(peptide,
                                                               indexStart=0
                                                               )
    if sequence is None:
        sequence = maspy.peptidemethods.removeModifications(peptide)

    currLabelMods = dict()
    for labelPosition, labelSymbols in viewitems(labelStateInfo['aminoAcidLabels']):
        labelSymbols = aux.toList(labelSymbols)
        if labelSymbols == ['']:
            pass
        elif labelPosition == 'nTerm':
            currLabelMods.setdefault(0, list())
            currLabelMods[0].extend(labelSymbols)
        else:
            for sequencePosition in aux.findAllSubstrings(sequence,
                                                          labelPosition):
                currLabelMods.setdefault(sequencePosition, list())
                currLabelMods[sequencePosition].extend(labelSymbols)

    if labelStateInfo['excludingModifications'] is not None:
        for excludingMod, excludedLabelSymbol in viewitems(labelStateInfo['excludingModifications']):
            if excludingMod not in modPositions:
                continue
            for excludingModPos in modPositions[excludingMod]:
                if excludingModPos not in currLabelMods:
                    continue
                if excludedLabelSymbol not in currLabelMods[excludingModPos]:
                    continue
                if len(currLabelMods[excludingModPos]) == 1:
                    del(currLabelMods[excludingModPos])
                else:
                    excludedModIndex = currLabelMods[excludingModPos].index(excludedLabelSymbol)
                    currLabelMods[excludingModPos].pop(excludedModIndex)

    for sequencePosition in list(viewkeys(currLabelMods)):
        currLabelMods[sequencePosition] = sorted(currLabelMods[sequencePosition])
    return currLabelMods
Exemplo n.º 5
0
    def _addProteinIdsToGroupMapping(self, proteinIds, groupId):
        """Add a groupId to one or multiple entries of the internal
        proteinToGroupId mapping.

        :param proteinIds: a proteinId or a list of proteinIds, a proteinId
            must be a string.
        :param groupId: str, a groupId
        """
        for proteinId in AUX.toList(proteinIds):
            self._proteinToGroupIds[proteinId].add(groupId)
Exemplo n.º 6
0
    def _addProteinIdsToGroupMapping(self, proteinIds, groupId):
        """Add a groupId to one or multiple entries of the internal
        proteinToGroupId mapping.

        :param proteinIds: a proteinId or a list of proteinIds, a proteinId
            must be a string.
        :param groupId: str, a groupId
        """
        for proteinId in AUX.toList(proteinIds):
            self._proteinToGroupIds[proteinId].add(groupId)
Exemplo n.º 7
0
    def addSubsumableToGroups(self, proteinIds, groupIds):
        """Add one or multiple subsumable proteins to one or multiple protein
        groups.

        :param proteinIds: a proteinId or a list of proteinIds, a proteinId
            must be a string.
        :param groupIds: a groupId or a list of groupIds, a groupId
            must be a string.
        """
        for groupId in AUX.toList(groupIds):
            self.groups[groupId].addSubsumableProteins(proteinIds)
            self._addProteinIdsToGroupMapping(proteinIds, groupId)
Exemplo n.º 8
0
    def addSubsumableToGroups(self, proteinIds, groupIds):
        """Add one or multiple subsumable proteins to one or multiple protein
        groups.

        :param proteinIds: a proteinId or a list of proteinIds, a proteinId
            must be a string.
        :param groupIds: a groupId or a list of groupIds, a groupId
            must be a string.
        """
        for groupId in AUX.toList(groupIds):
            self.groups[groupId].addSubsumableProteins(proteinIds)
            self._addProteinIdsToGroupMapping(proteinIds, groupId)
Exemplo n.º 9
0
    def _addProteins(self, proteinIds, containerNames):
        """Add one or multiple proteinIds to the respective container.

        :param proteinIds: a proteinId or a list of proteinIds, a proteinId
            must be a string.
        :param containerNames: list, entries must be one or multiple of
            'leading', 'subset', 'subsumableProteins' or 'proteins'
        :param addToProteins: bool, if True the proteinIds are added to the
        """
        proteinIds = AUX.toList(proteinIds)
        for containerName in containerNames:
            proteinContainer = getattr(self, containerName)
            proteinContainer.update(proteinIds)
Exemplo n.º 10
0
    def _addProteins(self, proteinIds, containerNames):
        """Add one or multiple proteinIds to the respective container.

        :param proteinIds: a proteinId or a list of proteinIds, a proteinId
            must be a string.
        :param containerNames: list, entries must be one or multiple of
            'leading', 'subset', 'subsumableProteins' or 'proteins'
        :param addToProteins: bool, if True the proteinIds are added to the
        """
        proteinIds = AUX.toList(proteinIds)
        for containerName in containerNames:
            proteinContainer = getattr(self, containerName)
            proteinContainer.update(proteinIds)
Exemplo n.º 11
0
def modAminoacidsFromLabelInfo(labelDescriptor):
    """Returns a set of all amino acids and termini which can bear a label, as
    described in "labelDescriptor".

    :param labelDescriptor: :class:`LabelDescriptor` describes the label setup
        of an experiment

    :returns: #TODO: docstring
    """
    modAminoacids = set()
    for labelStateEntry in viewvalues(labelDescriptor.labels):
        for labelPositionEntry in viewkeys(labelStateEntry['aminoAcidLabels']):
            for modAminoacid in aux.toList(labelPositionEntry):
                if modAminoacid != '':
                    modAminoacids.add(modAminoacid)
    return modAminoacids
Exemplo n.º 12
0
def modSymbolsFromLabelInfo(labelDescriptor):
    """Returns a set of all modiciation symbols which were used in the
    labelDescriptor

    :param labelDescriptor: :class:`LabelDescriptor` describes the label setup
        of an experiment

    :returns: #TODO: docstring
    """
    modSymbols = set()
    for labelStateEntry in viewvalues(labelDescriptor.labels):
        for labelPositionEntry in viewvalues(labelStateEntry['aminoAcidLabels']):
            for modSymbol in aux.toList(labelPositionEntry):
                if modSymbol != '':
                    modSymbols.add(modSymbol)
    return modSymbols
Exemplo n.º 13
0
def modAminoacidsFromLabelInfo(labelDescriptor):
    """Returns a set of all amino acids and termini which can bear a label, as
    described in "labelDescriptor".

    :param labelDescriptor: :class:`LabelDescriptor` describes the label setup
        of an experiment

    :returns: #TODO: docstring
    """
    modAminoacids = set()
    for labelStateEntry in viewvalues(labelDescriptor.labels):
        for labelPositionEntry in viewkeys(labelStateEntry['aminoAcidLabels']):
            for modAminoacid in aux.toList(labelPositionEntry):
                if modAminoacid != '':
                    modAminoacids.add(modAminoacid)
    return modAminoacids
Exemplo n.º 14
0
def modSymbolsFromLabelInfo(labelDescriptor):
    """Returns a set of all modiciation symbols which were used in the
    labelDescriptor

    :param labelDescriptor: :class:`LabelDescriptor` describes the label setup
        of an experiment

    :returns: #TODO: docstring
    """
    modSymbols = set()
    for labelStateEntry in viewvalues(labelDescriptor.labels):
        for labelPositionEntry in viewvalues(
                labelStateEntry['aminoAcidLabels']):
            for modSymbol in aux.toList(labelPositionEntry):
                if modSymbol != '':
                    modSymbols.add(modSymbol)
    return modSymbols
Exemplo n.º 15
0
    def getArrays(self, attr=None, sort=False, reverse=False,
                  selector=None, defaultValue=None, report='lfq'):
        """ #TODO: docstring
        """
        selector = (lambda fgi: fgi.isValid) if selector is None else selector
        attr = attr if attr is not None else []
        attr = set(['id', 'intensities'] + aux.toList(attr))
        items = self.getItems(sort, reverse, selector)
        arrays = _getArrays(items, attr, defaultValue)

        for specfile in self._matrixTemplate:
            arrays[specfile] = list()
        for intensities in arrays['intensities']:
            for specfile, intensitiy in zip(self._matrixTemplate, intensities):
                arrays[specfile].append(intensitiy)
        for specfile in self._matrixTemplate:
            arrays[specfile] = numpy.array(arrays[specfile],
                                           dtype=numpy.float64
                                           )
        del arrays['intensities']

        return arrays
Exemplo n.º 16
0
 def test_toList(self):
     self.assertEqual(MODULE.toList((1, 2, 3, 'A')), (1, 2, 3, 'A'))
     self.assertEqual(MODULE.toList('A'), ['A'])
     self.assertEqual(MODULE.toList(123), [123])
Exemplo n.º 17
0
def mappingBasedGrouping(protToPeps):
    """Performs protein grouping based only on protein to peptide mappings.

    :param protToPeps: dict, for each protein (=key) contains a set of
        associated peptides (=value). For Example {protein: {peptide, ...}, ...}

    #TODO: REFACTORING!!!

    returns a ProteinInference object
    """
    inference = ProteinInference(protToPeps)
    pepToProts = inference.pepToProts

    proteinClusters = _findProteinClusters(protToPeps, pepToProts)
    proteins = {}
    for clusterId, proteinCluster in enumerate(proteinClusters, 1):
        clusterProtToPeps = {p: protToPeps[p] for p in proteinCluster}

        #Find sameset proteins, define unique and non unique sameset proteins
        #NOTE: already unique proteins could be excluded to find sameset proteins
        samesetProteins = _findSamesetProteins(clusterProtToPeps)
        mergedProtToPeps = _mergeProteinEntries(samesetProteins,
                                                clusterProtToPeps)
        mergedPepToProts = _invertMapping(mergedProtToPeps)
        uniqueProteins = _findUniqueMappingValues(mergedPepToProts)
        remainingProteins = set(mergedProtToPeps).difference(uniqueProteins)

        # Remove subset proteins and check if remaining proteins become unique
        subsetProteinInfo = _findSubsetProteins(remainingProteins,
                                                mergedProtToPeps,
                                                mergedPepToProts)
        subsetProteins = [p for p, _ in subsetProteinInfo]
        subsetRemovedProtToPeps = _reducedProtToPeps(mergedProtToPeps,
                                                     subsetProteins)
        subsetRemovedPepToProts = _invertMapping(subsetRemovedProtToPeps)
        uniqueSubsetRemoved = _findUniqueMappingValues(subsetRemovedPepToProts)
        remainingProteins = remainingProteins.difference(subsetProteins)
        remainingProteins = remainingProteins.difference(uniqueSubsetRemoved)

        # Find redundant proteins #
        subsumableProteins = _findRedundantProteins(subsetRemovedProtToPeps,
                                                    subsetRemovedPepToProts)
        remainingNonRedundant = remainingProteins.difference(
            subsumableProteins)
        groupInitiatingProteins = uniqueSubsetRemoved.union(
            remainingNonRedundant)

        # - Generate protein groups and assign proteins to groups - #
        #Generate protein groups
        clusterGroupIds = set()
        for protein in groupInitiatingProteins:
            proteinIds = AUX.toList(protein)

            groupId = inference.addProteinGroup(proteinIds[0])
            inference.addLeadingToGroups(proteinIds, groupId)
            clusterGroupIds.add(groupId)

        #Add redundant proteins here (must be subsumable I guess)
        for protein in subsumableProteins:
            proteinIds = AUX.toList(protein)

            connectedProteins = _mappingGetValueSet(mergedPepToProts,
                                                    mergedProtToPeps[protein])
            flatConnectedProteins = _flattenMergedProteins(connectedProteins)
            groupIds = _mappingGetValueSet(inference._proteinToGroupIds,
                                           flatConnectedProteins)
            inference.addSubsumableToGroups(proteinIds, groupIds)
            assert len(groupIds) > 1

        #Add subgroup proteins to the respective groups
        #NOTE: proteins that are only a subset of subsumable proteins are not
        #to be added as subset proteins to a group but as subsumable proteins.
        for protein, supersetProteins in subsetProteinInfo:
            proteinIds = AUX.toList(protein)

            #If the protein is a subset of at least one protein, that is not a
            #subsumable protein, then it should be added to the group as subset.
            leadingSuperProteins = supersetProteins.intersection(
                groupInitiatingProteins)
            if leadingSuperProteins:
                flatSupersetProteins = _flattenMergedProteins(
                    leadingSuperProteins)
                superGroupIds = _mappingGetValueSet(
                    inference._proteinToGroupIds, flatSupersetProteins)
                inference.addSubsetToGroups(proteinIds, superGroupIds)
            #However, if all its super proteins are subsumable, the protein
            #itself is a subsumable protein.
            else:
                flatSupersetProteins = _flattenMergedProteins(supersetProteins)
                superGroupIds = _mappingGetValueSet(
                    inference._proteinToGroupIds, flatSupersetProteins)
                inference.addSubsumableToGroups(proteinIds, superGroupIds)
                subsumableProteins.update(proteinIds)
            assert superGroupIds

        # - Define peptide properties - #
        groupToPeps = dict()
        allSubsumablePeps = set()
        for groupId in clusterGroupIds:
            group = inference.groups[groupId]
            if group.subsumableProteins:
                subsumablePeptides = _mappingGetValueSet(
                    protToPeps, group.subsumableProteins)
                allSubsumablePeps.update(subsumablePeptides)

            groupPeptides = _mappingGetValueSet(protToPeps, group.proteins)
            groupToPeps[groupId] = groupPeptides
        pepToGroups = _invertMapping(groupToPeps)

        #Get unique peptides from peptide to protein mapping
        uniquePeptides = _findUniqueMappingKeys(mergedPepToProts)
        #Shared peptides have a groupPeptideCount > 1
        nonSharedPeptides = _findUniqueMappingKeys(pepToGroups)
        sharedPeptides = set(pepToGroups).difference(nonSharedPeptides)
        #Subsumable peptides are peptides from subsumable proteins that
        #are not shared peptides of multiple groups
        subsumablePeptides = allSubsumablePeps.difference(sharedPeptides)
        #groupUniquePeptides are the remaining ones (not shared with subsumable
        #proteins, groupPeptideCount == 1, not unique peptides)
        groupUniquePeptides = nonSharedPeptides.difference(subsumablePeptides)
        groupUniquePeptides = groupUniquePeptides.difference(uniquePeptides)

        inference._uniquePeptides.update(uniquePeptides)
        inference._groupUniquePeptides.update(groupUniquePeptides)
        inference._groupSubsumablePeptides.update(subsumablePeptides)
        inference._sharedPeptides.update(sharedPeptides)

        # - Generate protein entries and add them to the inference object - #
        subsetProteinInfoDict = dict(subsetProteinInfo)
        for protein, peptides in viewitems(mergedProtToPeps):
            _uniquePeptides = peptides.intersection(uniquePeptides)
            _groupUniquePeptides = peptides.intersection(groupUniquePeptides)
            _subsumablePeptides = peptides.intersection(subsumablePeptides)
            _sharedPeptides = peptides.intersection(sharedPeptides)
            proteinIds = AUX.toList(protein)
            for proteinId in proteinIds:
                proteinEntry = Protein(proteinId, peptides)
                if protein in groupInitiatingProteins:
                    proteinEntry.isLeading = True
                elif protein in subsumableProteins:
                    proteinEntry.isSubsumable = True
                if protein in subsetProteins:
                    superset = subsetProteinInfoDict[protein]
                    proteinEntry.isSubset = _flattenMergedProteins(superset)
                if len(proteinIds) > 1:
                    proteinEntry.isSameset = set(proteinIds)
                inference.proteins[proteinId] = proteinEntry

                #Add peptides to protein entry
                proteinEntry.uniquePeptides = _uniquePeptides
                proteinEntry.groupUniquePeptides = _groupUniquePeptides
                proteinEntry.groupSubsumablePeptides = _subsumablePeptides
                proteinEntry.sharedPeptides = _sharedPeptides

        # - Save cluster information - #
        for proteinId in proteinCluster:
            inference._proteinToClusterId[proteinId] = clusterId
        inference.clusters[clusterId] = clusterGroupIds

    allProteins = set()
    for proteinGroup in viewvalues(inference.groups):
        allProteins.update(proteinGroup.proteins)
        allProteins.update(proteinGroup.subsumableProteins)
    assert len(allProteins) == len(protToPeps)
    return inference
Exemplo n.º 18
0
def mappingBasedGrouping(protToPeps):
    """Performs protein grouping based only on protein to peptide mappings.

    :param protToPeps: dict, for each protein (=key) contains a set of
        associated peptides (=value). For Example {protein: {peptide, ...}, ...}

    #TODO: REFACTORING!!!

    returns a ProteinInference object
    """
    inference = ProteinInference(protToPeps)
    pepToProts = inference.pepToProts

    proteinClusters = _findProteinClusters(protToPeps, pepToProts)
    proteins = {}
    for clusterId, proteinCluster in enumerate(proteinClusters, 1):
        clusterProtToPeps = {p: protToPeps[p] for p in proteinCluster}

        #Find sameset proteins, define unique and non unique sameset proteins
        #NOTE: already unique proteins could be excluded to find sameset proteins
        samesetProteins = _findSamesetProteins(clusterProtToPeps)
        mergedProtToPeps = _mergeProteinEntries(samesetProteins,
                                                clusterProtToPeps)
        mergedPepToProts = _invertMapping(mergedProtToPeps)
        uniqueProteins = _findUniqueMappingValues(mergedPepToProts)
        remainingProteins = set(mergedProtToPeps).difference(uniqueProteins)

        # Remove subset proteins and check if remaining proteins become unique
        subsetProteinInfo = _findSubsetProteins(remainingProteins,
                                                mergedProtToPeps,
                                                mergedPepToProts)
        subsetProteins = [p for p, _ in subsetProteinInfo]
        subsetRemovedProtToPeps = _reducedProtToPeps(mergedProtToPeps,
                                                     subsetProteins)
        subsetRemovedPepToProts = _invertMapping(subsetRemovedProtToPeps)
        uniqueSubsetRemoved = _findUniqueMappingValues(subsetRemovedPepToProts)
        remainingProteins = remainingProteins.difference(subsetProteins)
        remainingProteins = remainingProteins.difference(uniqueSubsetRemoved)

        # Find redundant proteins #
        subsumableProteins = _findRedundantProteins(subsetRemovedProtToPeps,
                                                    subsetRemovedPepToProts)
        remainingNonRedundant = remainingProteins.difference(subsumableProteins)
        groupInitiatingProteins = uniqueSubsetRemoved.union(remainingNonRedundant)

        # - Generate protein groups and assign proteins to groups - #
        #Generate protein groups
        clusterGroupIds = set()
        for protein in groupInitiatingProteins:
            proteinIds = AUX.toList(protein)

            groupId = inference.addProteinGroup(proteinIds[0])
            inference.addLeadingToGroups(proteinIds, groupId)
            clusterGroupIds.add(groupId)

        #Add redundant proteins here (must be subsumable I guess)
        for protein in subsumableProteins:
            proteinIds = AUX.toList(protein)

            connectedProteins = _mappingGetValueSet(
                mergedPepToProts, mergedProtToPeps[protein]
            )
            flatConnectedProteins = _flattenMergedProteins(connectedProteins)
            groupIds = _mappingGetValueSet(
                inference._proteinToGroupIds, flatConnectedProteins
            )
            inference.addSubsumableToGroups(proteinIds, groupIds)
            assert len(groupIds) > 1

        #Add subgroup proteins to the respective groups
        #NOTE: proteins that are only a subset of subsumable proteins are not
        #to be added as subset proteins to a group but as subsumable proteins.
        for protein, supersetProteins in subsetProteinInfo:
            proteinIds = AUX.toList(protein)

            #If the protein is a subset of at least one protein, that is not a
            #subsumable protein, then it should be added to the group as subset.
            leadingSuperProteins = supersetProteins.intersection(
                                                    groupInitiatingProteins)
            if leadingSuperProteins:
                flatSupersetProteins = _flattenMergedProteins(
                                                    leadingSuperProteins)
                superGroupIds = _mappingGetValueSet(
                    inference._proteinToGroupIds, flatSupersetProteins
                )
                inference.addSubsetToGroups(proteinIds, superGroupIds)
            #However, if all its super proteins are subsumable, the protein
            #itself is a subsumable protein.
            else:
                flatSupersetProteins = _flattenMergedProteins(supersetProteins)
                superGroupIds = _mappingGetValueSet(
                    inference._proteinToGroupIds, flatSupersetProteins
                )
                inference.addSubsumableToGroups(proteinIds, superGroupIds)
                subsumableProteins.update(proteinIds)
            assert superGroupIds

        # - Define peptide properties - #
        groupToPeps = dict()
        allSubsumablePeps = set()
        for groupId in clusterGroupIds:
            group = inference.groups[groupId]
            if group.subsumableProteins:
                subsumablePeptides = _mappingGetValueSet(
                    protToPeps, group.subsumableProteins
                )
                allSubsumablePeps.update(subsumablePeptides)

            groupPeptides = _mappingGetValueSet(protToPeps, group.proteins)
            groupToPeps[groupId] = groupPeptides
        pepToGroups = _invertMapping(groupToPeps)

        #Get unique peptides from peptide to protein mapping
        uniquePeptides = _findUniqueMappingKeys(mergedPepToProts)
        #Shared peptides have a groupPeptideCount > 1
        nonSharedPeptides = _findUniqueMappingKeys(pepToGroups)
        sharedPeptides = set(pepToGroups).difference(nonSharedPeptides)
        #Subsumable peptides are peptides from subsumable proteins that
        #are not shared peptides of multiple groups
        subsumablePeptides = allSubsumablePeps.difference(sharedPeptides)
        #groupUniquePeptides are the remaining ones (not shared with subsumable
        #proteins, groupPeptideCount == 1, not unique peptides)
        groupUniquePeptides = nonSharedPeptides.difference(subsumablePeptides)
        groupUniquePeptides = groupUniquePeptides.difference(uniquePeptides)

        inference._uniquePeptides.update(uniquePeptides)
        inference._groupUniquePeptides.update(groupUniquePeptides)
        inference._groupSubsumablePeptides.update(subsumablePeptides)
        inference._sharedPeptides.update(sharedPeptides)

        # - Generate protein entries and add them to the inference object - #
        subsetProteinInfoDict = dict(subsetProteinInfo)
        for protein, peptides in viewitems(mergedProtToPeps):
            _uniquePeptides = peptides.intersection(uniquePeptides)
            _groupUniquePeptides = peptides.intersection(groupUniquePeptides)
            _subsumablePeptides = peptides.intersection(subsumablePeptides)
            _sharedPeptides = peptides.intersection(sharedPeptides)
            proteinIds = AUX.toList(protein)
            for proteinId in proteinIds:
                proteinEntry = Protein(proteinId, peptides)
                if protein in groupInitiatingProteins:
                    proteinEntry.isLeading = True
                elif protein in subsumableProteins:
                    proteinEntry.isSubsumable = True
                if protein in subsetProteins:
                    superset = subsetProteinInfoDict[protein]
                    proteinEntry.isSubset = _flattenMergedProteins(superset)
                if len(proteinIds) > 1:
                    proteinEntry.isSameset = set(proteinIds)
                inference.proteins[proteinId] = proteinEntry

                #Add peptides to protein entry
                proteinEntry.uniquePeptides = _uniquePeptides
                proteinEntry.groupUniquePeptides = _groupUniquePeptides
                proteinEntry.groupSubsumablePeptides = _subsumablePeptides
                proteinEntry.sharedPeptides = _sharedPeptides

        # - Save cluster information - #
        for proteinId in proteinCluster:
            inference._proteinToClusterId[proteinId] = clusterId
        inference.clusters[clusterId] = clusterGroupIds

    allProteins = set()
    for proteinGroup in viewvalues(inference.groups):
        allProteins.update(proteinGroup.proteins)
        allProteins.update(proteinGroup.subsumableProteins)
    assert len(allProteins) == len(protToPeps)
    return inference
Exemplo n.º 19
0
def expectedLabelPosition(peptide,
                          labelStateInfo,
                          sequence=None,
                          modPositions=None):
    """Returns a modification description of a certain label state of a peptide.

    :param peptide: Peptide sequence used to calculat the expected label state
        modifications
    :param labelStateInfo: An entry of :attr:`LabelDescriptor.labels` that
        describes a label state
    :param sequence: unmodified amino acid sequence of :var:`peptide`, if None
        it is generated by :func:`maspy.peptidemethods.removeModifications()`
    :param modPositions: dictionary describing the modification state of
        "peptide", if None it is generated by
        :func:`maspy.peptidemethods.returnModPositions()`

    :returns: {sequence position: sorted list of expected label modifications
                  on that position, ...
               }
    """
    if modPositions is None:
        modPositions = maspy.peptidemethods.returnModPositions(peptide,
                                                               indexStart=0)
    if sequence is None:
        sequence = maspy.peptidemethods.removeModifications(peptide)

    currLabelMods = dict()
    for labelPosition, labelSymbols in viewitems(
            labelStateInfo['aminoAcidLabels']):
        labelSymbols = aux.toList(labelSymbols)
        if labelSymbols == ['']:
            pass
        elif labelPosition == 'nTerm':
            currLabelMods.setdefault(0, list())
            currLabelMods[0].extend(labelSymbols)
        else:
            for sequencePosition in aux.findAllSubstrings(
                    sequence, labelPosition):
                currLabelMods.setdefault(sequencePosition, list())
                currLabelMods[sequencePosition].extend(labelSymbols)

    if labelStateInfo['excludingModifications'] is not None:
        for excludingMod, excludedLabelSymbol in viewitems(
                labelStateInfo['excludingModifications']):
            if excludingMod not in modPositions:
                continue
            for excludingModPos in modPositions[excludingMod]:
                if excludingModPos not in currLabelMods:
                    continue
                if excludedLabelSymbol not in currLabelMods[excludingModPos]:
                    continue
                if len(currLabelMods[excludingModPos]) == 1:
                    del (currLabelMods[excludingModPos])
                else:
                    excludedModIndex = currLabelMods[excludingModPos].index(
                        excludedLabelSymbol)
                    currLabelMods[excludingModPos].pop(excludedModIndex)

    for sequencePosition in list(viewkeys(currLabelMods)):
        currLabelMods[sequencePosition] = sorted(
            currLabelMods[sequencePosition])
    return currLabelMods
Exemplo n.º 20
0
 def test_toList(self):
     self.assertEqual(MODULE.toList((1, 2, 3, 'A')), (1, 2, 3, 'A'))
     self.assertEqual(MODULE.toList('A'), ['A'])
     self.assertEqual(MODULE.toList(123), [123])
Exemplo n.º 21
0
def matchToFeatures(fiContainer, specContainer, specfiles=None, fMassKey='mz',
                    sMassKey='obsMz', isotopeErrorList=(0),
                    precursorTolerance=5, toleranceUnit='ppm',
                    rtExpansionUp=0.10, rtExpansionDown=0.05, matchCharge=True,
                    scoreKey='pep', largerBetter=False):
    """Annotate :class:`Fi <maspy.core.Fi>` (Feature items) by matching
    :class:`Si <maspy.core.Si>` (Spectrum items) or :class:`Sii
    <maspy.core.Sii>` (Spectrum identification items).

    :param fiContainer: :class:`maspy.core.FeatureContainer`, contains ``Fi``.
    :param specContainer: :class:`maspy.core.MsrunContainer` or
        :class:`maspy.core.SiiContainer`, contains ``Si`` or ``Sii``.
    :param specfiles: filenames of ms-run files, if specified consider only
        items from those files
    :type specfiles: str, list or None
    :param fMassKey: mass attribute key in :attr:`Fi.__dict__`
    :param sMassKey: mass attribute key in :attr:`Si.__dict__` or
        :attr:`Sii.__dict__` (eg 'obsMz', 'excMz')
    :param isotopeErrorList: allowed isotope errors relative to the spectrum
        mass, for example "0" or "1". If no feature has been matched with
        isotope error 0, the spectrum mass is increased by the mass difference
        of carbon isotopes 12 and 13 and matched again. The different isotope
        error values are tested in the specified order therefore "0" should
        normally be the first value of the list.
    :type isotopeErrorList: list or tuple of int
    :param precursorTolerance: the largest allowed mass deviation of ``Si`` or
        ``Sii`` relative to ``Fi``
    :param toleranceUnit: defines how the ``precursorTolerance`` is applied to
        the mass value of ``Fi``. ``"ppm": mass * (1 +/- tolerance*1E-6)`` or
        ``"da": mass +/- value``
    :param rtExpansionUp: relative upper expansion of ``Fi`` retention time
        area. ``limitHigh = Fi.rtHigh + (Fi.rtHigh - Fi.rtLow) * rtExpansionUp``
    :param rtExpansionDown: relative lower expansion of ``Fi`` retention time
        area. ``limitLow = Fi.rtLow - (Fi.rtHigh - Fi.rtLow) * rtExpansionDown``
    :param matchCharge: bool, True if ``Fi`` and ``Si`` or ``Sii`` must have the
        same ``charge`` state to be matched.
    :param scoreKey: ``Sii`` attribute name used for scoring the identification
        reliability
    :param largerBetter: bool, True if higher score value means a better
        identification reliability

    .. note:
        Concerning the feature retention area expansion. If ``Si`` or ``Sii`` is
        matched to multiple ``Fi`` the rt expansion is removed and the matching
        is repeated.

    .. note:
        If the ``specContainer`` is a ``SiiContainer`` then matched ``Fi`` are
        annotated with :attr:`Sii.peptide`, if multiple ``Sii`` are matched to
        ``Fi`` the one with the best score is used.

    #TODO: this function is nested pretty badly and should maybe be rewritten
    #TODO: replace tolerance unit "ppm" by tolerance mode "relative" and change
        repsective calculations
    """
    isotopeErrorList = aux.toList(isotopeErrorList)

    if specContainer.__class__.__name__ == 'MsrunContainer':
        listKeySpecIds = 'siIds'
    else:
        listKeySpecIds = 'siiIds'
    specContainerSpecfiles = [_ for _ in viewkeys(specContainer.info)]

    if specfiles is not None:
        specfiles = aux.toList(specfiles)
    else:
        specfiles = [_ for _ in viewkeys(fiContainer.info)]
    specfiles = list(set(specfiles).intersection(set(specContainerSpecfiles)))

    for specfile in specfiles:
        multiMatchCounter = int()
        isotopeErrorMatchCounter = int()
        specArrays = specContainer.getArrays([sMassKey, 'rt', 'charge',
                                              'msLevel'], specfiles=specfile
                                              )
        featureArrays = fiContainer.getArrays(['rtHigh', 'rtLow', 'charge',
                                               fMassKey], specfiles=specfile,
                                               sort=fMassKey
                                              )
        featureArrays['rtHighExpanded'] = (featureArrays['rtHigh'] +
                                           (featureArrays['rtHigh'] -
                                            featureArrays['rtLow']) *
                                           rtExpansionUp
                                           )
        featureArrays['rtLowExpanded'] = (featureArrays['rtLow'] -
                                          (featureArrays['rtHigh'] -
                                           featureArrays['rtLow']) *
                                          rtExpansionDown
                                          )

        specFeatureDict = dict() ## key = scanNr, value = set(featureKeys)
        featureSpecDict = dict() ## key = featureKey, value = set(scanNrs)

        for specPos, specId in enumerate(specArrays['id']):
            specZ = specArrays['charge'][specPos]
            if specZ is None:
                continue
            specMass = specArrays[sMassKey][specPos]
            specRt = specArrays['rt'][specPos]

            matchComplete = False
            isotopeErrorPos = 0

            while not matchComplete:
                isotopeError = isotopeErrorList[isotopeErrorPos]

                # calculate mass limits for each isotope error
                if toleranceUnit.lower() == 'ppm':
                    specMassHigh = ((specMass + isotopeError * 1.003355 / specZ)
                                    * (1 + precursorTolerance*1E-6)
                                    )
                    specMassLow = ((specMass + isotopeError * 1.003355 / specZ)
                                   * (1 - precursorTolerance*1E-6)
                                   )
                elif toleranceUnit.lower() == 'da':
                    specMassHigh = ((specMass + isotopeError * 1.003355 / specZ)
                                    + precursorTolerance
                                    )
                    specMassLow  = ((specMass + isotopeError * 1.003355 / specZ)
                                    - precursorTolerance
                                    )

                posL = bisect.bisect_left(featureArrays[fMassKey],
                                          specMassLow
                                          )
                posR = bisect.bisect_right(featureArrays[fMassKey],
                                           specMassHigh
                                           )

                if matchCharge:
                    chargeMask = (featureArrays['charge'][posL:posR] == specZ)

                fRtHighKey = 'rtHighExpanded'
                fRtLowKey = 'rtLowExpanded'
                for fRtHighKey, fRtLowKey in [('rtHighExpanded',
                                               'rtLowExpanded'),
                                              ('rtHigh', 'rtLow')
                                              ]:
                    rtMask = ((featureArrays[fRtLowKey][posL:posR] <= specRt) &
                              (featureArrays[fRtHighKey][posL:posR] >= specRt)
                              )
                    if matchCharge:
                        matchedFeatureIds = featureArrays['id'][posL:posR][rtMask & chargeMask]
                    else:
                        matchedFeatureIds = featureArrays['id'][posL:posR][rtMask]

                    if len(matchedFeatureIds) <= 1:
                        break

                # if exactly one feature has been matched,
                if len(matchedFeatureIds) > 0:
                    if len(matchedFeatureIds) == 1:
                        matchComplete = True
                        if isotopeErrorList[isotopeErrorPos] != 0:
                            isotopeErrorMatchCounter += 1
                    else:
                        #Stop if Spectrum can be matched to multiple features
                        multiMatchCounter += 1
                        break

                isotopeErrorPos += 1
                if isotopeErrorPos >= len(isotopeErrorList):
                    #Stop if all allowed isotope errors have been tested
                    break

            if matchComplete:
                for featureId in matchedFeatureIds:
                    getattr(fiContainer.container[specfile][featureId],
                            listKeySpecIds
                            ).append(specId)
                    fiContainer.container[specfile][featureId].isMatched = True
                    specFeatureDict[specId] = featureId
                    featureSpecDict[featureId] = specId

        stats = dict()
        stats['totalFeatures'] = len(featureArrays['id'])
        stats['matchedFeatures'] = len(featureSpecDict)
        stats['relMatchedFeatures'] = round(100*stats['matchedFeatures']/stats['totalFeatures'], 1)
        stats['totalSpectra'] = len(specArrays['id'][(specArrays['msLevel'] != 1)])
        stats['matchedSpectra'] = len(specFeatureDict)
        stats['relMatchedSpectra'] = round(100*stats['matchedSpectra']/stats['totalSpectra'], 1)

        print('------', specfile, '------')
        print('Annotated features:\t\t\t', stats['matchedFeatures'], '/', stats['totalFeatures'], '=', stats['relMatchedFeatures'], '%')
        print('Spectra matched to features:\t\t', stats['matchedSpectra'], '/', stats['totalSpectra'], '=', stats['relMatchedSpectra'], '%')
        if multiMatchCounter != 0:
                print('Discarded because of multiple matches:\t', multiMatchCounter)
        if isotopeErrorMatchCounter != 0:
                print('Isotope error matched spectra:\t\t', isotopeErrorMatchCounter)

        #annotate feature with sii information (peptide, sequence, score)
        if isinstance(specContainer, maspy.core.SiiContainer):
            for featureId in viewkeys(featureSpecDict):
                matches = list()
                for specId in fiContainer.container[specfile][featureId].siiIds:
                    _sii = specContainer.getValidItem(specfile, specId)
                    score = getattr(_sii, scoreKey)
                    peptide = _sii.peptide
                    sequence = _sii.sequence
                    matches.append([score, peptide, sequence])
                matches.sort(reverse=largerBetter)

                fiContainer.container[specfile][featureId].isAnnotated = True
                fiContainer.container[specfile][featureId].score = matches[0][0]
                fiContainer.container[specfile][featureId].peptide = matches[0][1]
                fiContainer.container[specfile][featureId].sequence = matches[0][2]
Exemplo n.º 22
0
def matchToFeatures(fiContainer,
                    specContainer,
                    specfiles=None,
                    fMassKey='mz',
                    sMassKey='obsMz',
                    isotopeErrorList=(0),
                    precursorTolerance=5,
                    toleranceUnit='ppm',
                    rtExpansionUp=0.10,
                    rtExpansionDown=0.05,
                    matchCharge=True,
                    scoreKey='pep',
                    largerBetter=False):
    """Annotate :class:`Fi <maspy.core.Fi>` (Feature items) by matching
    :class:`Si <maspy.core.Si>` (Spectrum items) or :class:`Sii
    <maspy.core.Sii>` (Spectrum identification items).

    :param fiContainer: :class:`maspy.core.FeatureContainer`, contains ``Fi``.
    :param specContainer: :class:`maspy.core.MsrunContainer` or
        :class:`maspy.core.SiiContainer`, contains ``Si`` or ``Sii``.
    :param specfiles: filenames of ms-run files, if specified consider only
        items from those files
    :type specfiles: str, list or None
    :param fMassKey: mass attribute key in :attr:`Fi.__dict__`
    :param sMassKey: mass attribute key in :attr:`Si.__dict__` or
        :attr:`Sii.__dict__` (eg 'obsMz', 'excMz')
    :param isotopeErrorList: allowed isotope errors relative to the spectrum
        mass, for example "0" or "1". If no feature has been matched with
        isotope error 0, the spectrum mass is increased by the mass difference
        of carbon isotopes 12 and 13 and matched again. The different isotope
        error values are tested in the specified order therefore "0" should
        normally be the first value of the list.
    :type isotopeErrorList: list or tuple of int
    :param precursorTolerance: the largest allowed mass deviation of ``Si`` or
        ``Sii`` relative to ``Fi``
    :param toleranceUnit: defines how the ``precursorTolerance`` is applied to
        the mass value of ``Fi``. ``"ppm": mass * (1 +/- tolerance*1E-6)`` or
        ``"da": mass +/- value``
    :param rtExpansionUp: relative upper expansion of ``Fi`` retention time
        area. ``limitHigh = Fi.rtHigh + (Fi.rtHigh - Fi.rtLow) * rtExpansionUp``
    :param rtExpansionDown: relative lower expansion of ``Fi`` retention time
        area. ``limitLow = Fi.rtLow - (Fi.rtHigh - Fi.rtLow) * rtExpansionDown``
    :param matchCharge: bool, True if ``Fi`` and ``Si`` or ``Sii`` must have the
        same ``charge`` state to be matched.
    :param scoreKey: ``Sii`` attribute name used for scoring the identification
        reliability
    :param largerBetter: bool, True if higher score value means a better
        identification reliability

    .. note:
        Concerning the feature retention area expansion. If ``Si`` or ``Sii`` is
        matched to multiple ``Fi`` the rt expansion is removed and the matching
        is repeated.

    .. note:
        If the ``specContainer`` is a ``SiiContainer`` then matched ``Fi`` are
        annotated with :attr:`Sii.peptide`, if multiple ``Sii`` are matched to
        ``Fi`` the one with the best score is used.

    #TODO: this function is nested pretty badly and should maybe be rewritten
    #TODO: replace tolerance unit "ppm" by tolerance mode "relative" and change
        repsective calculations
    """
    isotopeErrorList = aux.toList(isotopeErrorList)

    if specContainer.__class__.__name__ == 'MsrunContainer':
        listKeySpecIds = 'siIds'
    else:
        listKeySpecIds = 'siiIds'
    specContainerSpecfiles = [_ for _ in viewkeys(specContainer.info)]

    if specfiles is not None:
        specfiles = aux.toList(specfiles)
    else:
        specfiles = [_ for _ in viewkeys(fiContainer.info)]
    specfiles = list(set(specfiles).intersection(set(specContainerSpecfiles)))

    for specfile in specfiles:
        multiMatchCounter = int()
        isotopeErrorMatchCounter = int()
        specArrays = specContainer.getArrays(
            [sMassKey, 'rt', 'charge', 'msLevel'], specfiles=specfile)
        featureArrays = fiContainer.getArrays(
            ['rtHigh', 'rtLow', 'charge', fMassKey],
            specfiles=specfile,
            sort=fMassKey)
        featureArrays['rtHighExpanded'] = (
            featureArrays['rtHigh'] +
            (featureArrays['rtHigh'] - featureArrays['rtLow']) * rtExpansionUp)
        featureArrays['rtLowExpanded'] = (
            featureArrays['rtLow'] -
            (featureArrays['rtHigh'] - featureArrays['rtLow']) *
            rtExpansionDown)

        specFeatureDict = dict()  ## key = scanNr, value = set(featureKeys)
        featureSpecDict = dict()  ## key = featureKey, value = set(scanNrs)

        for specPos, specId in enumerate(specArrays['id']):
            specZ = specArrays['charge'][specPos]
            if specZ is None:
                continue
            specMass = specArrays[sMassKey][specPos]
            specRt = specArrays['rt'][specPos]

            matchComplete = False
            isotopeErrorPos = 0

            while not matchComplete:
                isotopeError = isotopeErrorList[isotopeErrorPos]

                # calculate mass limits for each isotope error
                if toleranceUnit.lower() == 'ppm':
                    specMassHigh = (
                        (specMass + isotopeError * 1.003355 / specZ) *
                        (1 + precursorTolerance * 1E-6))
                    specMassLow = (
                        (specMass + isotopeError * 1.003355 / specZ) *
                        (1 - precursorTolerance * 1E-6))
                elif toleranceUnit.lower() == 'da':
                    specMassHigh = (
                        (specMass + isotopeError * 1.003355 / specZ) +
                        precursorTolerance)
                    specMassLow = (
                        (specMass + isotopeError * 1.003355 / specZ) -
                        precursorTolerance)

                posL = bisect.bisect_left(featureArrays[fMassKey], specMassLow)
                posR = bisect.bisect_right(featureArrays[fMassKey],
                                           specMassHigh)

                if matchCharge:
                    chargeMask = (featureArrays['charge'][posL:posR] == specZ)

                fRtHighKey = 'rtHighExpanded'
                fRtLowKey = 'rtLowExpanded'
                for fRtHighKey, fRtLowKey in [('rtHighExpanded',
                                               'rtLowExpanded'),
                                              ('rtHigh', 'rtLow')]:
                    rtMask = ((featureArrays[fRtLowKey][posL:posR] <= specRt) &
                              (featureArrays[fRtHighKey][posL:posR] >= specRt))
                    if matchCharge:
                        matchedFeatureIds = featureArrays['id'][posL:posR][
                            rtMask & chargeMask]
                    else:
                        matchedFeatureIds = featureArrays['id'][posL:posR][
                            rtMask]

                    if len(matchedFeatureIds) <= 1:
                        break

                # if exactly one feature has been matched,
                if len(matchedFeatureIds) > 0:
                    if len(matchedFeatureIds) == 1:
                        matchComplete = True
                        if isotopeErrorList[isotopeErrorPos] != 0:
                            isotopeErrorMatchCounter += 1
                    else:
                        #Stop if Spectrum can be matched to multiple features
                        multiMatchCounter += 1
                        break

                isotopeErrorPos += 1
                if isotopeErrorPos >= len(isotopeErrorList):
                    #Stop if all allowed isotope errors have been tested
                    break

            if matchComplete:
                for featureId in matchedFeatureIds:
                    getattr(fiContainer.container[specfile][featureId],
                            listKeySpecIds).append(specId)
                    fiContainer.container[specfile][featureId].isMatched = True
                    specFeatureDict[specId] = featureId
                    featureSpecDict[featureId] = specId

        stats = dict()
        stats['totalFeatures'] = len(featureArrays['id'])
        stats['matchedFeatures'] = len(featureSpecDict)
        stats['relMatchedFeatures'] = round(
            100 * stats['matchedFeatures'] / stats['totalFeatures'], 1)
        stats['totalSpectra'] = len(
            specArrays['id'][(specArrays['msLevel'] != 1)])
        stats['matchedSpectra'] = len(specFeatureDict)
        stats['relMatchedSpectra'] = round(
            100 * stats['matchedSpectra'] / stats['totalSpectra'], 1)

        print('------', specfile, '------')
        print('Annotated features:\t\t\t', stats['matchedFeatures'], '/',
              stats['totalFeatures'], '=', stats['relMatchedFeatures'], '%')
        print('Spectra matched to features:\t\t', stats['matchedSpectra'], '/',
              stats['totalSpectra'], '=', stats['relMatchedSpectra'], '%')
        if multiMatchCounter != 0:
            print('Discarded because of multiple matches:\t',
                  multiMatchCounter)
        if isotopeErrorMatchCounter != 0:
            print('Isotope error matched spectra:\t\t',
                  isotopeErrorMatchCounter)

        #annotate feature with sii information (peptide, sequence, score)
        if isinstance(specContainer, maspy.core.SiiContainer):
            for featureId in viewkeys(featureSpecDict):
                matches = list()
                for specId in fiContainer.container[specfile][
                        featureId].siiIds:
                    _sii = specContainer.getValidItem(specfile, specId)
                    score = getattr(_sii, scoreKey)
                    peptide = _sii.peptide
                    sequence = _sii.sequence
                    matches.append([score, peptide, sequence])
                matches.sort(reverse=largerBetter)

                fiContainer.container[specfile][featureId].isAnnotated = True
                fiContainer.container[specfile][featureId].score = matches[0][
                    0]
                fiContainer.container[specfile][featureId].peptide = matches[
                    0][1]
                fiContainer.container[specfile][featureId].sequence = matches[
                    0][2]