Exemplo n.º 1
0
 def __init__(self, path, *args, **kwargs):
     """
     Imports mzML file, adds the chromatogram into a single spectrum.
     :param path: .mzML file path
     :param args: arguments (unused)
     :param kwargs: keywords (unused)
     :return: mzMLimporter object
     """
     #TODO make this work
     """
     del sys.modules[mzFile]
     print "Breaking"
     if mzFile not in sys.modules or merge_spectra not in sys.module:
         dlg = dlg = wx.MessageDialog(parent=None, message='Please install multiplierz and MSFileReader',
                            caption='Error', style=wx.OK)
         dlg.ShowModal()
         return
         """
     print "Reading Data:", path
     self.msrun = mzFile(path)
     self.scanrange = self.msrun.scan_range()
     self.scans = np.arange(self.scanrange[0], self.scanrange[1] + 1)
     self.times = []
     self.data = []
     for s in self.scans:
         impdat = np.array(self.msrun.scan(s))
         impdat = impdat[impdat[:, 0] > 10]
         self.data.append(impdat)
         self.times.append(self.msrun.scan_time_from_scan_name(s))
     self.times = np.array(self.times)
     self.data = np.array(self.data)
Exemplo n.º 2
0
def getAcqPoints(datafile, resultFile):
    data = mzFile(datafile)
    scans = data.scan_info(0, 999999)
    ms2toms1 = {}
    ms1 = scans[0][2]
    ms2s = []
    assert scans[0][3] == 'MS1'
    for scan in scans:
        if scan[3] == 'MS1':
            for ms2 in ms2s:
                ms2toms1[ms2] = ms1
            ms1 = scan[2]
            ms2s = []
        elif scan[3] == 'MS2':
            ms2s.append(scan[2])
        else:
            raise Exception, "Unidentified scan type of %s" % scan[3]
    for ms2 in ms2s:
        ms2toms1[ms2] = ms1

    acqPoints = []
    for result in resultFile:
        mz = spectrumDescriptionToMZ(result['Spectrum Sescription'])
        scan = spectrumDescriptionToScanNumber(result['Spectrum Description'])
        scan = data.timeForScan(ms2toms1[scan])
        acqPoints.append((mz, scan))

    return acqPoints
Exemplo n.º 3
0
    def openDataFile(self, event):
        self.set_status("Opening MS data file...", 0)

        self.data = mzFile(self.dataCtrl.GetValue())

        self.display.setData(self.data,
                             lambda x: self.scanCtrl.SetValue(str(x)))
        self.set_status("Ready.", 0)
Exemplo n.º 4
0
 def __init__(self, path, *args, **kwargs):
     """
     Imports mzML file, adds the chromatogram into a single spectrum.
     :param path: .mzML file path
     :param args: arguments (unused)
     :param kwargs: keywords (unused)
     :return: mzMLimporter object
     """
     # TODO make this work
     """
     del sys.modules[mzFile]
     print "Breaking"
     if mzFile not in sys.modules or merge_spectra not in sys.module:
         dlg = dlg = wx.MessageDialog(parent=None, message='Please install multiplierz and MSFileReader',
                            caption='Error', style=wx.OK)
         dlg.ShowModal()
         return
         """
     print("Reading Data:", path)
     try:
         self.msrun = mzFile(path)
     except:
         register()
         self.msrun = mzFile(path)
     self.scanrange = self.msrun.scan_range()
     # print(self.scanrange)
     self.scans = np.arange(self.scanrange[0], self.scanrange[1])
     self.times = []
     self.data = None
     for s in self.scans:
         s = s - 1
         try:
             self.times.append(self.msrun.scan_time_from_scan_name(s))
         except Exception as e:
             try:
                 t = self.msrun.info[s][0]
                 self.times.append(t)
             except Exception as e2:
                 try:
                     t = self.msrun.scan_info()[s][0]
                     self.times.append(t)
                 except Exception as e3:
                     print("Error getting scan times:", e, e2, e3)
                     print("Using Scan rather than Time")
                     self.times.append(s)
     self.times = np.array(self.times)
Exemplo n.º 5
0
def async_mzFile_internal(datafile, size_cap, input, output):
    try:
        from collections import deque
        from Queue import Empty

        data = mzFile(datafile)
        cacheRecord = deque()
        cache = {}
        commands = []

        while True:
            if not commands:
                commands = [input.get(block=True)]

            while True:
                try:
                    commands.append(input.get_nowait())
                except Empty:
                    break

            if any(x[0] == 'close' for x in commands):
                break

            call = next((x for x in commands if x[0] == 'call'), None)
            if call:
                commands.remove(call)

                _, method, argkwarg = call
                callhash = arghash(method, argkwarg)
                if callhash in cache:
                    output.put((call, cache[callhash]))
                else:
                    args, kwargs = argkwarg
                    returnval = getattr(data, method)(*args, **kwargs)
                    cache[callhash] = returnval
                    output.put((call, returnval))

            else:
                assert commands
                com = commands[0]
                _, method, argkwarg = com
                callhash = arghash(method, argkwarg)
                commands = commands[1:]
                if callhash in cache:
                    continue
                args, kwargs = argkwarg
                returnval = getattr(data, method)(*args, **kwargs)
                cache[callhash] = returnval

        return
    except Exception as err:
        import traceback
        traceback.print_exc()
        print '------------------'
        raise err
Exemplo n.º 6
0
def dataReaderProc(datafile, que, scanNumbers):
    try:
        data = mzFile(datafile)

        for scanNum in scanNumbers:
            scan = data.scan(scanNum, centroid=True)
            que.put((scanNum, scan), block=True)

        que.put('done')
        data.close()
    except Exception as err:
        import traceback
        print "READ THREAD ERROR."
        traceback.print_exc()
        print '------------------'
        raise err
Exemplo n.º 7
0
def detect_features(datafile, **constants):
    """
    Runs the feature detection algorithm on the target data file (currently,
    only Thermo .RAW is supported.)  Returns the path to the feature data
    file.
    
    Optional arguments:
    - tolerance (default 10): MZ tolerance in parts-per-million for all determinations
    of peak identity.  Should usually correspond to the mass precision of the
    source instrument.
    - force (default False): If True, feature detection is run even if a
    feature data file already exists for the target data.
    """

    if 'outputfile' in constants:
        featurefile = constants['outputfile']
    else:
        featurefile = datafile + '.features'

    if 'tolerance' in constants and constants['tolerance']:
        global tolerance
        tolerance = constants['tolerance']
        if tolerance < 1:
            print "\n\n\nWARNING- tolerance value for SILAC analysis should now be in PPM!\n\n\n"
    else:
        tolerance = 10

    if 'partial' in constants:
        # This is primarily for testing purposes only.
        scanrange = constants['partial']
    else:
        scanrange = None

    if 'force' in constants:
        force = constants['force']
    else:
        force = False

    if 'whitelist_psms' in constants:
        whitelist_mzs = constants['whitelist_psms']
        featurefile = datafile + '.partial%s.features' % (str(
            hash(frozenset(whitelist_mzs)))[:5])
    else:
        whitelist_mzs = None

    if 'peak_picking_params' in constants:
        peak_pick_params = constants['peak_picking_params']
    elif 'tolerance' in constants and constants['tolerance']:
        peak_pick_params = {'tolerance': constants['tolerance']}
    else:
        peak_pick_params = {'tolerance': 10}

    if os.path.exists(featurefile) and not force:
        vprint("Feature data file already exists: %s" % featurefile)
        return featurefile

    setGlobals(constants)

    times = []
    times.append(time.clock())
    data = mzFile(datafile)

    times.append(time.clock())
    vprint("Opened data file; getting isotopes...")

    scaninfo = [x for x in data.scan_info(0, 99999999) if x[3] == 'MS1']
    rtLookup = dict([(x[2], x[0]) for x in scaninfo])
    scaninfo = [x[2] for x in scaninfo]

    if scanrange:
        scaninfo = [x for x in scaninfo if scanrange[0] < x < scanrange[1]]

    data.close()

    que = multiprocessing.Queue(maxsize=20)
    reader = multiprocessing.Process(target=dataReaderProc,
                                     args=(datafile, que, scaninfo))
    reader.start()

    isotopeData = deque()
    thing = que.get(block=True)
    bar = 0
    while thing != 'done':
        scanNum, scan = thing
        foo = time.clock()
        isotopeData.append((scanNum, peak_pick_PPM(scan,
                                                   **peak_pick_params)[0]))
        bar += time.clock() - foo

        thing = que.get(block=True)

        if verbose_mode and len(isotopeData) % 100 == 0:
            print len(isotopeData)  # Shielded by explicit verbose_mode check.

    reader.join()
    # Could just discard the un-feature'd peaks immediately.
    vprint("Isotopic features acquired; finding features over time...")

    times.append(time.clock())

    ms1ToIndex = {}
    indexToMS1 = {}
    for index, scanNum in enumerate(scaninfo):
        ms1ToIndex[scanNum] = index
        indexToMS1[index] = scanNum

    isotopesByChargePoint = defaultdict(lambda: defaultdict(
        lambda: ProximityIndexedSequence([], lambda x: x[0][0])))
    allIsotopes = []
    for scanNum, isotopesByCharge in isotopeData:
        scanIndex = ms1ToIndex[scanNum]
        for charge, isotopes in isotopesByCharge.items():
            for isoSeq in isotopes:
                isotopesByChargePoint[charge][scanIndex].add(isoSeq)
                allIsotopes.append((isoSeq, scanIndex, charge))

    del isotopeData

    for scanlookup in isotopesByChargePoint.values():
        for proxseq in scanlookup.values():
            proxseq.rebalance()

    if whitelist_mzs:
        vprint("Screening out irrelevant MZs; starting with %s..." %
               len(allIsotopes))
        allIsotopes.sort(key=lambda x: x[0][0][0])
        whitelist_mzs = sorted(list(set([round(x, 2) for x in whitelist_mzs])))
        isoAcc = []
        whitemz = whitelist_mzs.pop()
        while allIsotopes:
            iso = allIsotopes.pop()
            mz = iso[0][0][0]
            while whitelist_mzs and whitemz - mz > whitelist_tol:
                whitemz = whitelist_mzs.pop()
            if abs(whitemz - mz) < whitelist_tol:
                isoAcc.append(iso)

        allIsotopes = isoAcc
        vprint("...%s remain." % len(allIsotopes))

    allIsotopes.sort(key=lambda x: x[0][0][1])

    times.append(time.clock())

    seenIsotopes = set()
    # Can assume isotopic sequences are unique because floats.
    # (But it may not be a valid assumption, because detectors
    # and floating point approximations!)

    featureList = []
    while allIsotopes:
        highIso, highScan, highChg = allIsotopes.pop()
        if tuple(highIso) in seenIsotopes:
            continue

        centerIndex, (centerMZ, _) = max(enumerate(highIso),
                                         key=lambda x: x[1][1])

        newFeature = [[highScan, highIso]]
        curScan = highScan
        continuing = True
        lastSeen = rtLookup[indexToMS1[curScan]]
        while continuing:  # Trailing the feature backwards.
            curScan -= 1
            try:
                curRT = rtLookup[indexToMS1[curScan]]
            except KeyError:
                assert curScan < max(indexToMS1.keys())
                break

            scanSeqs = isotopesByChargePoint[highChg][curScan].returnRange(
                centerMZ - 2, centerMZ + 1.5)
            scanSeqs.sort(key=lambda x: x[centerIndex][1], reverse=True)

            found = False
            for iso in scanSeqs:  # These are known to have centerMZ in common.
                # The indexes between iso and highIso may not be equivalent
                # if there's sub-C12 peak(s) in either.  For a first draft
                # this can be considered a feature, since C12s should be
                # consistent throughout features, but in some cases like
                # single-scan-dropouts of the C12 this is insufficient
                # and such discrepancies should be accounted for.

                if (inPPM(tolerance, iso[0][0], highIso[0][0])
                        and inPPM(tolerance, iso[1][0], highIso[1][0])
                        and tuple(iso) not in seenIsotopes):
                    newFeature.append([curScan, iso])
                    found = True
                    break  # From "for iso in scanSeqs"

            if found:
                lastSeen = curRT
            elif abs(curRT - lastSeen) > dropoutTimeTolerance:
                continuing = False

        curScan = highScan
        continuing = True
        lastSeen = rtLookup[indexToMS1[curScan]]
        while continuing:  # Trailing the feature forwards; mostly repeat code.
            curScan += 1
            try:
                curRT = rtLookup[indexToMS1[curScan]]
            except KeyError:
                assert curScan > max(indexToMS1.keys())
                break

            scanSeqs = isotopesByChargePoint[highChg][curScan].returnRange(
                centerMZ - 2, centerMZ + 1.5)
            scanSeqs.sort(key=lambda x: x[centerIndex][1], reverse=True)

            found = False
            for iso in scanSeqs:  # These are known to have centerMZ in common.
                # Ditto.

                if (inPPM(tolerance, iso[0][0], highIso[0][0])
                        and inPPM(tolerance, iso[1][0], highIso[1][0])
                        and tuple(iso) not in seenIsotopes):
                    newFeature.append([curScan, iso])
                    found = True
                    break  # From "for iso in scanSeqs"

            if found:
                lastSeen = curRT
            elif abs(curRT - lastSeen) > dropoutTimeTolerance:
                continuing = False

        if len(newFeature) > 1:
            featureList.append((highChg, newFeature))

        for _, iso in newFeature:
            seenIsotopes.add(tuple(iso))
    times.append(time.clock())

    for chg, feature in featureList:
        for stage in feature:
            stage[0] = indexToMS1[stage[0]]

    class idLookup():
        def __getitem__(self, thing):
            return thing

    lookup = idLookup()

    if scanrange:
        featurefile = datafile + ('%s-%s.features' % scanrange)

    featureObjects = []
    for chg, feature in featureList:
        newfeature = Feature()
        for scan, envelope in feature:
            newfeature.add(envelope, scan, chg)

        newfeature.calculate_bounds(lookup)

        #newfeature.prepareBoxes(lookup)
        #newfeature.prepareBoxes() # It's entirely different, for some reason?

        #test = Feature()
        #for scan, envelope in feature:
        #test.add(envelope, scan, chg)
        #test.calculate_bounds(lookup)

        #assert test.mz == newfeature.mz and test.charge == newfeature.charge

        featureObjects.append(newfeature)
    save_feature_database(featureObjects, featurefile)

    vprint("Saved feature file.")
    times.append(time.clock())

    return featurefile
Exemplo n.º 8
0
def binByFullFeature(datafile, featureDB, results):
    data = mzFile(datafile)

    scans = data.scan_info(0, 999999)
    ms2toms1 = {}
    ms1 = None
    ms2s = []
    # MS2s are dropped until the first MS1.
    for scan in scans:
        if scan[3] == 'MS1':
            for ms2 in ms2s:
                ms2toms1[ms2] = ms1
            ms1 = scan[2]
            ms2s = []
        elif scan[3] == 'MS2':
            if ms1 != None:
                ms2s.append(scan[2])
        else:
            raise Exception, "Unidentified scan type of %s" % scan[3]
    for ms2 in ms2s:
        ms2toms1[ms2] = ms1

    matchesToSplits = 0
    matchesToUnsplit = 0
    featureItems = defaultdict(list)
    edgeItems = defaultdict(list)
    inexplicableItems = []
    for result in results:
        #mz = spectrumDescriptionToMZ(result['Spectrum Description'])
        #scan = spectrumDescriptionToScanNumber(result['Spectrum Description'])
        mz = mzFromPSM(result)
        scan = scanFromPSM(result)
        charge = int(result['Charge'])
        try:
            scan = ms2toms1[scan]
        except:
            continue

        features = [(i, x)
                    for i, x in featureDB.mz_range(mz - 0.01, mz + 0.01)
                    if x.containsPoint(mz, scan, charge)]
        if features:
            index, feature = min(features, key=lambda x: abs(x[1].mz - mz))
            scans = min(feature.scans), max(feature.scans)
            intensity = feature.c12Intensity()
            kurtosis = feature.kurtosis
            skew = feature.skewness
            featureItems[index].append(
                (result, scans, intensity, kurtosis, skew))
        else:
            features = [(i, x) for i, x in featureDB.mz_range(mz - 1, mz + 1)
                        if x.bordersPoint(mz, scan, charge)]
            if features:
                index, feature = min(features, key=lambda x: abs(x[1].mz - mz))
                edge = feature.bordersPoint(mz, scan, charge)
                scans = min(feature.scans), max(feature.scans)
                intensity = feature.c12Intensity()
                kurtosis = feature.kurtosis
                skew = feature.skewness
                edgeItems[index].append(
                    (result, edge, scans, intensity, kurtosis, skew))
            else:
                inexplicableItems.append(result)

    groupedResults = []
    overFitCount = 0
    for feature, results in featureItems.items():
        try:
            pep = results[0][0]['Peptide Sequence']
            if not all(
                [x['Peptide Sequence'] == pep for x, s, i, k, sk in results]):
                overFitCount += 1
        except KeyError:
            pep = results[0][0]['Annotated Sequence']
            if not all(
                [x['Annotated Sequence'] == pep
                 for x, s, i, k, sk in results]):
                overFitCount += 1

        for result, scans, intensity, kurtosis, skew in results:
            result['Feature'] = feature
            result['feature error'] = '-'
            result['feature start scan'] = scans[0]
            result['feature end scan'] = scans[1]
            result['feature start time'] = data.timeForScan(
                scans[0]) if scans[0] else '-'
            result['feature end time'] = data.timeForScan(
                scans[1]) if scans[1] else '-'
            result['feature intensity'] = intensity
            result['feature kurtosis'] = kurtosis
            result['feature skewness'] = skew
            groupedResults.append(result)
    for feature, resultEdges in edgeItems.items():
        for result, edge, scans, intensity, kurtosis, skew in resultEdges:
            result['Feature'] = '-'
            result['feature error'] = str(feature) + " " + edge
            result['feature start scan'] = scans[0]
            result['feature end scan'] = scans[1]
            result['feature start time'] = data.timeForScan(
                scans[0]) if scans[0] else '-'
            result['feature end time'] = data.timeForScan(
                scans[1]) if scans[1] else '-'
            result['feature intensity'] = intensity
            result['feature kurtosis'] = kurtosis
            result['feature skewness'] = skew
            groupedResults.append(result)
    for result in inexplicableItems:
        result['Feature'] = '-'
        result['feature error'] = 'Feature not found'
        result['feature start scan'] = '-'
        result['feature end scan'] = '-'
        result['feature start time'] = '-'
        result['feature end time'] = '-'
        result['feature intensity'] = '-'
        result['feature kurtosis'] = '-'
        result['feature skewness'] = '-'
        groupedResults.append(result)

    data.close()
    return groupedResults
Exemplo n.º 9
0
def extract(datafile,
            outputfile=None,
            default_charge=2,
            centroid=True,
            scan_type=None,
            deisotope_and_reduce_charge=True,
            maximum_precursor_mass=15999,
            long_ms1=False,
            derive_precursor_via='All',
            deisotope_and_reduce_MS1_args={},
            deisotope_and_reduce_MS2_args={},
            min_mz=140,
            precursor_tolerance=0.005,
            isobaric_labels=None,
            label_tolerance=0.01,
            channel_corrections=None,
            prec_info_file=None,
            region_based_labels=False):
    """
    Converts a mzAPI-compatible data file to MGF.
    
    Writes only MS2 spectra where these can be determined, otherwise takes
    every spectrum in the file.  Likewise writes the precursor charge
    and mass if these can be determined.
    
    deisotope_and_reduce_charge deisotopes and charge-reduces each MS2
    spectrum, which generally improves results from peptide database search
    algorithms. However, it should be disabled for very low-resolution scans.
    """

    for key, val in [('tolerance', 0.01), ('min_peaks', 2),
                     ('enforce_isotopic_ratios', True)]:
        if key not in deisotope_and_reduce_MS1_args:
            deisotope_and_reduce_MS1_args[key] = val

    if not outputfile:
        outputfile = datafile + '.mgf'

    if os.path.exists(outputfile):
        assert outputfile.lower().endswith('mgf'), (
            "Overwriting a non-MGF file %s with "
            "the MGF extractor is probably a mistake." % outputfile)

    data = mzFile(datafile)
    from multiplierz.mgf.extraction import _extractor_
    extractor = _extractor_(data, datafile, default_charge, centroid,
                            scan_type, deisotope_and_reduce_charge,
                            derive_precursor_via, maximum_precursor_mass,
                            long_ms1, deisotope_and_reduce_MS1_args,
                            deisotope_and_reduce_MS2_args, min_mz,
                            precursor_tolerance, isobaric_labels,
                            label_tolerance, channel_corrections,
                            prec_info_file, region_based_labels)
    writer = MGF_Writer(outputfile)

    for scan, title, mz, charge in extractor.run():
        writer.write(scan, title, mass=mz, charge=charge)
    writer.close()

    if extractor.inconsistent_precursors:
        vprint("Precursor inconsistencies: %s/%s" %
               (extractor.inconsistent_precursors, extractor.scans_written))

    return outputfile
Exemplo n.º 10
0
    def writeIonAnnotations(self, datafile=None, in_place=False):
        for spectrumList in self.root.getiterator(
                self.pfx + "SpectrumIdentificationList"):
            try:
                fragtab = [
                    x for x in spectrumList
                    if x.tag == self.pfx + "FragmentationTable"
                ][0]
            except IndexError:
                fragtab = xml.SubElement(spectrumList, "FragmentationTable")

            intMeasure = xml.SubElement(fragtab, "Measure")
            intMeasure.set("id", "m_intensity")
            intMeasureKind = xml.SubElement(intMeasure, "cvParam")
            intMeasureKind.set("cvRef", "PSI-MS")
            intMeasureKind.set("accession", "MS:1001226")
            intMeasureKind.set("name", "product ion intensity")

            mzMeasure = xml.SubElement(fragtab, "Measure")
            mzMeasure.set("id", "m_mz")
            mzMeasureKind = xml.SubElement(mzMeasure, "cvParam")
            mzMeasureKind.set("cvRef", "PSI-MS")
            mzMeasureKind.set("accession", "MS:1001225")
            mzMeasureKind.set("name", "product ion m/z")

        for spectrumResult in self.root.getiterator(
                self.pfx + "SpectrumIdentificationResult"):
            dataEl = self.fileLookup[spectrumResult.get("spectraData_ref")]

            spectrumTitle = self.giveCVs(spectrumResult)['spectrum title']
            derivedData, scanNum = parseSpectrumTitle(spectrumTitle)
            if not datafile:
                #datafile = dataEl.get("location")
                datafile = derivedData
            try:
                data = self.filePointers[datafile]
            except KeyError:
                data = mzFile(datafile)
                self.filePointers[datafile] = data
            #rT = float(self.giveCVs(spectrumResult)["MS:1001114"]) / 60.0

            #scanName = spectrumResult.get("spectrumID") # Perhaps?  Not entirely clear.
            #try:
            #scanNum = int(scanName)
            #except ValueError:
            #scanNum = int(scanName.split("=")[1])

            #scan = data.cscan(data.scan_time_from_scan_name(scanNum))

            #for spectrumItem in [x for x in spectrumResult
            #if x.tag == (self.pfx + 'SpectrumIdentificationItem')]:
            for spectrumItem in spectrumResult.getiterator(
                    self.pfx + 'SpectrumIdentificationItem'):
                #mz = float(spectrumItem.get("experimentalMassToCharge"))
                #scanHeader = min([x for x in data.scan_info(rT - 0.1, rT + 0.1, mz - 1, mz + 1)
                #if x[3] == 'MS2'],
                #key = lambda x: abs(x[1] - mz))
                #scan = data.scan(scanHeader[0], centroid = True)
                scan = data.scan(scanNum, centroid=True)

                if len(scan) > 500:
                    scan = sorted(scan, key=lambda x: x[1], reverse=True)[:500]

                #try:
                #scans = self.dataFileScans[datafile]
                #except KeyError:
                #scans = data.scan_info()
                #self.dataFileScans[datafile] = scans
                try:
                    #fragmentation = [x for x in spectrumItem
                    #if x.tag == (self.pfx + "Fragmentation")][0]
                    fragmentation = spectrumItem.getiterator(
                        self.pfx + 'Fragmentation').next()
                except StopIteration:
                    fragmentation = xml.SubElement(spectrumItem,
                                                   self.pfx + "Fragmentation")
                    #fragmentation = [x for x in spectrumItem
                    #if x.tag == (self.pfx + "Fragmentation")][0]
                iontype = xml.SubElement(fragmentation, self.pfx + "IonType")
                iontype.set("index", "0 " * len(scan))
                iontype.set("charge", "0")

                ionKind = xml.SubElement(iontype, self.pfx + "cvParam")
                ionKind.set("cvRef", "PSI-MS")
                ionKind.set("accession", "MS:1001240")
                ionKind.set("name", "non-identified ion")

                def listStr(thing):
                    out = ""
                    for x in thing:
                        out += (str(x) + " ")
                    return out

                mzArray = xml.SubElement(iontype, self.pfx + "FragmentArray")
                mzArray.set("values", listStr(unzip(scan)[0]))
                mzArray.set("measure_ref", "m_mz")

                intArray = xml.SubElement(iontype, self.pfx + "FragmentArray")
                intArray.set("values", listStr(unzip(scan)[1]))
                intArray.set("measure_ref", "m_intensity")

        if not in_place:
            outputFile = self.filename[:-5] + "_annotated.mzid"
        else:
            outputFile = self.filename

        softwareUsed = self.root.getiterator(self.pfx +
                                             "AnalysisSoftwareList").next()
        mzDesktopEl = xml.SubElement(softwareUsed,
                                     self.pfx + "AnalysisSoftware")
        mzDesktopEl.set("id", "DFCI Multiplierz v1.1.0")
        mzDesktopEl.set("name", "Multiplierz")
        mzDesktopEl.set("uri", "http://sourceforge.net/projects/multiplierz/")
        mzDesktopEl.set("version", __version__)
        softwareName = xml.SubElement(mzDesktopEl, "SoftwareName")
        nameParam = xml.SubElement(softwareName, "userParam")
        nameParam.set("name", "Multiplierz")

        self.mzid.close()

        output = open(outputFile, "w")
        self.tree.write(output)
        output.close()

        self.mzid = open(outputFile, "r")
Exemplo n.º 11
0
def extract(datafile,
            outputfile=None,
            default_charge=2,
            centroid=True,
            scan_type=None,
            deisotope_and_reduce_charge=True,
            deisotope_and_reduce_args={},
            min_mz=140,
            precursor_tolerance=0.005,
            isobaric_labels=None,
            label_tolerance=0.01):
    """
    Converts a mzAPI-compatible data file to MGF.
    
    Writes only MS2 spectra where these can be determined, otherwise takes
    every spectrum in the file.  Likewise writes the precursor charge
    and mass if these can be determined.
    
    deisotope_and_reduce_charge deisotopes and charge-reduces each MS2
    spectrum, which generally improves results from peptide database search
    algorithms. However, it should be disabled for very low-resolution scans.
    """
    # Currently doesn't compensate for injection time! Would be required in
    # order to deal with iTRAQ/TMT labels.

    from multiplierz.spectral_process import deisotope_reduce_scan, peak_pick
    from multiplierz.spectral_process import centroid as centroid_func  # Distinct from 'centroid' argument.

    def _get_precursor(mz, possible_prec, charge):
        try:
            return min([
                x for x in possible_prec if (charge == None or x[1] == charge)
            ],
                       key=lambda x: abs(x[0] - mz))
        except ValueError:
            return None, None

    if not outputfile:
        outputfile = datafile + '.mgf'

    if os.path.exists(outputfile):
        assert outputfile.lower().endswith('mgf'), (
            "Overwriting a non-MGF file %s with "
            "the MGF extractor is probably a mistake." % outputfile)

    writer = MGF_Writer(outputfile)

    data = mzFile(datafile)
    scanInfo = data.scan_info()

    # Coerce that scanInfo be in order of time, so that for .WIFF files
    # we can still use the previous-MS1 method to look up precursor charges.
    scanInfo.sort(key=lambda x: x[0])

    if datafile.lower().endswith('.raw'):  # May also exist for WIFF?
        filters = dict(data.filters())

        # For RAW files only, there's the option to filter by a given
        # scan type.  (It would be more efficient in many cases to
        # actually split files in a single run, though.)
        if scan_type:
            scan_type = scan_type
            assert (scan_type.lower()
                    in ['cid', 'hcd', 'etd',
                        'etdsa']), ("Invalid scan type %s, must be one"
                                    "of (CID, HCD, ETD, ETDSA).") % scan_type
            typestr = "@%s" % scan_type.lower()

            scanInfo = [
                x for x in scanInfo
                if x[3] == 'MS1' or typestr in filters[x[0]]
            ]
    else:
        filters = None
        assert not scan_type, "Scan type filtering only enabled with .RAW format files."

    if isobaric_labels:
        assert centroid, "Isobaric tags can only be read from centroided data; set 'centroid' to True."

    if not isobaric_labels:
        labels = []
    elif isobaric_labels == 4 or isobaric_labels == '4plex':
        labels = zip([114, 115, 116, 117], [114.11, 115.11, 116.11, 117.12])
    elif isobaric_labels == 6 or isobaric_labels == '6plex':
        labels = zip([126, 127, 128, 129, 130, 131],
                     [126.127, 127.131, 128.134, 129.138, 130.141, 131.138])
    elif isobaric_labels == 8 or isobaric_labels == '8plex':
        labels = zip(
            [113, 114, 115, 116, 117, 118, 119, 121],
            [113.11, 114.11, 115.11, 116.11, 117.12, 118.12, 119.12, 121.12])
    elif isobaric_labels == 10 or isobaric_labels == '10plex':
        labels = zip([
            '126', '127N', '127C', '128N', '128C', '129N', '129C', '130N',
            '130C', '131'
        ], [
            126.127726, 127.124761, 127.131081, 128.128116, 128.134436,
            129.131471, 129.137790, 130.134825, 130.141145, 131.138180
        ])

        assert label_tolerance < 0.005, (
            "label_tolerance must be lower "
            "than 0.005 for 10-plex experiments! (Currently %s)" %
            label_tolerance)
    else:
        raise NotImplementedError, ("Labels of type %s not recognized.\n"
                                    "Should be one of [4,6,8,10] or None.")

    def read_labels(scan):
        partscan = [x for x in scan if x[0] < labels[-1][1] + 3]
        if not partscan:
            return dict([(str(l), '0') for l in zip(*labels)[0]])

        # This should probably actually sum all points within
        # the tolerance range.
        scan_values = {}
        for label, mz in labels:
            nearpt = min(partscan, key=lambda x: abs(x[0] - mz))
            if abs(nearpt[0] - mz) < label_tolerance:
                scan_values[str(label)] = '%.3f' % nearpt[1]
            else:
                scan_values[str(label)] = '0'  # Report noise value?

        return scan_values

    inconsistent_precursors = 0
    scans_written = 0

    lastMS1 = None
    lastMS1ScanName = None
    recal_factor = 1
    calibrant = RAW_CAL_MASS
    for time, mz, scanNum, scanLevel, scanMode in scanInfo:
        scanName = scanNum if isinstance(scanNum, int) else time

        if scanLevel == 'MS1':
            lastMS1ScanName = scanName

            possible_precursors = None

            def calculate_precursors(calibrant):
                if data.format == 'raw':
                    lastMS1 = data.lscan(lastMS1ScanName)
                    lastMS1, calibrant = raw_scan_recalibration(
                        lastMS1, calibrant)
                else:
                    try:
                        lastMS1 = data.scan(lastMS1ScanName, centroid=True)
                    except NotImplementedError:
                        lastMS1 = centroid_func(data.scan(lastMS1ScanName))

                envelopes = peak_pick(lastMS1,
                                      tolerance=0.01,
                                      min_peaks=2,
                                      enforce_isotopic_ratios=True)[0]
                return sum([[(x[0][0], c) for x in xs]
                            for c, xs in envelopes.items()], []), calibrant

            continue
        elif scanLevel == 'MS3':
            continue
        elif lastMS1ScanName == None:
            continue

        # Each file type handles centroiding differently (or not at all.)
        if data.format == 'raw':
            scan = data.scan(scanName, centroid=centroid)

            scan, calibrant = raw_scan_recalibration(scan, calibrant)
        elif data.format == 'wiff':
            # explicit_numbering, of course, can't be active here.
            scan = data.scan(scanName)
            if centroid:
                scan = centroid_func(scan)
        elif data.format == 'd':
            scan = data.scan(scanName, centroid=centroid)
            if centroid and not scan:
                # mzAPI.D returns empty if centroid data is not present in
                # the file, but that can be corrected by external centroiding.
                scan = centroid_func(data.scan(scanName, centroid=False))
        else:
            raise NotImplementedError, "Extractor does not handle type %s" % data.format

        if filters and not mz:
            mz = float(filters[time].split('@')[0].split(' ')[-1])

        mzP = None
        chargeP = None
        if "scanPrecursor" in dir(data):
            assert isinstance(scanName, int)
            mzP, chargeP = data.scanPrecursor(scanName)

        if not mzP:  # .scanPrecursor sometimes returns charge and not mzP.
            if possible_precursors == None:
                possible_precursors, calibrant = calculate_precursors(
                    calibrant)

            mzP, chargeP = _get_precursor(mz, possible_precursors, chargeP)
            if not mzP:
                # Release presumed charge possibly obtained from scanPrecursor.
                mzP, chargeP = _get_precursor(mz, possible_precursors, None)
                if mz and chargeP:
                    inconsistent_precursors += 1

        if mzP and (abs(mz - mzP) < 2 or not mz):
            mz = mzP
            charge = chargeP
        else:
            charge = default_charge

        if not charge:
            charge = default_charge

        if not mz:
            import warnings
            errmgf = os.path.abspath(datafile)
            warnings.warn('Unable to recover all precursor masses from %s' %
                          errmgf)
        else:
            if labels:
                scan_labels = read_labels(scan)
            else:
                scan_labels = {}

            title = standard_title_write(datafile,
                                         rt=time,
                                         mz=mz,
                                         mode=scanMode,
                                         scan=scanNum,
                                         **scan_labels)

            # Should expand extract() call to include arguments to this.
            if deisotope_and_reduce_charge and centroid:
                if ('tolerance' not in deisotope_and_reduce_args
                        or not deisotope_and_reduce_args['tolerance']):
                    deisotope_and_reduce_args[
                        'tolerance'] = precursor_tolerance
                scan = deisotope_reduce_scan(scan, **deisotope_and_reduce_args)
            scan = [x for x in scan if x[0] > min_mz]
            assert charge, title
            writer.write(scan, title, mass=mz, charge=charge)
            scans_written += 1

    writer.close

    if inconsistent_precursors:
        vprint("Precursor inconsistencies: %s/%s" %
               (inconsistent_precursors, scans_written))

    return outputfile
Exemplo n.º 12
0
    def writeIonAnnotations(self, datafile=None, in_place=False):
        for spectrumList in self.root.getiterator(
                self.pfx + "SpectrumIdentificationList"):
            try:
                fragtab = [
                    x for x in spectrumList
                    if x.tag == self.pfx + "FragmentationTable"
                ][0]
            except IndexError:
                fragtab = xml.SubElement(spectrumList, "FragmentationTable")

            intMeasure = xml.SubElement(fragtab, "Measure")
            intMeasure.set("id", "m_intensity")
            intMeasureKind = xml.SubElement(intMeasure, "cvParam")
            intMeasureKind.set("cvRef", "PSI-MS")
            intMeasureKind.set("accession", "MS:1001226")
            intMeasureKind.set("name", "product ion intensity")

            mzMeasure = xml.SubElement(fragtab, "Measure")
            mzMeasure.set("id", "m_mz")
            mzMeasureKind = xml.SubElement(mzMeasure, "cvParam")
            mzMeasureKind.set("cvRef", "PSI-MS")
            mzMeasureKind.set("accession", "MS:1001225")
            mzMeasureKind.set("name", "product ion m/z")

        for spectrumResult in self.root.getiterator(
                self.pfx + "SpectrumIdentificationResult"):
            dataEl = self.fileLookup[spectrumResult.get("spectraData_ref")]

            spectrumTitle = self.giveCVs(spectrumResult)['spectrum title']
            derivedData, scanNum = parseSpectrumTitle(spectrumTitle)
            if not datafile:
                datafile = derivedData
            try:
                data = self.filePointers[datafile]
            except KeyError:
                data = mzFile(datafile)
                self.filePointers[datafile] = data

            for spectrumItem in spectrumResult.getiterator(
                    self.pfx + 'SpectrumIdentificationItem'):
                scan = data.scan(scanNum, centroid=True)

                if len(scan) > 500:
                    scan = sorted(scan, key=lambda x: x[1], reverse=True)[:500]

                try:
                    fragmentation = next(
                        spectrumItem.getiterator(self.pfx + 'Fragmentation'))
                except StopIteration:
                    fragmentation = xml.SubElement(spectrumItem,
                                                   self.pfx + "Fragmentation")
                iontype = xml.SubElement(fragmentation, self.pfx + "IonType")
                iontype.set("index", "0 " * len(scan))
                iontype.set("charge", "0")

                ionKind = xml.SubElement(iontype, self.pfx + "cvParam")
                ionKind.set("cvRef", "PSI-MS")
                ionKind.set("accession", "MS:1001240")
                ionKind.set("name", "non-identified ion")

                def listStr(thing):
                    out = ""
                    for x in thing:
                        out += (str(x) + " ")
                    return out

                mzArray = xml.SubElement(iontype, self.pfx + "FragmentArray")
                mzArray.set("values", listStr(unzip(scan)[0]))
                mzArray.set("measure_ref", "m_mz")

                intArray = xml.SubElement(iontype, self.pfx + "FragmentArray")
                intArray.set("values", listStr(unzip(scan)[1]))
                intArray.set("measure_ref", "m_intensity")

        if not in_place:
            outputFile = self.filename[:-5] + "_annotated.mzid"
        else:
            outputFile = self.filename

        softwareUsed = next(
            self.root.getiterator(self.pfx + "AnalysisSoftwareList"))
        mzDesktopEl = xml.SubElement(softwareUsed,
                                     self.pfx + "AnalysisSoftware")
        mzDesktopEl.set("id", "DFCI Multiplierz v1.1.0")
        mzDesktopEl.set("name", "Multiplierz")
        mzDesktopEl.set("uri", "http://sourceforge.net/projects/multiplierz/")
        mzDesktopEl.set("version", __version__)
        softwareName = xml.SubElement(mzDesktopEl, "SoftwareName")
        nameParam = xml.SubElement(softwareName, "userParam")
        nameParam.set("name", "Multiplierz")

        self.mzid.close()

        output = open(outputFile, "w")
        self.tree.write(output)
        output.close()

        self.mzid = open(outputFile, "r")
Exemplo n.º 13
0
def evaluateMascotFile(resultfile, datafile = None, featurefile = None, outputfile = None):
    #assert datafile or featurefile, "Either raw data or feature data must be given!"
    
    header = [list(x.values()) for x in list(reader(resultfile, sheet_name = 'Mascot_Header'))]
    
    def retrieveHeaderValue(key):
        try:
            return [[x for x in xs if x != key] for xs in header if key in xs][0][0]
        except IndexError:
            return ''
    quant = retrieveHeaderValue('Quantitation method')
    varmods = retrieveHeaderValue('Variable modifications')
    
    assert ('SILAC' in quant) or ('plex' in varmods), "Label method not recognized!"
    
    if not featurefile:
        featurefile = detectFeatures(datafile, signalToNoiseThreshold = 15)
        features = FeatureInterface(featurefile)
    else:
        features = FeatureInterface(featurefile)
    
    print("Matching features to PSMs...")
    results = reader(resultfile)
    columns = results.columns
    results = list(results)
    
    
    data = mzFile(datafile)
    ms1map = {}
    ms2s = []
    ms1 = None
    for _, _, scan, level, _ in data.scan_info(0, 999999):
        if level == 'MS1':
            for ms2 in ms2s:
                ms1map[ms2] = ms1
            ms1 = scan
            ms2s = []
        elif level == 'MS2':
            ms2s.append(scan)
    ms1map[ms1] = ms2s
    data.close()
    
    featureIntMap = {}
    for psm in results:
        mz = psm['Experimental mz']
        scan = int(psm['Spectrum Description'].split('.')[1])
        charge = int(psm['Charge'])
        for index, feature in features.mz_range(mz - 1, mz + 1):
            if feature.containsPoint(mz, ms1map[scan], charge):
                featureIntMap[scan] = feature.c12Intensity()
                break
    
    del features    
    
    if not outputfile:
        outputfile = '.'.join(resultfile.split('.')[:-1]) + '_LABEL_EVALUATION.xlsx'
    
    if 'SILAC' in quant:
        return evaluateSILAC(outputfile, columns, results, featureIntMap), outputfile
    elif 'plex' in varmods:
        return evaluateTMTiTRAQ(outputfile, columns, results, featureIntMap), outputfile
Exemplo n.º 14
0
def psm_XIC_localized(directory, subdirs):
    """
    A peptide may appear in multiple fractions due various factors, but for
    the purpose of this analysis it is useful to consider a peptide as
    "belonging" only to the fraction in which the main bulk of the elution
    occurred. For each fraction in which a given peptide appeared, we take
    XICs over the m/z values for a set of possible charge and compare their
    total intensity; the fraction with the most intense XIC(s) is assigned
    that peptide for the final count.
    """

    tolerance = 0.1
    time_tolerance = 15

    rawfiles = dict([(x.split('.')[0], mzFile(os.path.join(directory, x)))
                     for x in os.listdir(directory)
                     if x.lower().endswith('raw')])
    columns = None

    start = time.clock()
    for subdir in subdirs:
        resultfiles = typeInDir(os.path.join(directory, subdir), 'xlsx')
        resultfiles = [x for x in resultfiles if 'XIC_localized' not in x]

        peptidesForFile = defaultdict(dict)
        for resultfile in resultfiles:
            rdr = reader(resultfile)
            columns = rdr.columns
            psmsByPeptide = collectByCriterion(
                list(rdr), lambda x:
                (x['Peptide Sequence'], x['Variable Modifications']))
            for peptide, psms in psmsByPeptide.items():
                peptidesForFile[peptide][resultfile] = psms

        outputByFile = defaultdict(list)
        for peptide, psmsByFile in peptidesForFile.items():
            xicsByFile = []

            allPSMs = sum(psmsByFile.values(), [])
            mass = allPSMs[0]['Predicted mr']
            assert len(set(x['Predicted mr'] for x in allPSMs)) == 1

            charges = set(x['Charge'] for x in allPSMs)
            allScans = set([
                tuple(x['Spectrum Description'].split('.')[:2])
                for x in allPSMs
            ])
            allRTs = set(rawfiles[x[0]].scan_time_from_scan_name(int(x[1]))
                         for x in allScans)
            minRT, maxRT = min(allRTs), max(allRTs)

            for resultfile, psms in psmsByFile.items():
                rawfile = rawfiles[os.path.basename(resultfile.split('.')[0])]
                xicInt = 0
                for charge in charges:
                    mz = (mass + (1.0072764 * charge)) / charge
                    xic = rawfile.xic(minRT - time_tolerance,
                                      maxRT + time_tolerance, mz - tolerance,
                                      mz + tolerance)
                    xicInt += sum(zip(*xic)[1])

                xicsByFile.append((xicInt, resultfile))

            highIntFile = max(xicsByFile, key=lambda x: x[0])[1]
            outputByFile[highIntFile].append(psmsByFile[highIntFile][0])

        for resultfile, psms in outputByFile.items():
            outputfile = resultfile[:-5] + '.XIC_localized.xlsx'
            output = writer(outputfile, columns=columns)
            for psm in psms:
                output.write(psm)
            output.close()