Пример #1
0
    def mascot_headers(self, report_file, mascot_headers):
        '''Combines a list of Mascot header pages into a single page (XLS or MZD)'''

        logger_message(30, 'Adding Mascot Headers...')

        first_columns = set()
        columns = defaultdict(dict)

        report_files = [f for f, mh in mascot_headers]

        for f, mascot_header in mascot_headers:
            first_col = []

            for line in mascot_header[1:]:
                if line[0] == ' ' or (isinstance(line[1], (str, unicode))
                                      and line[1].startswith('-----')):
                    continue
                columns[f][line[0]] = line[1]
                first_col.append(line[0])

            first_columns.add(tuple(first_col))

        if len(first_columns) > 1:
            logger_message(20,
                           'Headers seems different, will try to merge them')

            main_h = list(max(first_columns, key=len))
            main_h += sorted(
                reduce(set.union, first_columns, set()).difference(main_h))
        else:
            main_h = list(first_columns.pop())

        cols = ['Header'] + [(os.path.basename(f) if f else ('-' * 50))
                             for f in report_files]

        if report_file.lower().endswith('.mzd'):
            mascot_rep = mzReport.writer(report_file,
                                         columns=cols,
                                         table_name='MascotHeader')
        else:
            mascot_rep = mzReport.writer(report_file,
                                         columns=cols,
                                         sheet_name='Mascot_Header')

        for col in main_h:
            row = [col]

            for f in report_files:
                if col in columns[f]:
                    row.append(columns[f][col])
                else:
                    row.append(None)

            mascot_rep.write(row)

        mascot_rep.close()
Пример #2
0
    def prot_report(self, report_file, prot_report):
        '''Adds a protein page to a report (XLS or MZD)'''

        logger_message(30, 'Adding Protein Info...')

        if report_file.lower().endswith('.mzd'):
            prot_rep = mzReport.mzDB.sqlite3.connect(report_file)
            prot_rep.execute(
                'create view ProteinData as select '
                '"Protein Rank","Accession Number","Protein Description",'
                '"Protein Mass","Protein Matches","Protein Score",'
                'count(distinct "Peptide Sequence") as "Unique Peptides"'
                ' from PeptideData group by "Protein Rank","Accession Number"')

            prot_rep.close()
        else:
            cols = [
                'Protein Rank', 'Accession Number', 'Protein Description',
                'Protein Mass', 'Protein Matches', 'Protein Score',
                'Unique Peptides'
            ]

            prot_rep = mzReport.writer(report_file,
                                       columns=cols,
                                       sheet_name='Protein')

            for line in prot_report:
                prot_rep.write(line)

            prot_rep.close()
Пример #3
0
    def calculate(self, event):
        datafile = self.ionFileCtrl.GetValue()
        ionStr = self.ionSelectCtrl.GetValue()
        ions = [float(x) for x in ionStr.split()]

        tolerance = float(self.toleranceCtrl.GetValue())
        threshold = float(self.thresholdCtrl.GetValue())

        saveOutput = self.saveAsCheck.GetValue()
        if saveOutput:
            outputFile = self.saveAsCtrl.GetValue()
            if not outputFile:
                raise IOError, "Output file not selected."

        results, resultColumns = findIonsInData(datafile,
                                                ions,
                                                tolerance,
                                                threshold,
                                                includeColumns=True)

        if saveOutput:
            output = writer(outputFile, columns=resultColumns)
            for row in results:
                output.write(row)
            output.close()
        else:
            resultDisplay = FoundIonsDisplay(self, -1, resultColumns, results)
            resultDisplay.Show()
Пример #4
0
def convert_report(reportfile, outputfile=None):
    # Reads .mzid file to generate a Mascot-like report which has most
    # important fields; doesn't report protein-level statistics, missed cleavages,
    # or Mascot-specific fields (e.g., query.)

    from multiplierz.mzTools.mzIdentMLAPI import mzIdentML
    from multiplierz.mass_biochem import remove_protons
    from multiplierz.mzReport import reader, writer

    if not outputfile:
        outputfile = reportfile + '.csv'

    report = mzIdentML(reportfile)
    out = writer(outputfile,
                 columns=[
                     'File', 'Rank', 'Accession Number', 'Protein Description',
                     'Peptide Sequence', 'Variable Modifications',
                     'Experimental mz', 'Predicted mr', 'Charge', 'Delta',
                     'Peptide Score', 'Expectation Value', 'Start Position',
                     'End Position', 'Preceding Residue', 'Following Residue',
                     'Spectrum Description'
                 ])
    for row in report.peptideSummary():
        row['Predicted mr'] = remove_protons(row['Calculated mz'],
                                             int(row['Charge']))
        del row['Calculated mz']
        del row['Passed Threshold']
        del row['Peptide ID']
        del row['Spectrum ID']
        out.write(row)
    out.close()
Пример #5
0
def detect_matches(file_names,
                   fields=[],
                   tol_field=None,
                   tolerance=0.0,
                   save_file=''):
    #mzTools.logger_message(30,'Detecting Matches...')

    #fields = fields or [mzReport.multiplierzHeaders[k]
    #for k in ['acc','seq','var_mods']]

    detect_gen = _detect_matches(file_names, fields, tol_field, tolerance)

    cols = detect_gen.next()
    match_out = [cols]

    if save_file:
        writer = mzReport.writer(save_file, columns=cols)

    for line in detect_gen:
        match_out.append(line)
        if save_file:
            writer.write(line)

    detect_gen.close()

    if save_file:
        writer.close()

    #mzTools.logger_message(20,'Matches Detected')

    return match_out
Пример #6
0
def combine_peptides(reportfile, isobaric=None, outputfile=None):
    from multiplierz.mzReport import reader, writer
    from multiplierz.mgf import standard_title_parse

    isobaric_labels = {
        None: [],
        4: ['114', '115', '116', '117'],
        6: ['126', '127', '128', '129', '130', '131'],
        8: ['113', '114', '115', '116', '117', '118', '119', '121'],
        10: [
            '126', '127N', '127C', '128N', '128C', '129N', '129C', '130N',
            '130C', '131'
        ]
    }

    def _byPeptide(row):
        # Not counting charge.
        varmodset = frozenset(
            [x.strip() for x in row['Variable Modifications'].split(';')])
        return row['Peptide Sequence'], varmodset

    def _getReporters(row):
        attrib = standard_title_parse(row['Spectrum Description'])
        return [float(attrib[x.lower()]) for x in isobaric_labels[isobaric]]

    assert isobaric in isobaric_labels

    psms = reader(reportfile)
    rowsByPeptide = collectByCriterion(psms, _byPeptide)

    sum_cols = ['Sum%s' % x for x in isobaric_labels[isobaric]]
    top_cols = ['Max%s' % x for x in isobaric_labels[isobaric]]
    if not outputfile:
        outputfile = insert_tag(reportfile, 'peptide_combined')
    output = writer(outputfile,
                    columns=(psms.columns + sum_cols + top_cols + ['PSMs']))

    for pep, psms in rowsByPeptide.items():
        outrow = max(psms, key=lambda x: x['Peptide Score'])
        outrow['PSMs'] = len(psms)

        if isobaric:
            repsets = [_getReporters(x) for x in psms]
            toprepset = max(repsets, key=lambda x: sum(x))
            sumrepset = [sum(x) for x in zip(*repsets)]

            for rep, col in zip(toprepset, top_cols):
                outrow[col] = rep
            for rep, col in zip(sumrepset, sum_cols):
                outrow[col] = rep

        output.write(outrow)

    output.close()

    return outputfile
Пример #7
0
    def mascot_header(self, report_file, mascot_header):
        '''Adds a Mascot Header page to a report (XLS or MZD)'''

        logger_message(30, 'Adding Mascot Header...')

        if report_file.lower().endswith('.mzd'):
            mascot_rep = mzReport.writer(report_file,
                                         columns=mascot_header[0],
                                         table_name='MascotHeader')
        else:
            mascot_rep = mzReport.writer(report_file,
                                         columns=mascot_header[0],
                                         sheet_name='Mascot_Header')

        for line in mascot_header[1:]:
            mascot_rep.write(line)

        mascot_rep.close()

        logger_message(20, 'Mascot Header Complete')
Пример #8
0
def concatenate_reports(reportfiles, outputfile):
    from multiplierz.mzReport import reader, writer
    readers = map(reader, reportfiles)
    allcols = sorted(set.intersection(*map(set, [x.columns for x in readers])),
                     key = lambda x: readers[0].columns.index(x))
    if not all(x.columns == allcols for x in readers):
        print "Warning- concatenation drops some columns!"
    output = writer(outputfile, columns = allcols)
    for report in readers:
        for row in report:
            output.write(row, ignore_extra = True)
    output.close()
    return outputfile
Пример #9
0
    def dispatchModes(self, event):
        self.runButton.Enable(False)

        mode = self.modeCtrl.GetString(self.modeCtrl.GetSelection())
        self.criteria = self.fieldsCtrl.GetCheckedStrings()
        self.inputfiles = [(x, reader(x)) for x in self.fileList.GetStrings()]

        outputfile = self.outputCtrl.GetValue()
        if not outputfile:
            outputfile = 'combined_output_file'
        if not outputfile.split('.')[-1].lower() in ('xls', 'xlsx', 'csv',
                                                     'mzd'):
            outputfile += '.xlsx'
        if not os.path.isabs(outputfile):
            outdir = os.path.dirname(self.inputfiles[0][0])
            outputfile = os.path.join(outdir, outputfile)

        if mode in [
                'Concatenate All', 'Unique-by-File Report',
                'Entries-in-Common Report'
        ]:
            columnsets = [x[1].columns for x in self.inputfiles]
            columnIntersection = reduce(set.intersection, columnsets,
                                        set(columnsets[0]))
            self.outcolumns = ['Source'] + [
                x for x in columnsets[0] if x in columnIntersection
            ]

            if mode != 'Concatenate All':
                assert all([x in self.outcolumns for x in self.criteria])
        elif mode in ['Cross-Report Key']:
            self.outcolumns = ['Key'] + [x[0] for x in self.inputfiles]
        else:
            raise Exception

        self.output = writer(outputfile, columns=self.outcolumns)

        if mode == 'Concatenate All':
            self.concatenate()
        elif mode == 'Cross-Report Key':
            self.cross_report_key()
        elif mode == 'Unique-by-File Report':
            self.unique_by_file()
        elif mode == 'Entries-in-Common Report':
            self.entries_in_common()
        else:
            raise Exception

        self.output.close()
        print "Wrote %s" % outputfile
        self.runButton.Enable(True)
Пример #10
0
def combine_accessions(reportfile, outputfile = None):
    """
    Given a Mascot-style PSM report, this combines all protein hypotheses for a given
    MS2 spectrum into a single PSM.
    
    outputfile may be safely specified to be the same as the input file, in
    order to overwrite the original file.
    """
    
    
    from multiplierz.mzReport import reader, writer
    
    report = reader(reportfile)
    columns = report.columns

    molecules = defaultdict(list)
    for row in report:
        molecules[row['Spectrum Description']].append(row)
        
    
    outputData = []
    for rows in molecules.values():
        accessions = [x['Accession Number'] for x in rows]
        newRow = max(rows, key = lambda x: x['Peptide Score'])
        
        if 'Accession Number' in columns:
            newRow['Accession Number'] = '; '.join([x['Accession Number'] for x in rows])
        if 'Protein Description' in columns:
            newRow['Protein Description'] = '; '.join([x['Protein Description'] for x in rows])
        if 'Protein Masses' in columns:
            newRow['Protein Masses'] = '; '.join([str(x['Protein Mass']) for x in rows])
        newRow['Protein Redundancy'] = len(rows)
        outputData.append(newRow)
    
    try:
        columns = [x for x in columns + ['Protein Masses'] if x in newRow]
    except UnboundLocalError:
        pass # Means there was no newRow, and thus no rows, so it's pretty arbitrary.
    
    if not outputfile:
        outputfile = insert_tag(reportfile, 'combined_accessions')
    
    output = writer(outputfile, columns = columns + ['Protein Redundancy'])
    report.close()
    for row in outputData:
        output.write(row)
    output.close()
    
    
    
    return outputfile
Пример #11
0
def psm_intersection(directory, mode_subdirs):
    """
    To give a more accurate depiction of the relative elution profile of each
    label state, the final results will only consider peptides that appear in
    the results for all four states. This determines the overlapping peptide
    repertoire detected across all four experiments, and produces subset
    result files that only include these peptides.
    """

    psmByCondition = defaultdict(list)
    for mode, subdir, par in mode_subdirs:
        files = typeInDir(os.path.join(directory, subdir), 'xlsx')
        conditionPSMs = []
        for resultfile in files:
            if not 'FDR' in resultfile:
                continue
            conditionPSMs += list(reader(resultfile))
        psmByCondition[subdir] = collectByCriterion(conditionPSMs, peptideKey)

    consistentPSMs = reduce(set.intersection,
                            [set(x.keys()) for x in psmByCondition.values()],
                            set(psmByCondition.values()[0].keys()))

    newSubdirs = []
    for mode, subdir, par in mode_subdirs:
        newSubdir = subdir + '_intersection_sheets'
        newSubdirs.append((mode, newSubdir))
        try:
            os.mkdir(os.path.join(directory, newSubdir))
        except:
            pass
        files = typeInDir(os.path.join(directory, subdir), 'xlsx')
        for filename in files:
            alreadySeenPeptides = set()
            if not 'FDR' in filename:
                continue
            psms = reader(filename)
            filterfile = writer(os.path.join(directory, newSubdir,
                                             os.path.basename(filename)),
                                columns=psms.columns)
            for psm in psms:
                pepKey = peptideKey(psm)
                if pepKey in consistentPSMs and pepKey not in alreadySeenPeptides:
                    alreadySeenPeptides.add(pepKey)
                    filterfile.write(psm)
            filterfile.close()

    return newSubdirs
Пример #12
0
def concatenate_reports(reportfiles, outputfile, include_file_column=False):
    from multiplierz.mzReport import reader, writer
    readers = list(map(reader, reportfiles))
    allcols = sorted(
        set.intersection(*list(map(set, [x.columns for x in readers]))),
        key=lambda x: readers[0].columns.index(x))
    if not all(x.columns == allcols for x in readers):
        print("Warning- concatenation drops some columns!")
    output = writer(outputfile, columns=allcols)
    for filename, report in zip(reportfiles, readers):
        filename = os.path.basename(filename)
        for row in report:
            if include_file_column:
                row['FILE'] = filename
            output.write(row, ignore_extra=True)
    output.close()
    return outputfile
Пример #13
0
def combineFiles(files, outputFile, ext):
    if not os.path.isabs(outputFile):
        outputFile = os.path.join(os.path.dirname(files[0]),
                                  os.path.basename(outputFile))
        
    if not outputFile[-1*len(ext):] == ext:
        outputFile += ext
        
    print "Merging %s" % files
    columns = reader(files[0]).columns
    output = writer(outputFile, columns = ['Source'] + columns)
    
    for filename in files:
        for row in reader(filename):
            row['Source'] = os.path.basename(filename)
            output.write(row)
    
    output.close()
    print "Wrote %s !" % outputFile    
Пример #14
0
def psm_XIC_localized(directory, subdirs):
    """
    A peptide may appear in multiple fractions due various factors, but for
    the purpose of this analysis it is useful to consider a peptide as
    "belonging" only to the fraction in which the main bulk of the elution
    occurred. For each fraction in which a given peptide appeared, we take
    XICs over the m/z values for a set of possible charge and compare their
    total intensity; the fraction with the most intense XIC(s) is assigned
    that peptide for the final count.
    """

    tolerance = 0.1
    time_tolerance = 15

    rawfiles = dict([(x.split('.')[0], mzFile(os.path.join(directory, x)))
                     for x in os.listdir(directory)
                     if x.lower().endswith('raw')])
    columns = None

    start = time.clock()
    for subdir in subdirs:
        resultfiles = typeInDir(os.path.join(directory, subdir), 'xlsx')
        resultfiles = [x for x in resultfiles if 'XIC_localized' not in x]

        peptidesForFile = defaultdict(dict)
        for resultfile in resultfiles:
            rdr = reader(resultfile)
            columns = rdr.columns
            psmsByPeptide = collectByCriterion(
                list(rdr), lambda x:
                (x['Peptide Sequence'], x['Variable Modifications']))
            for peptide, psms in psmsByPeptide.items():
                peptidesForFile[peptide][resultfile] = psms

        outputByFile = defaultdict(list)
        for peptide, psmsByFile in peptidesForFile.items():
            xicsByFile = []

            allPSMs = sum(psmsByFile.values(), [])
            mass = allPSMs[0]['Predicted mr']
            assert len(set(x['Predicted mr'] for x in allPSMs)) == 1

            charges = set(x['Charge'] for x in allPSMs)
            allScans = set([
                tuple(x['Spectrum Description'].split('.')[:2])
                for x in allPSMs
            ])
            allRTs = set(rawfiles[x[0]].scan_time_from_scan_name(int(x[1]))
                         for x in allScans)
            minRT, maxRT = min(allRTs), max(allRTs)

            for resultfile, psms in psmsByFile.items():
                rawfile = rawfiles[os.path.basename(resultfile.split('.')[0])]
                xicInt = 0
                for charge in charges:
                    mz = (mass + (1.0072764 * charge)) / charge
                    xic = rawfile.xic(minRT - time_tolerance,
                                      maxRT + time_tolerance, mz - tolerance,
                                      mz + tolerance)
                    xicInt += sum(zip(*xic)[1])

                xicsByFile.append((xicInt, resultfile))

            highIntFile = max(xicsByFile, key=lambda x: x[0])[1]
            outputByFile[highIntFile].append(psmsByFile[highIntFile][0])

        for resultfile, psms in outputByFile.items():
            outputfile = resultfile[:-5] + '.XIC_localized.xlsx'
            output = writer(outputfile, columns=columns)
            for psm in psms:
                output.write(psm)
            output.close()
Пример #15
0
def evaluateTMTiTRAQ(outputfile, columns, results, resultIntMap):
    peptides = defaultdict(list)
    for psm in results:
        try:
            varmods = [x for x in psm['Variable Modifications'].split(';') if 'plex' not in x]
        except AttributeError:
            varmods = ''
        peptides[psm['Peptide Sequence'], '; '.join(varmods)].append(psm)
    
    output = writer(outputfile, sheet_name = 'Data',
                    columns = columns + ['Lysines','Lysine Labels','N-term Label',
                                         'Fully Labelled', 'Intensity'])
    
    evaluation = {'total peptides':0,
                  'total lysines':0,
                  'fully labelled':0,
                  'nterm labelled':0,
                  'lysine labelled':0}
    
    labelSummary = []
    for key, pepSet in list(peptides.items()):
        fullLabelledInt = 0
        partLabelledInt = 0
        
        seq = key[0]
        lysCount = seq.count('K')
        
        for psm in pepSet:
            assert seq == psm['Peptide Sequence']
            
            # Should work now?
            lysLabelCount = len([x for x in psm['Variable Modifications'].split('; ')
                                 if x and x[0] == 'K' and 'plex' in x])
            ntermLabel = any([(x[:len('N-term')] == 'N-term' and 'plex' in x) for
                              x in psm['Variable Modifications'].split('; ')])
            
            psm['Lysines'] = lysCount
            psm['Lysine Labels'] = lysCount
            psm['N-term Label'] = ntermLabel
            
            try:
                intensity = resultIntMap[int(psm['Spectrum Description'].split('.')[1])]
            except KeyError:
                intensity = 0
            
            if (lysLabelCount == lysCount) and ntermLabel:
                fullLabelledInt += intensity
                psm['Fully Labelled'] = True
            else:
                partLabelledInt += intensity
                psm['Fully Labelled'] = False
            
            psm['Intensity'] = intensity
            output.write(psm)
            
            evaluation['total peptides'] += 1
            evaluation['total lysines'] += lysCount
            evaluation['fully labelled'] += 1 if not partLabelledInt else 0
            evaluation['nterm labelled'] += int(ntermLabel)
            evaluation['lysine labelled'] += lysLabelCount
        
        labelSummary.append((key, fullLabelledInt, partLabelledInt))
    output.close()
    
    summaryOutput = writer(outputfile, sheet_name = 'Label Report',
                           columns = ['Peptide Sequence', 'Non-Label Mods',
                                      '% Labelling'])
    for (seq, varmods), fullLabelledInt, partLabelledInt in labelSummary:
        row = {}
        row['Peptide Sequence'] = seq
        row['Non-Label Mods'] = varmods
        if partLabelledInt + fullLabelledInt:
            row['% Labelling'] = fullLabelledInt / (partLabelledInt + fullLabelledInt)
        else:
            row['% Labelling'] = ''
        summaryOutput.write(row)
    
    summaryOutput.close()         
    
    return evaluation
Пример #16
0
def filterJoin(filenames,
               matchColumns,
               returnMode,
               outputKeyFile,
               combinedOutputFile=None,
               outputFileType='.xlsx',
               outputTag=None,
               tolerance=None,
               toleranceColumn=None):
    """
    Produces a joined file, filtering out either repeat or unique
    PSMS.
    
    If returnMode is 'matched', output file contains one instance
    of each PSM group;
    if returnMode is 'unmatched', the output file contains every
    PSM that wasn't part of a larger PSM group.
    (Where 'PSM group' is a set of PSMs that are identical based on
    the given matchColumns + toleranceColumn.)
    If 'both', both kinds of output are produced.  
    
    Returns output file name(s); a tuple in the case of 'both.'
    """

    assert returnMode in ['matched', 'unmatched', 'both']
    #if not outputFileBase:
    #outputFileBase = filenames[0]

    data = []
    columnLists = []
    for filename in filenames:
        subdata = []
        inputfile = reader(filename)
        columnLists.append(inputfile.columns)
        for psm in inputfile:
            psm['Source'] = filename
            subdata.append(psm)
        data.append(subdata)
        inputfile.close()

    assert all([columnLists[0] == x
                for x in columnLists]), "Heterogeneous data columns!"

    datadict = defaultdict(list)
    for subdata in data:
        for psm in subdata:
            signature = tuple([psm[x] for x in matchColumns])
            datadict[signature].append(psm)

    if toleranceColumn:
        toldatadict = {}
        for signature, sigGroup in datadict.items():
            subGroups = []
            for psm in sigGroup:
                match = False
                for subGroup in subGroups:
                    if all([
                            abs(psm[toleranceColumn] - subpsm[toleranceColumn])
                            < tolerance for subpsm in subGroup
                    ]):
                        match = True
                        subGroup.append(psm)
                        break

                if not match:
                    subGroups.append([psm])

            for index, subGroup in enumerate(subGroups):
                subSig = tuple(list(signature) + index)
                toldatadict[subSig] = subGroup

        datadict = toldatadict

    if outputKeyFile:
        keyfile = writer(outputKeyFile, columns=['PSM Key'] + filenames)
        for signature, psmGroup in datadict.items():
            line = {}
            line['PSM Key'] = '|'.join([str(x) for x in signature])
            line.update([(x, len([y for y in psmGroup if y['Source'] == x]))
                         for x in filenames])
            keyfile.write(line)

        keyfile.close()

    outputpsms = []
    if returnMode == 'matched' or returnMode == 'both':
        #outputFileName = outputFileBase + '_matchedPSMs' + outputFileType
        #outputfile = writer(outputFileName, columns = ['Source'] + columnLists[0])

        for psmGroup in datadict.values():
            if len(psmGroup) > 1:
                exemplar = psmGroup[0]
                sourceFiles = '; '.join(set([x['Source'] for x in psmGroup]))
                exemplar['source'] = sourceFiles
                #outputfile.write(exemplar)
                outputpsms.append(exemplar)
        #outputfile.close()

    if returnMode == 'unmatched' or returnMode == 'both':
        #outputFileName = outputFileBase + '_uniquePSMs' + outputFileType
        #outputfile = writer(outputFileName, columns = ['Source'] + columnLists[0])

        for psmGroup in datadict.values():
            if len(psmGroup) == 1:
                #outputfile.write(psmGroup[0])
                outputpsms.append(psmGroup[0])

        #outputfile.close()

    outputs = []
    if outputTag:
        outputfiles = [
            (x, '.'.join(x.split('.')[:-1] + [outputTag, outputFileType]))
            for x in filenames
        ]
        for filename, outputfile in outputfiles:
            output = writer(outputfile, columns=['Source'] + columnLists[0])
            for psm in [x for x in outputpsms if x['Source'] == filename]:
                output.write(psm)
            output.close()

        outputs = [x[1] for x in outputfiles]

    if combinedOutputFile:
        output = writer(combinedOutputFile,
                        columns=['Source'] + columnLists[0])
        for psm in outputpsms:
            output.write(psm)
        output.close()
        outputs.append(combinedOutputFile)

    return outputs
Пример #17
0
def filter_join(file_names,
                key_source_file,
                exclude=False,
                append=False,
                save_file_suffix='_filtered',
                ext='.xls'):
    if (not append) and (not save_file_suffix):
        raise ValueError(
            'Save_file_suffix cannot be empty string when not combining output.'
        )

    filtered_files = _filter_join(file_names, key_source_file, exclude,
                                  save_file_suffix)
    #if not filtered_files:
    #return

    cols = filtered_files.next()

    #Convert csv files to xls and append to manipulations log
    if append:
        combo_out_file = os.path.join(os.path.dirname(key_source_file),
                                      'Combined%s%s' % (save_file_suffix, ext))
        if os.path.exists(combo_out_file):
            os.remove(combo_out_file)

        # union of file columns--checking to see if files have the same columns
        #union_cols = reduce(set.union, [set(f[1]) for f in filtered_files])
        union_cols = reduce(set.union, [set(c) for c in cols])
        # if the union is the size of the smallest set, they are all equal
        #if len(union_cols) == min(len(f[1]) for f in filtered_files):
        if len(union_cols) == min(len(c) for c in cols):
            same_cols = True
            if 'File' not in union_cols:
                #newcols = ['File'] + filtered_files[0][1] # they're all the same so use the first one
                newcols = ['File'] + list(
                    cols[0])  # they're all the same so use the first one
            else:
                #newcols = filtered_files[0][1]
                newcols = list(cols[0])
        else:
            # files have different columns
            same_cols = False
            if 'File' not in union_cols:
                # use the largest set of columns as initial template
                #newcols = ['File'] + max((f[1] for f in filtered_files), key=len)
                newcols = ['File'] + list(max(cols, key=len))
                # stick the remaining columns on the end, sorted as strings
                newcols += sorted(union_cols.difference(newcols))
            else:
                # use the largest set of columns as initial template
                #newcols = max((f[1] for f in filtered_files), key=len)
                newcols = list(max(cols, key=len))
                # stick the remaining columns on the end, sorted as strings
                newcols += sorted(union_cols.difference(newcols))

        if 'Filter Key' not in newcols:
            newcols.insert(1, 'Filter Key')

        combo_report = mzReport.writer(combo_out_file, columns=newcols)

        # if the columns are all the same, this is straightforward, just write them all out
        if same_cols:
            for (filename, columns, filedata) in filtered_files:
                for row in filedata:
                    row['File'] = filename  # will either add File or overwrite it
                    combo_report.write(row)
        # if not, it's slightly more complicated--create dictionary for each row with None
        # if a value is missing
        else:
            blank_row = dict((c.lower(), None) for c in newcols)
            for (filename, columns, filedata) in filtered_files:
                for row in filedata:
                    new_row = blank_row.copy()
                    new_row.update(row)
                    new_row[
                        'file'] = filename  # will either add File or overwrite it
                    combo_report.write(new_row)

        combo_report.close()
    else:
        for (filename, columns, filedata) in filtered_files:
            # write each sheet to a separate file
            if 'Filter Key' not in columns:
                columns = ['Filter Key'] + columns

            rep = mzReport.writer(save_file_suffix.join(
                os.path.splitext(filename)),
                                  columns=columns)
            for row in filedata:
                rep.write(row)

            rep.close()
Пример #18
0
def calculate_FDR(reportfile, outputfile = None, threshold = 0.01,
                  decoyString = 'rev_', includeStatisticsSheet = True,
                  includeDuplicates = True, separateDuplicateSheet = True,
                  includeFailedSheet = True, includeReverseSheet = True,
                  single_cutoff = True):
    """
    Performs Forward/Reverse database filtering on the target file, giving back
    the true PSMs over the specified statistical threshold as well as removed decoy
    and below-threshold PSMs, in respective sheets.

    All entries in the decoy (reverse) database must have accessions that begin with
    some uniform prefix; by default, "rev_" (so that gi|198292342|X7823_EXTRA becomes
    rev_gi|198292342|X7823_EXTRA.)
    
    outputfile may be safely specified to be the same as the input file, in
    order to overwrite the original file.
    """

    from multiplierz.mzReport import reader, writer

    reportReader = reader(reportfile)
    reportRows = list(reportReader)
    columns = reportReader.columns + ['FDR']
    reportReader.close()

    reportRows.sort(key = lambda x: x['Peptide Score'], reverse = True)

    seenSpectra = {}

    passedRows = []
    failedRows = []
    duplicateRows = []
    reverseRows = []

    reverses = 0.0
    forwards = 0.0
    duplicates = 0
    passed = 0
    failed = 0
    lowPass = 999999999
    highRev = 0
    for row in reportRows:
        specDesc = row['Spectrum Description']
        if specDesc in seenSpectra:
            duplicates += 1
            fdr = seenSpectra[specDesc]
            row['FDR'] = fdr
            if includeDuplicates and not separateDuplicateSheet:
                if fdr < threshold:
                    passedRows.append(row)
                else:
                    failedRows.append(row)
            else:
                duplicateRows.append(row)
            continue

        #if decoyString in row['Accession Number'].lower():
        # Turns out that produced awful results, since high-scoring peptides
        # could just happen to be duplicated in the reverse database.
        # So instead:
        if all([decoyString in x.lower() for x in row['Accession Number'].split(';')]):
            reverses += 1
            if forwards:
                fdr = reverses / forwards
            else:
                fdr = 100
            row['FDR'] = fdr      

            if float(row['Peptide Score']) > highRev:
                highRev = float(row['Peptide Score'])

            seenSpectra[specDesc] = fdr
            reverseRows.append(row)
        else:
            forwards += 1
            fdr = reverses / forwards
            row['FDR'] = fdr            

            seenSpectra[specDesc] = fdr
            if fdr < threshold:
                passed += 1
                passedRows.append(row)
                if float(row['Peptide Score']) < lowPass:
                    lowPass = float(row['Peptide Score'])
            else:
                failed += 1
                failedRows.append(row)

    if single_cutoff:
        recovered = [x for x in failedRows if x['Peptide Score'] > lowPass]
        failedRows = [x for x in failedRows if x['Peptide Score'] <= lowPass]
        passedRows += recovered

    if not outputfile: 
        # Output format must support sheets.
        if reportfile.lower().endswith('xlsx') or reportfile.lower().endswith('xls'):
            outputfile = insert_tag(reportfile, 'FDR_filtered')
        else:
            outputfile = '.'.join(reportfile.split('.')[:-1] + ['FDR_filtered.xlsx'])

    percentage = round(threshold * 100)

    if includeFailedSheet:
        failedOutput = writer(outputfile, columns = columns,
                              sheet_name = "Failed %s%% FDR" % percentage)
        for row in failedRows:
            failedOutput.write(row)
        failedOutput.close()

    if separateDuplicateSheet:
        duplicateOutput = writer(outputfile, columns = columns,
                                 sheet_name = "Duplicate Rows")
        for row in duplicateRows:
            duplicateOutput.write(row)
        duplicateOutput.close()

    if includeReverseSheet:
        reverseOutput = writer(outputfile, columns = columns,
                               sheet_name = 'Reverse Hits')
        for row in reverseRows:
            reverseOutput.write(row)
        reverseOutput.close()

    if includeStatisticsSheet:
        statOutput = writer(outputfile, columns = ['FDR Calculation Statistics', '--------------'],
                            sheet_name = "FDR Statistics")
        statOutput.write(['', ''])
        statOutput.write(['Total Spectra', str(len(reportRows))])
        statOutput.write(['Passed %s%% FDR' % percentage, str(passed)])
        statOutput.write(['Lowest Passing Score', str(lowPass)])
        statOutput.write(['Reverse Hits', str(reverses)])
        statOutput.write(['Highest Scoring Reverse Hit', str(highRev)])
        statOutput.write(['Number of Duplicates', str(duplicates)])
        statOutput.close()

    passedOutput = writer(outputfile, columns = columns, sheet_name = "Data")
    for row in passedRows:
        passedOutput.write(row)
    passedOutput.close()   

    return outputfile
Пример #19
0
def evaluateSILAC(outputfile, columns, results, featureIntMap):
    peptides = defaultdict(list)
    for psm in results:
        if psm['Variable Modifications']:
            varmods = [x for x in psm['Variable Modifications'].split(';') if 'Label' not in x]
        else:
            varmods = ''
        peptides[psm['Peptide Sequence'], '; '.join(varmods)].append(psm)
    
    
    output = writer(outputfile, sheet_name = 'Data',
                    columns = columns + ['Lysines', 'Arginines', 'Lysine Labels',
                                         'Arginine Labels', 'Fully Labelled',
                                         'Intensity'])    
    
    evaluation = {'total peptides':0,
                  'total lysines':0,
                  'total arginines':0,
                  'fully labelled':0,
                  'lysine labelled':0,
                  'arginine labelled':0}    
    
    labelSummary = []
    for key, pepSet in list(peptides.items()):
        fullLabelledInt = 0
        partLabelledInt = 0
        
        seq = key[0]
        lysCount = seq.count('K')
        argCount = seq.count('R')        
        
        if not (lysCount + argCount):
            for psm in pepSet:
                psm['Lysines'] = 0
                psm['Arginines'] = 0
                psm['Lysine Labels'] = 0
                psm['Arginine Labels'] = 0
                psm['Fully Labelled'] = 'N/A'
                psm['Intensity'] = ''
                output.write(psm)
            
            continue
        
        for psm in pepSet:
            assert seq == psm['Peptide Sequence']
            if psm['Variable Modifications']:
                varmods = psm['Variable Modifications'].split(';')
            else:
                varmods = ''
            
            lysLabelCount = len([x for x in varmods if x[0] == 'K' and 'Label' in x])
            argLabelCount = len([x for x in varmods if x[0] == 'R' and 'Label' in x])
            
            psm['Lysines'] = lysCount
            psm['Arginines'] = argCount
            psm['Lysine Labels'] = lysLabelCount
            psm['Arginine Labels'] = argLabelCount
            
            try:
                intensity = featureIntMap[int(psm['Spectrum Description'].split('.')[1])]
            except KeyError:
                intensity = 0

            if lysLabelCount == lysCount and argCount == argLabelCount:
                fullLabelledInt += intensity
                psm['Fully Labelled'] = True
            else:
                partLabelledInt += intensity
                psm['Fully Labelled'] = False
            
            psm['Intensity'] = intensity
            output.write(psm)
            
            evaluation['total peptides'] += 1
            evaluation['total lysines'] += lysCount
            evaluation['total arginines'] += argCount
            evaluation['fully labelled'] += int(psm['Fully Labelled'])
            evaluation['lysine labelled'] += lysLabelCount
            evaluation['arginine labelled'] += argLabelCount
        
        labelSummary.append((key, fullLabelledInt, partLabelledInt))
    output.close()
    
    summaryOutput = writer(outputfile, sheet_name = 'Label Report',
                           columns = ['Peptide Sequence', 'Non-Label Mods',
                                      '% Labelling'])
    for (seq, varmods), fullLabelledInt, partLabelledInt in labelSummary:
        row = {}
        row['Peptide Sequence'] = seq
        row['Non-Label Mods'] = varmods
        if partLabelledInt + fullLabelledInt:
            row['% Labelling'] = fullLabelledInt / (partLabelledInt + fullLabelledInt)
        else:
            row['% Labelling'] = ''
        summaryOutput.write(row)
    
    summaryOutput.close()
    
    return evaluation
Пример #20
0
    def get_reports(self,
                    mascot_ids,
                    dates=None,
                    outputfile=None,
                    ext=None,
                    chosen_folder='',
                    **report_kwargs):

        if ext:
            ext = ext.lstrip('.')

        report_columns = mzReport.default_columns
        if float(self.mascot.version
                 ) >= 2.3 and 'Protein Database' not in report_columns:
            report_columns.insert(1, 'Protein Database')

        if dates:
            assert len(dates) == len(
                mascot_ids), "Mismatched date list provided."
            mascot_searches = zip(mascot_ids, dates)
        else:
            mascot_searches = [(x, None) for x in mascot_ids]

        reports = []
        for mascot_id, date in mascot_searches:
            header, psms = self.retrieve_report_data(mascot_id, report_columns,
                                                     date, **report_kwargs)
            datafilename = header[7][1] or mascot_id
            reports.append((mascot_id, datafilename, header, psms))

        imputed_output_file_name = False
        if outputfile and ext:
            if not outputfile.lower().endswith(ext):
                outputfile += '.' + ext
        elif not outputfile:
            if not ext:
                ext = 'xlsx'
            outputfile = '_'.join(mascot_ids) + '.' + ext.strip('.')
            imputed_output_file_name = True
            #outputfile = '.'.join([reports[0][1], ext.strip('.')])
        elif outputfile and not ext:
            ext = outputfile.split('.')[-1]
            assert ext in ['csv', 'xlsx', 'xls', 'mzd', 'mzid']

        assert outputfile
        if chosen_folder and not os.path.isabs(outputfile):
            outputfile = os.path.join(chosen_folder, outputfile)

        if ext and 'mzid' in ext:
            assert len(mascot_ids) == 1, ("Combined result file not supported "
                                          "for mzIdentML files.")
            self.mascot.download_mzid(mascot_id,
                                      save_file=outputfile,
                                      date=date)
        elif len(mascot_ids) == 1:
            mascot_id, datafilename, header, psms = reports[0]
            if imputed_output_file_name or not outputfile:
                outputfile = datafilename + '.' + ext
                if chosen_folder:
                    outputfile = os.path.join(chosen_folder, outputfile)
            output = mzReport.writer(outputfile,
                                     columns=header[0],
                                     sheet_name='Mascot_Header')
            for line in header[1:]:
                output.write(line)
            output.close()

            output = mzReport.writer(outputfile,
                                     columns=report_columns,
                                     sheet_name='Data')
            for psm in psms:
                output.write(psm)
            output.close()

        else:
            #report_columns.insert(0, 'File')
            if not outputfile:
                raise IOError("Combined report file name must be specified.")

            if (outputfile.lower().endswith('xls')
                    or outputfile.lower().endswith('xlsx')
                    or outputfile.lower().endswith('mzd')):
                for m_id, datafilepath, header, _ in reports:
                    datafilename = os.path.basename(datafilepath)
                    output = mzReport.writer(outputfile,
                                             columns=header[0],
                                             sheet_name='%s Mascot Header' %
                                             datafilename)
                    for line in header[1:]:
                        output.write(line)
                    output.close()
            else:
                extension = outputfile.split('.')[-1]
                print "Omitting header tables due to %s format." % extension

            output = mzReport.writer(outputfile,
                                     columns=['File'] + report_columns,
                                     sheet_name='Data')
            for _, datafilepath, _, psms in reports:
                datafilename = os.path.basename(datafilepath)
                for psm in psms:
                    psm['File'] = datafilename
                    output.write(psm)

            output.close()

        return outputfile
Пример #21
0
def format_report(reportfile,
                  outputfile=None,
                  mgffile=None,
                  parameters=None,
                  most_rank=None,
                  most_exp=None):
    """
    Renders a native Comet output .txt file into an mzReport-compatible and
    prettier .xlsx format.
    
    (Native .txt output is noncompatible mostly due to a space instead of 
    underscore in the 'modified peptide' column; hopefully that will be fixed
    soon.)
    """
    if most_rank:
        most_rank = int(most_rank)
    if most_exp:
        most_exp = float(most_exp)

    if mgffile:
        from multiplierz.mgf import parse_to_generator
        mgfgen = parse_to_generator(mgffile)
        queryToDesc = dict(enumerate(x['title'] for x in mgfgen), start=1)
    else:
        queryToDesc = {}

    columns = []
    rows = []
    report = open(reportfile, 'r')

    headeritems = report.next().split('\t')
    header = {
        'Program': headeritems[0],
        'Data': headeritems[1],
        'Search Run Time': headeritems[2],
        'Database': headeritems[3].strip()
    }
    columnline = report.next()

    # Fix for presumed bug; omit if this column title is changed in later Comet versions.
    columnline = columnline.replace('peptide\tmodifications',
                                    'peptide_modifications')

    def tryNum(thing):
        try:
            return int(thing)
        except ValueError:
            try:
                return float(thing)
            except ValueError:
                return thing

    columns = [
        toStandardPSMConversions.get(x, x)
        for x in columnline.strip().split('\t')
    ]
    for line in report:
        values = [tryNum(x.strip()) for x in line.split('\t')]
        row = dict(zip(columns, values))
        row = convertVarmods(row)
        row['Spectrum Description'] = queryToDesc.get(row['Query'], 'Unknown')
        rows.append(row)

    report.close()
    if not outputfile:
        outputfile = '.'.join(reportfile.split('.')[:-1] + ['xlsx'])

    if outputfile.lower().endswith('xlsx') or outputfile.lower().endswith(
            'xls'):
        headerwriter = writer(
            outputfile,
            columns=['Program', 'Data', 'Search Run Time', 'Database'],
            sheet_name='Comet_Header')
        headerwriter.write(header)
        headerwriter.write(['', '', '', ''])
        if parameters:
            for setting, value in sorted(parameters.items()):
                headerwriter.write({
                    'Program': setting,
                    'Data': value,
                    'Search Run Time': '',
                    'Database': ''
                })
        headerwriter.close()
    mainwriter = writer(outputfile,
                        columns=['Spectrum Description'] + columns,
                        sheet_name='Data')
    for row in rows:
        if most_rank and row['Peptide Rank'] > most_rank:
            continue
        if most_exp and row['Expectation Value'] > most_exp:
            continue
        mainwriter.write(row)
    mainwriter.close()

    return outputfile
Пример #22
0
def feature_analysis(datafile,
                     resultFiles,
                     featureFile=None,
                     tolerance=None,
                     mzRegex=None,
                     scanRegex=None,
                     **constants):
    """
    Performs feature-detection analysis on the given .RAW file and PSM
    reports. The output files group the given PSMs by feature, with the
    addition of source feature extent and intensity information.
    
    """

    import os

    if mzRegex:
        import re
        global spectrumDescriptionToMZ

        mzRegCompiled = re.compile(mzRegex)

        def newParser(description):
            return float(mzRegCompiled.search(description).group())

        spectrumDescriptionToMZ = newParser

    if scanRegex:
        import re
        global spectrumDescriptionToScanNumber

        scanRegCompiled = re.compile(scanRegex)

        def newParser(description):
            return int(scanRegCompiled.search(description).group())

        spectrumDescriptionToScanNumber = newParser

    #if tolerance:
    #global peakFindTolerance
    #peakFindTolerance = tolerance

    #if signalNoise:
    #global signalToNoiseThreshold
    #signalToNoiseThreshold = signalNoise

    assert os.path.exists(datafile), "%s not found!" % datafile
    for resultfile in resultFiles:
        assert os.path.exists(resultfile), "%s not found!" % resultfile
    assert datafile.lower().endswith(
        '.raw'), "Only .raw files are currently supported."

    if featureFile:
        assert os.path.exists(
            featureFile
        ), "Specified feature data file %s not found!" % featureFile
    else:
        featureFile = detect_features(datafile,
                                      tolerance=tolerance,
                                      **constants)
    features = FeatureInterface(featureFile)

    outputfiles = []
    if resultFiles:
        print resultFiles
        print "Categorizing search results by file."
        for resultfile in resultFiles:
            resultfile = os.path.abspath(resultfile)
            inputResults = mzReport.reader(resultfile)
            outputfile = '.'.join(
                resultfile.split('.')[:-1] + ['featureDetect', 'xlsx'])
            outputfiles.append(outputfile)

            resultsByFeature = binByFullFeature(datafile, features,
                                                inputResults)

            output = mzReport.writer(
                outputfile,
                columns=inputResults.columns + [
                    'Feature', 'feature error', 'feature start scan',
                    'feature end scan', 'feature start time',
                    'feature end time', 'feature intensity',
                    'feature kurtosis', 'feature skewness'
                ])

            for result in resultsByFeature:
                output.write(result)

            output.close()

            print "Output saved to %s ." % outputfile
    else:
        print "No PSM data given; skipping annotation step."

    return featureFile, outputfiles
Пример #23
0
    def on_convert(self, event):
        if not self.file_list.GetStrings():
            wx.MessageBox('No files selected', 'Error')
            return

        #show hourglass
        wx.BeginBusyCursor(wx.HOURGLASS_CURSOR)

        files = self.file_list.GetStrings()
        input_format = self.input_format.GetSelection()
        output_format = self.output_format.GetSelection()
        output_ext = { 0:'.xls', 1:'.xlsx', 2:'.csv', 3:'.mzd' }[output_format]

        #update statusbar
        self.set_status("Converting...", 0)
        self.set_status("", 1)

        if self.combineCheck.GetValue():
            if input_format not in [0, 6]:
                wx.MessageBox("Only tabular/Excel files can currently be merged.")
                return
            combineFiles(self.file_list.GetStrings(),
                         self.combineCtrl.GetValue(),
                         output_ext)
            wx.EndBusyCursor()        
            self.set_status("Ready", 0)
            self.set_status("Done", 1)            
            return
            
            

        if input_format == 0: # Mascot CSV
            mascot_converter = mascot.mascot(version=settings.mascot_version)

            for file_name in files:
                self.set_status(file_name, 1)

                #Run MascotCSV program
                clean_csv_file = '_clean'.join(os.path.splitext(file_name))

                rep_file = os.path.splitext(clean_csv_file)[0] + output_ext
                if os.path.exists(rep_file):
                    os.remove(rep_file)

                mascot_converter.clean_csv(file_name, export_file=clean_csv_file, ion_list=False)

                repreader = mzReport.reader(clean_csv_file)
                repwriter = mzReport.writer(rep_file, columns=repreader.columns)

                for row in repreader:
                    repwriter.write(row)

                repreader.close()
                repwriter.close()

                #if os.path.splitext(rep_file)[1].lower() in ('.xls', '.xlsx', 'mzd'):
                    #mascot_reporter.mascot_header(rep_file, file_name)

                os.remove(clean_csv_file)

        elif input_format == 1: # Mascot DAT
            mascot_reporter = mzTools.MascotReport()

            _mascot_options = dict(max_hits=1000, ion_cutoff=20, bold_red=True,
                                   unassigned_queries=False, show_query_data=True,
                                   show_same_set=False, show_sub_set=False, quant=False)

            for file_name in files:
                self.set_status(file_name, 1)

                mascot_dat_file = mascot.MascotDatFile(file_name, **_mascot_options)
                mascot_header = mascot_dat_file.mascot_header()
                #mascot_header, prot_report, pep_report = mascot.parse_dat_file(file_name, **_mascot_options)

                ms_file_name = mascot_header[7][1] or (os.path.splitext(os.path.basename(file_name))[0])
                report_file = os.path.join(os.path.dirname(file_name),
                                           os.path.basename(ms_file_name) + output_ext)

                if os.path.exists(report_file):
                    os.remove(report_file)

                if output_ext in ('.xls', '.xlsx', '.mzd'):
                    mascot_reporter.mascot_header(report_file, mascot_header)
                    #mascot_reporter.mascot_header(report_file, mascot_header)

                if mascot_dat_file.res_file.getMascotVer() >= '2.3':
                    report = mzReport.writer(report_file,
                                             columns=(mzReport.default_columns[:1]
                                                      + ['Protein Database']
                                                      + mzReport.default_columns[1:]))
                else:
                    report = mzReport.writer(report_file, default_columns=True)

                #for row in pep_report:
                for row in mascot_dat_file.peptide_report():
                    report.write(row)

                mascot_dat_file.close()
                report.close()

        #elif input_format == 2: # Mascot mzIdentML
            #for file_name in files:
                #mzid = mzIdentML(file_name)
                #report_file = os.path.splitext(file_name)[0] + output_ext

                #if os.path.exists(report_file):
                    #os.remove(report_file)

                #report = mzReport.writer(report_file, default_columns=True)

                #for row in mzid:
                    #report.write(row)

                #report.close()

        elif input_format == 2: # Mascot mzIdentML
            for file_name in files:
                mzid = mzIdentML(file_name)
                data = mzid.peptideSummary()
                header = data[0].keys()
                
                report_file = os.path.splitext(file_name)[0] + output_ext
                
                if os.path.exists(report_file): os.remove(report_file)
                
                report = mzReport.writer(report_file, columns = header)
                
                for row in data:
                    writeRow = []
                    for column in header:
                        thing = row[column]
                        if type(thing) == type(['list']):
                            thing = "; ".join(thing)
                        writeRow.append(thing)
                    report.write(writeRow)

                report.close()
                
        elif input_format == 3: # Protein Pilot
            for file_name in files:
                self.set_status(file_name, 1)
                pilot = ProteinPilot(file_name)
                pilot.format(str(os.path.splitext(file_name)[0] + output_ext))

        elif input_format == 4: # OMMSA
            for file_name in files:
                self.set_status(file_name, 1)
                omssa = OMSSA_CSV(file_name)
                omssa.format(str(os.path.splitext(file_name)[0] + output_ext))

        elif input_format == 5: # X!Tandem XML
            for file_name in files:
                report_file = os.path.splitext(file_name)[0] + output_ext

                format_XML(file_name, report_file)

        elif input_format == 6: # other mzReport
            output_method = {'.xls': mzReport.toXLS,
                             '.xlsx': mzReport.toXLS,
                             '.csv': mzReport.toCSV,
                             '.mzd': mzReport.toMZD}[output_ext]

            for file_name in files:
                self.set_status(file_name, 1)
                
                rdr = reader(file_name)
                outputname = '.'.join(file_name.split('.')[:-1]) + output_ext
                wtr = writer(outputname, columns = rdr.columns)
                
                for row in rdr:
                    wtr.write(row)
                wtr.close()
                rdr.close()
                
                #if output_ext.startswith('.xls'):
                    #output_method(file_name, output_ext == '.xlsx')
                #else:
                    #output_method(file_name)

        #hide hourglass
        wx.EndBusyCursor()

        self.set_status("Ready", 0)
        self.set_status("Done", 1)
Пример #24
0
def add_gene_ids(target_files,
                 p2g_database,
                 target_sheet=None,
                 outputfile=None,
                 inPlace=False,
                 leucine_equals_isoleucine=True,
                 legacy_columns=True):
    starttime = time.clock()

    if isinstance(target_files, str):
        return_list = False
        target_files = [target_files]
    else:
        return_list = True

    dataRdr = open(p2g_database, 'rb')
    data = pickle.load(dataRdr)
    k_len = None
    if isinstance(data, tuple) and len(data) == 6:
        k_len, seqLookup, fmerLookup, geneLookup, isoSeqLookup, isoFmerLookup = data
    elif isinstance(data, tuple) and not len(data) == 6:
        raise Exception(str(len(data)))
    else:
        print('Legacy mode P2G database detected!')
        seqLookup = data
        fmerLookup = pickle.load(dataRdr)
        geneLookup = pickle.load(dataRdr)
        try:
            isoSeqLookup = pickle.load(dataRdr)
            isoFmerLookup = pickle.load(dataRdr)
        except EOFError:
            distinguish_leucine = False
            isoSeqLookup = None
            isoFmerLookup = None
    dataRdr.close()

    if isinstance(list(geneLookup.values())[0], tuple):
        print("Legacy mode gene names detected.")
        oldTupleInstance = list(geneLookup.values())[0]
        nameIndex = 0 if oldTupleInstance[0] and any(
            x.isalpha() for x in oldTupleInstance[0]) else 1
        for k, v in list(geneLookup.items()):
            geneLookup[k] = v[nameIndex]

    if leucine_equals_isoleucine:
        assert isoFmerLookup, (
            "Pep2Gene database does not contain leucine-isoleucine "
            "ambiguity data; re-compile database or "
            "select leucine_equals_isoleucine = False .")
    if k_len:
        assert k_len == K, "Pep2Gene database created with kmers of length %s, not %s" % (
            k_len, K)

    print("P2G database loaded: %.2f\n\n" % (time.clock() - starttime))
    prevtime = time.clock()

    outputfiles = []
    for target_file in target_files:
        try:
            rdr = reader(target_file, sheet_name=target_sheet)
        except TypeError:
            rdr = reader(target_file)  # Not an Excel file.

        add_legacy_cols = [
            "pro_count",
            "pro_list",
            "gene_count",
            "gene_symbols",
        ]

        add_cols = ["Protein Count", "Proteins", "Gene Count", "Gene Symbols"]
        if legacy_columns:
            new_cols = add_legacy_cols
            colname = dict(list(zip(add_cols, add_legacy_cols)))
        else:
            new_cols = add_cols
            colname = dict(list(zip(add_cols, add_cols)))

        iso_legacy_cols = [
            'IL Ambiguity pro_count', 'IL Ambiguity pro_list',
            "IL Ambiguity gene_count", "IL Ambiguity gene_symbols"
        ]
        iso_cols = [
            'I<->L Protein Count', 'I<->L Proteins', 'I<->L Gene Count',
            'I<->L Gene Symbols'
        ]
        if legacy_columns and leucine_equals_isoleucine:
            new_cols += iso_legacy_cols
            colname.update(dict(list(zip(iso_cols, iso_legacy_cols))))
        elif leucine_equals_isoleucine:
            new_cols += iso_cols
            colname.update(dict(list(zip(iso_cols, iso_cols))))

        if (not outputfile) or return_list:
            ext = target_file.split('.')[-1]
            outputfile = '.'.join(target_file.split('.')[:-1] + ['GENES', ext])
        output = writer(outputfile, columns=rdr.columns + new_cols)

        pepToProts = {}
        isoPepToProts = {}
        for counter, row in enumerate(rdr):
            if counter % 1000 == 0:
                print_progress(counter)
            try:
                pep = row['Peptide Sequence'].upper()
            except KeyError:
                pep = row['Peptide'].upper()

            pep = ''.join([x for x in pep if x.isalpha()])

            if len(pep) <= K:
                continue  # No 4-mers in a 3-mer!

            isoPep = pep.replace('I', 'L')
            if pep not in pepToProts:
                candidate_prots = reduce(set.intersection,
                                         (fmerLookup[pep[x:x + K]]
                                          for x in range(len(pep) - K)))
                # pep_find could be replaced by giving the p2g database a pre-made set of
                # hashes of all tryptic peptides in a protein, and seeing if the hash of the
                # pep is present in the set.
                pep_find = re.compile(
                    '((^M?)|[KR](?=[^P]))%s(((?<=[KR])[^P])|$)' % pep)
                pepToProts[pep] = set(prot for prot in candidate_prots
                                      if pep_find.search(seqLookup[prot]))

                if leucine_equals_isoleucine and isoPep not in isoPepToProts:
                    iso_candidate_prots = reduce(
                        set.intersection, (isoFmerLookup[isoPep[x:x + K]]
                                           for x in range(len(isoPep) - K)))
                    pep_find = re.compile(
                        '((^M?)|[KR](?=[^P]))%s(((?<=[KR])[^P])|$)' % isoPep)
                    isoPepToProts[isoPep] = set(
                        prot for prot in iso_candidate_prots
                        if pep_find.search(isoSeqLookup[prot]))

            proteins = '; '.join(pepToProts[pep])
            proteinCount = len(pepToProts[pep])

            geneList = set(geneLookup[x] for x in pepToProts[pep]
                           if x in geneLookup)
            geneIds = '; '.join(set(g for g in geneList))
            #geneSymbols = '; '.join(set(s for _, s in geneList))
            geneCount = len(geneList)

            row[colname['Protein Count']] = proteinCount
            row[colname['Proteins']] = proteins
            row[colname['Gene Count']] = geneCount
            row[colname['Gene Symbols']] = geneIds
            #row[colname['Gene IDs']] =

            if leucine_equals_isoleucine:
                isoProteins = '; '.join(isoPepToProts[isoPep])
                isoProteinCount = len(isoPepToProts[isoPep])

                isoGeneList = set(geneLookup[x] for x in isoPepToProts[isoPep]
                                  if x in geneLookup)
                isoGeneIds = '; '.join(set(g for g in isoGeneList))
                #isoGeneSymbols = '; '.join(set(s for _, s in isoGeneList))
                isoGeneCount = len(isoGeneList)

                row[colname['I<->L Protein Count']] = isoProteinCount
                row[colname['I<->L Proteins']] = isoProteins
                row[colname['I<->L Gene Count']] = isoGeneCount
                row[colname['I<->L Gene Symbols']] = isoGeneIds
                #row[colname['I<->L Gene IDs']] =

            output.write(row)

        print("\nGene lookup completed: %.2f" % (time.clock() - prevtime))
        prevtime = time.clock()
        rdr.close()
        output.close()
        print("Output written: %.2f" % (time.clock() - prevtime))
        outputfiles.append(outputfile)
    if return_list:
        return outputfiles
    else:
        return outputfile
Пример #25
0
    def legacy_get_reports(self,
                           mascot_ids,
                           dates=None,
                           chosen_folder=None,
                           combined_file=False,
                           rank_one=False,
                           protein_report=False,
                           mascot_options=None,
                           peaks=False,
                           peaks_options=None,
                           mascot_web=False,
                           mascot_web_options=None,
                           mascot_prot_cov=False,
                           ext='.xlsx',
                           local_dat_files=None,
                           mascotIDInResultName=False,
                           percolatorDirectory=None,
                           **kwargs):

        # mascot_ids should be a list/tuple of IDs. dates should be a matching list/tuple of dates,
        # or False. combined_file should be None for individual files or an output file name

        # mascot options: (max_hits, ion_cutoff, bold_red, unassigned_queries,
        #                  show_query_data, show_same_set, show_sub_set, quant) + mascot_id, date
        # mascot_web options: (ms2_img, mascot_ms2, mascot_var_mods,
        #                      draw_pep, instrument, im_size) + mascot_id, date
        # mascot_prot_cov options: ion_cutoff, mascot_id, date

        # defaults and overrides. The priority is:  keyword > option_dict > default

        # Using local .DATs means you don't have access to certain fancy
        # Mascot features.
        if local_dat_files:
            mascot_web = False
            mascot_prot_cov = False

        assert not peaks, (
            "precursor_peaks and images in result files are no longer supported; "
            "peaks argument to get_reports must be False.")

        # defaults
        _mascot_options = dict(max_hits=1000,
                               ion_cutoff=20,
                               bold_red=True,
                               unassigned_queries=False,
                               show_query_data=True,
                               show_same_set=False,
                               show_sub_set=False,
                               quant=False)
        # option_dict
        if mascot_options:
            _mascot_options.update(mascot_options)
        # keywords
        _mascot_options.update(
            (k, kwargs[k]) for k in kwargs if k in _mascot_options)
        for k in _mascot_options:
            if k in kwargs:
                _mascot_options[k] = kwargs[k]

        if peaks:
            # defaults
            _peaks_options = dict(time_window=(0.5, 0.5),
                                  mz_window=(0.1, 0.1),
                                  plot_ms1=False,
                                  plot_xic=False,
                                  plot_ms2=False,
                                  peak_area=False,
                                  reporter_ions=False,
                                  peakfilter=None,
                                  ion_list=['b', 'y'],
                                  instrument='ESI-TRAP',
                                  im_size=(8.0, 6.0))
            # option_dict
            if peaks_options:
                _peaks_options.update(peaks_options)
            # keywords
            _peaks_options.update(
                (k, kwargs[k]) for k in kwargs
                if (k in _peaks_options or k == 'peak_data_path'))

            # need a path (file or directory) to actually do this,
            # so we raise an exception if it's not present
            if 'peak_data_path' not in _peaks_options:
                raise ValueError(
                    'peak_data_path value is required for peak extraction')

        if mascot_web:
            # defaults
            _mascot_web_options = dict(ms2_img=True,
                                       mascot_ms2=True,
                                       mascot_var_mods=True,
                                       instrument='ESI-TRAP',
                                       im_size=(8.0, 6.0))
            # option_dict
            if mascot_web_options:
                _mascot_web_options.update(mascot_web_options)
            # keywords
            _mascot_web_options.update(
                (k, kwargs[k]) for k in kwargs if k in _mascot_web_options)

        # if version is 2.2+, mod positions are extracted automatically
        if mascot_web and self.mascot.version >= '2.2':
            _mascot_web_options['mascot_var_mods'] = False
            if not _mascot_web_options['ms2_img']:
                mascot_web = False

        # require agreement 'instrument' between two dictionaries
        if peaks and mascot_web:
            if _peaks_options['instrument'] != _mascot_web_options[
                    'instrument']:
                raise ValueError(
                    'instrument value must be consistent; input dictionaries disagree'
                )

        # Getting both of these would be redundant, so force at most one
        if mascot_web and peaks and _mascot_web_options['ms2_img']:
            _peaks_options['plot_ms2'] = False

        if chosen_folder is None:
            chosen_folder = myData

        # if creating a single file, we'll create the writer now
        if combined_file:
            # figuring out the report columns. start with defaults...
            repcols = mzReport.default_columns[:]

            # mascot 2.3 can have multiple databases so add a column for that
            if self.mascot.version >= '2.3':
                repcols.insert(1, 'Protein Database')

            # these are the columns coming out of the dat file, need them separate
            res_cols = repcols[:]

            # add columns for peak extraction
            if peaks:
                repcols.extend(c for c in [
                    'MS2 Time', 'Peak Time', 'Peak Intensity',
                    'Peak Width (sec)', 'Peak Comment'
                ] if c not in repcols)
                if _peaks_options['peak_area'] and 'Peak Area' not in repcols:
                    repcols.append('Peak Area')
                if _peaks_options['reporter_ions']:
                    repcols.extend(
                        c for c in ['Rep114', 'Rep115', 'Rep116', 'Rep117']
                        if c not in repcols)

            repcols.insert(0, 'File')

            report_file = os.path.join(chosen_folder, combined_file)

            if os.path.exists(report_file):
                os.remove(report_file)

            report = mzReport.writer(report_file, columns=repcols)
            isMZD = isinstance(report, mzReport.mzDB.SQLiteWriter)

            mascot_headers = []
        else:
            report_files = []

        if dates:
            mid_d = zip(mascot_ids, dates, [None] * len(mascot_ids))
        elif local_dat_files:
            mid_d = zip(["Local File"] * len(local_dat_files),
                        [None] * len(local_dat_files), local_dat_files)
        else:
            mid_d = [(mid, None, None) for mid in mascot_ids]

        for mascot_id, date, local in mid_d:
            mascot_id = str(mascot_id)

            if ':' in mascot_id:
                (mascot_id, date) = mascot_id.split(':', 1)

            mascot_id = str(mascot_id).zfill(6)

            if not (date or local):
                date = self.mascot.get_date(mascot_id)

            logger_message(
                30, 'Generating Multiplierz-Mascot Report for JobID %s...' %
                mascot_id)

            if ext == '.mzid':
                logger_message(30, 'Downloading mzIdentML File...')
                destination = chosen_folder if chosen_folder else myData

                reportfilename = "F%s.mzid" % mascot_id
                outputfile = os.path.join(destination, reportfilename)
                report_file = self.mascot.download_mzid(mascot_id,
                                                        save_file=outputfile,
                                                        date=date)
                assert report_file == outputfile

                report_files.append(report_file)
                continue
                # mzIdentML files don't use the rest of this function;
                # what they contain is essentially fixed, to multiplierz.

            if not local:
                logger_message(30, 'Downloading Mascot DAT File...')
                dat_file = self.mascot.download_dat(chosen_folder, mascot_id,
                                                    date)
            else:
                dat_file = os.path.abspath(local)
                mascot_id = os.path.basename(local).split('.')[0]

            if dat_file:
                logger_message(20, 'Mascot DAT File Downloaded!')
                mascot_dat_file = interface.MascotDatFile(
                    dat_file, **_mascot_options)

                if percolatorDirectory and mascot_dat_file.hasDecoyHits():
                    print "Running Mascot Percolator..."
                    mascot_dat_file.close()
                    percolatedDatFile = runPercolator(dat_file,
                                                      percolatorDirectory)
                    mascot_dat_file = interface.MascotDatFile(
                        dat_file, **mascot_options)

                    if self.cleanup:
                        os.remove(dat_file)
                    dat_file = percolatedDatFile

            else:
                logger_message(
                    40, 'Failed to download DAT file for %s' % mascot_id)
                continue

            if self.mascot.version != mascot_dat_file.res_file.getMascotVer(
            )[:len(self.mascot.version)]:
                print(
                    "Mascot version mismatch detected; changing version from %s to %s"
                    % (self.mascot.version,
                       mascot_dat_file.res_file.getMascotVer()
                       [:len(self.mascot.version)]))
                self.mascot.version = mascot_dat_file.res_file.getMascotVer(
                )[:len(self.mascot.version)]

            if not combined_file:
                # Report column stuff moved from above, in order to handle version dependency.  (Heavy sigh.)
                # figuring out the report columns. start with defaults...
                repcols = mzReport.default_columns[:]

                # mascot 2.3 can have multiple databases so add a column for that
                if self.mascot.version >= '2.3':
                    repcols.insert(1, 'Protein Database')

                # these are the columns coming out of the dat file, need them separate
                res_cols = repcols[:]

                # add columns for peak extraction
                if peaks:
                    repcols.extend(c for c in [
                        'MS2 Time', 'Peak Time', 'Peak Intensity',
                        'Peak Width (sec)', 'Peak Comment'
                    ] if c not in repcols)
                    if _peaks_options[
                            'peak_area'] and 'Peak Area' not in repcols:
                        repcols.append('Peak Area')
                    if _peaks_options['reporter_ions']:
                        repcols.extend(
                            c
                            for c in ['Rep114', 'Rep115', 'Rep116', 'Rep117']
                            if c not in repcols)

                if mascot_prot_cov:
                    repcols.append('Protein Coverage')

            #Get MS File Name
            mascot_header = mascot_dat_file.mascot_header()

            ms_file_name = mascot_header[7][1] or ('F%s' % mascot_id)

            if not combined_file:
                filename = os.path.basename(ms_file_name)
                if mascotIDInResultName and filename.endswith('.mgf'):
                    filename = filename[:-4] + "." + mascot_id

                report_file = os.path.join(chosen_folder, filename + ext)

                if os.path.exists(report_file):
                    os.remove(report_file)

                report = mzReport.writer(report_file, columns=repcols)
                isMZD = isinstance(report, mzReport.mzDB.SQLiteWriter)

            if mascot_web and (_mascot_web_options['ms2_img']
                               or _mascot_web_options['mascot_var_mods']):
                gen_options = {}
                try:
                    gen_options['ms2_img'] = _mascot_web_options['ms2_img']
                except KeyError:
                    pass
                try:
                    gen_options['mascot_var_mods'] = _mascot_web_options[
                        'mascot_var_mods']
                except KeyError:
                    pass

                mascot_web_gen = self.mascot_web(
                    mascot_id,
                    date=date,
                    dat_file=(dat_file
                              if _mascot_web_options['mascot_ms2'] else None),
                    isMZD=isMZD,
                    **gen_options)
                mascot_web_gen.next()

            if mascot_prot_cov:
                prot_cov_gen = self.mascot_prot_coverage(
                    mascot_id, _mascot_options['ion_cutoff'], date)
                prot_cov_gen.next()

            prot_desc_dict = {}

            if self.mascot.version != mascot_dat_file.res_file.getMascotVer(
            )[:len(self.mascot.version)]:
                raise TypeError, "Incorrect version of Mascot selected. %s %s" % (
                    self.mascot.version, mascot_dat_file.res_file.getMascotVer(
                    )[:len(self.mascot.version)])

            missing_desc_count = 0
            for row in mascot_dat_file.peptide_report():
                row = mzReport.ReportEntry(res_cols, row)

                if rank_one and row['Peptide Rank'] != 1:
                    continue

                if (not local) and not (row['Protein Description']
                                        or row['Protein Mass']):
                    if row['Accession Number'] not in prot_desc_dict:
                        missing_desc_count += 1
                        # Very slow!
                        #prot_desc_dict[row['Accession Number']] = self.mascot.get_description(row['Accession Number'],
                        #row.get('protein database', '1').split('::')[0],
                        #mascot_id,
                        #date)
                    row['Protein Description'], row[
                        'Protein Mass'] = prot_desc_dict.get(
                            row['Accession Number'], ('-', '-'))

                md = []
                #if peaks:
                #(new_row, img_tuples) = peak_gen.send(row)
                #row.update(new_row)
                #md.extend(img_tuples)

                if mascot_web and (_mascot_web_options['ms2_img']
                                   or _mascot_web_options['mascot_var_mods']):
                    (vartext, img_tup) = mascot_web_gen.send(row)
                    if _mascot_web_options['mascot_var_mods']:
                        row['Variable Modifications'] = vartext
                    if _mascot_web_options['ms2_img']:
                        md.append(img_tup)

                if mascot_prot_cov:
                    (prot_cov, md_tup) = prot_cov_gen.send(row)
                    row['Protein Coverage'] = prot_cov
                    md.append(md_tup)

                if combined_file:
                    row['File'] = ms_file_name

                report.write(row, metadata=md)
            if missing_desc_count:
                print "Missing protein info for %d PSMs." % missing_desc_count

            if peaks:
                peak_gen.close()
            if mascot_web and (_mascot_web_options['ms2_img']
                               or _mascot_web_options['mascot_var_mods']):
                mascot_web_gen.close()
            if mascot_prot_cov:
                prot_cov_gen.close()

            # Mascot-decoy-data finder!
            if mascot_dat_file.hasDecoyHits():
                decoy_dat_file = interface.MascotDatFile(dat_file,
                                                         decoyMode=True,
                                                         **_mascot_options)
                for row in decoy_dat_file.peptide_report():
                    report.write(row)
                decoy_dat_file.close()

            if not combined_file:
                if os.path.splitext(report_file)[1].lower() in ('.xls',
                                                                '.xlsx',
                                                                '.mzd'):
                    report.close()

                    self.mascot_headers(report_file, [(None, mascot_header)])
                    if protein_report:
                        self.prot_report(report_file,
                                         mascot_dat_file.protein_report())
                else:
                    report.close()

                report_files.append(report_file)
            else:
                if os.path.splitext(report_file)[1].lower() in ('.xls',
                                                                '.xlsx',
                                                                '.mzd'):
                    mascot_headers.append((ms_file_name, mascot_header))

            mascot_dat_file.close()
            if self.cleanup and not local_dat_files:
                os.remove(dat_file)

            logger_message(
                30, 'Multiplierz-Mascot Report for JobID %s Generated!' %
                mascot_id)

        if combined_file:
            if os.path.splitext(report_file)[1].lower() in ('.xls', '.xlsx',
                                                            '.mzd'):
                report.close()

                self.mascot_headers(report_file, mascot_headers)
                # not supported right now: protein reports for XLS.
                if isMZD and protein_report:
                    self.prot_report(report_file, None)
            else:
                report.close()

        return [report_file] if combined_file else report_files