def write_out_matrix_file(matrix_outfile, allruns, multipeptides, fraction_needed_selected,
                          style="none", write_requant=True, aligner_mscore_treshold=1.0):
    matrix_writer = getwriter(matrix_outfile)

    run_ids = [r.get_id() for r in allruns]
    header = ["Peptide", "Protein"]
    for r in allruns:
        fname = "%s_%s" % (os.path.basename(r.orig_filename), r.get_id() )
        header.extend(["Intensity_%s" % fname])
        if style == "RT" or style == 'full':
            header.extend(["RT_%s" % fname])
        if style == "score" or style == 'full':
            header.extend(["score_%s" % fname])

    header.extend(["RT_mean", "RT_std", "pg_pvalue"])

    for i in header:
        matrix_writer.write(i)
    matrix_writer.newline()

    for multipep in multipeptides:

        # Retrieve all transition group ids available for this precursor group
        # Iterate through all ids and write one line per transition group id
        trgr_ids = set([ trgr.get_id() for prgr in multipep.getPrecursorGroups() for trgr in prgr ])
        for trgr_id in trgr_ids:

            # Get all selected peakgroups that correspond to the current
            # transition group id and ensure we do not have any twice.
            allruns = [pg.peptide.run.get_id() for pg in multipep.get_selected_peakgroups() 
                       if pg.peptide.get_id() == trgr_id]
            if len(allruns) != len(set(allruns)):
                # TODO test this as well .... 
                raise Exception("Error when writing out matrix, found more than one peakgroup for a run %s" % allruns)

            # Get all selected peakgroups that correspond to the current transition group id 
            selected_peakgroups = dict([(pg.peptide.run.get_id(), pg) 
                for pg in multipep.get_selected_peakgroups() if pg.peptide.get_id() == trgr_id])

            # Skip empty lines or lines that have too few entries
            if len(selected_peakgroups) == 0:
                continue
            if (len(selected_peakgroups) * 1.0 / len(allruns) < fraction_needed_selected): 
                continue

            # Write first two columns of the matrix
            for i in [trgr_id, multipep.find_best_peptide_pg().peptide.protein_name]:
                matrix_writer.write(i)

            # Write other columns (one or two per run, depending on format)
            rts = []
            for rid in run_ids:
                pg = selected_peakgroups.get(rid, None)

                if not write_requant:
                    if not pg is None and pg.get_fdr_score() > 1.0:
                        pg = None

                if pg is None:
                    matrix_writer.write("NA")
                    if style == "RT" or style == "full":
                        matrix_writer.write("NA")
                    if style == "score" or style == "full":
                        matrix_writer.write("NA")
                else:
                    if pg.get_fdr_score() > 1.0:
                        color = 'r'
                    elif pg.get_fdr_score() > aligner_mscore_treshold:
                        color = 'b'
                    else:
                        color = 'd'

                    matrix_writer.write(pg.get_intensity(), color=color)

                    if style == "RT" or style == "full":
                        matrix_writer.write(pg.get_normalized_retentiontime(), color=color)
                    if style == "score" or style == "full":
                        matrix_writer.write(pg.get_fdr_score(), color=color)

                if not pg is None:
                    rts.append(pg.get_normalized_retentiontime())

            # The d_score is a z-score which computed on the null / decoy
            # distribution which is (assumed) gaussian with u = 0, sigma = 1
            # -> we thus compute a p-value from the z-score and assuming
            # independent measurements, we multiply the p-values to compute a
            # peakgroup p-value.
            # We use norm.sf (1-cdf) on the vector of z-scores.
            pvals = [float(pg.get_dscore()) for k,pg in selected_peakgroups.iteritems() if
                     not pg is None and not pg.get_dscore() is None]
            pvalue = numpy.prod(scipy.stats.norm.sf(pvals))

            for i in [numpy.mean(rts), numpy.std(rts), pvalue]:
                matrix_writer.write(i)
            matrix_writer.newline()

    del matrix_writer
Exemplo n.º 2
0
def write_out_matrix_file(matrix_outfile,
                          allruns,
                          multipeptides,
                          fraction_needed_selected,
                          style="none",
                          write_requant=True,
                          aligner_mscore_treshold=1.0):
    matrix_writer = getwriter(matrix_outfile)

    run_ids = [r.get_id() for r in allruns]
    header = ["Peptide", "Protein"]
    for r in allruns:
        fname = "%s_%s" % (os.path.basename(r.orig_filename), r.get_id())
        header.extend(["Intensity_%s" % fname])
        if style == "RT" or style == 'full':
            header.extend(["RT_%s" % fname])
        if style == "score" or style == 'full':
            header.extend(["score_%s" % fname])

    header.extend(["RT_mean", "RT_std", "pg_pvalue"])

    for i in header:
        matrix_writer.write(i)
    matrix_writer.newline()

    for multipep in multipeptides:

        # Retrieve all transition group ids available for this precursor group
        # Iterate through all ids and write one line per transition group id
        trgr_ids = set([
            trgr.get_id() for prgr in multipep.getPrecursorGroups()
            for trgr in prgr
        ])
        for trgr_id in trgr_ids:

            # Get all selected peakgroups that correspond to the current
            # transition group id and ensure we do not have any twice.
            allruns = [
                pg.peptide.run.get_id()
                for pg in multipep.get_selected_peakgroups()
                if pg.peptide.get_id() == trgr_id
            ]
            if len(allruns) != len(set(allruns)):
                # TODO test this as well ....
                raise Exception(
                    "Error when writing out matrix, found more than one peakgroup for a run %s"
                    % allruns)

            # Get all selected peakgroups that correspond to the current transition group id
            selected_peakgroups = dict([
                (pg.peptide.run.get_id(), pg)
                for pg in multipep.get_selected_peakgroups()
                if pg.peptide.get_id() == trgr_id
            ])

            # Skip empty lines or lines that have too few entries
            if len(selected_peakgroups) == 0:
                continue
            if (len(selected_peakgroups) * 1.0 / len(allruns) <
                    fraction_needed_selected):
                continue

            # Write first two columns of the matrix
            for i in [
                    trgr_id,
                    multipep.find_best_peptide_pg().peptide.protein_name
            ]:
                matrix_writer.write(i)

            # Write other columns (one or two per run, depending on format)
            rts = []
            for rid in run_ids:
                pg = selected_peakgroups.get(rid, None)

                if not write_requant:
                    if not pg is None and pg.get_fdr_score() > 1.0:
                        pg = None

                if pg is None:
                    matrix_writer.write("NA")
                    if style == "RT" or style == "full":
                        matrix_writer.write("NA")
                    if style == "score" or style == "full":
                        matrix_writer.write("NA")
                else:
                    if pg.get_fdr_score() > 1.0:
                        color = 'r'
                    elif pg.get_fdr_score() > aligner_mscore_treshold:
                        color = 'b'
                    else:
                        color = 'd'

                    matrix_writer.write(pg.get_intensity(), color=color)

                    if style == "RT" or style == "full":
                        matrix_writer.write(pg.get_normalized_retentiontime(),
                                            color=color)
                    if style == "score" or style == "full":
                        matrix_writer.write(pg.get_fdr_score(), color=color)

                if not pg is None:
                    rts.append(pg.get_normalized_retentiontime())

            # The d_score is a z-score which computed on the null / decoy
            # distribution which is (assumed) gaussian with u = 0, sigma = 1
            # -> we thus compute a p-value from the z-score and assuming
            # independent measurements, we multiply the p-values to compute a
            # peakgroup p-value.
            # We use norm.sf (1-cdf) on the vector of z-scores.
            pvals = [
                float(pg.get_dscore())
                for k, pg in selected_peakgroups.iteritems()
                if not pg is None and not pg.get_dscore() is None
            ]
            pvalue = numpy.prod(scipy.stats.norm.sf(pvals))

            for i in [numpy.mean(rts), numpy.std(rts), pvalue]:
                matrix_writer.write(i)
            matrix_writer.newline()

    del matrix_writer