Exemplo n.º 1
0
def analyzeMali(mali, options, prefix_row=""):

    if len(mali) == 0:
        raise "not analyzing empty multiple alignment"

    # count empty sequences
    row_data = map(
        lambda x: Mali.MaliData(x.mString, options.gap_chars, options.
                                mask_chars), mali.values())
    col_data = map(
        lambda x: Mali.MaliData(x, options.gap_chars, options.mask_chars),
        mali.getColumns())

    if len(row_data) == 0 or len(col_data) == 0:
        return False

    if options.loglevel >= 2:
        for row in row_data:
            options.stdlog.write("# row: %s\n" % str(row))
        for col in col_data:
            options.stdlog.write("# col: %s\n" % str(col))

    options.stdout.write(prefix_row)

    # calculate average column occupancy
    col_mean = scipy.mean(map(lambda x: x.mNChars, col_data))
    col_median = scipy.median(map(lambda x: x.mNChars, col_data))
    length = mali.getLength()

    if float(int(col_median)) == col_median:
        options.stdout.write("%5.2f\t%5.2f\t%i\t%5.2f" %
                             (col_mean, 100.0 * col_mean / length, col_median,
                              100.0 * col_median / length))
    else:
        options.stdout.write("%5.2f\t%5.2f\t%5.1f\t%5.2f" %
                             (col_mean, 100.0 * col_mean / length, col_median,
                              100.0 * col_median / length))

    row_mean = scipy.mean(map(lambda x: x.mNChars, row_data))
    row_median = scipy.median(map(lambda x: x.mNChars, row_data))
    width = mali.getWidth()

    if float(int(row_median)) == row_median:
        options.stdout.write("\t%5.2f\t%5.2f\t%i\t%5.2f" %
                             (row_mean, 100.0 * row_mean / width, row_median,
                              100.0 * row_median / width))
    else:
        options.stdout.write("\t%5.2f\t%5.2f\t%5.1f\t%5.2f" %
                             (row_mean, 100.0 * row_mean / width, row_median,
                              100.0 * row_median / width))

    options.stdout.write("\n")

    return True
Exemplo n.º 2
0
def ProcessResult(result, options, mali=None, prefix=None, p_value=None):

    counts = None

    if options.method == "summary-slr":

        thresholds = "95%", "99%", "95% corrected", "99% corrected"

        if prefix:
            options.stdout.write("%s\t" % prefix)

        options.stdout.write("%5.2f\t%5.2f\t%5.2f\t%6.4f\t%i\t%i\t%i\t" % (
            result.mTreeLength,
            result.mOmega,
            result.mKappa,
            result.mLogLikelihood,
            len(result.mSites),
            result.mNSitesSynonymous,
            result.mNSitesGaps + result.mNSitesSingleChar,
        ))
        options.stdout.write("\t".join(
            map(lambda x: "%i" % result.mNPositiveSites[x][0], thresholds)))
        options.stdout.write("\t")
        options.stdout.write("\t".join(
            map(lambda x: "%i" % result.mNNegativeSites[x], thresholds)))
        options.stdout.write("\n")

    elif options.method in ("summary-filtered", "positive-site-table",
                            "negative-site-table", "neutral-site-table",
                            "positive-site-list", "negative-site-list",
                            "neutral-site-list"):

        mali_length = mali.getLength()
        mali_width = mali.getWidth()
        column_data = map(
            lambda x: Mali.MaliData(x, gap_chars="Nn", mask_chars="-."),
            mali.getColumns())

        # sanity check: do lengths of mali and # of sites correspond
        if len(result.mSites) * 3 != mali_width:
            raise "mali (%i) and # of sites (%i) do not correspond." % (
                mali_width, len(result.mSites))

        if options.method == "summary-filtered":
            # count sites, but filter with multiple alignment
            ntotal = 0
            npositive = 0
            nnegative = 0
            nneutral = 0
            nfiltered = 0
            nsynonymous = 0

            if prefix:
                options.stdout.write("%s\t" % prefix)

            for x in range(len(result.mSites)):
                site = result.mSites[x]
                column = column_data[x * 3]

                if column.mNChars != mali_length:
                    nfiltered += 1
                    continue

                if site.isPositive(options.significance_threshold,
                                   options.use_adjusted):
                    npositive += 1
                elif site.isNegative(options.significance_threshold,
                                     options.use_adjusted):
                    nnegative += 1

                if site.isSynonymous():
                    nsynonymous += 1

                ntotal += 1

            options.stdout.write(
                "%5.2f\t%5.2f\t%5.2f\t%6.4f\t%i\t%i\t%i\t%i\t%i\t%i\n" %
                (result.mTreeLength, result.mOmega, result.mKappa,
                 result.mLogLikelihood, len(result.mSites), nfiltered, ntotal,
                 nsynonymous, nnegative, npositive))
            counts = Result(nfiltered, ntotal, nsynonymous, nnegative,
                            npositive)

        elif options.method in (
                "positive-site-table",
                "negative-site-table",
                "neutral-site-table",
                "positive-site-list",
                "negative-site-list",
                "neutral-site-list",
        ):

            select_positive_sites = options.method in ("positive-site-table",
                                                       "positive-site-list")
            select_negative_sites = options.method in ("negative-site-table",
                                                       "negative-site-list")

            # iterate over sites and output those under xxx selection
            identifiers = mali.getIdentifiers()
            chars_per_row = [[] for x in range(mali_length)]

            sites = []

            for col in range(len(result.mSites)):

                site = result.mSites[col]
                column = column_data[col * 3]

                if column.mNChars != mali_length:
                    continue

                keep = False

                if select_positive_sites and site.isPositive(
                        options.significance_threshold, options.use_adjusted):
                    keep = True

                elif select_negative_sites and site.isNegative(
                        options.significance_threshold, options.use_adjusted):
                    keep = True

                if not keep:
                    continue

                sites.append((col, site))

            nsites = len(sites)

            if options.truncate_sites_list:
                # truncate sites list, sort by significance
                sites.sort(lambda x, y: cmp(x[1].mPValue, y[1].mPValue))
                sites = sites[:options.truncate_sites_list]

            for col, site in sites:

                site = result.mSites[col]
                xcol = col * 3

                for row in range(mali_length):
                    id = identifiers[row]
                    x = max(xcol - options.context_size * 3, 0)
                    y = min(xcol + 3 + options.context_size * 3, mali_width)
                    segment = mali[id][x:y]
                    codon = mali[id][xcol:xcol + 3]
                    pos = mali.getResidueNumber(id, xcol)
                    pos /= 3

                    # save as real-world coordinates
                    chars_per_row[row].append(
                        PositionInformation(
                            Genomics.MapCodon2AA(codon), pos + 1, xcol,
                            Genomics.TranslateDNA2Protein(segment).upper()))

            if p_value is not None:
                pp_value = p_value
            else:
                pp_value = "na"

            if options.method in ("positive-site-table", "negative-site-table",
                                  "neutral-site-table"):

                if options.context_size:
                    for row in range(mali_length):
                        if prefix:
                            options.stdout.write("%s\t" % prefix)

                        options.stdout.write(
                            "%s\t%i\t%s\t%s\n" %
                            (identifiers[row], nsites, pp_value, ";".join([
                                "%s%i in %s" %
                                (x.mAA, x.mSequencePosition, x.mContext)
                                for x in chars_per_row[row]
                            ])))
                else:
                    for row in range(mali_length):
                        if prefix:
                            options.stdout.write("%s\t" % prefix)

                        options.stdout.write(
                            "%s\t%i\t%s\t%s\n" %
                            (identifiers[row], nsites, pp_value, ";".join([
                                "%s%i" % (x.mAA, x.mSequencePosition)
                                for x in chars_per_row[row]
                            ])))

            elif options.method in ("positive-site-list", "negative-site-list",
                                    "neutral-site-list"):

                for row in range(mali_length):

                    if prefix:
                        xprefix = "%s\t%s" % (prefix, identifiers[row])
                    else:
                        xprefix = "%s" % (identifiers[row])
                    x = 0
                    for chars in chars_per_row[row]:
                        x += 1
                        options.stdout.write(
                            "%s\t%i\t%s\t%i\t%i\t%s\n" %
                            (xprefix, x, chars.mAA, chars.mSequencePosition,
                             chars.mMaliPosition, chars.mContext))

    options.stdout.flush()

    return counts
Exemplo n.º 3
0
def selectPositiveSites(results, selection_mode, options, mali=None):
    """returns sites, which are consistently estimated to be positively selected.

    Depending on the option selection_mode, various sites are selected:

    'all': all positive sites are returned
    'consistent': only positive sites that are positive in all models and runs
    'emes': only sites that are > 0.9 in one model and at least > 0.5 in all other models

    If mali is given, positions that are not fully aligned are removed.

    """

    ## filter and extract functions
    if selection_mode == "emes":
        filter_f = lambda x: x.mProbability >= 0.5 and x.mOmega >= options.filter_omega
    else:
        filter_f = lambda x: x.mProbability >= options.filter_probability and x.mOmega >= options.filter_omega

    extract_f = lambda x: x.mResidue

    ## maximum significance per site (for emes)
    max_per_site = {}

    total_sites = set()

    first = True

    for result in results:

        for model in options.models:

            sites = result.mSites[model]

            s1, s2 = set(), set()
            if "neb" in options.analysis:
                s1 = set(
                    map(extract_f, filter(filter_f,
                                          sites.mNEB.mPositiveSites)))
                for x in filter(filter_f, sites.mNEB.mPositiveSites):
                    if x.mResidue not in max_per_site:
                        max_per_site[x.mResidue] = 0
                    max_per_site[x.mResidue] = max(x.mProbability,
                                                   max_per_site[x.mResidue])

            if "beb" in options.analysis:
                s2 = set(
                    map(extract_f, filter(filter_f,
                                          sites.mBEB.mPositiveSites)))
                for x in filter(filter_f, sites.mBEB.mPositiveSites):
                    if x.mResidue not in max_per_site:
                        max_per_site[x.mResidue] = 0
                    max_per_site[x.mResidue] = max(x.mProbability,
                                                   max_per_site[x.mResidue])

            s = s1.union(s2)

            if first:
                total_sites = s
                first = False
            else:
                if selection_mode == "all":
                    total_sites = total_sites.union(s)
                elif selection_mode == "consistent":
                    total_sites = total_sites.intersection(s)
                elif selection_mode == "emes":
                    total_sites = total_sites.intersection(s)

    if selection_mode == "emes":
        if options.loglevel >= 2:
            options.stdlog.write(
                "# before EMES filtering %i positive sites: mode %s, P>%5.2f\n"
                % (len(total_sites), selection_mode, 0.5))

        # filter according to emes: maximum significance larger than 0.9
        total_sites = set(filter(lambda x: max_per_site[x] > 0.9, total_sites))

        if options.loglevel >= 2:
            options.stdlog.write(
                "# after EMES filtering %i positive sites: mode %s, P>%5.2f\n"
                % (len(total_sites), selection_mode, 0.9))

    else:
        if options.loglevel >= 2:
            options.stdlog.write(
                "# extracted %i positive sites: mode %s, P>%5.2f\n" %
                (len(total_sites), selection_mode, options.filter_probabiltiy))

    if mali and options.filter_mali:
        if options.filter_mali == "gaps":
            nfiltered = 0
            mali_length = mali.getLength()

            column_data = map(
                lambda x: Mali.MaliData(x, gap_chars="Nn", mask_chars="-."),
                mali.getColumns())
            new_sites = set()

            for x in total_sites:

                ## PAML uses one-based coordinates
                column = column_data[x - 1]

                if column.mNChars != mali_length:
                    nfiltered += 1
                    if options.loglevel >= 3:
                        options.stdlog.write(
                            "# rejected position %i due to mali\n" % x)
                    continue

                new_sites.add(x)

            total_sites = new_sites

        if options.loglevel >= 2:
            options.stdlog.write("# after MALI filtering %i positive sites\n" %
                                 (len(total_sites)))

    return total_sites, max_per_site