Exemplo n.º 1
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: malis2profiles.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.set_defaults()

    (options, args) = E.Start(parser)

    mali = Mali.SequenceCollection()
    last_id = None
    ninput, noutput, nskipped = 0, 0, 0

    for line in sys.stdin:
        if line[0] == "#":
            continue

        start, ali, end, id = line[:-1].split("\t")
        ninput += 1
        if id != last_id:
            if last_id:
                mali.setName(last_id)
                mali.writeToFile(sys.stdout, format="profile")
                noutput += 1
            mali = Mali.SequenceCollection()
            last_id = id

        mali.addSequence(id, start, end, ali)

    if last_id:
        mali.setName(last_id)
        mali.writeToFile(sys.stdout, format="profile")
        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i.\n" %
                             (ninput, noutput, nskipped))

    E.Stop()
Exemplo n.º 2
0
        xrate_min_increment=0.000001,
        with_rho=True,
        separator="|",
        single_omega=False,
        shared_frequencies=False,
        shared_rates=False,
        block_size=None,
        replicates=None,
    )

    (options, args) = Experiment.Start(parser)

    if options.replicates != None:
        # read a sequence collection with possible duplicate names
        # used for benchmarking
        mali = Mali.SequenceCollection()
    else:
        mali = Mali.Mali()

    mali.readFromFile(sys.stdin, format=options.input_format)

    options.stdout.write(
        "seq1\tseq2\tdN\tdS\tdNdS\tN\tS\tdN_err\tdS_err\tkappa\tlnL\ttau\tlen")

    if options.with_rho:
        options.stdout.write("\trN\trS\tt\trN0\trS0\tt0")

    options.stdout.write("\terror_str\n")

    if options.replicates != None:
        ids = mali.getIdentifiers()
Exemplo n.º 3
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: mali2kaks.py 2781 2009-09-10 11:33:14Z andreas $")

    parser.add_option("--set-omega",
                      dest="omega",
                      type="float",
                      help="initial omega value.")

    parser.add_option("--set-kappa",
                      dest="kappa",
                      type="float",
                      help="initial kappa value.")

    parser.add_option("--fix-kappa",
                      dest="fix_kappa",
                      action="store_true",
                      help="do not estimate kappa.")

    parser.add_option("--fix-omega",
                      dest="fix_omega",
                      action="store_true",
                      help="do not estimate omega.")

    parser.add_option("--set-codon-frequencies",
                      dest="codon_frequencies",
                      type="choice",
                      choices=("uniform", "fequal", "f3x4", "f1x4", "f61"),
                      help="set codon frequencies.")

    parser.add_option("--set-method",
                      dest="paml_method",
                      type="int",
                      help="set paml optimization method [0|1].")

    parser.add_option("--set-sequence-type",
                      dest="seqtype",
                      type="choice",
                      choices=("codon", "aa", "trans"),
                      help="sequence type.")

    parser.add_option(
        "--set-clean-data",
        dest="clean_data",
        type="choice",
        choices=("0", "1"),
        help=
        "PAML should cleanup data:  0=only gaps within pair are removed, 1=columns in the mali with gaps are removed."
    )

    parser.add_option("--dump",
                      dest="dump",
                      action="store_true",
                      help="dump raw output [%default].")

    parser.add_option("--set-optimization-threshold",
                      dest="optimization_threshold",
                      type="string",
                      help="set paml optimization threshold [%default].")

    parser.add_option("-i",
                      "--input-format",
                      dest="input_format",
                      type="choice",
                      choices=("plain", "fasta", "clustal", "stockholm",
                               "phylip"),
                      help="input format of multiple alignment [%default].")

    parser.add_option("--pairwise",
                      dest="pairwise",
                      action="store_true",
                      help="force pairwise comparison [%default].")

    parser.add_option("--iteration",
                      dest="iteration",
                      type="choice",
                      choices=("all-vs-all", "first-vs-all", "pairwise",
                               "tree"),
                      help="iteration mode [%default].")

    parser.add_option(
        "--no-clean",
        dest="clean_mali",
        action="store_false",
        help=
        "do not clean multiple alignment before submitting to codeml. It might take too long for very large sequences."
    )

    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=("paml", "xrate"),
                      help="choose method for rate computation [%default]")

    parser.add_option("--xrate-model",
                      dest="xrate_model",
                      type="choice",
                      choices=("f3x4-two", "f3x4-four", "sn", "akaksgc",
                               "ef3x4-four", "f3x4-fourproducts"),
                      help="models to use [%default].")

    parser.add_option("-w",
                      "--write",
                      dest="write",
                      type="choice",
                      action="append",
                      choices=("input_fixed", "trained_fixed",
                               "input_variable", "trained_variable", "all"),
                      help="output sections to write [%default].")

    parser.add_option("-o",
                      "--output-pattern",
                      dest="output_pattern",
                      type="string",
                      help="output pattern for output files [%default].")

    parser.add_option("--xrate-insert-frequencies",
                      dest="xrate_insert_frequencies",
                      action="store_true",
                      help="estimate codon frequencies from input [%default].")

    parser.add_option("--xrate-uniform-frequencies",
                      dest="xrate_insert_frequencies",
                      action="store_false",
                      help="use uniform codon frequencies [%default].")

    parser.add_option("--xrate-fix-frequencies",
                      dest="xrate_fix_frequencies",
                      action="store_true",
                      help="set initial frequencies to const [%default].")

    parser.add_option("--xrate-estimate-frequencies",
                      dest="xrate_fix_frequencies",
                      action="store_false",
                      help="estimate nucleotide frequencies [%default].")

    parser.add_option(
        "--xrate-fix-rates",
        dest="fix_rates",
        type="string",
        help=
        """fix rates to specified values. Note that the number of rates has to match the ones
in the model. Provide values in a comma-separated list [%default].""")

    parser.add_option(
        "--xrate-min-increment",
        dest="xrate_min_increment",
        type=float,
        help="minimum increment to stop iteration in xrate [%default].")

    parser.add_option(
        "--min-overlap",
        dest="min_overlap",
        type="int",
        help="minimum overlap between a sequence pair in residues [%default].")

    parser.add_option(
        "--with-rho",
        dest="with_rho",
        action="store_true",
        help=
        "output rho values (substitution rates per codon). This requires a patched version of PAML [%default]."
    )

    parser.add_option(
        "--with-counts",
        dest="with_counts",
        action="store_true",
        help=
        "output counts of aligned positions, transitions and transversions [%default]."
    )

    parser.add_option("--remove-stops",
                      dest="remove_stops",
                      action="store_true",
                      help="remove stop codons [%default].")

    parser.add_option(
        "--replicates",
        dest="replicates",
        type="int",
        help="in benchmarking mode expect ## replicates [%default].")

    parser.add_option("--tree",
                      dest="tree",
                      type="string",
                      help="use tree for estimation [%default].")

    parser.set_defaults(
        input_format="fasta",
        omega=None,
        codon_frequencies=None,
        paml_method=None,
        optimization_threshold=None,
        seqtype="codon",
        dump=False,
        clean_data=False,
        min_overlap=60,
        gap_chars="-.",
        mask_chars="nN",
        pairwise=False,
        kappa=None,
        fix_kappa=False,
        fix_omega=False,
        clean_mali=True,
        method="paml",
        report_step=1000,
        loglevel=1,
        xrate_insert_frequencies=False,
        xrate_fix_frequencies=False,
        write=[],
        output_pattern="%s.eg",
        value_format="%6.4f",
        fix_rates=None,
        xrate_from_parameters=False,
        xrate_model="f3x4-four",
        with_rho=False,
        with_counts=False,
        iteration="all-vs-all",
        remove_stops=False,
        xrate_min_increment=0.000001,
        replicates=None,
        tree=None,
    )

    (options, args) = E.Start(parser)

    if options.method == "xrate":
        # imports for xrate computation
        from XGram.Generator.Prebuilt import Codons
        from XGram.Model import Annotation
        import XGram.Run
        import Bio.Data.CodonTable

        # paml like estimation using xrate
        if options.codon_frequencies == "uniform":
            options.xrate_fix_frequencies = True
            options.xrate_insert_frequencies = False
        elif options.codon_frequencies == "f3x4":
            options.xrate_fix_frequencies = True
            options.xrate_insert_frequencies = True
    elif options.method == "paml":
        if not options.codon_frequencies:
            options.codon_frequencies = "F3X4"

    if options.fix_rates:
        options.fix_rates = map(float, options.fix_rates.split(","))

    if options.pairwise or options.replicates:
        ## read sequences, but not as a multiple alignment. This permits multiple names.
        mali = Mali.SequenceCollection()
    else:
        mali = Mali.Mali()

    mali.readFromFile(sys.stdin, format=options.input_format)

    E.info("read multiple alignment")

    if mali.getLength() == 0:
        raise "refusing to process empty alignment."

    ################################################################
    ################################################################
    ################################################################
    ## setup methods
    ################################################################

    options.stdout.write(
        "seq1\tseq2\tdN\tdS\tdNdS\tN\tS\tdN_err\tdS_err\tkappa\tlnL\ttau")

    if options.with_rho:
        options.stdout.write("\trN\trS\tt\trN0\trS0\tt0")

    if options.with_counts:
        options.stdout.write("\t%s" % Genomics.SequencePairInfo().getHeader())

    options.stdout.write("\terror_str\n")

    if options.replicates != None:
        ids = mali.getIdentifiers()
        assert (len(ids) % options.replicates == 0)
        s = len(ids) / options.replicates
        for x in range(0, len(ids), s):
            m = Mali.Mali()
            for id in ids[x:x + s]:
                m.addEntry(mali.getEntry(id))
            processMali(m, options)
    else:
        processMali(mali, options)

    E.Stop()
Exemplo n.º 4
0
def main(argv=sys.argv):

    parser = E.OptionParser(
        version=
        "%prog version: $Id: mali2mali.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-i",
        "--input-format",
        dest="input_format",
        type="choice",
        choices=("plain", "fasta", "clustal", "stockholm", "phylip"),
        help="input format of multiple alignment [default=%default].")

    parser.add_option(
        "-o",
        "--output-format",
        dest="output_format",
        type="choice",
        choices=("plain", "fasta", "stockholm", "phylip", "nexus",
                 "plain-fasta"),
        help="output format of multiple alignment [default=%default].")

    parser.add_option(
        "--with-ranges",
        dest="with_ranges",
        action="store_true",
        help=
        "output alignment ranges (suffix /from-to after identifier) [default=%default]."
    )

    parser.add_option(
        "--without-ranges",
        dest="with_ranges",
        action="store_false",
        help=
        "do not output alignment ranges (suffix /from-to after identifier) [default=%default]."
    )

    parser.add_option("-u",
                      "--allow-duplicates",
                      dest="allow_duplicates",
                      action="store_true",
                      help="permit duplicate entries [default=%default].")

    parser.add_option(
        "-m",
        "--method",
        dest="methods",
        type="string",
        help=
        """methods to apply. Several methods can be specified in a ','-separated list [default=%default]."""
    )

    parser.add_option(
        "-p",
        "--parameters",
        dest="parameters",
        type="string",
        help="parameter stack for methods that require one [default=%default]."
    )

    parser.add_option(
        "-a",
        "--mask-char",
        dest="mask_char",
        type="string",
        help="character to identify/set masked characters [default=%default].")

    parser.set_defaults(
        input_format="fasta",
        output_format="fasta",
        methods="",
        parameters="",
        mask_char="x",
        gap_chars="-.nN",
        with_ranges=True,
        allow_duplicates=False,
    )

    (options, args) = E.Start(parser)

    options.methods = options.methods.split(",")
    options.parameters = options.parameters.split(",")

    # 1. read multiple alignment in various formats
    if options.allow_duplicates:
        mali = Mali.SequenceCollection()
    else:
        mali = Mali.Mali()

    t1 = time.time()

    mali.readFromFile(options.stdin, format=options.input_format)

    E.info("read mali with %i entries in %i seconds." %
           (len(mali), time.time() - t1))

    if len(mali) == 0:
        raise ValueError("empty multiple alignment")

    for method in options.methods:

        t1 = time.time()

        if method == "remove-unaligned-ends":
            mali.removeUnalignedEnds()
        elif method == "remove-end-gaps":
            mali.removeEndGaps()
        elif method == "remove-all-gaps":
            mali.removeGaps(minimum_gaps=len(mali))
        elif method == "remove-any-gaps":
            mali.removeGaps(minimum_gaps=1)
        elif method == "remove-some-gaps":
            minimum_gaps = int(options.parameters[0])
            del options.parameters[0]
            mali.removeGaps(minimum_gaps=minimum_gaps)
        elif method == "remove-empty-sequences":
            mali.removeEmptySequences()
        elif method == "upper":
            mali.upperCase()
        elif method == "lower":
            mali.lowerCase()
        elif method == "mark-codons":
            mali.markCodons()
        elif method == "remove-stops":
            mali.removePattern(lambda x: x.upper() in ("TAG", "TAA", "TGA"),
                               allowed_matches=0,
                               minimum_matches=1,
                               delete_frame=3,
                               search_frame=3)
        elif method == "shift-alignment":
            map_id2offset = IOTools.ReadMap(open(options.parameters[0], "r"),
                                            map_functions=(str, int))
            del options.parameters[0]
            mali.shiftAlignment(map_id2offset)
        elif method == "propagate-masks":
            mali.propagateMasks(mask_char=options.mask_char)

        elif method == "recount":
            mali.recount()

        elif method in ("mark-transitions", "filter-odd-transitions",
                        "filter-even-transitions", "keep-even-segments",
                        "keep-odd-segments"):

            if os.path.exists(options.parameters[0]):
                map_id2transitions = IOTools.readMultiMap(
                    open(options.parameters[0], "r"), map_functions=(str, int))
            else:
                map_id2transitions = {}
                r = map(int, options.parameters[0].split(':'))
                r.sort()
                map_id2transitions["mali"] = r

            del options.parameters[0]
            if method == "mark-transitions":
                mali.markTransitions(map_id2transitions)
            elif method in ("filter-odd-transitions", "keep-even-segments"):
                mali.markTransitions(map_id2transitions, mode="keep-odd")
            elif method in ("filter-even-transitions", "keep-odd-segments"):
                mali.markTransitions(map_id2transitions, mode="keep-even")

        elif method == "propagate-transitions":
            mali.propagateTransitions()

        elif method == "map-annotation":
            # map annotations in one mali (stockholm-format) to the annotations in another.
            # Note: the first two sequence identifiers must be shared and the sequence of the
            # same length
            other_mali = Mali.Mali()
            other_mali.readFromFile(open(options.parameters[0], "r"),
                                    format="stockholm")
            del options.parameters[0]
            mali.copyAnnotations(other_mali)

        elif method == "add-annotation":
            annotation_type, annotation_file = options.parameters[:2]
            del options.parameters[:2]
            AddAnnotation(mali, annotation_type, annotation_file)

        elif method == "mask-columns":
            annotation_type, annotation_file = options.parameters[:2]
            del options.parameters[:2]
            maskColumns(mali, annotation_type, annotation_file)

        elif method == "remove-unaligned-pairs":
            removeUnalignedPairs(mali, options)

        elif method == "filter-3rd":
            filterMali(mali, "3rd")

        elif method == "filter-4d":
            filterMali(mali, "4d")

        elif method in ("mask-seg", "mask-bias"):
            a, b = method.split("-")
            maskMali(mali, b)

        elif method == "exclude-with-stop":
            mali.filter(method="with-stop")

        elif method == "exclude-with-stop":
            mali.filter(method="with-frameshift")

        E.info("applied method %s in %i seconds." % (method, time.time() - t1))

    mali.writeToFile(options.stdout,
                     format=options.output_format,
                     write_ranges=options.with_ranges)

    E.Stop()
Exemplo n.º 5
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: mali2rates.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--input-format",
                      dest="input_format",
                      type="choice",
                      choices=("plain", "fasta", "clustal", "stockholm",
                               "phylip"),
                      help="input format of multiple alignment")

    parser.add_option(
        "-s",
        "--sites",
        dest="sites",
        type="string",
        help="sites to use [default=%default].",
    )

    parser.add_option(
        "-f",
        "--file",
        dest="filename",
        type="string",
        help="filename of multiple alignment (- for stdin) [default=%default].",
        metavar="FILE")

    parser.add_option("-o",
                      "--format",
                      dest="format",
                      type="string",
                      help="format [default=%default].",
                      metavar="format")

    parser.add_option(
        "-d",
        "--distance",
        dest="distance",
        type="choice",
        choices=("PID", "T92", "JC69", "POVL", "F84", "LogDet", "K80", "F81",
                 "HKY85", "TN93", "REV", "UNREST", "REVU", "UNRESTU", "JTT",
                 "PMB", "PAM", "Kimura", "CategoriesModel"),
        help="method to use for distance calculation [default=%default].")

    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=("phylip", "baseml", "own", "xrate"),
                      help="program to use for rate calculation.")

    parser.add_option("--output-format",
                      dest="output_format",
                      type="choice",
                      choices=("list", "tree"),
                      help="output format.")

    parser.add_option(
        "-m",
        "--min-sites",
        dest="min_sites",
        type="int",
        help="minimum number of sites for output[default=%default].",
    )

    parser.add_option(
        "-a",
        "--alphabet",
        dest="alphabet",
        type="choice",
        choices=("aa", "na", "auto"),
        help="alphabet to use.",
    )

    parser.add_option("-t",
                      "--filename-tree",
                      dest="filename_tree",
                      type="string",
                      help="filename with tree information.")

    parser.add_option("--set-alpha",
                      dest="alpha",
                      type="float",
                      help="initial alpha value.")

    parser.add_option("--fix-alpha",
                      dest="fix_alpha",
                      action="store_true",
                      help="do not estimate alpha.")

    parser.add_option("--set-kappa",
                      dest="kappa",
                      type="float",
                      help="initial kappa value.")

    parser.add_option("--fix-kappa",
                      dest="fix_kappa",
                      action="store_true",
                      help="do not estimate kappa.")

    parser.add_option("--dump",
                      dest="dump",
                      action="store_true",
                      help="dump output.")

    parser.add_option("--test",
                      dest="test",
                      action="store_true",
                      help="test run - does not clean up.")

    parser.add_option("--pairwise",
                      dest="pairwise",
                      action="store_true",
                      help="force pairwise comparison.")

    parser.add_option(
        "--set-clean-data",
        dest="clean_data",
        type="choice",
        choices=("0", "1"),
        help=
        "PAML should cleanup data:  0=only gaps within pair are removed, 1=columns in the mali with gaps are removed."
    )

    parser.add_option(
        "--with-counts",
        dest="with_counts",
        action="store_true",
        help=
        "output counts of aligned positions, transitions and transversions.")

    parser.add_option("-w",
                      "--write",
                      dest="write",
                      type="choice",
                      action="append",
                      choices=("input", "trained", "all"),
                      help="output sections to write for xrate.")

    parser.add_option("--output-pattern",
                      dest="output_pattern",
                      type="string",
                      help="output pattern for output files.")

    parser.add_option("--xrate-min-increment",
                      dest="xrate_min_increment",
                      type=float,
                      help="minimum increment to stop iteration in xrate.")

    parser.set_defaults(
        input_format="fasta",
        filename_tree=None,
        with_counts=False,
        sites="d4",
        distance="T92",
        min_sites=1,
        filename="-",
        alphabet="auto",
        format="%6.4f",
        method="phylip",
        kappa=None,
        fix_kappa=False,
        alpha=None,
        fix_alpha=False,
        dump=False,
        clean_data=None,
        output_format="list",
        iteration="all-vs-all",
        pairwise=False,
        report_step=1000,
        output_pattern="%s.eg",
        write=[],
        test_xrate=False,
        xrate_min_increment=None,
        is_codons=False,
    )

    (options, args) = E.Start(parser)

    if options.filename != "-":
        infile = open(options.filename, "r")
    else:
        infile = sys.stdin

    # read multiple alignment
    if options.pairwise:
        # read sequences, but not as a multiple alignment. This permits
        # multiple names.
        mali = Mali.SequenceCollection()
        options.iteration = "pairwise"
    else:
        mali = Mali.Mali()

    mali.readFromFile(infile, format=options.input_format)

    ids = mali.getIdentifiers()

    if options.alphabet == "auto":
        s = "".join(map(lambda x: x.mString, mali.values())).lower()
        ss = re.sub("[acgtxn]", "", s)
        if float(len(ss)) < (len(s) * 0.1):
            options.alphabet = "na"
            if mali.getNumColumns() % 3 == 0:
                options.is_codons = True
        else:
            options.alphabet = "aa"

        if options.loglevel >= 1:
            options.stdlog.write("# autodetected alphabet: %s\n" %
                                 options.alphabet)

    if options.filename != "-":
        infile.close()

    npairs = 0
    nskipped_length = 0
    nskipped_distance = 0

    pairs = []
    if options.iteration == "all-vs-all":
        for x in range(len(ids) - 1):
            for y in range(x + 1, len(ids)):
                pairs.append((x, y))
    elif options.iteration == "first-vs-all":
        for y in range(1, len(ids)):
            pairs.append((0, y))
    elif options.iteration == "pairwise":
        if len(ids) % 2 != 0:
            raise "uneven number of sequences (%i) not compatible with --iteration=pairwise" % len(
                ids)
        for x in range(0, len(ids), 2):
            pairs.append((x, x + 1))

    if options.alphabet == "na":

        if options.method == "baseml":
            runBaseML(mali, pairs, options)
        elif options.method == "phylip" and options.distance in ("F84", "K80",
                                                                 "JC69",
                                                                 "LogDet"):
            runDNADIST(mali, pairs, options)
        elif options.method == "xrate":
            runXrate(mali, pairs, options)
        else:
            if options.is_codons:
                h = Genomics.SequencePairInfoCodons().getHeader()
            else:
                h = Genomics.SequencePairInfo().getHeader()
            options.stdout.write("seq1\tseq2\tdist\tvar\t%s\n" % (h))

            for x, y in pairs:
                id_x = ids[x]
                npairs += 1

                id_y = ids[y]

                info = Genomics.CalculatePairIndices(
                    mali[id_x], mali[id_y], with_codons=options.is_codons)

                if options.distance in ("T92", "JC69"):
                    if options.sites == "d4":
                        seq1, seq2 = Genomics.GetDegenerateSites(mali[id_x],
                                                                 mali[id_y],
                                                                 position=3,
                                                                 degeneracy=4)

                        if len(seq1) < options.min_sites:
                            nskipped_length += 1
                            continue
                    else:
                        raise "unknown sites %s" % options.sites

                if options.distance == "T92":
                    distance, variance = CalculateDistanceT92(info)
                elif options.distance == "JC69":
                    distance, variance = CalculateDistanceJC69(info)
                elif options.distance == "PID":
                    distance, variance = CalculateDistancePID(
                        mali[id_x], mali[id_y])
                elif options.distance == "POVL":
                    distance, variance = CalculateDistancePOVL(
                        mali[id_x], mali[id_y])

                if distance >= 0:
                    options.stdout.write("\t".join(
                        map(str, (id_x, id_y, options.format % distance,
                                  options.format % variance, info))) + "\n")
                else:
                    nskipped_distance += 1

    elif options.alphabet == "aa":

        if options.distance in ("JTT", "PMB", "PAM", "Kimura",
                                "CategoriesModel"):

            # use phylip for these
            phylip = WrapperPhylip.Phylip()
            phylip.setProgram("protdist")
            phylip.setMali(mali)

            phylip_options = []
            if options.distance == "PMG":
                phylip_options += ["D"] * 1
            elif options.distance == "PAM":
                phylip_options += ["D"] * 2
            elif options.distance == "Kimura":
                phylip_options += ["D"] * 3
            elif options.distance == "CategoriesModel":
                phylip_options += ["D"] * 4

            phylip_options.append("Y")
            phylip.setOptions(phylip_options)
            result = phylip.run()

            writePhylipResult(result, options)

        else:
            options.stdout.write("id1\tid2\tdist\tvar\n")

            # iterate over all pairs of sequences
            for x, y in pairs:
                id_x = ids[x]
                npairs += 1

                id_y = ids[y]

                if options.distance == "PID":
                    distance, variance = CalculateDistancePID(
                        mali[id_x], mali[id_y])
                elif options.distance == "POVL":
                    # percentage overlap
                    distance, variance = CalculateDistancePOVL(
                        mali[id_x], mali[id_y])

                if distance >= 0:
                    options.stdout.write("\t".join(
                        (id_x, id_y, options.format % distance,
                         options.format % variance)) + "\n")
                else:
                    nskipped_distance += 1

    if options.loglevel >= 1:
        options.stdlog.write(
            "# nseqs=%i, npairs=%i, nskipped_length=%i, nskipped_distance=%i\n"
            % (len(ids), npairs, nskipped_length, nskipped_distance))

    E.Stop()
Exemplo n.º 6
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--input-format",
                      dest="input_format",
                      type="choice",
                      choices=("plain", "fasta", "clustal", "stockholm"),
                      help="input format of multiple alignment")

    parser.add_option(
        "-a",
        "--alphabet",
        dest="alphabet",
        type="choice",
        choices=("aa", "na"),
        help="alphabet to use [default=%default].",
    )

    parser.add_option("-s",
                      "--sections",
                      dest="sections",
                      type="choice",
                      action="append",
                      choices=("length", "composition", "entropy", "all"),
                      help="which sections to output")

    parser.add_option("-u",
                      "--allow-duplicates",
                      dest="allow_duplicates",
                      action="store_true",
                      help="permit duplicate entries [default=%default].")

    parser.set_defaults(
        input_format="fasta",
        output_format="fasta",
        mask_chars="nN",
        gap_chars="-.",
        alphabet="na",
        sections=[],
        allow_duplicates=False,
    )

    (options, args) = E.Start(parser)

    if len(options.sections) == 0:
        raise ValueError("please supply at least one method.")

    if "all" in options.sections:
        options.sections = ["length", "composition", "entropy"]

    counters = []

    def getCounter(section):

        if options.alphabet == "na":
            if section == "length":
                s = SequencePropertiesLength()
            elif section == "composition":
                s = SequencePropertiesNA()
            elif section == "entropy":
                s = SequencePropertiesEntropy("ACGT")
            else:
                raise ValueError("unknown section %s" % section)
        elif options.alphabet == "aa":
            if section == "length":
                s = SequencePropertiesLength()
            elif section == "composition":
                s = SequencePropertiesAminoAcids()
            elif section == "entropy":
                s = SequencePropertiesEntropy("ACDEFGHIKLMNPQRSTVWY")
            else:
                raise ValueError("unknown section %s" % section)
        return s

    # read multiple alignment in various formats
    # 1. read multiple alignment in various formats
    if options.allow_duplicates:
        mali = Mali.SequenceCollection()
    else:
        mali = Mali.Mali()

    mali.readFromFile(options.stdin, format=options.input_format)

    # do not use column, as it is a reserved word in sql
    options.stdout.write("col")
    for section in options.sections:
        options.stdout.write("\t" +
                             "\t".join(getCounter(section).getHeaders()))
    options.stdout.write("\n")

    columns = mali.getColumns()
    counter = E.Counter()

    for x, column in enumerate(columns):
        counter.input += 1
        sequence = "".join(column)
        options.stdout.write("%i" % x)

        for section in options.sections:
            s = getCounter(section)
            s.loadSequence(sequence)
            options.stdout.write("\t" + "\t".join(s.getFields()))

        options.stdout.write("\n")
        counter.output += 1

    E.info("%s" % str(counter))

    E.Stop()