示例#1
0
    def loadSequence(self, sequence, seqtype="na"):
        """load sequence properties from a sequence."""

        SequencePropertiesLength.loadSequence(self, sequence, seqtype)
        if len(sequence) % 3:
            raise ValueError(
                '''sequence length is not a multiple of 3 (length=%i)''' %
                (len(sequence)))

        # uppercase all letters
        sequence = sequence.upper()

        self.mNStopCodons = 0

        # setup counting arrays
        # nucleotide counts for each position (is not a sum of the counts
        # per degenerate site, as the codon might be intelligible, e.g. GNN).
        self.mCounts = [{'A': 0, 'C': 0, 'G': 0, 'T': 0, 'X': 0, 'N': 0},
                        {'A': 0, 'C': 0, 'G': 0, 'T': 0, 'X': 0, 'N': 0},
                        {'A': 0, 'C': 0, 'G': 0, 'T': 0, 'X': 0, 'N': 0}]

        # nucleotide counts for each position per degeneracy
        self.mCountsDegeneracy = []

        for x in (0, 1, 2):
            xx = []
            for y in range(5):
                yy = {}
                for z in Bio.Alphabet.IUPAC.extended_dna.letters:
                    yy[z] = 0
                xx.append(yy)
            self.mCountsDegeneracy.append(xx)

        # use generator rather than list to save memory
        for codon in (sequence[x:x + 3] for x in xrange(0, len(sequence), 3)):

            for x in (0, 1, 2):
                self.mCounts[x][codon[x]] += 1

            if Genomics.IsStopCodon(codon):
                self.mNStopCodons += 1
                continue

            try:
                aa, deg1, deg2, deg3 = Genomics.GetDegeneracy(codon)
                degrees = (deg1, deg2, deg3)
                for x in range(len(degrees)):
                    self.mCountsDegeneracy[x][degrees[x]][codon[x]] += 1

            except KeyError:
                pass
示例#2
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-w",
        "--weights-tsv-file",
        dest="filename_weights",
        type="string",
        help="filename with codon frequencies. Multiple filenames "
        "can be separated by comma.")

    parser.add_option("-s",
                      "--section",
                      dest="sections",
                      type="choice",
                      action="append",
                      choices=("length", "sequence", "hid", "na", "aa", "cpg",
                               "dn", "degeneracy", "gaps", "codons",
                               "codon-usage", "codon-translator",
                               "codon-bias"),
                      help="which sections to output [%default]")

    parser.add_option(
        "-t",
        "--sequence-type",
        dest="seqtype",
        type="choice",
        choices=("na", "aa"),
        help="type of sequence: na=nucleotides, aa=amino acids [%default].")

    parser.add_option(
        "-e",
        "--regex-identifier",
        dest="regex_identifier",
        type="string",
        help="regular expression to extract identifier from fasta "
        "description line.")

    parser.add_option("--split-fasta-identifier",
                      dest="split_id",
                      action="store_true",
                      help="split fasta description line (starting >) and use "
                      "only text before first space")

    parser.add_option(
        "--add-total",
        dest="add_total",
        action="store_true",
        help="add a row with column totals at the end of the table"
        "[%default]")

    parser.set_defaults(
        filename_weights=None,
        pseudocounts=1,
        sections=[],
        regex_identifier="(.+)",
        seqtype="na",
        gap_chars='xXnN',
        split_id=False,
        add_total=False,
    )

    (options, args) = E.Start(parser, argv=argv)

    rx = re.compile(options.regex_identifier)

    reference_codons = []
    if options.filename_weights:
        options.filename_weights = options.filename_weights.split(",")
        for filename in options.filename_weights:
            if filename == "uniform":
                reference_codons.append(Genomics.GetUniformCodonUsage())
            else:
                reference_codons.append(
                    IOTools.ReadMap(IOTools.openFile(filename, "r"),
                                    has_header=True,
                                    map_functions=(str, float)))

        # print codon table differences
        options.stdlog.write(
            "# Difference between supplied codon usage preferences.\n")
        for x in range(0, len(reference_codons)):
            for y in range(0, len(reference_codons)):
                if x == y:
                    continue
                # calculate KL distance
                a = reference_codons[x]
                b = reference_codons[y]
                d = 0
                for codon, p in a.items():
                    if Genomics.IsStopCodon(codon):
                        continue
                    d += b[codon] * math.log(b[codon] / p)

                options.stdlog.write("# tablediff\t%s\t%s\t%f\n" %
                                     (options.filename_weights[x],
                                      options.filename_weights[y], d))

    iterator = FastaIterator.FastaIterator(options.stdin)

    def getCounter(section):

        if options.seqtype == "na":
            if section == "length":
                s = SequenceProperties.SequencePropertiesLength()
            elif section == "sequence":
                s = SequenceProperties.SequencePropertiesSequence()
            elif section == "hid":
                s = SequenceProperties.SequencePropertiesHid()
            elif section == "na":
                s = SequenceProperties.SequencePropertiesNA()
            elif section == "gaps":
                s = SequenceProperties.SequencePropertiesGaps(
                    options.gap_chars)
            elif section == "cpg":
                s = SequenceProperties.SequencePropertiesCpg()
            elif section == "dn":
                s = SequenceProperties.SequencePropertiesDN()
            # these sections requires sequence length to be a multiple of 3
            elif section == "aa":
                s = SequenceProperties.SequencePropertiesAA()
            elif section == "degeneracy":
                s = SequenceProperties.SequencePropertiesDegeneracy()
            elif section == "codon-bias":
                s = SequenceProperties.SequencePropertiesBias(reference_codons)
            elif section == "codons":
                s = SequenceProperties.SequencePropertiesCodons()
            elif section == "codon-usage":
                s = SequenceProperties.SequencePropertiesCodonUsage()
            elif section == "codon-translator":
                s = SequenceProperties.SequencePropertiesCodonTranslator()
            else:
                raise ValueError("unknown section %s" % section)
        elif options.seqtype == "aa":
            if section == "length":
                s = SequenceProperties.SequencePropertiesLength()
            elif section == "sequence":
                s = SequenceProperties.SequencePropertiesSequence()
            elif section == "hid":
                s = SequenceProperties.SequencePropertiesHid()
            elif section == "aa":
                s = SequenceProperties.SequencePropertiesAminoAcids()
            else:
                raise ValueError("unknown section %s" % section)
        return s

    # setup totals
    totals = {}
    for section in options.sections:
        totals[section] = getCounter(section)

    options.stdout.write("id")
    for section in options.sections:
        options.stdout.write("\t" + "\t".join(totals[section].getHeaders()))

    options.stdout.write("\n")
    options.stdout.flush()

    s = getCounter("hid")
    s.loadSequence("AAAAAAAAA", "na")

    for cur_record in iterator:

        sequence = re.sub(" ", "", cur_record.sequence).upper()

        if len(sequence) == 0:
            raise ValueError("empty sequence %s" % cur_record.title)

        id = rx.search(cur_record.title).groups()[0]

        if options.split_id is True:
            options.stdout.write("%s" % id.split()[0])
        else:
            options.stdout.write("%s" % id)
        options.stdout.flush()

        for section in options.sections:
            s = getCounter(section)
            s.loadSequence(sequence, options.seqtype)
            totals[section].addProperties(s)

            options.stdout.write("\t" + "\t".join(s.getFields()))

        options.stdout.write("\n")

    if options.add_total:
        options.stdout.write("total")
        for section in options.sections:
            options.stdout.write("\t" + "\t".join(totals[section].getFields()))
        options.stdout.write("\n")

    E.Stop()
示例#3
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-m",
        "--method",
        dest="methods",
        type="choice",
        action="append",
        choices=("translate", "translate-to-stop", "truncate-at-stop",
                 "back-translate", "mark-codons", "apply-map", "build-map",
                 "pseudo-codons", "filter", "interleaved-codons", "map-codons",
                 "remove-gaps", "mask-seg", "mask-bias", "mask-codons",
                 "mask-incomplete-codons", "mask-stops", "mask-soft",
                 "remove-stops", "upper", "lower", "reverse-complement",
                 "sample", "shuffle"),
        help="method to apply to sequences.")

    parser.add_option("-p",
                      "--parameters",
                      dest="parameters",
                      type="string",
                      help="parameter stack for methods that require one "
                      "[default=%default].")

    parser.add_option("-x",
                      "--ignore-errors",
                      dest="ignore_errors",
                      action="store_true",
                      help="ignore errors [default = %default].")

    parser.add_option("--sample-proportion",
                      dest="sample_proportion",
                      type="float",
                      help="sample proportion [default = %default].")

    parser.add_option("--exclude-pattern",
                      dest="exclude_pattern",
                      type="string",
                      help="exclude all sequences with ids matching pattern "
                      "[default = %default].")

    parser.add_option("--include-pattern",
                      dest="include_pattern",
                      type="string",
                      help="include only sequences with ids matching pattern "
                      "[default = %default].")

    parser.add_option("--filter-method",
                      dest="filter_methods",
                      type="string",
                      action="append",
                      help="filtering methods to apply "
                      "[default = %default].")

    parser.add_option(
        "-t",
        "--sequence-type",
        dest="type",
        type="choice",
        choices=("aa", "na"),
        help="sequence type (aa or na) [%default]. This option determines "
        "which characters to use for masking [default = %default].")

    parser.add_option(
        "-l",
        "--template-identifier",
        dest="template_identifier",
        type="string",
        help="template for numerical identifier [default = %default] "
        "for the operation --build-map. A %i is replaced by the position "
        "of the sequence in the file.")

    parser.set_defaults(
        methods=[],
        parameters="",
        type="na",
        aa_mask_chars="xX",
        aa_mask_char="x",
        na_mask_chars="nN",
        na_mask_char="n",
        gap_chars="-.",
        gap_char="-",
        template_identifier="ID%06i",
        ignore_errors=False,
        exclude_pattern=None,
        include_pattern=None,
        sample_proportion=None,
        filter_methods=[],
    )

    (options, args) = E.Start(parser)
    options.parameters = options.parameters.split(",")

    rx_include, rx_exclude = None, None
    if options.include_pattern:
        rx_include = re.compile(options.include_pattern)
    if options.exclude_pattern:
        rx_exclude = re.compile(options.exclude_pattern)

    iterator = FastaIterator.FastaIterator(options.stdin)

    nseq = 0

    map_seq2nid = {}

    if "apply-map" in options.methods:
        map_seq2nid = IOTools.ReadMap(open(options.parameters[0], "r"))
        del options.parameters[0]

    if options.type == "na":
        mask_chars = options.na_mask_chars
        mask_char = options.na_mask_char
    else:
        mask_chars = options.aa_mask_chars
        mask_char = options.aa_mask_char

    if "map-codons" in options.methods:
        map_codon2code = IOTools.ReadMap(open(options.parameters[0], "r"))
        del options.parameters[0]

    if "mask-soft" in options.methods:
        f = options.parameters[0]
        del options.parameters[0]
        hard_masked_iterator = FastaIterator.FastaIterator(open(f, "r"))

    if "mask-codons" in options.methods or "back-translate" in options.methods:

        # open a second stream to read sequences from
        f = options.parameters[0]
        del options.parameters[0]

        other_iterator = FastaIterator.FastaIterator(open(f, "r"))

    ninput, noutput, nerrors, nskipped = 0, 0, 0, 0

    if "sample" in options.methods:
        if not options.sample_proportion:
            raise ValueError("specify a sample proportion")
        sample_proportion = options.sample_proportion
    else:
        sample_proportion = None

    filter_min_sequence_length = None
    filter_max_sequence_length = None
    filter_id_list = None
    for f in options.filter_methods:
        if f.startswith("min-length"):
            filter_min_sequence_length = int(f.split("=")[1])
        elif f.startswith("max-length"):
            filter_max_sequence_length = int(f.split("=")[1])
        elif f.startswith("id-file"):
            filter_id_list = [
                line[:-1] for line in IOTools.openFile(f.split("=")[1])
            ]

    def raiseIfNotCodon(l, title):
        '''raise ValueError if sequence length l is not divisible by
        3'''

        if l % 3 != 0:
            raise ValueError("length of sequence %s not divisible by 3" %
                             (title))

    while 1:
        try:
            cur_record = next(iterator)
        except StopIteration:
            break

        if cur_record is None:
            break
        nseq += 1
        ninput += 1

        sequence = re.sub(" ", "", cur_record.sequence)
        l = len(sequence)

        if rx_include and not rx_include.search(cur_record.title):
            nskipped += 1
            continue

        if rx_exclude and rx_exclude.search(cur_record.title):
            nskipped += 1
            continue

        if sample_proportion:
            if random.random() > sample_proportion:
                continue

        if not (filter_id_list is None or cur_record.title in filter_id_list):
            nskipped += 1
            continue

        for method in options.methods:

            if method == "translate":
                # translate such that gaps are preserved
                seq = []

                ls = len(re.sub('[%s]' % options.gap_chars, sequence, ""))

                if ls % 3 != 0:
                    msg = "length of sequence %s (%i) not divisible by 3" % (
                        cur_record.title, ls)
                    nerrors += 1
                    if options.ignore_errors:
                        E.warn(msg)
                        continue
                    else:
                        raise ValueError(msg)

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:
                    aa = Genomics.MapCodon2AA(codon)
                    seq.append(aa)

                sequence = "".join(seq)

            elif method == "back-translate":
                # translate from an amino acid alignment to codon alignment
                seq = []

                try:
                    other_record = next(other_iterator)
                except StopIteration:
                    raise ValueError("run out of sequences")

                if cur_record.title != other_record.title:
                    raise "sequence titles don't match: %s %s" % (
                        cur_record.title, other_record.title)

                other_sequence = re.sub("[ %s]" % options.gap_chars, "",
                                        other_record.sequence)

                if len(other_sequence) % 3 != 0:
                    raise ValueError(
                        "length of sequence %s not divisible by 3" %
                        (other_record.title))

                r = re.sub("[%s]" % options.gap_chars, "", sequence)
                if len(other_sequence) != len(r) * 3:
                    raise ValueError(
                        "length of sequences do not match: %i vs %i" %
                        (len(other_sequence), len(r)))

                x = 0
                for aa in sequence:
                    if aa in options.gap_chars:
                        c = options.gap_char * 3
                    else:
                        c = other_sequence[x:x + 3]
                        x += 3
                    seq.append(c)

                sequence = "".join(seq)

            elif method == "pseudo-codons":
                raiseIfNotCodon(l, cur_record.title)
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    aa = Genomics.MapCodon2AA(codon)
                    seq.append(aa)

                sequence = "   ".join(seq)

            elif method == "reverse-complement":
                sequence = string.translate(
                    sequence, string.maketrans("ACGTacgt", "TGCAtgca"))[::-1]

            elif method in ("mask-stops", "remove-stops"):
                c = []
                codon = []
                new_sequence = []

                if method == "mask-stops":
                    char = options.na_mask_char
                elif method == "remove-stops":
                    char = options.gap_char

                for x in sequence:

                    if x not in options.gap_chars:
                        codon.append(x.upper())

                    c.append(x)

                    if len(codon) == 3:
                        codon = "".join(codon).upper()
                        # mask all non-gaps
                        if Genomics.IsStopCodon(codon):

                            for x in c:
                                if x in options.gap_chars:
                                    new_sequence.append(x)
                                else:
                                    new_sequence.append(char)
                        else:
                            new_sequence += c

                        c = []
                        codon = []

                new_sequence += c

                sequence = "".join(new_sequence)

            elif method == "mask-soft":
                # Get next hard masked record and extract sequence and length
                try:
                    cur_hm_record = next(hard_masked_iterator)
                except StopIteration:
                    break
                hm_sequence = re.sub(" ", "", cur_hm_record.sequence)
                lhm = len(hm_sequence)
                new_sequence = []

                # Check lengths of unmasked and soft masked sequences the same
                if l != lhm:
                    raise ValueError(
                        "length of unmasked and hard masked sequences not "
                        "identical for record %s" % (cur_record.title))

                # Check if hard masked seq contains repeat (N), if so replace N
                # with lowercase sequence from unmasked version
                if sequence == hm_sequence:
                    pass
                else:
                    for x, y in zip_longest(sequence, hm_sequence):
                        if y == "N":
                            new_sequence += x.lower()
                        else:
                            new_sequence += x.upper()
                sequence = "".join(new_sequence)

            elif method == "map-codons":
                raiseIfNotCodon(l, cur_record.title)
                seq = []

                for codon in (sequence[x:x + 3].upper()
                              for x in range(0, l, 3)):

                    if codon not in map_codon2code:
                        aa = "X"
                    else:
                        aa = map_codon2code[codon]
                    seq.append(aa)

                sequence = "".join(seq)

            elif method == "interleaved-codons":
                raiseIfNotCodon(l, cur_record.title)
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    aa = Genomics.MapCodon2AA(codon)
                    seq.append("%s:%s" % (aa, codon))

                sequence = " ".join(seq)

            elif method == "translate-to-stop":
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    if Genomics.IsStopCodon(codon):
                        break

                    aa = Genomics.MapCodon2AA(codon)
                    seq.append(aa)

                sequence = "".join(seq)

            elif method == "truncate-at-stop":
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    if Genomics.IsStopCodon(codon):
                        break
                    seq.append(codon)

                sequence = "".join(seq)

            elif method == "remove-gaps":

                seq = []
                for s in sequence:
                    if s in options.gap_chars:
                        continue
                    seq.append(s)

                sequence = "".join(seq)

            elif method == "upper":
                sequence = sequence.upper()

            elif method == "lower":
                sequence = sequence.lower()

            elif method == "mark-codons":
                raiseIfNotCodon(l, cur_record.title)
                seq = []

                sequence = " ".join(
                    [sequence[x:x + 3] for x in range(0, l, 3)])

            elif method == "apply-map":
                id = re.match("^(\S+)", cur_record.title).groups()[0]
                if id in map_seq2nid:
                    rest = cur_record.title[len(id):]
                    cur_record.title = map_seq2nid[id] + rest

            elif method == "build-map":
                # build a map of identifiers
                id = re.match("^(\S+)", cur_record.title).groups()[0]
                new_id = options.template_identifier % nseq
                if id in map_seq2nid:
                    raise "duplicate fasta entries - can't map those: %s" % id
                map_seq2nid[id] = new_id
                cur_record.title = new_id

            elif method == "mask-bias":
                masker = Masker.MaskerBias()
                sequence = masker(sequence)

            elif method == "mask-seg":
                masker = Masker.MaskerSeg()
                sequence = masker(sequence)

            elif method == "shuffle":
                s = list(sequence)
                random.shuffle(s)
                sequence = "".join(s)

            elif method == "mask-incomplete-codons":
                seq = list(sequence)
                for x in range(0, l, 3):
                    nm = len([x for x in seq[x:x + 3] if x in mask_chars])
                    if 0 < nm < 3:
                        seq[x:x + 3] = [mask_char] * 3
                sequence = "".join(seq)

            elif method == "mask-codons":
                # mask codons based on amino acids given as reference
                # sequences.
                other_record = next(other_iterator)

                if other_record is None:
                    raise ValueError("run out of sequences.")

                if cur_record.title != other_record.title:
                    raise ValueError("sequence titles don't match: %s %s" %
                                     (cur_record.title, other_record.title))

                other_sequence = re.sub(" ", "", other_record.sequence)

                if len(other_sequence) * 3 != len(sequence):
                    raise ValueError(
                        "sequences for %s don't have matching lengths %i - %i"
                        % (cur_record.title, len(other_sequence) * 3,
                           len(sequence)))

                seq = list(sequence)
                c = 0
                for x in other_sequence:
                    if x in options.aa_mask_chars:
                        if x.isupper():
                            seq[c:c + 3] = [options.na_mask_char.upper()] * 3
                        else:
                            seq[c:c + 3] = [options.na_mask_char.lower()] * 3
                    c += 3

                sequence = "".join(seq)

        l = len(sequence)
        if filter_min_sequence_length is not None and \
           l < filter_min_sequence_length:
            nskipped += 1

        if filter_max_sequence_length is not None and \
           l > filter_max_sequence_length:
            nskipped += 1
            continue

        options.stdout.write(">%s\n%s\n" % (cur_record.title, sequence))
        noutput += 1

    if "build-map" in options.methods:
        p = options.parameters[0]
        if p:
            outfile = IOTools.openFile(p, "w")
        else:
            outfile = options.stdout

        outfile.write("old\tnew\n")
        for old_id, new_id in list(map_seq2nid.items()):
            outfile.write("%s\t%s\n" % (old_id, new_id))
        if p:
            outfile.close()

    E.info("ninput=%i, noutput=%i, nskipped=%i, nerrors=%i" %
           (ninput, noutput, nskipped, nerrors))

    E.Stop()
示例#4
0
def processMali(mali, options):

    map_new2old = mali.mapIdentifiers()
    ids = mali.getIdentifiers()

    invalid_chars = options.gap_chars + options.mask_chars

    has_non_overlaps = False

    pairs = []

    if options.iteration == "all-vs-all":
        for x in range(len(ids)):
            for y in range(0, x):
                pairs.append((x, y))
    elif options.iteration == "first-vs-all":
        for y in range(1, len(ids)):
            pairs.append((0, y))
    elif options.iteration == "pairwise":
        if len(ids) % 2 != 0:
            raise "uneven number of sequences (%i) not compatible with --iteration=pairwise" % len(
                ids)
        for x in range(0, len(ids), 2):
            pairs.append((x, x + 1))
    elif options.iteration == "tree":
        pairs = []
    else:
        raise "unknown iteration mode: %s" % (options.iteration)

    if options.remove_stops:
        for id, entry in mali.items():
            s = entry.mString.upper()
            fragments = []
            for x in range(0, len(s), 3):
                codon = s[x:x + 3]
                if Genomics.IsStopCodon(codon):
                    codon = "NNN"

                fragments.append(codon)

            entry.mString = "".join(fragments)

    for x, y in pairs:
        noverlap = 0
        for a, b in zip(mali[ids[x]], mali[ids[y]]):
            if a not in invalid_chars and b not in invalid_chars:
                noverlap += 1
                if noverlap >= options.min_overlap:
                    break
        else:
            has_non_overlaps = True
            break

    if options.tree:
        tree = TreeTools.Newick2Nexus(options.tree).trees[0]
        map_old2new = IOTools.getInvertedDictionary(map_new2old,
                                                    make_unique=True)
        tree.relabel(map_old2new)
    else:
        tree = None

    if options.method == "paml":
        runCodeML(mali, tree, has_non_overlaps, pairs, map_new2old, options)

    elif options.method == "xrate":
        runXrate(mali, has_non_overlaps, pairs, map_new2old, options)
示例#5
0
def prepareGrammar(xgram, mali, options):
    """prepare grammar for custom grammars."""

    ids = mali.getIdentifiers()

    fh, filename = tempfile.mkstemp()
    os.close(fh)
    outfile = open(filename, "w")
    mali.writeToFile(outfile,
                     format="stockholm",
                     write_ranges=False,
                     options=("#=GF NH (%s:1.0)%s;" % tuple(ids), ))
    outfile.close()

    if options.xrate_model == "sn":
        infile = open(XGram.PATH_DATA + "/sn.eg", "r")
        input_model = XGram.Parser.parseGrammar(infile.readlines())

    elif options.xrate_model == "akaksgc":
        infile = open(XGram.PATH_DATA + "/akaksgc.eg", "r")
        input_model = XGram.Parser.parseGrammar(infile.readlines())

    elif options.xrate_model in ("f3x4-two", "f3x4-four", "f3x4-fourproducts"):
        input_model = Codons.buildCodonML(codon_model=options.xrate_model,
                                          fix_kappa=options.fix_kappa,
                                          fix_omega=options.fix_omega)

    if options.xrate_model in ("ef3x4-four", ):

        sequences = getSequencesFromStk(filename)
        frequencies = Codons.getFrequenciesPerCodonPosition(sequences.values())

        codon_frequencies = {}
        if options.xrate_insert_frequencies:
            for c1 in ('A', 'C', 'G', 'T'):
                for c2 in ('A', 'C', 'G', 'T'):
                    for c3 in ('A', 'C', 'G', 'T'):
                        codon = "".join((c1, c2, c3))
                        if not Genomics.IsStopCodon(codon):
                            codon_frequencies[codon] = frequencies[0][
                                c1] * frequencies[1][c2] * frequencies[2][c3]

            total = sum(codon_frequencies.values())
            for k, v in codon_frequencies.items():
                codon_frequencies[k] /= total
        else:
            for c1 in ('A', 'C', 'G', 'T'):
                for c2 in ('A', 'C', 'G', 'T'):
                    for c3 in ('A', 'C', 'G', 'T'):
                        codon = "".join((c1, c2, c3))
                        codon_frequencies[codon] = 1 / 61.0

        input_model = Codons.buildCodonML(codon_model="codons-four",
                                          codon_frequencies=codon_frequencies,
                                          fix_kappa=options.fix_kappa,
                                          fix_omega=options.fix_omega)

    else:

        if options.xrate_insert_frequencies:
            setFrequencies(input_model, filename)

        if options.xrate_fix_frequencies:
            for char in ('a', 'c', 'g', 't'):
                for x in (0, 1, 2):
                    param = "p%s%i" % (char, x)
                    input_model.mGrammar.moveVariableToConst(param)

    if options.dump:
        options.stdlog.write("## input model:\n%s\n" %
                             input_model.getGrammar())

    writeModel(input_model, "input", options)

    t1 = time.time()

    result = xgram.train(input_model, filename)

    t2 = time.time()

    trained_model = result.getModel()

    if options.dump:
        options.stdlog.write("## trained model:\n%s\n" %
                             trained_model.getGrammar())

    writeModel(trained_model, "trained", options)

    return result, mali, ids
示例#6
0
    def Load(self, in_sequence):
        """load sequence properties from a sequence."""

        ## uppercase all letters
        sequence = in_sequence.upper()

        self.mNCodons = len(sequence) / 3

        self.mNStopCodons = 0

        ## setup counting arrays
        ## counts of amino acids
        self.mCountsAA = {}

        for x in Bio.Alphabet.IUPAC.extended_protein.letters:
            self.mCountsAA[x] = 0

        ## nucleotide counts for each position (is not a sum of the counts
        ## per degenerate site, as the codon might be intelligible, e.g. GNN).
        self.mCounts = [{
            'A': 0,
            'C': 0,
            'G': 0,
            'T': 0,
            'X': 0,
            'N': 0
        }, {
            'A': 0,
            'C': 0,
            'G': 0,
            'T': 0,
            'X': 0,
            'N': 0
        }, {
            'A': 0,
            'C': 0,
            'G': 0,
            'T': 0,
            'X': 0,
            'N': 0
        }]

        ## nucleotide counts for each position per degeneracy
        self.mCountsDegeneracy = []

        self.mLength = len(sequence)

        for x in (0, 1, 2):
            xx = []
            for y in range(5):
                yy = {}
                for z in Bio.Alphabet.IUPAC.extended_dna.letters:
                    yy[z] = 0
                xx.append(yy)
            self.mCountsDegeneracy.append(xx)

        for codon in [sequence[x:x + 3] for x in range(0, len(sequence), 3)]:

            for x in (0, 1, 2):
                self.mCounts[x][codon[x]] += 1

            if Genomics.IsStopCodon(codon):
                self.mNStopCodons += 1
                continue

            try:
                aa, deg1, deg2, deg3 = Genomics.GetDegeneracy(codon)
                degrees = (deg1, deg2, deg3)
                for x in range(len(degrees)):
                    self.mCountsDegeneracy[x][degrees[x]][codon[x]] += 1
                self.mCountsAA[aa] += 1

            except KeyError:
                pass

        self.Update()
示例#7
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-w",
        "--filename-weights",
        dest="filename_weights",
        type="string",
        help=
        "filename with codon frequencies. Multiple filenames can be separated by comma [default=%default]."
    )

    parser.add_option("-s",
                      "--sections",
                      dest="sections",
                      type="choice",
                      action="append",
                      choices=("length", "hid", "na", "aa", "degeneracy",
                               "bias", "codons", "codon-usage",
                               "codon-translator"),
                      help="which sections to output [default=%default]")

    parser.add_option(
        "-t",
        "--type",
        dest="seqtype",
        type="choice",
        choices=("na", "aa"),
        help=
        "type of sequence: na=nucleotides, aa=amino acids [default=%default].")

    parser.add_option(
        "-e",
        "--regex-identifier",
        dest="regex_identifier",
        type="string",
        help=
        "regular expression to extract identifier from fasta description line [default=%default]."
    )

    parser.set_defaults(
        filename_weights="uniform",
        pseudocounts=1,
        sections=[],
        regex_identifier="(.+)",
        seqtype="na",
    )

    (options, args) = E.Start(parser, argv=argv)
    options.filename_weights = options.filename_weights.split(",")

    rx = re.compile(options.regex_identifier)

    reference_codons = []
    if options.filename_weights:
        for filename in options.filename_weights:
            if filename == "uniform":
                reference_codons.append(Genomics.GetUniformCodonUsage())
            else:
                reference_codons.append(
                    IOTools.ReadMap(open(filename, "r"),
                                    has_header=True,
                                    map_functions=(str, float)))

        ## print codon table differences
        E.info("difference between supplied codon usage preferences.")
        for x in range(0, len(reference_codons)):
            for y in range(0, len(reference_codons)):
                if x == y: continue
                # calculate KL distance
                a = reference_codons[x]
                b = reference_codons[y]
                d = 0
                for codon, p in a.items():
                    if Genomics.IsStopCodon(codon): continue
                    d += b[codon] * math.log(b[codon] / p)
                E.info("tablediff\t%s\t%s\t%f" %
                       (options.filename_weights[x],
                        options.filename_weights[y], d))

    iterator = FastaIterator.FastaIterator(options.stdin)

    def getCounter(section):

        if options.seqtype == "na":
            if section == "length":
                s = SequencePropertiesLength()
            elif section == "hid":
                s = SequencePropertiesHid()
            elif section == "na":
                s = SequencePropertiesNA()
            elif section == "aa":
                s = SequencePropertiesAA()
            elif section == "degeneracy":
                s = SequencePropertiesDegeneracy()
            elif section == "bias":
                s = SequencePropertiesBias(reference_codons)
            elif section == "codons":
                s = SequencePropertiesCodons()
            elif section == "codon-usage":
                s = SequencePropertiesCodonUsage()
            elif section == "codon-translator":
                s = SequencePropertiesCodonTranslator()
            else:
                raise ValueError("unknown section %s" % section)
        elif options.seqtype == "aa":
            if section == "length":
                s = SequencePropertiesLength()
            elif section == "hid":
                s = SequencePropertiesHid()
            elif section == "aa":
                s = SequencePropertiesAminoAcids()
            else:
                raise ValueError("unknown section %s" % section)
        return s

    ## setup totals
    totals = {}
    for section in options.sections:
        totals[section] = getCounter(section)

    options.stdout.write("id")
    for section in options.sections:
        options.stdout.write("\t" + "\t".join(totals[section].getHeaders()))

    options.stdout.write("\n")
    options.stdout.flush()

    for cur_record in iterator:

        sequence = re.sub(" ", "", cur_record.sequence).upper()

        if len(sequence) == 0:
            E.warning("empty sequence %s" % cur_record.title)
            continue

        id = rx.search(cur_record.title).groups()[0]

        options.stdout.write("%s" % id)
        options.stdout.flush()

        for section in options.sections:
            s = getCounter(section)
            s.loadSequence(sequence)
            totals[section].addProperties(s)

            options.stdout.write("\t" + "\t".join(s.getFields()))

        options.stdout.write("\n")

    options.stdout.write("total")
    for section in options.sections:
        options.stdout.write("\t" + "\t".join(totals[section].getFields()))
    options.stdout.write("\n")

    E.Stop()