Exemplo n.º 1
0
    def loadSequence(self, sequence, seqtype="na"):
        """load sequence properties from a sequence."""

        SequenceProperties.loadSequence(self, sequence, seqtype)

        if len(sequence) % 3:
            raise ValueError(
                '''sequence length is not a multiple of 3 (length=%i)''' %
                (len(sequence)))

        # counts of amino acids
        self.mCountsAA = {}

        for x in Bio.Alphabet.IUPAC.extended_protein.letters:
            self.mCountsAA[x] = 0

        for codon in (sequence[x:x + 3] for x in range(0, len(sequence), 3)):
            aa = Genomics.MapCodon2AA(codon)
            self.mCountsAA[aa] += 1
Exemplo n.º 2
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument(
        "-m",
        "--method",
        dest="methods",
        type=str,
        action="append",
        choices=("translate", "translate-to-stop", "truncate-at-stop",
                 "back-translate", "mark-codons", "apply-map", "build-map",
                 "pseudo-codons", "filter", "interleaved-codons", "map-codons",
                 "remove-gaps", "mask-seg", "mask-bias", "mask-codons",
                 "mask-incomplete-codons", "mask-stops", "mask-soft",
                 "map-identifier", "nop", "remove-stops", "upper", "lower",
                 "reverse-complement", "sample", "shuffle"),
        help="method to apply to sequences.")

    parser.add_argument("-p",
                        "--parameters",
                        dest="parameters",
                        type=str,
                        help="parameter stack for methods that require one ")

    parser.add_argument("-x",
                        "--ignore-errors",
                        dest="ignore_errors",
                        action="store_true",
                        help="ignore errors.")

    parser.add_argument("--sample-proportion",
                        dest="sample_proportion",
                        type=float,
                        help="sample proportion.")

    parser.add_argument(
        "--exclude-pattern",
        dest="exclude_pattern",
        type=str,
        help="exclude all sequences with ids matching pattern ")

    parser.add_argument(
        "--include-pattern",
        dest="include_pattern",
        type=str,
        help="include only sequences with ids matching pattern ")

    parser.add_argument("--filter-method",
                        dest="filter_methods",
                        type=str,
                        action="append",
                        help="filtering methods to apply ")

    parser.add_argument(
        "-t",
        "--sequence-type",
        dest="type",
        type=str,
        choices=("aa", "na"),
        help="sequence type (aa or na) . This option determines "
        "which characters to use for masking.")

    parser.add_argument(
        "-l",
        "--template-identifier",
        dest="template_identifier",
        type=str,
        help="template for numerical identifier"
        "for the operation --build-map. A %i is replaced by the position "
        "of the sequence in the file.")

    parser.add_argument(
        "--map-tsv-file",
        dest="map_tsv_file",
        type=str,
        help=
        "input filename with map for identifiers. The first row is a header")

    parser.add_argument("--fold-width",
                        dest="fold_width",
                        type=int,
                        help="fold width for sequence output. 0 is unfolded ")

    parser.set_defaults(methods=[],
                        parameters="",
                        type="na",
                        aa_mask_chars="xX",
                        aa_mask_char="x",
                        na_mask_chars="nN",
                        na_mask_char="n",
                        gap_chars="-.",
                        gap_char="-",
                        template_identifier="ID%06i",
                        ignore_errors=False,
                        exclude_pattern=None,
                        include_pattern=None,
                        sample_proportion=None,
                        filter_methods=[],
                        input_filename_fasta="-",
                        input_filename_map=None,
                        fold_width=80)

    (args, unknown) = E.start(parser, unknowns=True)

    if len(unknown) > 0:
        args.input_filename_fasta = unknown[0]

    args.parameters = args.parameters.split(",")

    rx_include, rx_exclude = None, None
    if args.include_pattern:
        rx_include = re.compile(args.include_pattern)
    if args.exclude_pattern:
        rx_exclude = re.compile(args.exclude_pattern)

    iterator = FastaIterator.FastaIterator(args.stdin)

    nseq = 0

    map_seq2nid = {}

    map_identifier = ("apply-map" in args.methods
                      or "map-identifier" in args.methods)
    if map_identifier:
        if args.input_filename_map is None:
            raise ValueError("for method=map-identifier use --map-tsv-file")
        with iotools.open_file(args.input_filename_map) as infile:
            map_identifier = iotools.read_map(infile, has_header=True)

    if args.type == "na":
        mask_chars = args.na_mask_chars
        mask_char = args.na_mask_char
    else:
        mask_chars = args.aa_mask_chars
        mask_char = args.aa_mask_char

    if "map-codons" in args.methods:
        map_codon2code = iotools.ReadMap(open(args.parameters[0], "r"))
        del args.parameters[0]

    if "mask-soft" in args.methods:
        f = args.parameters[0]
        del args.parameters[0]
        hard_masked_iterator = FastaIterator.FastaIterator(open(f, "r"))

    if "mask-codons" in args.methods or "back-translate" in args.methods:

        # open a second stream to read sequences from
        f = args.parameters[0]
        del args.parameters[0]

        other_iterator = FastaIterator.FastaIterator(open(f, "r"))

    if "sample" in args.methods:
        if not args.sample_proportion:
            raise ValueError("specify a sample proportion")
        sample_proportion = args.sample_proportion
    else:
        sample_proportion = None

    filter_min_sequence_length = None
    filter_max_sequence_length = None
    filter_id_list = None
    for f in args.filter_methods:
        if f.startswith("min-length"):
            filter_min_sequence_length = int(f.split("=")[1])
        elif f.startswith("max-length"):
            filter_max_sequence_length = int(f.split("=")[1])
        elif f.startswith("id-file"):
            filter_id_list = [
                line[:-1] for line in iotools.open_file(f.split("=")[1])
            ]

    def raiseIfNotCodon(l, title):
        '''raise ValueError if sequence length l is not divisible by
        3'''

        if l % 3 != 0:
            raise ValueError("length of sequence %s not divisible by 3" %
                             (title))

    iterator = pysam.FastxFile(args.input_filename_fasta)

    c = E.Counter()

    fold_width = args.fold_width

    def fold(s, w):
        return "\n".join([s[x:x + w] for x in range(0, len(s), w)])

    for record in iterator:
        c.nseq += 1
        c.input += 1

        sequence = re.sub(" ", "", record.sequence)
        l = len(sequence)

        if rx_include and not rx_include.search(record.name):
            c.skipped += 1
            continue

        if rx_exclude and rx_exclude.search(record.name):
            c.skipped += 1
            continue

        if sample_proportion:
            if random.random() > sample_proportion:
                continue

        if not (filter_id_list is None or record.name in filter_id_list):
            c.skipped += 1
            continue

        for method in args.methods:

            if method == "translate":
                # translate such that gaps are preserved
                seq = []

                ls = len(re.sub('[%s]' % args.gap_chars, sequence, ""))

                if ls % 3 != 0:
                    msg = "length of sequence %s (%i) not divisible by 3" % (
                        record.name, ls)
                    c.errors += 1
                    if args.ignore_errors:
                        E.warn(msg)
                        continue
                    else:
                        raise ValueError(msg)

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:
                    aa = Genomics.MapCodon2AA(codon)
                    seq.append(aa)

                sequence = "".join(seq)

            elif method == "back-translate":
                # translate from an amino acid alignment to codon alignment
                seq = []

                try:
                    other_record = next(other_iterator)
                except StopIteration:
                    raise ValueError("run out of sequences")

                if record.name != other_record.title:
                    raise "sequence titles don't match: %s %s" % (
                        record.name, other_record.title)

                other_sequence = re.sub("[ %s]" % args.gap_chars, "",
                                        other_record.sequence)

                if len(other_sequence) % 3 != 0:
                    raise ValueError(
                        "length of sequence %s not divisible by 3" %
                        (other_record.title))

                r = re.sub("[%s]" % args.gap_chars, "", sequence)
                if len(other_sequence) != len(r) * 3:
                    raise ValueError(
                        "length of sequences do not match: %i vs %i" %
                        (len(other_sequence), len(r)))

                x = 0
                for aa in sequence:
                    if aa in args.gap_chars:
                        c = args.gap_char * 3
                    else:
                        c = other_sequence[x:x + 3]
                        x += 3
                    seq.append(c)

                sequence = "".join(seq)

            elif method == "pseudo-codons":
                raiseIfNotCodon(l, record.name)
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    aa = Genomics.MapCodon2AA(codon)
                    seq.append(aa)

                sequence = "   ".join(seq)

            elif method == "reverse-complement":
                sequence = sequence.translate(
                    str.maketrans("ACGTacgt", "TGCAtgca"))[::-1]

            elif method in ("mask-stops", "remove-stops"):
                c = []
                codon = []
                new_sequence = []

                if method == "mask-stops":
                    char = args.na_mask_char
                elif method == "remove-stops":
                    char = args.gap_char

                for x in sequence:

                    if x not in args.gap_chars:
                        codon.append(x.upper())

                    c.append(x)

                    if len(codon) == 3:
                        codon = "".join(codon).upper()
                        # mask all non-gaps
                        if Genomics.IsStopCodon(codon):

                            for x in c:
                                if x in args.gap_chars:
                                    new_sequence.append(x)
                                else:
                                    new_sequence.append(char)
                        else:
                            new_sequence += c

                        c = []
                        codon = []

                new_sequence += c

                sequence = "".join(new_sequence)

            elif method == "mask-soft":
                # Get next hard masked record and extract sequence and length
                try:
                    cur_hm_record = next(hard_masked_iterator)
                except StopIteration:
                    break
                hm_sequence = re.sub(" ", "", cur_hm_record.sequence)
                lhm = len(hm_sequence)
                new_sequence = []

                # Check lengths of unmasked and soft masked sequences the same
                if l != lhm:
                    raise ValueError(
                        "length of unmasked and hard masked sequences not "
                        "identical for record %s" % (record.name))

                # Check if hard masked seq contains repeat (N), if so replace N
                # with lowercase sequence from unmasked version
                if sequence == hm_sequence:
                    pass
                else:
                    for x, y in zip_longest(sequence, hm_sequence):
                        if y == "N":
                            new_sequence += x.lower()
                        else:
                            new_sequence += x.upper()
                sequence = "".join(new_sequence)

            elif method == "map-codons":
                raiseIfNotCodon(l, record.name)
                seq = []

                for codon in (sequence[x:x + 3].upper()
                              for x in range(0, l, 3)):

                    if codon not in map_codon2code:
                        aa = "X"
                    else:
                        aa = map_codon2code[codon]
                    seq.append(aa)

                sequence = "".join(seq)

            elif method == "interleaved-codons":
                raiseIfNotCodon(l, record.name)
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    aa = Genomics.MapCodon2AA(codon)
                    seq.append("%s:%s" % (aa, codon))

                sequence = " ".join(seq)

            elif method == "translate-to-stop":
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    if Genomics.IsStopCodon(codon):
                        break

                    aa = Genomics.MapCodon2AA(codon)
                    seq.append(aa)

                sequence = "".join(seq)

            elif method == "truncate-at-stop":
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    if Genomics.IsStopCodon(codon):
                        break
                    seq.append(codon)

                sequence = "".join(seq)

            elif method == "remove-gaps":

                seq = []
                for s in sequence:
                    if s in args.gap_chars:
                        continue
                    seq.append(s)

                sequence = "".join(seq)

            elif method == "upper":
                sequence = sequence.upper()

            elif method == "lower":
                sequence = sequence.lower()

            elif method == "mark-codons":
                raiseIfNotCodon(l, record.name)
                seq = []

                sequence = " ".join(
                    [sequence[x:x + 3] for x in range(0, l, 3)])

            elif method == "apply-map":
                id = re.match("^(\S+)", record.name).groups()[0]
                if id in map_seq2nid:
                    rest = record.name[len(id):]
                    record.name = map_seq2nid[id] + rest

            elif method == "build-map":
                # build a map of identifiers
                id = re.match("^(\S+)", record.name).groups()[0]
                new_id = args.template_identifier % nseq
                if id in map_seq2nid:
                    raise "duplicate fasta entries - can't map those: %s" % id
                map_seq2nid[id] = new_id
                record.name = new_id

            elif method == "mask-bias":
                masker = Masker.MaskerBias()
                sequence = masker(sequence)

            elif method == "mask-seg":
                masker = Masker.MaskerSeg()
                sequence = masker(sequence)

            elif method == "shuffle":
                s = list(sequence)
                random.shuffle(s)
                sequence = "".join(s)

            elif method == "mask-incomplete-codons":
                seq = list(sequence)
                for x in range(0, l, 3):
                    nm = len([x for x in seq[x:x + 3] if x in mask_chars])
                    if 0 < nm < 3:
                        seq[x:x + 3] = [mask_char] * 3
                sequence = "".join(seq)

            elif method == "mask-codons":
                # mask codons based on amino acids given as reference
                # sequences.
                other_record = next(other_iterator)

                if other_record is None:
                    raise ValueError("run out of sequences.")

                if record.name != other_record.title:
                    raise ValueError("sequence titles don't match: %s %s" %
                                     (record.name, other_record.title))

                other_sequence = re.sub(" ", "", other_record.sequence)

                if len(other_sequence) * 3 != len(sequence):
                    raise ValueError(
                        "sequences for %s don't have matching lengths %i - %i"
                        %
                        (record.name, len(other_sequence) * 3, len(sequence)))

                seq = list(sequence)
                c = 0
                for x in other_sequence:
                    if x in args.aa_mask_chars:
                        if x.isupper():
                            seq[c:c + 3] = [args.na_mask_char.upper()] * 3
                        else:
                            seq[c:c + 3] = [args.na_mask_char.lower()] * 3
                    c += 3

                sequence = "".join(seq)

        l = len(sequence)
        if filter_min_sequence_length is not None and \
           l < filter_min_sequence_length:
            c.skipped += 1

        if filter_max_sequence_length is not None and \
           l > filter_max_sequence_length:
            c.skipped += 1
            continue

        record.sequence = sequence
        if fold_width >= 0:
            if record.comment:
                args.stdout.write(">{} {}\n{}\n".format(
                    record.name, record.comment,
                    fold(record.sequence, fold_width)))
            else:
                args.stdout.write(">{}\n{}\n".format(
                    record.name, fold(record.sequence, fold_width)))
        else:
            args.stdout.write(str(record) + "\n")

        c.output += 1

    if "build-map" in args.methods:
        p = args.parameters[0]
        if p:
            outfile = iotools.open_file(p, "w")
        else:
            outfile = args.stdout

        outfile.write("old\tnew\n")
        for old_id, new_id in list(map_seq2nid.items()):
            outfile.write("%s\t%s\n" % (old_id, new_id))
        if p:
            outfile.close()

    E.info(c)
    E.stop()