Exemplo n.º 1
0
Arquivo: bam2bam.py Projeto: SCV/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m", "--methods", dest="methods", type="choice",
                      action="append",
                      choices=("filter",
                               "keep-first-base",
                               "set-nh",
                               "set-sequence",
                               "strip-sequence",
                               "strip-quality",
                               "unstrip",
                               "unset-unmapped-mapq"),
                      help="methods to apply [%default]")

    parser.add_option("--strip-method", dest="strip_method", type="choice",
                      choices=("all", "match"),
                      help="define which sequences/qualities to strip. "
                      "match means that stripping only applies to entries "
                      "without mismatches (requires NM tag to be present). "
                      "[%default]")

    parser.add_option("--filter-method", dest="filter_methods",
                      action="append", type="choice",
                      choices=('NM', 'CM', 'mapped', 'unique', "non-unique"),
                      help="filter method to apply to remove alignments "
                      "from a bam file. Multiple methods can be supplied "
                      "[%default]")

    parser.add_option("--reference-bam-file", dest="reference_bam",
                      type="string",
                      help="bam-file to filter with [%default]")

    parser.add_option("--force-output", dest="force", action="store_true",
                      help="force processing. Some methods such "
                      "as strip/unstrip will stop processing if "
                      "they think it not necessary "
                      "[%default]")

    parser.add_option("--output-sam", dest="output_sam", action="store_true",
                      help="output in sam format [%default]")

    parser.add_option("--inplace", dest="inplace", action="store_true",
                      help="modify bam files in-place. Bam files need "
                      "to be given "
                      "as arguments. Temporary bam files are written "
                      "to /tmp [%default]")

    parser.add_option(
        "--first-fastq-file", "-1", dest="fastq_pair1", type="string",
        help="fastq file with read information for first "
        "in pair or unpaired. Used for unstripping sequence "
        "and quality scores [%default]")

    parser.add_option(
        "--second-fastq-file", "-2", dest="fastq_pair2", type="string",
        help="fastq file with read information for second "
        "in pair. Used for unstripping sequence "
        "and quality scores  [%default]")

    parser.set_defaults(
        methods=[],
        output_sam=False,
        reference_bam=None,
        filter_methods=[],
        strip_method="all",
        force=False,
        inplace=False,
        fastq_pair1=None,
        fastq_pair2=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    bamfiles = []

    if options.stdin != sys.stdin:
        bamfiles.append(options.stdin.name)

    if options.inplace:
        bamfiles.extend(args)
        if len(bamfiles) == 0:
            raise ValueError(
                "please one or more bam-files as command line arguments")

        if "-" in bamfiles:
            raise ValueError(
                "can not read from stdin if ``--inplace`` is selected")

    if len(bamfiles) == 0:
        bamfiles = ["-"]

    for bamfile in bamfiles:

        E.info('processing %s' % bamfile)

        if os.path.islink(bamfile):
            E.warn('ignoring link %s' % bamfile)
            continue

        if IOTools.isEmpty(bamfile):
            E.warn('ignoring empty file %s' % bamfile)
            continue

        # reading bam from stdin does not work with only the "r" tag
        pysam_in = pysam.Samfile(bamfile, "rb")

        if bamfile == "-":
            if options.output_sam:
                pysam_out = pysam.Samfile("-", "wh", template=pysam_in)
            else:
                pysam_out = pysam.Samfile("-", "wb", template=pysam_in)
        else:
            if IOTools.isEmpty(bamfile):
                E.warn('skipping empty file %s' % bamfile)
                continue
            tmpfile = tempfile.NamedTemporaryFile(delete=False, prefix="ctmp")
            tmpfile.close()

            E.debug("writing temporary bam-file to %s" % tmpfile.name)
            pysam_out = pysam.Samfile(tmpfile.name, "wb", template=pysam_in)

        if "filter" in options.methods:

            remove_mismatches, colour_mismatches = False, False

            if "NM" in options.filter_methods:
                remove_mismatches = True

            elif "CM" in options.filter_methods:
                remove_mismatches = True
                colour_mismatches = True

            if remove_mismatches:
                if not options.reference_bam:
                    raise ValueError(
                        "requiring reference bam file for removing by "
                        "mismatches")

                pysam_ref = pysam.Samfile(options.reference_bam, "rb")
            else:
                pysam_ref = None

            # filter and flags are the opposite way around
            c = _bam2bam.filter_bam(
                pysam_in, pysam_out, pysam_ref,
                remove_nonunique="unique" in options.filter_methods,
                remove_unique="non-unique" in options.filter_methods,
                remove_contigs=None,
                remove_unmapped="mapped" in options.filter_methods,
                remove_mismatches=remove_mismatches,
                colour_mismatches=colour_mismatches)

            options.stdlog.write("category\tcounts\n%s\n" % c.asTable())
        else:

            # set up the modifying iterators
            it = pysam_in.fetch(until_eof=True)

            # function to check if processing should start
            pre_check_f = lambda x: None

            if "unset-unmapped-mapq" in options.methods:
                def unset_unmapped_mapq(i):
                    for read in i:
                        if read.is_unmapped:
                            read.mapq = 0
                        yield read
                it = unset_unmapped_mapq(it)

            if "set-sequence" in options.methods:
                def set_sequence(i):
                    for read in i:
                        # can't get at length of unmapped reads
                        if read.is_unmapped:
                            read.seq = "A"
                            read.qual = "F"
                        else:
                            read.seq = "A" * read.inferred_length
                            read.qual = "F" * read.inferred_length

                        yield read
                it = set_sequence(it)

            if "strip-sequence" in options.methods or "strip-quality" in \
               options.methods:
                def strip_sequence(i):
                    for read in i:
                        read.seq = None
                        yield read

                def check_sequence(reads):
                    if reads[0].seq is None:
                        return 'no sequence present'
                    return None

                def strip_quality(i):
                    for read in i:
                        read.qual = None
                        yield read

                def check_quality(reads):
                    if reads[0].qual is None:
                        return 'no quality information present'
                    return None

                def strip_match(i):
                    for read in i:
                        try:
                            nm = read.opt('NM')
                        except KeyError:
                            nm = 1
                        if nm == 0:
                            read.seq = None
                        yield read

                if options.strip_method == "all":
                    if "strip-sequence" in options.methods:
                        it = strip_sequence(it)
                        pre_check_f = check_sequence
                    elif "strip-quality" in options.methods:
                        it = strip_quality(it)
                        pre_check_f = check_quality
                elif options.strip_method == "match":
                    it = strip_match(it)

            if "unstrip" in options.methods:
                def buildReadDictionary(filename):
                    if not os.path.exists(filename):
                        raise OSError("file not found: %s" % filename)
                    fastqfile = pysam.FastxFile(filename)
                    fastq2sequence = {}
                    for x in fastqfile:
                        if x.name in fastq2sequence:
                            raise ValueError(
                                "read %s duplicate - can not unstrip" % x.name)

                        fastq2sequence[x.name] = (x.sequence, x.quality)
                    return fastq2sequence

                if not options.fastq_pair1:
                    raise ValueError(
                        "please supply fastq file(s) for unstripping")
                fastq2sequence1 = buildReadDictionary(options.fastq_pair1)
                if options.fastq_pair2:
                    fastq2sequence2 = buildReadDictionary(options.fastq_pair2)

                def unstrip_unpaired(i):
                    for read in i:
                        read.seq, read.qual = fastq2sequence1[read.qname]
                        yield read

                def unstrip_pair(i):
                    for read in i:
                        if read.is_read1:
                            read.seq, read.qual = fastq2sequence1[read.qname]
                        else:
                            read.seq, read.qual = fastq2sequence2[read.qname]
                        yield read

                if options.fastq_pair2:
                    it = unstrip_pair(it)
                else:
                    it = unstrip_unpaired(it)

            if "set-nh" in options.methods:
                it = _bam2bam.SetNH(it)

            # keep first base of reads by changing the cigarstring to
            # '1M' and, in reads mapping to the reverse strand,
            # changes the pos to aend - 1
            # Needs to be refactored to make it more general
            # (last base, midpoint, ..)
            if "keep_first_base" in options.methods:
                def keep_first_base(i):
                    for read in i:
                        if read.is_reverse:
                            read.pos = read.aend - 1
                            read.cigarstring = '1M'
                        elif not read.is_unmapped:
                            read.cigarstring = '1M'
                        yield read
                it = keep_first_base(it)

            # read first read and check if processing should continue
            # only possible when not working from stdin
            # Refactoring: use cache to also do a pre-check for
            # stdin input.
            if bamfile != "-":
                # get first read for checking pre-conditions
                first_reads = list(pysam_in.head(1))

                msg = pre_check_f(first_reads)
                if msg is not None:
                    if options.force:
                        E.warn('proccessing continues, though: %s' % msg)
                    else:
                        E.warn('processing not started: %s' % msg)
                        pysam_in.close()
                        pysam_out.close()
                        continue

            # continue processing till end
            for read in it:
                pysam_out.write(read)

            pysam_in.close()
            pysam_out.close()

        if options.inplace:
            # set date and file permissions according to original
            # Note: currently it will not update user and group.
            original = os.stat(bamfile)
            os.utime(tmpfile.name, (original.st_atime, original.st_mtime))
            os.chmod(tmpfile.name, original.st_mode)
            # move new file over original copy
            shutil.move(tmpfile.name, bamfile)
            # re-index
            pysam.index(bamfile)

    # write footer and output benchmark information.
    E.Stop()
Exemplo n.º 2
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--methods",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=("filter", "keep-first-base", "set-nh",
                               "set-sequence", "strip-sequence",
                               "strip-quality", "unstrip",
                               "unset-unmapped-mapq", "downsample-single",
                               "downsample-paired"),
                      help="methods to apply [%default]")

    parser.add_option("--strip-method",
                      dest="strip_method",
                      type="choice",
                      choices=("all", "match"),
                      help="define which sequences/qualities to strip. "
                      "match means that stripping only applies to entries "
                      "without mismatches (requires NM tag to be present). "
                      "[%default]")

    parser.add_option("--filter-method",
                      dest="filter_methods",
                      action="append",
                      type="choice",
                      choices=('NM', 'CM', 'mapped', 'unique', "non-unique"),
                      help="filter method to apply to remove alignments "
                      "from a bam file. Multiple methods can be supplied "
                      "[%default]")

    parser.add_option("--reference-bam-file",
                      dest="reference_bam",
                      type="string",
                      help="bam-file to filter with [%default]")

    parser.add_option("--force-output",
                      dest="force",
                      action="store_true",
                      help="force processing. Some methods such "
                      "as strip/unstrip will stop processing if "
                      "they think it not necessary "
                      "[%default]")

    parser.add_option("--output-sam",
                      dest="output_sam",
                      action="store_true",
                      help="output in sam format [%default]")

    parser.add_option("--inplace",
                      dest="inplace",
                      action="store_true",
                      help="modify bam files in-place. Bam files need "
                      "to be given "
                      "as arguments. Temporary bam files are written "
                      "to /tmp [%default]")

    parser.add_option("--first-fastq-file",
                      "-1",
                      dest="fastq_pair1",
                      type="string",
                      help="fastq file with read information for first "
                      "in pair or unpaired. Used for unstripping sequence "
                      "and quality scores [%default]")

    parser.add_option("--second-fastq-file",
                      "-2",
                      dest="fastq_pair2",
                      type="string",
                      help="fastq file with read information for second "
                      "in pair. Used for unstripping sequence "
                      "and quality scores  [%default]")

    parser.add_option("--downsample",
                      dest="downsample",
                      type="int",
                      help="Number of reads to downsample to")

    parser.set_defaults(methods=[],
                        output_sam=False,
                        reference_bam=None,
                        filter_methods=[],
                        strip_method="all",
                        force=False,
                        inplace=False,
                        fastq_pair1=None,
                        fastq_pair2=None,
                        downsample=None,
                        random_seed=None)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)
    # random.seed(options.random_seed)
    bamfiles = []

    if options.stdin != sys.stdin:
        from_stdin = True
        bamfiles.append(options.stdin.name)
    else:
        from_stdin = False

    if options.inplace:
        bamfiles.extend(args)
        if len(bamfiles) == 0:
            raise ValueError(
                "please one or more bam-files as command line arguments")

        if "-" in bamfiles:
            raise ValueError(
                "can not read from stdin if ``--inplace`` is selected")

    if len(bamfiles) == 0:
        bamfiles = ["-"]

    to_stdout = False

    for bamfile in bamfiles:

        E.info('processing %s' % bamfile)

        if os.path.islink(bamfile):
            E.warn('ignoring link %s' % bamfile)
            continue

        if IOTools.isEmpty(bamfile):
            E.warn('ignoring empty file %s' % bamfile)
            continue

        # reading bam from stdin does not work with only the "r" tag
        pysam_in = pysam.AlignmentFile(bamfile, "rb")
        if bamfile == "-" or (from_stdin and bamfile == options.stdin.name):
            to_stdout = True
            if options.output_sam:
                pysam_out = pysam.AlignmentFile("-", "wh", template=pysam_in)
            else:
                pysam_out = pysam.AlignmentFile("-", "wb", template=pysam_in)
        else:
            if IOTools.isEmpty(bamfile):
                E.warn('skipping empty file %s' % bamfile)
                continue
            tmpfile = tempfile.NamedTemporaryFile(delete=False, prefix="ctmp")
            tmpfile.close()

            E.debug("writing temporary bam-file to %s" % tmpfile.name)
            pysam_out = pysam.AlignmentFile(tmpfile.name,
                                            "wb",
                                            template=pysam_in)

        if "filter" in options.methods:

            remove_mismatches, colour_mismatches = False, False

            if "NM" in options.filter_methods:
                remove_mismatches = True

            elif "CM" in options.filter_methods:
                remove_mismatches = True
                colour_mismatches = True

            if remove_mismatches:
                if not options.reference_bam:
                    raise ValueError(
                        "requiring reference bam file for removing by "
                        "mismatches")

                pysam_ref = pysam.AlignmentFile(options.reference_bam, "rb")
            else:
                pysam_ref = None

            # filter and flags are the opposite way around
            c = _bam2bam.filter_bam(pysam_in,
                                    pysam_out,
                                    pysam_ref,
                                    remove_nonunique="unique"
                                    in options.filter_methods,
                                    remove_unique="non-unique"
                                    in options.filter_methods,
                                    remove_contigs=None,
                                    remove_unmapped="mapped"
                                    in options.filter_methods,
                                    remove_mismatches=remove_mismatches,
                                    colour_mismatches=colour_mismatches)

            if pysam_ref:
                pysam_ref.close()

            # do not write to stdlog in the middle of a SAM/BAM stdout stream.
            if options.stdlog != options.stdout:
                E.info("category\tcounts\n%s\n" % c.asTable())
        else:

            # set up the modifying iterators
            it = pysam_in.fetch(until_eof=True)

            # function to check if processing should start
            pre_check_f = lambda x: None

            if "unset-unmapped-mapq" in options.methods:

                def unset_unmapped_mapq(i):
                    for read in i:
                        if read.is_unmapped:
                            read.mapq = 0
                        yield read

                it = unset_unmapped_mapq(it)

            if "set-sequence" in options.methods:

                def set_sequence(i):
                    for read in i:
                        # can't get at length of unmapped reads
                        if read.is_unmapped:
                            read.seq = "A"
                            read.qual = "F"
                        else:
                            read.seq = "A" * read.inferred_length
                            read.qual = "F" * read.inferred_length

                        yield read

                it = set_sequence(it)

            if "strip-sequence" in options.methods or "strip-quality" in \
               options.methods:

                def strip_sequence(i):
                    for read in i:
                        read.seq = None
                        yield read

                def check_sequence(reads):
                    if reads[0].seq is None:
                        return 'no sequence present'
                    return None

                def strip_quality(i):
                    for read in i:
                        read.qual = None
                        yield read

                def check_quality(reads):
                    if reads[0].qual is None:
                        return 'no quality information present'
                    return None

                def strip_match(i):
                    for read in i:
                        try:
                            nm = read.opt('NM')
                        except KeyError:
                            nm = 1
                        if nm == 0:
                            read.seq = None
                        yield read

                if options.strip_method == "all":
                    if "strip-sequence" in options.methods:
                        it = strip_sequence(it)
                        pre_check_f = check_sequence
                    elif "strip-quality" in options.methods:
                        it = strip_quality(it)
                        pre_check_f = check_quality
                elif options.strip_method == "match":
                    it = strip_match(it)

            if "unstrip" in options.methods:

                def buildReadDictionary(filename):
                    if not os.path.exists(filename):
                        raise OSError("file not found: %s" % filename)
                    fastqfile = pysam.FastxFile(filename)
                    fastq2sequence = {}
                    for x in fastqfile:
                        if x.name in fastq2sequence:
                            raise ValueError(
                                "read %s duplicate - can not unstrip" % x.name)

                        fastq2sequence[x.name] = (x.sequence, x.quality)
                    return fastq2sequence

                if not options.fastq_pair1:
                    raise ValueError(
                        "please supply fastq file(s) for unstripping")
                fastq2sequence1 = buildReadDictionary(options.fastq_pair1)
                if options.fastq_pair2:
                    fastq2sequence2 = buildReadDictionary(options.fastq_pair2)

                def unstrip_unpaired(i):
                    for read in i:
                        read.seq, read.qual = fastq2sequence1[read.qname]
                        yield read

                def unstrip_pair(i):
                    for read in i:
                        if read.is_read1:
                            read.seq, read.qual = fastq2sequence1[read.qname]
                        else:
                            read.seq, read.qual = fastq2sequence2[read.qname]
                        yield read

                if options.fastq_pair2:
                    it = unstrip_pair(it)
                else:
                    it = unstrip_unpaired(it)

            if "set-nh" in options.methods:
                it = _bam2bam.SetNH(it)

            # keep first base of reads by changing the cigarstring to
            # '1M' and, in reads mapping to the reverse strand,
            # changes the pos to aend - 1
            # Needs to be refactored to make it more general
            # (last base, midpoint, ..)
            if "keep_first_base" in options.methods:

                def keep_first_base(i):
                    for read in i:
                        if read.is_reverse:
                            read.pos = read.aend - 1
                            read.cigarstring = '1M'
                        elif not read.is_unmapped:
                            read.cigarstring = '1M'
                        yield read

                it = keep_first_base(it)

            # read first read and check if processing should continue
            # only possible when not working from stdin
            # Refactoring: use cache to also do a pre-check for
            # stdin input.
            if bamfile != "-":
                # get first read for checking pre-conditions
                first_reads = list(pysam_in.head(1))

                msg = pre_check_f(first_reads)
                if msg is not None:
                    if options.force:
                        E.warn('proccessing continues, though: %s' % msg)
                    else:
                        E.warn('processing not started: %s' % msg)
                        pysam_in.close()
                        pysam_out.close()
                        continue

            if "downsample-single" in options.methods:

                if not options.downsample:
                    raise ValueError("Please provide downsample size")

                else:
                    down = SubsetBam(pysam_in=it,
                                     downsample=options.downsample,
                                     paired_end=None,
                                     single_end=True,
                                     random_seed=options.random_seed)
                    it = down.downsample_single()

            if "downsample-paired" in options.methods:

                if not options.downsample:
                    raise ValueError("Please provide downsample size")

                else:
                    down = SubsetBam(pysam_in=it,
                                     downsample=options.downsample,
                                     paired_end=True,
                                     single_end=None,
                                     random_seed=options.random_seed)
                    it = down.downsample_paired()

            # continue processing till end
            for read in it:
                pysam_out.write(read)

        pysam_in.close()
        pysam_out.close()

        if options.inplace:
            # set date and file permissions according to original
            # Note: currently it will not update user and group.
            original = os.stat(bamfile)
            os.utime(tmpfile.name, (original.st_atime, original.st_mtime))
            os.chmod(tmpfile.name, original.st_mode)
            # move new file over original copy
            shutil.move(tmpfile.name, bamfile)
            # re-index
            pysam.index(bamfile)

    # write footer and output benchmark information.
    E.Stop()
Exemplo n.º 3
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("--set-nh", dest="set_nh", action="store_true",
                      help="sets the NH flag. The file needs to be sorted by readname [%default]")

    parser.add_option("--unset-unmapped-mapq", dest="unset_unmapped_mapq", action="store_true",
                      help="sets the mapping quality of unmapped reads to 0 [%default]")

    parser.add_option("--set-sequence", dest="set_sequence", action="store_true",
                      help="sets the sequence to 'A's (a valid base) and the quality to 'F's"
                      ",which is defined in all fastq scoring schemes "
                      "[%default]")

    parser.add_option("--strip", dest="strip", type="choice",
                      choices=("sequence", "quality", "match"),
                      help = "remove parts of the bam-file. Note that "
                      "stripping the sequence will "
                      "also strip the quality values [%default]")

    parser.add_option("--unstrip", dest="unstrip", action="store_true",
                      help="add sequence and quality into bam file [%default]")

    parser.add_option("--filter", dest="filter",
                      action="append", type="choice",
                      choices=('NM', 'CM', 'mapped', 'unique', "non-unique"),
                      help = "filter bam file. The option denotes "
                      "the property that is  "
                      "used to determine better match [%default]")

    parser.add_option("--reference-bam", dest="reference_bam", type="string",
                      help="bam-file to filter with [%default]")

    parser.add_option("--sam", dest="output_sam", action="store_true",
                      help="output in sam format [%default]")

    parser.add_option("--inplace", dest="inplace", action="store_true",
                      help="modify bam files in-place. Bam files need "
                      "to be given "
                      "as arguments. Temporary bam files are written "
                      "to /tmp [%default]")

    parser.add_option("--fastq1", "-1", dest="fastq_pair1", type="string",
                      help="fastq file with read information for first in pair or unpaired [%default]")

    parser.add_option("--fastq2", "-2", dest="fastq_pair2", type="string",
                      help="fastq file with read information for second in pair [%default]")

    parser.add_option("--keep-first-base", dest="keep_first_base", action="store_true",
                      help="keep first base of reads such that gtf2table.py will only consider the"
                      "first base in its counts")

    parser.set_defaults(
        filter=[],
        set_nh=False,
        unset_unmapped_mapq=False,
        output_sam=False,
        reference_bam=None,
        strip=None,
        unstrip=None,
        set_sequence=False,
        inplace=False,
        fastq_pair1=None,
        fastq_pair2=None,
        keep_first_base=False
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if options.inplace:
        bamfiles = args
        if len(bamfiles) == 0:
            raise ValueError(
                "please one or more bam-files as command line arguments")

        if "-" in bamfiles:
            raise ValueError(
                "can not read from stdin if ``--inplace`` is selected")
    else:
        bamfiles = ["-"]

    for bamfile in bamfiles:

        E.info('processing %s' % bamfile)

        # reading bam from stdin does not work with only the "r" tag
        pysam_in = pysam.Samfile(bamfile, "rb")

        if bamfile == "-":
            if options.output_sam:
                pysam_out = pysam.Samfile("-", "wh", template=pysam_in)
            else:
                pysam_out = pysam.Samfile("-", "wb", template=pysam_in)
        else:
            tmpfile = tempfile.NamedTemporaryFile(delete=False, prefix="ctmp")
            tmpfile.close()

            E.debug("writing temporary bam-file to %s" % tmpfile.name)
            pysam_out = pysam.Samfile(tmpfile.name, "wb", template=pysam_in)

        if options.filter:

            remove_mismatches, colour_mismatches = False, False

            if "NM" in options.filter:
                remove_mismatches = True

            elif "CM" in options.filter:
                remove_mismatches = True
                colour_mismatches = True

            if remove_mismatches:
                if not options.reference_bam:
                    raise ValueError(
                        "requiring reference bam file for removing by "
                        "mismatches")

                pysam_ref = pysam.Samfile(options.reference_bam, "rb")
            else:
                pysam_ref = None

            # filter and flags are the opposite way around
            c = _bam2bam.filter_bam(
                pysam_in, pysam_out, pysam_ref,
                remove_nonunique="unique" in options.filter,
                remove_unique="non-unique" in options.filter,
                remove_contigs=None,
                remove_unmapped="mapped" in options.filter,
                remove_mismatches=remove_mismatches,
                colour_mismatches=colour_mismatches)

            options.stdlog.write("category\tcounts\n%s\n" % c.asTable())
        else:
            # set up the modifying iterators
            it = pysam_in.fetch(until_eof=True)

            if options.unset_unmapped_mapq:
                def unset_unmapped_mapq(i):
                    for read in i:
                        if read.is_unmapped:
                            read.mapq = 0
                        yield read
                it = unset_unmapped_mapq(it)

            if options.set_nh and False:
                def set_nh(i):

                    for key, reads in itertools.groupby(i, lambda x: x.qname):
                        l = list(reads)
                        nh = len(l)
                        for read in l:
                            if not read.is_unmapped:
                                t = dict(read.tags)
                                t['NH'] = nh
                                read.tags = list(t.iteritems())
                            yield read
                it = set_nh(it)

            if options.set_sequence:
                def set_sequence(i):
                    for read in i:
                        # can't get at length of unmapped reads
                        if read.is_unmapped:
                            read.seq = "A"
                            read.qual = "F"
                        else:
                            read.seq = "A" * read.inferred_length
                            read.qual = "F" * read.inferred_length

                        yield read
                it = set_sequence(it)

            if options.strip is not None:
                def strip_sequence(i):
                    for read in i:
                        read.seq = None
                        yield read

                def strip_quality(i):
                    for read in i:
                        read.qual = None
                        yield read

                def strip_match(i):
                    for read in i:
                        try:
                            nm = read.opt('NM')
                        except KeyError:
                            nm = 1
                        if nm == 0:
                            read.seq = None
                        yield read

                if options.strip == "sequence":
                    it = strip_sequence(it)
                elif options.strip == "quality":
                    it = strip_quality(it)
                elif options.strip == "match":
                    it = strip_match(it)

            if options.unstrip:
                def buildReadDictionary(filename):
                    if not os.path.exists(filename):
                        raise OSError("file not found: %s" % filename)
                    fastqfile = pysam.Fastqfile(filename)
                    fastq2sequence = {}
                    for x in fastqfile:
                        if x.name in fastq2sequence:
                            raise ValueError(
                                "read %s duplicate - can not unstrip" % x.name)

                        fastq2sequence[x.name] = (x.sequence, x.quality)
                    return fastq2sequence

                if not options.fastq_pair1:
                    raise ValueError(
                        "please supply fastq file(s) for unstripping")
                fastq2sequence1 = buildReadDictionary(options.fastq_pair1)
                if options.fastq_pair2:
                    fastq2sequence2 = buildReadDictionary(options.fastq_pair2)

                def unstrip_unpaired(i):
                    for read in i:
                        read.seq, read.qual = fastq2sequence1[read.qname]
                        yield read

                def unstrip_pair(i):
                    for read in i:
                        if read.is_read1:
                            read.seq, read.qual = fastq2sequence1[read.qname]
                        else:
                            read.seq, read.qual = fastq2sequence2[read.qname]
                        yield read

                if options.fastq_pair2:
                    it = unstrip_pair(it)
                else:
                    it = unstrip_unpaired(it)

            if options.set_nh:
                it = _bam2bam.SetNH(it)

            # keep first base of reads by changing the cigarstring to
            # '1M' and, in reads mapping to the reverse strand,
            # changes the pos to aend - 1
            if options.keep_first_base:
                def keep_first_base(i):
                    for read in i:
                        if read.is_reverse:
                            read.pos = read.aend - 1
                            read.cigarstring = '1M'
                        elif not read.is_unmapped:
                            read.cigarstring = '1M'
                        yield read
                it = keep_first_base(it)

            # read and output
            for read in it:
                pysam_out.write(read)

            pysam_in.close()
            pysam_out.close()

        if options.inplace:
            # set date and file permissions according to original
            # Note: currently it will not update user and group.
            original = os.stat(bamfile)
            os.utime(tmpfile.name, (original.st_atime, original.st_mtime))
            os.chmod(tmpfile.name, original.st_mode)
            # move new file over original copy
            shutil.move(tmpfile.name, bamfile)
            # re-index
            pysam.index(bamfile)

    # write footer and output benchmark information.
    E.Stop()
Exemplo n.º 4
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "--set-nh",
        dest="set_nh",
        action="store_true",
        help=
        "sets the NH flag. The file needs to be sorted by readname [%default]")

    parser.add_option(
        "--unset-unmapped-mapq",
        dest="unset_unmapped_mapq",
        action="store_true",
        help="sets the mapping quality of unmapped reads to 0 [%default]")

    parser.add_option(
        "--set-sequence",
        dest="set_sequence",
        action="store_true",
        help="sets the sequence to 'A's (a valid base) and the quality to 'F's"
        ",which is defined in all fastq scoring schemes "
        "[%default]")

    parser.add_option(
        "--strip",
        dest="strip",
        type="choice",
        choices=("sequence", "quality", "match"),
        help=
        "remove parts of the bam-file. Note that stripping the sequence will "
        "also strip the quality values [%default]")

    parser.add_option("--unstrip",
                      dest="unstrip",
                      action="store_true",
                      help="add sequence and quality into bam file [%default]")

    parser.add_option(
        "--filter",
        dest="filter",
        action="append",
        type="choice",
        choices=('NM', 'CM', 'mapped', 'unique', "non-unique"),
        help="filter bam file. The option denotes the property that is  "
        "used to determine better match [%default]")

    parser.add_option("--reference-bam",
                      dest="reference_bam",
                      type="string",
                      help="bam-file to filter with [%default]")

    parser.add_option("--sam",
                      dest="output_sam",
                      action="store_true",
                      help="output in sam format [%default]")

    parser.add_option(
        "--inplace",
        dest="inplace",
        action="store_true",
        help="modify bam files in-place. Bam files need to be given "
        "as arguments. Temporary bam files are written to /tmp [%default]")

    parser.add_option(
        "--fastq1",
        "-1",
        dest="fastq_pair1",
        type="string",
        help=
        "fastq file with read information for first in pair or unpaired [%default]"
    )

    parser.add_option(
        "--fastq2",
        "-2",
        dest="fastq_pair2",
        type="string",
        help="fastq file with read information for second in pair [%default]")

    parser.add_option(
        "--keep-first-base",
        dest="keep_first_base",
        action="store_true",
        help=
        "keep first base of reads such that gtf2table.py will only consider the"
        "first base in its counts")

    parser.set_defaults(filter=[],
                        set_nh=False,
                        unset_unmapped_mapq=False,
                        output_sam=False,
                        reference_bam=None,
                        strip=None,
                        unstrip=None,
                        set_sequence=False,
                        inplace=False,
                        fastq_pair1=None,
                        fastq_pair2=None,
                        keep_first_base=False)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if options.inplace:
        bamfiles = args
        if len(bamfiles) == 0:
            raise ValueError(
                "please one or more bam-files as command line arguments")

        if "-" in bamfiles:
            raise ValueError(
                "can not read from stdin if ``--inplace`` is selected")
    else:
        bamfiles = ["-"]

    for bamfile in bamfiles:

        E.info('processing %s' % bamfile)

        # reading bam from stdin does not work with only the "r" tag
        pysam_in = pysam.Samfile(bamfile, "rb")

        if bamfile == "-":
            if options.output_sam:
                pysam_out = pysam.Samfile("-", "wh", template=pysam_in)
            else:
                pysam_out = pysam.Samfile("-", "wb", template=pysam_in)
        else:
            tmpfile = tempfile.NamedTemporaryFile(delete=False, prefix="ctmp")
            tmpfile.close()

            E.debug("writing temporary bam-file to %s" % tmpfile.name)
            pysam_out = pysam.Samfile(tmpfile.name, "wb", template=pysam_in)

        if options.filter:

            remove_mismatches, colour_mismatches = False, False

            if "NM" in options.filter:
                remove_mismatches = True

            elif "CM" in options.filter:
                remove_mismatches = True
                colour_mismatches = True

            if remove_mismatches:
                if not options.reference_bam:
                    raise ValueError(
                        "requiring reference bam file for removing by "
                        "mismatches")

                pysam_ref = pysam.Samfile(options.reference_bam, "rb")
            else:
                pysam_ref = None

            # filter and flags are the opposite way around
            c = _bam2bam.filter_bam(pysam_in,
                                    pysam_out,
                                    pysam_ref,
                                    remove_nonunique="unique"
                                    in options.filter,
                                    remove_unique="non-unique"
                                    in options.filter,
                                    remove_contigs=None,
                                    remove_unmapped="mapped" in options.filter,
                                    remove_mismatches=remove_mismatches,
                                    colour_mismatches=colour_mismatches)

            options.stdlog.write("category\tcounts\n%s\n" % c.asTable())
        else:
            # set up the modifying iterators
            it = pysam_in.fetch(until_eof=True)

            if options.unset_unmapped_mapq:

                def unset_unmapped_mapq(i):
                    for read in i:
                        if read.is_unmapped:
                            read.mapq = 0
                        yield read

                it = unset_unmapped_mapq(it)

            if options.set_nh and False:

                def set_nh(i):

                    for key, reads in itertools.groupby(i, lambda x: x.qname):
                        l = list(reads)
                        nh = len(l)
                        for read in l:
                            if not read.is_unmapped:
                                t = dict(read.tags)
                                t['NH'] = nh
                                read.tags = list(t.iteritems())
                            yield read

                it = set_nh(it)

            if options.set_sequence:

                def set_sequence(i):
                    for read in i:
                        # can't get at length of unmapped reads
                        if read.is_unmapped:
                            read.seq = "A"
                            read.qual = "F"
                        else:
                            read.seq = "A" * read.inferred_length
                            read.qual = "F" * read.inferred_length

                        yield read

                it = set_sequence(it)

            if options.strip is not None:

                def strip_sequence(i):
                    for read in i:
                        read.seq = None
                        yield read

                def strip_quality(i):
                    for read in i:
                        read.qual = None
                        yield read

                def strip_match(i):
                    for read in i:
                        try:
                            nm = read.opt('NM')
                        except KeyError:
                            nm = 1
                        if nm == 0:
                            read.seq = None
                        yield read

                if options.strip == "sequence":
                    it = strip_sequence(it)
                elif options.strip == "quality":
                    it = strip_quality(it)
                elif options.strip == "match":
                    it = strip_match(it)

            if options.unstrip:

                def buildReadDictionary(filename):
                    if not os.path.exists(filename):
                        raise OSError("file not found: %s" % filename)
                    fastqfile = pysam.Fastqfile(filename)
                    fastq2sequence = {}
                    for x in fastqfile:
                        if x.name in fastq2sequence:
                            raise ValueError(
                                "read %s duplicate - can not unstrip" % x.name)

                        fastq2sequence[x.name] = (x.sequence, x.quality)
                    return fastq2sequence

                if not options.fastq_pair1:
                    raise ValueError(
                        "please supply fastq file(s) for unstripping")
                fastq2sequence1 = buildReadDictionary(options.fastq_pair1)
                if options.fastq_pair2:
                    fastq2sequence2 = buildReadDictionary(options.fastq_pair2)

                def unstrip_unpaired(i):
                    for read in i:
                        read.seq, read.qual = fastq2sequence1[read.qname]
                        yield read

                def unstrip_pair(i):
                    for read in i:
                        if read.is_read1:
                            read.seq, read.qual = fastq2sequence1[read.qname]
                        else:
                            read.seq, read.qual = fastq2sequence2[read.qname]
                        yield read

                if options.fastq_pair2:
                    it = unstrip_pair(it)
                else:
                    it = unstrip_unpaired(it)

            if options.set_nh:
                it = _bam2bam.SetNH(it)

            # keep first base of reads by changing the cigarstring to
            # '1M' and, in reads mapping to the reverse strand,
            # changes the pos to aend - 1
            if options.keep_first_base:

                def keep_first_base(i):
                    for read in i:
                        if read.is_reverse:
                            read.pos = read.aend - 1
                            read.cigarstring = '1M'
                        elif not read.is_unmapped:
                            read.cigarstring = '1M'
                        yield read

                it = keep_first_base(it)

            # read and output
            for read in it:
                pysam_out.write(read)

            pysam_in.close()
            pysam_out.close()

        if options.inplace:
            # set date and file permissions according to original
            # Note: currently it will not update user and group.
            original = os.stat(bamfile)
            os.utime(tmpfile.name, (original.st_atime, original.st_mtime))
            os.chmod(tmpfile.name, original.st_mode)
            # move new file over original copy
            shutil.move(tmpfile.name, bamfile)
            # re-index
            pysam.index(bamfile)

    # write footer and output benchmark information.
    E.Stop()