示例#1
0
 def test_merge(self):
     test_file = os.path.abspath('test_data/test_1500_merged_reads.bam')
     output_filtered_forward, output_filtered_reverse = filter_reads(
         self.arguments)
     output_tempfile = tempfile.NamedTemporaryFile(
         prefix='test_filtered_merged_',
         suffix='.bam',
         delete=False,
         dir=os.getcwd())
     output_tempfile.close()
     self.arguments.output = os.path.abspath(output_tempfile.name)
     merged_output = merge_bams(self.arguments, output_filtered_forward,
                                output_filtered_reverse)
     self.assertEqual(merged_output, 0)
     save = pysam.set_verbosity(0)
     test_out_fh = pysam.AlignmentFile(self.arguments.output, 'r')
     test_cmp_fh = pysam.AlignmentFile(test_file, 'r')
     pysam.set_verbosity(save)
     print(self.arguments.output)
     # Compare each read individually since bellerophon.merge_bams()
     # adds a row to the @PG section of the SAM header, making the checksums differ.
     for output_read, test_read in zip(test_out_fh, test_cmp_fh):
         self.assertEqual(output_read, test_read)
     test_out_fh.close()
     test_cmp_fh.close()
     os.unlink(self.arguments.output)
     self.assertFalse(os.path.exists(self.arguments.output))
示例#2
0
def collapse_barcode(bam, out):
    logger.info(f'Deduplicating {bam} {size(bam)} by collapsing barcodes ...')
    verbosity = pysam.set_verbosity(0)
    with pysam.AlignmentFile(bam, 'rb') as b1, pysam.AlignmentFile(bam,
                                                                   'rb') as b2:
        results = {}
        for read1, read2 in zip(itertools.islice(b1, 0, None, 2),
                                itertools.islice(b2, 1, None, 2)):
            if read1.query_name != read2.query_name:
                raise ValueError(
                    f'Read names do not match: {read1.query_name} != {read2.query_name}.'
                )
            if read1.is_unmapped or read2.is_unmapped or read1.reference_name != read2.reference_name:
                continue
            if not read1.is_read1:
                read1, read2 = read2, read1
            randomer = read1.query_name.split(':')[0]
            start = read1.positions[-1] if read1.is_reverse else read1.pos
            stop = read2.positions[-1] if read2.is_reverse else read2.pos
            strand = '-' if read1.is_reverse else '+'
            location = (read1.reference_name, start, stop, strand, randomer)
            if location in results:
                continue
            results[location] = (read1, read2)
        with pysam.AlignmentFile(out, 'wb', template=b1) as o:
            for (read1, read2) in results.values():
                o.write(read1)
                o.write(read2)
        logger.info(
            f'Deduplicating {bam} {size(bam)} by collapsing barcodes complete.'
        )
    pysam.set_verbosity(verbosity)
示例#3
0
def run(args):
    # XXX https://github.com/pysam-developers/pysam/issues/939
    pysam.set_verbosity(0)  # pylint: disable=no-member
    if args.show_zmws:
        if [args.whitelist, args.blacklist, args.percentage].count(None) != 3:
            log.warning("Ignoring unused filtering arguments")
        show_zmws(args.input_bam)
        return 0
    try:
        return filter_reads(input_bam=args.input_bam,
                            output_bam=args.output_bam,
                            whitelist=args.whitelist,
                            blacklist=args.blacklist,
                            percentage=args.percentage,
                            count=args.count,
                            seed=args.seed,
                            ignore_metadata=args.ignore_metadata,
                            relative=args.relative,
                            anonymize=args.anonymize,
                            use_barcodes=args.barcodes,
                            sample_scraps=args.sample_scraps,
                            keep_original_uuid=args.keep_uuid,
                            use_subreads=args.subreads,
                            min_adapters=args.min_adapters)
    except UserError as e:
        log.error(str(e))
        return 1
def barcode_collapse(bam, output, debug=False):
    """
    Deduplicate paired-end BAM by collapsing barcodes.

    :param bam: str, path to BAM file.
    :param output: str, path to the output file.
    :param debug: bool, set to True for invoking debug mode.
    """

    it.info(f'Deduplicating {bam} by collapsing barcodes ...')
    pysam.set_verbosity(1 if debug else 0)
    with pysam.AlignmentFile(bam, 'rb') as b1, pysam.AlignmentFile(bam, 'rb') as b2:
        results = {}
        for read1, read2 in zip(itertools.islice(b1, 0, None, 2), itertools.islice(b2, 1, None, 2)):
            if read1.query_name != read2.query_name:
                it.error_and_exit(f'Read names do not match: {read1.query_name} != {read2.query_name}.')
            if read1.is_unmapped or read2.is_unmapped or read1.reference_name != read2.reference_name:
                continue
            if not read1.is_read1:
                read1, read2 = read2, read1
            randomer = read1.query_name.split(':')[0]
            start = read1.positions[-1] if read1.is_reverse else read1.pos
            stop = read2.positions[-1] if read2.is_reverse else read2.pos
            strand = '-' if read1.is_reverse else '+'
            location = (read1.reference_name, start, stop, strand, randomer)
            if location in results:
                continue
            results[location] = (read1, read2)
        with pysam.AlignmentFile(out, 'wb', template=b1) as o:
            for (read1, read2) in results.values():
                o.write(read1)
                o.write(read2)
        it.info(f'Deduplicating {bam} by collapsing barcodes complete.')
def run_consolidate(dataset_file,
                    output_file,
                    datastore_file,
                    consolidate,
                    n_files,
                    consolidate_f=lambda ds: ds.consolidate):
    # XXX https://github.com/pysam-developers/pysam/issues/939
    pysam.set_verbosity(0)  # pylint: disable=no-member
    datastore_files = []
    with openDataSet(dataset_file) as ds_in:
        if consolidate:
            if len(ds_in.toExternalFiles()) <= 0:
                raise ValueError(
                    "DataSet {} must contain one or more files!".format(
                        dataset_file))
            new_resource_file = bam_of_dataset(output_file)
            consolidate_f(ds_in)(new_resource_file,
                                 numFiles=n_files,
                                 useTmp=False)
            # always display the BAM/BAI if consolidation is enabled
            # XXX there is no uniqueness constraint on the sourceId, but this
            # seems sloppy nonetheless - unfortunately I don't know how else to
            # make view rule whitelisting work
            reads_name = get_reads_name(ds_in)
            for ext_res in ds_in.externalResources:
                if ext_res.resourceId.endswith(".bam"):
                    ds_file = DataStoreFile(ext_res.uniqueId,
                                            Constants.TOOL_ID + "-out-2",
                                            ext_res.metaType,
                                            ext_res.bam,
                                            name=reads_name,
                                            description=reads_name)
                    datastore_files.append(ds_file)
                    # Prevent duplicated index files being added to datastore, since consolidated
                    # dataset may contain multiple indices pointing to the same physical file
                    added_resources = set()
                    for index in ext_res.indices:
                        if (index.metaType in Constants.BAI_FILE_TYPES
                                and index.resourceId not in added_resources):
                            added_resources.add(index.resourceId)
                            ds_file = DataStoreFile(
                                index.uniqueId,
                                Constants.TOOL_ID + "-out-3",
                                index.metaType,
                                index.resourceId,
                                name="Index of {}".format(reads_name.lower()),
                                description="Index of {}".format(
                                    reads_name.lower()))
                            datastore_files.append(ds_file)
        ds_in.newUuid()
        ds_in.write(output_file)
    datastore = DataStore(datastore_files)
    datastore.write_json(datastore_file)
    return 0
示例#6
0
def check_if_equal(bam_path, gbam_path, no_check_fields=[]):
    # Suppress warnings to work with BAM files without index file.
    # https://github.com/pysam-developers/pysam/issues/939#issuecomment-669016051
    save = pysam.set_verbosity(0)
    bam_file = pysam.AlignmentFile(bam_path, "rb")
    pysam.set_verbosity(save)

    fields_to_check = [
        field for field in list(map(int, Fields))
        if field not in no_check_fields
    ]

    gbam_file = get_reader(gbam_path, get_parsing_tmpl(fields_to_check))
    from gbam_tools import GbamRecord

    i = 0
    while True:
        cur_gbam = gbam_file.next_record()
        cur_bam = next(bam_file, None)
        if i > 0 and i % 100000 == 0:
            print('%d records are processed' % i)
        if cur_gbam == None or cur_bam == None:
            # Assert there is no records left
            assert (cur_gbam == cur_bam)
            break

        for field in fields_to_check:
            if field == Fields.REFID:
                assert (cur_bam.reference_id == cur_gbam.refid)
            if field == Fields.POS:
                assert (cur_bam.reference_start == cur_gbam.pos)
            if field == Fields.MAPQ:
                assert (cur_bam.mapping_quality == cur_gbam.mapq)
            if field == Fields.BIN:
                assert (cur_bam.bin == cur_gbam.bin)
            if field == Fields.FLAGS:
                assert (cur_bam.flag == cur_gbam.flag)
            if field == Fields.NEXTREFID:
                assert (cur_bam.next_reference_id == cur_gbam.next_ref_id)
            if field == Fields.NEXTPOS:
                assert (cur_bam.next_reference_start == cur_gbam.next_pos)
            if field == Fields.TLEN:
                assert (cur_bam.template_length == cur_gbam.tlen)
            if field == Fields.READNAME:
                assert (list(bytearray(cur_bam.query_name,
                                       'utf8')) == cur_gbam.read_name[:-1])
            if field == Fields.RAWCIGAR:
                assert (cur_bam.cigarstring == cur_gbam.cigar)
            if field == Fields.RAWSEQUENCE:
                assert (cur_bam.query_sequence == cur_gbam.seq)
            if field == Fields.RAWQUAL:
                assert (cur_bam.query_qualities == array('B', cur_gbam.qual))
        i += 1
示例#7
0
def get_align_file(path: str,
                   mode='r',
                   template=None,
                   expectIndex=True,
                   threads=1):
    hts_ext = os.path.splitext(path)[-1]
    xam_type = HTS_EXT_TO_AF_MODE[hts_ext]
    if expectIndex is False:
        save = pysam.set_verbosity(0)
    af = pysam.AlignmentFile(path,
                             f'{mode}{xam_type}',
                             template=template,
                             threads=threads)
    if expectIndex is False:
        pysam.set_verbosity(save)
    return af
示例#8
0
    def __init__(self, regex="[^\|]+", is_bam=True):
        self.regex = regex
        # min percent identity to consider a valid read
        self.min_identity = 0.95

        # bam files without an index will generate a warning on
        # opening. since we don't need an index, setting the
        # verbosity will silence this message
        pysam.set_verbosity(0)

        if is_bam:
            self.read_mode = "rb"
            self.write_mode = "wb"
        else:
            self.read_mode = "r"
            self.write_mode = "w"
示例#9
0
def process_bamfile(alignment, min_qual, filtered_out):
    """Filter alignment BAM files

    Reads all the reads in the input BAM alignment file. Keep reads in the
    output if they are aligned with a good quality (greater than min quality
    threshold given) saving their only some columns: ReadID, Contig,
    Position_start, Position_end, strand to save memory.

    Parameters:
    -----------
    alignment : str
        Path to the input temporary alignment.
    min_qual : int
        Minimum mapping quality required to keep a Hi-C pair.
    filtered_out : str
        Path to the output temporary tsv alignement.

    Returns:
    --------
    int:
        Number of reads aligned.
    """

    # Check the quality and status of each aligned fragment.
    aligned_reads = 0
    save = pysam.set_verbosity(0)
    temp_bam = pysam.AlignmentFile(alignment, "rb", check_sq=False)
    pysam.set_verbosity(save)
    with open(filtered_out, "a") as f:
        for r in temp_bam:
            # Check mapping quality
            if r.mapping_quality >= min_qual:
                # Check Mapping (0 or 16 flags are kept only)
                if r.flag == 0:
                    aligned_reads += 1
                    read = str(r.query_name + "\t" + r.reference_name + "\t" +
                               str(r.reference_start) + "\t" + "+" + "\n")
                    f.write(read)
                elif r.flag == 16:
                    aligned_reads += 1
                    read = str(r.query_name + "\t" + r.reference_name + "\t" +
                               str(r.reference_start) + "\t" + "-" + "\n")
                    f.write(read)
    temp_bam.close()

    return aligned_reads
示例#10
0
def reheader_bam(bam_file_in,
                 bam_file_out,
                 biosample_name=None,
                 library_name=None):
    """
    Write a new BAM file identical to the input except for substitution or
    addition of SM and/or LB tags in the @RG header.  If the tags are already
    present and current no file will be written.

    :return: True if header was changed, False if header is already current
    """
    # XXX https://github.com/pysam-developers/pysam/issues/939
    pysam.set_verbosity(0)  # pylint: disable=no-member
    was_changed = False
    with pysam.AlignmentFile(bam_file_in, "rb", check_sq=False) as bam_in:  # pylint: disable=no-member
        header = dict(bam_in.header)
        for rg in header["RG"]:
            if biosample_name:
                if rg.get("SM", None) != biosample_name:
                    was_changed = True
                rg["SM"] = biosample_name
            if library_name:
                if rg.get("LB", None) != library_name:
                    was_changed = True
                rg["LB"] = library_name
        if not was_changed:
            return False
        log.debug("Writing modified header and records to %s", bam_file_out)
        with pysam.AlignmentFile(
                bam_file_out,  # pylint: disable=no-member
                "wb",
                header=header) as bam_out:
            for rec in bam_in:
                bam_out.write(rec)
        log.debug("Running pbindex")
        subprocess.check_call(["samtools", "index", bam_file_out])
        subprocess.check_call(["pbindex", bam_file_out])
    return True
def main():
    parser = argparse.ArgumentParser(description='Shard .bam file using the .pbi index', prog='shard_bam')
    parser.add_argument('-p', '--prefix', type=str, default="shard", help="Shard filename prefix")
    parser.add_argument('-n', '--num_shards', type=int, default=4, help="Number of shards")
    parser.add_argument('-t', '--num_threads', type=int, default=2, help="Number of threads to use during sharding")
    parser.add_argument('-x', '--exclude', type=str, help='Comma-separated list of tags to exclude '
                                                          '(note: removing ip and pw tags will break ccs)')
    parser.add_argument('-i', '--index', type=str, required=False, help="PBI index filename")
    parser.add_argument('bam', type=str, help="BAM")
    args = parser.parse_args()

    pbi = args.bam + ".pbi" if args.index is None else args.index

    # Silence message about the .bai file not being found.
    pysam.set_verbosity(0)

    # Decode PacBio .pbi file and determine the shard offsets.
    print(f"Reading index ({pbi}). This may take a few minutes...", flush=True)
    offsets, zmw_counts, read_count = compute_shard_offsets(pbi, args.num_shards)

    # Prepare a function with arguments partially filled in.
    tags_to_exclude = [] if args.exclude is None else args.exclude.split(",")
    func = partial(write_shard, args.bam, offsets, zmw_counts, tags_to_exclude, args.prefix)
    idx = list(range(0, len(offsets) - 1))

    # Write the shards using the specified number of threads.
    print(f"Writing {len(idx)} shards using {args.num_threads} threads...", flush=True)
    res = ThreadPool(args.num_threads).imap_unordered(func, idx)

    # Emit final stats on the sharding.
    all_num_reads_written = list(res)
    count = 0
    for i in range(len(all_num_reads_written)):
        count += all_num_reads_written[i]
        print(f'  - wrote {all_num_reads_written[i]} reads to {args.prefix}{i}.bam', flush=True)

    print(f'Sharded {count}/{read_count} reads across {len(idx)} shards.', flush=True)
示例#12
0
def RTag(sli, c):
    '''
	Add CL/HP-tag to BAM upon request (slows a bit)

	'''

    for s in sli:

        save = pysam.set_verbosity(0)
        bamfilein = pysam.AlignmentFile(s, mode='rb', require_index=False)
        pysam.set_verbosity(save)

        with pysam.AlignmentFile(s + '.tmp', mode='wb',
                                 template=bamfilein) as bamfileout:

            for r in bamfilein.fetch(until_eof=True):

                r.set_tag('CL', c.clonenumber, 'i')
                r.set_tag('HP', c.hapnumber, 'i')
                bamfileout.write(r)

        bamfilein.close()
        os.remove(s)
        os.rename(s + '.tmp', s)
def main():
    parser = argparse.ArgumentParser(
        description=
        'Reset base qualities of reads in the CLR bam to the requested Phred base quality',
        prog='reset_clr_bam_bq')
    parser.add_argument('-q',
                        '--basequal',
                        type=int,
                        default=10,
                        help="Desired Phred base quality")
    parser.add_argument('-p',
                        '--prefix',
                        type=str,
                        default="barbequed",
                        help="Shard filename prefix")
    parser.add_argument('bam', type=str, help="BAM")
    args = parser.parse_args()

    # Silence message about the .bai file not being found.
    pysam.set_verbosity(0)

    if args.basequal < 0 or (args.basequal > 60 and args.basequal != 255):
        raise ValueError(f"Requested BQ value {args.basequal} isn't valid.")

    # https://pysam.readthedocs.io/en/latest/api.html#pysam.AlignedSegment.query_qualities
    bq = str(chr(args.basequal))  # no add 33 because the link above says so
    print(f"Setting base qualities to ASCII {str(chr(args.basequal+33))}.")

    bf = pysam.Samfile(args.bam, 'rb', check_sq=False)
    with pysam.Samfile(f'{args.prefix}.bam', 'wb', header=bf.header) as out:
        for read in bf:
            sausage = copy.deepcopy(read)
            n = len(sausage.query_sequence)
            qual = [ord(b) for b in list(bq * n)]
            sausage.query_qualities = qual
            out.write(sausage)
from cdispyutils.hmac4 import get_auth
import subprocess
import glob
import os
import sys
import requests
import json
import pysam
import numpy as np
import matplotlib.pyplot as plt
from operator import add

pysam.set_verbosity(0)

auth = ''

main_header_order = [
    'Sample', 'VCF File', 'Expectations', 'True-Positive', 'False-Positive',
    'Sensitivity', 'Specificity'
]

data_types = {
    'VCF': 'submitted_somatic_mutations',
    'FASTQ': 'submitted_unaligned_reads_files',
    'BAM': 'submitted_aligned_reads_files',
    'CNV': 'submitted_copy_number_files'
}

metadata_types = {'METADATA': 'experiment_metadata_files'}

示例#15
0
def process_bwa_bamfile(alignment, min_qual, contig_data, out_file):
    """Filter alignment BAM files

    Reads all the reads in the input BAM alignment file. Keep reads in the
    output if they are aligned with a good quality (greater than min quality
    threshold given) saving their only some columns: ReadID, Contig,
    Position_start, Position_end, strand to save memory.

    Parameters:
    -----------
    alignment : str
        Path to the input temporary alignment.
    min_qual : int
        Minimum mapping quality required to keep a Hi-C pair.
    contig_data : dict
        Dictionnary of the all the contigs from the assembly, the contigs names
        are the keys to the data of the contig available with the following
        keys: "id", "length", "GC", "hit", "coverage". Coverage still at 0 and
        need to be updated later.
    out_file : str
        Path to the output pairs file.

    Returns:
    --------
    int:
        Number of pairs aligned.
    """

    # Read the bam file.
    n_pairs = 0
    save = pysam.set_verbosity(0)
    temp_bam = pysam.AlignmentFile(alignment, "rb", check_sq=False)
    pysam.set_verbosity(save)

    with open(out_file, "w") as merged:

        # Write header of the pairs file.
        merged.write("## pairs format v1.0\n")
        merged.write("#columns: readID chr1 pos1 chr2 pos2 strand1 strand2\n")
        merged.write("#sorted: readID\n")
        merged.write("#shape: upper triangle\n")
        for contig in contig_data:
            merged.write("#chromsize: {0} {1}\n".format(
                contig, contig_data[contig]["length"]))

        # Loop until the end of the file. Read the reads by two as the forward
        # and reverse reads should be interleaved.
        while n_pairs >= 0:
            try:
                for_read = next(temp_bam)
                while for_read.is_supplementary:
                    for_read = next(temp_bam)
                rev_read = next(temp_bam)
                while rev_read.is_supplementary:
                    rev_read = next(temp_bam)

                # Check mapping quality
                if (for_read.mapping_quality >= min_qual
                        and rev_read.mapping_quality >= min_qual):

                    # Check flag
                    if not (for_read.is_unmapped or rev_read.is_unmapped):
                        n_pairs += 1

                        # Safety check (forward and reverse are the same reads)
                        if for_read.query_name != rev_read.query_name:
                            logger.error(
                                "Reads should be paired - %s\t%s",
                                for_read.query_name,
                                rev_read.query_name,
                            )
                            raise ValueError

                        # Define pairs value.
                        name = for_read.query_name
                        contig1 = for_read.reference_name
                        contig2 = rev_read.reference_name
                        pos1 = for_read.pos + 1
                        pos2 = for_read.pos + 1
                        strand1 = "+"
                        strand2 = "+"
                        if for_read.is_reverse:
                            strand1 = "-"
                        if rev_read.is_reverse:
                            strand2 = "-"

                        # Modify order to have an upper triangle and write
                        # the pair.
                        if (contig1 == contig2
                                and pos1 <= pos2) or contig_data[contig1][
                                    "id"] < contig_data[contig2]["id"]:
                            merged.write("\t".join([
                                name,
                                contig1,
                                str(pos1),
                                contig2,
                                str(pos2),
                                strand1,
                                strand2,
                            ]) + "\n")
                        else:
                            merged.write("\t".join([
                                name,
                                contig2,
                                str(pos2),
                                contig1,
                                str(pos1),
                                strand2,
                                strand1,
                            ]) + "\n")

            # Exit the loop if no more reads.
            except StopIteration:
                break

    # Close the bam file and return number of pairs
    temp_bam.close()
    return n_pairs
示例#16
0
def StrandSim(w, c):
    '''
	Perform first part of strand-seq simulations and re-align to the original haplotype
	'''

    hfa = pyfaidx.Fasta(c.ffile)

    if w.chrom not in hfa.keys():

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print('[' + now + '][Warning] Chromosome ' + w.chrom +
              ' not found in ' + c.ffile + '. Skipped simulation')

    else:

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print('[' + now + '][Message] Preparing simulation from ' + c.ffile +
              '. Haplotype ' + str(c.hapnumber))

        chr_ = hfa[w.chrom]
        seq_ = chr_[w.start - 1:w.end].seq
        tmpfa = os.path.abspath(c.haplodir + '/' + 'htmp.fa')
        region = w.chrom + '_' + str(w.start) + '_' + str(w.end)

        with open(tmpfa,
                  'w') as tmpfout:  #write temporary fa for sampling reads

            tmpfout.write('>' + region + '\n' +
                          '\n'.join(re.findall('.{1,60}', seq_)) + '\n')

        Ns = seq_.count('N')  #normalize coverage on Ns
        Nreads = round(((c.regioncoverage * (len(seq_) - Ns)) / c.length) /
                       2)  #for paired-end sequencing

        mate1h = os.path.abspath(c.haplodir + '/hr1.tmp.fq')
        mate2h = os.path.abspath(c.haplodir + '/hr2.tmp.fq')

        hapcov = Nreads * c.length * 2 / ((w.end - w.start) - Ns)
        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print('[' + now +
              '][Message] Simulated coverage for this region will be ' +
              str(hapcov))

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print('[' + now + '][Message] Simulating')

        wgsim.core(r1=mate1h,
                   r2=mate2h,
                   ref=tmpfa,
                   err_rate=c.error,
                   mut_rate=c.mutation,
                   indel_frac=c.indels,
                   indel_ext=c.extindels,
                   N=Nreads,
                   dist=c.distance,
                   stdev=c.stdev,
                   size_l=c.length,
                   size_r=c.length,
                   max_n=0.05,
                   is_hap=0,
                   is_fixed=0,
                   seed=0)

        os.remove(tmpfa)

        mate1hnew = os.path.abspath(c.haplodir + '/hr1.fq')
        mate2hnew = os.path.abspath(c.haplodir + '/hr2.fq')

        with open(mate1hnew, 'w') as out1, open(mate2hnew, 'w') as out2:

            for (name1, seq1, qual1), (name2, seq2,
                                       qual2) in zip(mp.fastx_read(mate1h),
                                                     mp.fastx_read(mate2h)):

                #change name1/name2

                newname1 = '@c' + str(c.singlecellnum) + 'h' + str(
                    c.hapnumber) + 'fh_' + name1
                newname2 = '@c' + str(c.singlecellnum) + 'h' + str(
                    c.hapnumber) + 'fh_' + name2

                read1 = [newname1, seq1, '+', qual1]
                read2 = [newname2, seq2, '+', qual2]

                out1.write('\n'.join(read1) + '\n')
                out2.write('\n'.join(read2) + '\n')

        os.remove(mate1h)
        os.remove(mate2h)

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print(
            '[' + now +
            '][Message] Mapping simulated reads to the corresponding haplotype'
        )

        BAM = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.srt.bam')

        sam_cmd = [
            'minimap2', '-ax', 'sr', '--MD', '--cs', '-Y', '--sam-hit-only',
            '-t',
            str(c.threads), c.ffile, mate1hnew, mate2hnew
        ]
        bam_cmd = [
            'samtools', 'sort', '-@',
            str(round(c.threads / 2)), '-o', BAM
        ]

        p1 = subprocess.Popen(sam_cmd,
                              stderr=open(os.devnull, 'wb'),
                              stdout=subprocess.PIPE)
        bout = open(BAM, 'wb')
        p2 = subprocess.run(bam_cmd,
                            stdin=p1.stdout,
                            stderr=open(os.devnull, 'wb'),
                            stdout=bout)
        bout.close()

        os.remove(mate1hnew)
        os.remove(mate2hnew)

        #now re-parse BAM file to keep only Watson/Crick reads
        #Watson reads: read1 forward, read2 reverse
        #Crick reads: read2 forward, read1 reverse

        ivf = None

        if len(c.sce_bedregion) != 0:

            sce_string = ''

            for s in c.sce_bedregion:

                if s[3] == c.cellid and s[4] == c.hapid:

                    sce_string += s.chrom + '\t' + str(s.start) + '\t' + str(
                        s.end) + '\n'

            if sce_string != '':

                sce_fromscratch = pybedtools.BedTool(sce_string.rstrip(),
                                                     from_string=True)
                ivf = sce_fromscratch.as_intervalfile(
                )  #intervals where to perform SCE events

                now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                print(
                    '[' + now +
                    '][Message] Detected one ore more SCE event for current cell/haplotype'
                )

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print(
            '[' + now +
            '][Message] Extracting Watson (R1F,R2R) and Crick (R1R,R2F) reads')

        save = pysam.set_verbosity(0)
        bamstrand = pysam.AlignmentFile(
            BAM, 'rb', require_index=False)  #until-eof consumes the bamfile
        pysam.set_verbosity(save)
        Wreads = list(WR(bamstrand, ivf))
        bamstrand.close()

        save = pysam.set_verbosity(0)
        bamstrand = pysam.AlignmentFile(
            BAM, 'rb', require_index=False)  #re-open for second round
        pysam.set_verbosity(save)
        Creads = list(CR(bamstrand, ivf))
        bamstrand.close()

        os.remove(BAM)

        if c.noise > 0:

            now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
            print('[' + now + '][Message] Adding noise to strands')

            CtoW = random.sample(Creads, round(len(Wreads) / 100 * c.noise))
            Wreads += CtoW

            WtoC = random.sample(Wreads, round(len(Creads) / 100 * c.noise))
            Creads += WtoC

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print('[' + now + '][Message] Writing Watson and Crick FASTQ')

        w1 = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.w1.fq')
        w2 = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.w2.fq')

        c1 = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.c1.fq')
        c2 = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.c2.fq')

        with open(w1, 'w') as wout1, open(w2, 'w') as wout2:

            for r1, r2 in Wreads:

                if r1.get_tag('OS') == 'W':  #this is true W

                    read1 = [
                        '@' + r1.query_name, r1.query_sequence, '+',
                        '2' * c.length
                    ]
                    read2 = [
                        '@' + r2.query_name,
                        mp.revcomp(r2.query_sequence), '+', '2' * c.length
                    ]

                else:  #write to Watson, but is Crick

                    read1 = [
                        '@' + r1.query_name,
                        mp.revcomp(r1.query_sequence), '+', '2' * c.length
                    ]
                    read2 = [
                        '@' + r2.query_name, r2.query_sequence, '+',
                        '2' * c.length
                    ]

                wout1.write('\n'.join(read1) + '\n')
                wout2.write('\n'.join(read2) + '\n')

        with open(c1, 'w') as cout1, open(c2, 'w') as cout2:

            for r1, r2 in Creads:

                if r1.get_tag('OS') == 'C':  #this is true C

                    read1 = [
                        '@' + r1.query_name,
                        mp.revcomp(r1.query_sequence), '+', '2' * c.length
                    ]
                    read2 = [
                        '@' + r2.query_name, r2.query_sequence, '+',
                        '2' * c.length
                    ]

                else:  #write to Crick, but is Watson

                    read1 = [
                        '@' + r1.query_name, r1.query_sequence, '+',
                        '2' * c.length
                    ]
                    read2 = [
                        '@' + r2.query_name,
                        mp.revcomp(r2.query_sequence), '+', '2' * c.length
                    ]

                cout1.write('\n'.join(read1) + '\n')
                cout2.write('\n'.join(read2) + '\n')

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print(
            '[' + now +
            '][Message] Mapping Watson and Crick reads to the original reference'
        )

        BAM = os.path.abspath(c.haplodir + '/' + str(c.r_number) +
                              '.W.srt.bam')

        sam_cmd = [
            'minimap2', '-ax', 'sr', '--MD', '--cs', '-Y', '--sam-hit-only',
            '-t',
            str(c.threads), '-R', '@RG\\tID:illumina\\tSM:strand', c.REF, w1,
            w2
        ]
        bam_cmd = [
            'samtools', 'sort', '-@',
            str(round(c.threads / 2)), '-o', BAM
        ]

        p1 = subprocess.Popen(sam_cmd,
                              stderr=open(os.devnull, 'wb'),
                              stdout=subprocess.PIPE)
        bout = open(BAM, 'wb')
        p2 = subprocess.run(bam_cmd,
                            stdin=p1.stdout,
                            stderr=open(os.devnull, 'wb'),
                            stdout=bout)
        bout.close()

        os.remove(w1)
        os.remove(w2)

        BAM = os.path.abspath(c.haplodir + '/' + str(c.r_number) +
                              '.C.srt.bam')

        sam_cmd = [
            'minimap2', '-ax', 'sr', '--MD', '--cs', '-Y', '--sam-hit-only',
            '-t',
            str(c.threads), '-R', '@RG\\tID:illumina\\tSM:strand', c.REF, c1,
            c2
        ]
        bam_cmd = [
            'samtools', 'sort', '-@',
            str(round(c.threads / 2)), '-o', BAM
        ]

        p1 = subprocess.Popen(sam_cmd,
                              stderr=open(os.devnull, 'wb'),
                              stdout=subprocess.PIPE)
        bout = open(BAM, 'wb')
        p2 = subprocess.run(bam_cmd,
                            stdin=p1.stdout,
                            stderr=open(os.devnull, 'wb'),
                            stdout=bout)
        bout.close()

        os.remove(c1)
        os.remove(c2)
    def categorize_outcomes(self, max_reads=None):
        # Record how long each categorization takes.
        times_taken = []

        if self.fns['outcomes_dir'].is_dir():
            shutil.rmtree(str(self.fns['outcomes_dir']))

        self.fns['outcomes_dir'].mkdir()

        outcome_to_qnames = defaultdict(list)

        bam_read_type = 'nonredundant'

        # iter wrap since tqdm objects are not iterators
        alignment_groups = iter(self.alignment_groups())

        if max_reads is not None:
            alignment_groups = itertools.islice(alignment_groups, max_reads)

        special_als = defaultdict(list)

        with self.fns['outcome_list'].open('w') as outcome_fh:

            for name, als in self.progress(alignment_groups,
                                           desc='Categorizing reads'):
                seq = als[0].get_forward_sequence()

                # Special handling of empty sequence.
                if seq is None:
                    seq = ''

                if seq in self.seq_to_outcome:
                    layout = self.seq_to_outcome[seq]
                    layout.query_name = name

                else:
                    layout = self.categorizer(als,
                                              self.target_info,
                                              error_corrected=self.has_UMIs,
                                              mode=self.layout_mode)

                    try:
                        layout.categorize()
                    except:
                        print()
                        print(self.sample_name, name)
                        raise

                if layout.special_alignment is not None:
                    special_als[layout.category, layout.subcategory].append(
                        layout.special_alignment)

                outcome_to_qnames[layout.category,
                                  layout.subcategory].append(name)

                outcome = self.final_Outcome.from_layout(layout)

                outcome_fh.write(f'{outcome}\n')

                times_taken.append(time.monotonic())

        # To make plotting easier, for each outcome, make a file listing all of
        # qnames for the outcome and a bam file (sorted by name) with all of the
        # alignments for these qnames.

        qname_to_outcome = {}

        bam_fn = self.fns_by_read_type['bam_by_name'][bam_read_type]
        header = sam.get_header(bam_fn)

        alignment_sorters = sam.multiple_AlignmentSorters(header, by_name=True)

        for outcome, qnames in outcome_to_qnames.items():
            outcome_fns = self.outcome_fns(outcome)
            outcome_fns['dir'].mkdir()

            alignment_sorters[outcome] = outcome_fns['bam_by_name'][
                bam_read_type]

            with outcome_fns['query_names'].open('w') as fh:
                for qname in qnames:
                    qname_to_outcome[qname] = outcome
                    fh.write(qname + '\n')

        with alignment_sorters:
            saved_verbosity = pysam.set_verbosity(0)
            with pysam.AlignmentFile(bam_fn) as full_bam_fh:
                for al in self.progress(full_bam_fh,
                                        desc='Making outcome-specific bams'):
                    if al.query_name in qname_to_outcome:
                        outcome = qname_to_outcome[al.query_name]
                        alignment_sorters[outcome].write(al)
            pysam.set_verbosity(saved_verbosity)

        # Make special alignments bams.
        for outcome, als in self.progress(
                special_als.items(), desc='Making special alignments bams'):
            outcome_fns = self.outcome_fns(outcome)
            bam_fn = outcome_fns['special_alignments']
            sorter = sam.AlignmentSorter(bam_fn, header)
            with sorter:
                for al in als:
                    sorter.write(al)

        return np.array(times_taken)
示例#18
0
文件: bam.py 项目: rtcz/varlock
def open_bam(*args, **kwargs):
    # https://github.com/pysam-developers/pysam/issues/939
    pysam.set_verbosity(0)
    return AlignmentFile(*args, **kwargs)
示例#19
0
    def code_block(input_bam: Bam, tags: Optional[String], out_bam: String):
        from sys import exit, stderr
        import pysam

        def commaSepList(inputStr):
            tmpList = inputStr.split(",")
            if len(tmpList) == 0:
                print("No input tags provided")
                exit(1)
            else:
                tmpList = [s.strip() for s in tmpList]
            return tmpList

        if not tags:
            tags = "ZA,ZB,RX,QX"

        tags = commaSepList(tags)

        # Some work around for htslib errors that don't impact the calculation
        save = pysam.set_verbosity(0)

        bamfh = pysam.AlignmentFile(input_bam, "rb")

        pysam.set_verbosity(save)

        umi_dict = dict()

        outfh = pysam.AlignmentFile(out_bam, "wb", template=bamfh)

        readcount = 0
        for read in bamfh:
            readcount += 1

            if read.query_name not in umi_dict:
                if sum([read.has_tag(t) for t in tags]) == len(tags):
                    if read.has_tag("XA"):
                        allTags = read.get_tags()
                        relTags = [(t, v) for t, v in allTags if t in tags]
                        umi_dict.update({read.query_name: relTags})
                    outfh.write(read)
                else:
                    missingTag = [t for t in tags if not read.has_tag(t)]
                    missingTagStr = ", ".join(missingTag)
                    errStr = "".join([
                        missingTagStr, " is missing for read ",
                        read.query_name, "\n"
                    ])
                    stderr.write(errStr)
            elif read.query_name in umi_dict:
                msg = "Accessing umi_dict\n"
                stderr.write(msg)
                if not read.has_tag("RX"):
                    curTags = read.get_tags()
                    curTags.extend(umi_dict[read.query_name])
                    read.set_tags(curTags)
                    outfh.write(read)
                else:
                    outfh.write(read)
            if readcount % 100000 == 0:
                msg = str(readcount) + "processed reads\n"
                stderr.write(msg)

        outfh.close()
        bamfh.close()

        return {"out": out_bam}
def main():
    parser = argparse.ArgumentParser(
        description='Remove redundant alignment records from ONT BAM file',
        prog='remove_redundant_reads')
    parser.add_argument('-p',
                        '--prefix',
                        type=str,
                        default="shard",
                        help="Output prefix")
    parser.add_argument('-a',
                        '--annotations',
                        type=str,
                        help="Annotations on (potential) duplicate reads")

    parser.add_argument('bam', type=str, help="BAM")
    args = parser.parse_args()

    # create a dict of set's, a trick to avoid Hash collisions
    guilty_dict_per_chr = dict()
    with open(args.annotations) as f:
        for line in f:
            arr = line.strip().split('\t')
            name = arr[0]
            chrom = arr[2]
            guilty_dict_per_chr.setdefault(chrom, set())
            guilty_dict_per_chr[chrom].add(name)

    # Silence message about the .bai file not being found.
    pysam.set_verbosity(0)

    num_alignments, num_dropped_alignments = 0, 0
    bf = pysam.Samfile(args.bam, 'rb', check_sq=False)
    with pysam.Samfile(f'{args.prefix}.bam', 'wb', header=bf.header) as out:
        # we rely on the observation that for coordinate sorted BAM,
        # duplicate records will appear in blocks, hence once we step off a position with duplicates, we start afresh
        current_position = -1
        current_signatures = set()
        for read in bf:
            num_alignments += 1

            chrom = read.reference_name
            n = read.query_name
            if n in guilty_dict_per_chr[chrom]:

                mq = read.mapping_quality
                sam_flag = read.flag
                pos = read.reference_start
                signature = f"{n}-{chrom}-{pos}-{mq}-{sam_flag}-"

                if current_position != pos:  # new position, let's write and reset
                    out.write(read)
                    current_position = pos
                    current_signatures = set()
                    current_signatures.add(signature)
                elif signature in current_signatures:  # You're wanted!
                    num_dropped_alignments += 1
                    pass
                else:  # you are a new group of duplicates that map to this location
                    out.write(read)
                    current_signatures.add(signature)
            else:
                out.write(read)

    print(f'num_alignments: {num_alignments}')
    print(f'num_dropped_alignments: {num_dropped_alignments}')
    print(f'num_kept_alignments: {num_alignments - num_dropped_alignments}')
示例#21
0
node = graph.add_vertex()
v_id[node] = "{idx}_{sample}".format(idx=0, sample=sample)
v_name[node] = ""
v_seq[node] = ""
v_q_qual[node] = ""

# add reads as vertices of the graph
if reads.endswith(".gz"):
    with gzip.open(reads, "rt") as _reads:
        graph = graph_operations.set_nodes(graph, _reads, format, sample)
else:
    with open(reads, "rU") as _reads:
        graph = graph_operations.set_nodes(graph, _reads, format, sample)

# add edges from all-vs-all alignment of reads (please see rule minimap2)
verbose = pysam.set_verbosity(
    0)  # https://github.com/pysam-developers/pysam/issues/939
bam = pysam.AlignmentFile(bam, "rb")
pysam.set_verbosity(verbose)
for read in bam.fetch(until_eof=True):
    graph = graph_operations.set_edges(graph, read, threshold)
bam.close()
graph.remove_vertex(0)

# write log files
sys.stderr.write("graph construction summary for sample {}:"
                 "\n nodes:\t{}\n edges:\t{}\n".format(sample,
                                                       graph.num_vertices(),
                                                       graph.num_edges()))

graph_operations.save_and_draw_graph(graph,
                                     xml_out=graph_xml,
示例#22
0
def merge_bams(args, filtered_forward, filtered_reverse):
    previous = None
    save = pysam.set_verbosity(0)
    forward = pysam.AlignmentFile(filtered_forward, 'r', threads=args.threads)
    reverse = pysam.AlignmentFile(filtered_reverse, 'r', threads=args.threads)
    pysam.set_verbosity(save)
    new_header = OrderedDict(forward.header)
    if 'PG' in new_header:
        last_pg = new_header['PG'][-1]
        previous = last_pg['ID']
    command = 'bellerophon --forward %s --reverse %s --output %s --quality %s' % \
        (os.path.split(args.forward)[-1], os.path.split(args.reverse)[-1], os.path.split(args.output)[-1], args.quality)
    new_pg = dict(ID=__name__,
                  PN=__name__,
                  PP=None,
                  VN=__version__,
                  CL=command,
                  DS=__description__)
    if previous is not None:
        new_pg['PP'] = previous
        new_pg = new_header['PG'] + [OrderedDict(new_pg)]
    else:
        new_pg = new_header['PG'] + [
            OrderedDict(ID=__name__,
                        PN=__name__,
                        VN=__version__,
                        CL=command,
                        DS=__description__)
        ]
    new_header['PG'] = new_pg
    output_fh = pysam.AlignmentFile(
        args.output, 'wb', header=pysam.AlignmentHeader.from_dict(new_header))
    processed_reads = 0
    mismatched_reads = 0
    unmapped_reads = 0
    low_quality_reads = 0
    starttime = time.time()
    for forward_read, reverse_read in zip(forward, reverse):
        proper_pairs = 0
        # Skip reads that aren't the same, are unmapped, or are less than --quality
        if forward_read.query_name != reverse_read.query_name:
            mismatched_reads += 1
            continue
        if forward_read.is_unmapped or reverse_read.is_unmapped:
            unmapped_reads += 1
            continue
        if forward_read.mapping_quality < args.quality or reverse_read.mapping_quality < args.quality:
            low_quality_reads += 1
            continue
        if not forward_read.is_unmapped or reverse_read.is_unmapped:
            proper_pairs = 1
            # Get the proper distances and lengths, since they may be off now.
            if forward_read.reference_id == reverse_read.reference_id:
                distance = abs(forward_read.reference_start -
                               reverse_read.reference_start)
                if forward_read.reference_start >= reverse_read.reference_start:
                    forward_length = -1 * distance
                    reverse_length = distance
                else:
                    forward_length = distance
                    reverse_length = -1 * distance
            else:
                forward_length = 0
                reverse_length = 0

        else:
            proper_pairs = 0
            forward_length = 0
            reverse_length = 0
        # Zero the right flags for the forward and reverse reads.
        forward_read.is_secondary = 0
        reverse_read.is_secondary = 0
        forward_read.is_unmapped = 0
        reverse_read.is_unmapped = 0
        forward_read.is_supplementary = 0
        reverse_read.is_supplementary = 0
        # Make sure each one has the right flag for read number.
        forward_read.is_read1 = 1
        reverse_read.is_read2 = 1
        reverse_read.is_read1 = 0
        forward_read.is_read2 = 0
        # Swap the mapped and reverse attributes between reads.
        reverse_is_unmapped = reverse_read.is_unmapped
        forward_is_unmapped = forward_read.is_unmapped
        reverse_is_reverse = reverse_read.is_reverse
        forward_is_reverse = forward_read.is_reverse
        reverse_read.is_unmapped = forward_is_unmapped
        forward_read.is_unmapped = reverse_is_unmapped
        forward_read.mate_is_unmapped = forward_is_unmapped
        reverse_read.mate_is_unmapped = reverse_is_unmapped
        forward_read.mate_is_reverse = reverse_is_reverse
        reverse_read.mate_is_reverse = forward_is_reverse
        # Set them to paired and properly paired.
        forward_read.is_proper_pair = proper_pairs
        reverse_read.is_proper_pair = proper_pairs
        forward_read.is_paired = 1
        reverse_read.is_paired = 1
        # Set the next reference for the reads to each other.
        reverse_read.next_reference_id = forward_read.reference_id
        forward_read.next_reference_id = reverse_read.reference_id
        reverse_read.next_reference_start = forward_read.reference_start
        forward_read.next_reference_start = reverse_read.reference_start
        # And update the length that we calculated above.
        forward_read.template_length = forward_length
        reverse_read.template_length = reverse_length
        output_fh.write(forward_read)
        output_fh.write(reverse_read)
        processed_reads += 1
    log.info('Successfully merged %d read pairs in %f seconds.' %
             (processed_reads, time.time() - starttime))
    log.debug(
        'Skipped %d pairs with mismatched read names, %d unmapped reads, and %d with a mapping quality below %d.'
        % (mismatched_reads, unmapped_reads, low_quality_reads, args.quality))
    for filename in [filtered_forward, filtered_reverse]:
        os.unlink(filename)
    return 0
示例#23
0
def filter_reads(args):
    log.setLevel(args.log_level)
    retval = []
    save = pysam.set_verbosity(0)
    ffh = pysam.AlignmentFile(args.forward, 'r', threads=args.threads)
    rfh = pysam.AlignmentFile(args.reverse, 'r', threads=args.threads)
    pysam.set_verbosity(save)
    if ffh.header.references != rfh.header.references or ffh.header.lengths != rfh.header.lengths:
        log.error(
            'The input files do not have the same sequence names or lengths.')
        return 1
    for handle in [ffh, rfh]:
        filename = os.path.split(
            os.path.abspath(handle.filename.decode('utf-8')))[-1]
        log.info('Loading reads from %s...' % filename)
        processed_reads = 0
        written_reads = 0
        previous_read = None
        all_reads = []
        unmapped_reads = []
        five_reads = []
        three_reads = []
        mid_reads = []
        counter = 0
        come_in_here = re.compile(r'^[0-9]*M')
        dear_boy = re.compile(r'.*M$')
        have_a_cigar = re.compile(
            r'^[0-9]*[HS].*M.*[HS]$')  # You're gonna go far, you're gonna fly
        output_tempfile = tempfile.NamedTemporaryFile(prefix='filtered_',
                                                      suffix='.bam',
                                                      delete=False,
                                                      dir=os.getcwd())
        retval.append(output_tempfile.name)
        output_tempfile.close()
        output_fh = pysam.AlignmentFile(output_tempfile.name,
                                        'wb',
                                        header=handle.header)
        starttime = time.time()
        for read in handle:
            processed_reads += 1
            # If this is 1. Not the first read, and 2. Not the previous read again:
            if previous_read is not None and read.query_name != previous_read:
                # If we have more than one read in the current batch and one
                # read is on the 5´ side of a ligation junction.
                if counter in [1, 2] and len(five_reads) == 1:
                    # Serve it forth.
                    output_fh.write(five_reads[0])
                    written_reads += 1
                else:
                    # Get the most recent read, set the unmapped flag, and send it
                    # to the output file.
                    new_read = all_reads[0]
                    new_read.is_unmapped = 1
                    output_fh.write(new_read)
                    written_reads += 1
                # Reset these variables to their original values.
                counter = 0
                all_reads = []
                unmapped_reads = []
                five_reads = []
                three_reads = []
                mid_reads = []
            counter += 1
            all_reads.append(read)
            previous_read = read.query_name
            # Determine whether read is unmapped, or has mapped reads spanning a junction
            if read.is_unmapped:
                unmapped_reads.append(read)
            # If the read is aligned - and has mapped reads at the end, or it is
            # aligned + and has mapped reads at the beginning, it goes in the 5´
            # bin and is retained.
            elif (read.is_reverse and dear_boy.match(read.cigarstring)
                  is not None) or (not read.is_reverse and come_in_here.match(
                      read.cigarstring) is not None):
                five_reads.append(read)
            # If the read is aligned + and has mapped reads at the end, or it is
            # aligned - and has mapped reads at the beginning, it goes in the 3´
            # bin and is discarded.
            elif (read.is_reverse and come_in_here.match(read.cigarstring)
                  is not None) or (not read.is_reverse and dear_boy.match(
                      read.cigarstring) is not None):
                three_reads.append(read)
            # If it has mapped reads in the middle, put it in that list.
            elif have_a_cigar.match(read.cigarstring):
                mid_reads.append(read)
        # If we have a read.
        if counter == 1:
            # And it is on the 5´ side of a ligation junction
            if len(five_reads) == 1:
                # We send it to the output
                output_fh.write(five_reads[0])
            else:
                # Otherwise we flag it unmapped and push it out.
                new_read = all_reads[0]
                new_read.is_unmapped = 1
                output_fh.write(new_read)
                written_reads += 1
        # Or if we have two reads and one of them is on the 5´ side of a junction.
        elif counter == 2 and len(five_reads) == 1:
            # We do.
            output_fh.write(five_reads[0])
            written_reads += 1
        else:
            # The same kind of thing.
            new_read = all_reads[0]
            new_read.is_unmapped = 1
            output_fh.write(new_read)
            written_reads += 1
        log.debug('Processed %d reads in %f seconds and output %d.' %
                  (processed_reads, time.time() - starttime, written_reads))
    # Send the filenames of the filtered alignments back to the caller.
    return retval
示例#24
0
    def getJunctionsFromBam(self,sample):
        """
        """
        min_length = self.args.min_length
        max_length = self.args.max_length
        min_reads = self.args.min_reads
        fasta = self.args.genome
        
        
        samplename, filename, metadata, condition, bedfilename = sample
        
        #genome = pysam.AlignmentFile(filename)
        #############
        old_verbosity = pysam.set_verbosity(0)
        try:
            genome = pysam.AlignmentFile(filename)
        except ValueError:
            print("Using: pysam.AlignmentFile(filename,check_sq=False) with",filename)
            genome = pysam.AlignmentFile(filename,check_sq=False)
        pysam.set_verbosity(old_verbosity)
        ##############
        counts = {}
        leftDiversity = {}
        rightDiversity = {}
        overhangs = {}
        for read in genome.fetch(until_eof=True):
            if True: #read.is_read2:
                if read.is_reverse:
                    strand = "-"
                else:
                    strand = "+"
            else:
                if read.is_reverse:
                    strand = "+"
                else:
                    strand = "-"

            blocks = read.get_blocks()
            try:
                read_start = blocks[0][0]
            except IndexError:
                continue
            read_end = blocks[-1][1]
            for i in range(len(blocks)-1):
                junction = (read.reference_name,blocks[i][1],blocks[i+1][0],strand)
                length = junction[2] - junction[1]
                if length >= min_length and length <= max_length:
                    leftOH = blocks[i][1]-blocks[i][0]
                    rightOH = blocks[i+1][1]-blocks[i+1][0]
                    overhang = min(leftOH,rightOH)
                    try:
                        counts[junction] += 1
                        overhangs[junction] = max(overhang,overhangs[junction])
                        try:
                            leftDiversity[junction][read_start] += 1
                            rightDiversity[junction][read_end] += 1
                        except KeyError:
                            leftDiversity[junction][read_start] = 1
                            rightDiversity[junction][read_end] = 1
                    except KeyError:
                        counts[junction] = 1
                        overhangs[junction] = overhang
                        leftDiversity[junction] = {read_start:1}
                        rightDiversity[junction] = {read_end:1}

        filteredJunctions = []
        leftEntropy = {}
        rightEntropy = {}

        if genome:
            leftMotif = {}
            rightMotif = {}
            genome = pysam.FastaFile(fasta)
        for junction in sorted(counts):
            chromosome,left,right,strand = junction
            
            if genome:
                if (chromosome,left) not in leftMotif:
                    try:
                        leftMotif[(chromosome,left)] = genome.fetch(chromosome,left,left+2)
                    except KeyError:
                        leftMotif[(chromosome,left)] = "NN"
                if (chromosome,right) not in rightMotif:
                    try:
                        rightMotif[(chromosome,right)] = genome.fetch(chromosome,right-2,right)
                    except KeyError:
                        rightMotif[(chromosome,right)] = "NN"
            leftEntropy[junction] = 0
            total = sum(leftDiversity[junction].values())
            for species,count in leftDiversity[junction].items():
                prop = count/total
                leftEntropy[junction] -= (prop) * np.log(prop)
            rightEntropy[junction] = 0
            total = sum(rightDiversity[junction].values())
            for species,count in rightDiversity[junction].items():
                prop = count/total
                rightEntropy[junction] -= (prop) * np.log(prop)

            filteredJunctions.append(junction)

            
        #
        if self.args.strands in ("inferCombine", "inferOnly"):
            
            opposite = {"+":"-", "-":"+"}
            plus_motifs = {"GT_AG","GC_AG","AT_AC"}
            minus_motifs = {"CT_AC","CT_GC","GT_AT"}
            
            firstFiltered = filteredJunctions
            filteredJunctions = []
                        
            for junction in firstFiltered:
                
                chromosome,left,right,strand = junction
                motif = f"{leftMotif[(chromosome,left)]}_{rightMotif[(chromosome,right)]}"


                complement = (chromosome,left,right,opposite[strand])
                
                if complement not in counts:
                    filteredJunctions.append(junction)
                elif (junction in self.annotated or
                     (strand == "+" and motif in plus_motifs) or
                     (strand == "-" and motif in minus_motifs)):

                    filteredJunctions.append(junction)
                    
                    if self.args.strands == "inferCombine":
                        counts[junction] += counts[complement]
                        
                elif (complement in self.annotated or
                     (strand == "-" and motif in plus_motifs) or
                     (strand == "+" and motif in minus_motifs)):
                    pass
                else:
                    filteredJunctions.append(junction)
                        


        
        with open(bedfilename,"w") as bedOut:
            for junction in filteredJunctions:
                chromosome,left,right,strand = junction
                name = f"e:{leftEntropy[junction]:0.02f}:{rightEntropy[junction]:0.02f};o:{overhangs[junction]};m:{leftMotif[left]}_{rightMotif[right]};a:{self.annotated.get(junction,'?')}"
                bedOut.write(f"{chromosome}\t{left}\t{right}\t{name}\t{counts[junction]}\t{strand}\n")

        return filename, bedfilename, len(filteredJunctions)
示例#25
0
import pysam
from collections import defaultdict

# suppress incorrect error warning - https://github.com/pysam-developers/pysam/issues/939
save = pysam.set_verbosity(0)
# load and iterate through the PathSeq BAM file
pathseq_bam = pysam.AlignmentFile(snakemake.input[0], mode="rb")
# set verbosity back to original setting
pysam.set_verbosity(save)

output = []
UMI_dict = defaultdict(list)
# seg is an AlignedSegment object
for seg in pathseq_bam.fetch(until_eof=True):
    # not all records will have the CB tag and the UB tag - they should now
    if seg.has_tag("CB") and seg.has_tag("UB"):
        if (seg.get_tag(tag="CB") == snakemake.wildcards["cell"]):
            UMI_dict[seg.get_tag(tag="UB")].append(seg)

barcode_bam = pysam.AlignmentFile(snakemake.output[0],
                                  mode="wb",
                                  template=pathseq_bam)
for UMI in UMI_dict:
    #print(UMI)
    # keep one read per UMI - the read with the highest mapping quality
    UMI_reads = UMI_dict[UMI]
    UMI_read = UMI_reads[0]
    #print(UMI_read)
    for read in UMI_reads:
        #print(read)
        if read.mapping_quality > UMI_read.mapping_quality:
示例#26
0
def check_raw_alignments(df, args, pon):

    # get soft-clip position and direction
    clips = []
    for chrA, posA, contA, chrB, posB, contB, idx, svlen, spanning in zip(
            df.chrA, df.posA, df.contigA, df.chrB, df.posB, df.contigB,
            df.index, df.svlen, df.spanning):
        if spanning:
            clips.append((chrA, posA, 3, idx, chrA == chrB, svlen))
            clips.append((chrB, posB, 3, idx, chrA == chrB, svlen))
        else:
            if contA:
                start_lower = contA[0].islower()
                end_lower = contA[-1].islower()
                if start_lower and not end_lower:
                    clip_side = 0
                elif not start_lower and end_lower:
                    clip_side = 1
                else:  # start_lower and end_lower:
                    clip_side = 3  # any side
                clips.append((chrA, posA, clip_side, idx, chrA == chrB, svlen))
            if contB:
                start_lower = contB[0].islower()
                end_lower = contB[-1].islower()
                if start_lower and not end_lower:
                    clip_side = 0
                elif not start_lower and end_lower:
                    clip_side = 1
                else:
                    clip_side = 3
                clips.append((chrB, posB, clip_side, idx, chrA == chrB, svlen))

    clips = sorted(clips, key=lambda x: (x[0], x[1]))

    opts = {"bam": "rb", "cram": "rc", "sam": "r", "-": "rb", "stdin": "rb"}
    pad = 20
    found = set([])
    for pth, _ in pon:
        # open alignment file
        kind = pth.split(".")[-1]
        bam_mode = opts[kind]

        pysam.set_verbosity(0)
        infile = pysam.AlignmentFile(
            pth,
            bam_mode,
            threads=1,
            reference_filename=None if kind != "cram" else args["ref"])
        pysam.set_verbosity(3)

        for chrom, pos, cs, index, intra, svlen in clips:

            if index in found:
                continue

            for a in infile.fetch(chrom, pos - pad if pos - pad > 0 else 0,
                                  pos + pad):
                if not a.cigartuples:
                    continue
                # if pos == 3786481 and a.cigartuples[-1][0] == 4:
                #     echo(a.cigartuples, abs(pos - a.pos), abs(pos - a.reference_end))
                if a.cigartuples[0][0] == 4 and cs != 1:
                    current_pos = a.pos
                    if abs(current_pos - pos) < 8:
                        found.add(index)
                        break
                if a.cigartuples[-1][0] == 4 and cs != 0:
                    current_pos = a.reference_end
                    if abs(current_pos - pos) < 8:
                        found.add(index)
                        break

    df = df.drop(found)

    return df