示例#1
0
    def __init__(self, fname, referenceFastaFname=None):
        self.filename = fname = abspath(expanduser(fname))
        self.peer = AlignmentFile(fname, "rb", check_sq=False)
        self._checkFileCompatibility()

        self._loadReferenceInfo()
        self._loadReadGroupInfo()
        self._loadProgramInfo()

        self.referenceFasta = None
        if referenceFastaFname is not None:
            if self.isUnmapped:
                raise ValueError, "Unmapped BAM file--reference FASTA should not be given as argument to BamReader"
            self._loadReferenceFasta(referenceFastaFname)
def make_chrom_info(bam):
    base = make_basename(bam)
    chrom_info_filename = base + '.chrom_info'
    with AlignmentFile(bam, 'rb') as alignment, open(chrom_info_filename,
                                                     'wt') as chrom_info:
        for row in alignment.header['SQ']:
            name = row['SN']
            length = row['LN']
            chrom_info.write(name)
            chrom_info.write('\t')
            chrom_info.write(str(length))
            chrom_info.write(os.linesep)

    return chrom_info_filename
示例#3
0
def test_anno_1(tmpdir):
    "test --simple version"

    make_bam(
        tmpdir.strpath, """
             123456789_123456789_12
        r1 + ...........
        r1 -      ......*....
        r2 +   .........*.
        r2 -       .....*.......
        r3 +       ...........
        r3 -            ....*......
        r4 +       ...........
        r4 -            ...........
             123456789_123456789_12
    """)

    sam = AlignmentFile(tmpdir.join("test.bam").strpath)

    o = Namespace(query=tmpdir.join("test.vcf").strpath,
                  cfdna=sam,
                  gdna=None,
                  simple=True,
                  verbos=False,
                  fast=False,
                  qual=20,
                  output=tmpdir.join("test_MrBam.vcf").strpath)

    anno(o)

    for i in open(tmpdir.join("test_MrBam.vcf").strpath):
        if i.startswith('#'):
            continue

        i = i.split('\t')

        if i[1] == '12':
            a, b, c, d = i[-1].split(':')[-1].strip().split(',')
            assert a == '0'
            assert b == '0'
            assert c == '1'
            assert d == '1'
        elif i[1] == '16':
            a, b, c, d = i[-1].split(':')[-1].strip().split(',')
            assert a == '0'
            assert b == '0'
            assert c == '0'
            assert d == '0'
        else:
            raise Exception("unexpected variant call")
示例#4
0
def atac(args, logger):
    """

    """
    if not args.chrom_sizes:
        exit("Chrom sizes required for bam conversion")

    chrom_mods = build_transform(args.mod, logger)
    input_ = AlignmentFile(args.input, 'rb')

    header = update_header(input_.header.as_dict(), args.chrom_sizes)
    output = AlignmentFile(args.output, 'wb', header=header)

    curr_chrom = ""
    for line in input_:

        if input_.references[line.reference_id] != curr_chrom:
            curr_chrom = input_.references[line.reference_id]
            positions, deltas = get_positions_and_deltas(
                chrom_mods, curr_chrom, logger)
        # if line.is_reverse and (line.reference_length != len(line.seq)):
        #     print line
        #     print line.reference_length
        #     print line.cigar
        #     print len(line.seq)
        #     print len(line.get_reference_positions())
#        try:
        if not line.is_reverse:
            start_delta = find_delta(positions, deltas,
                                     int(line.reference_start))
            line.reference_start = int(line.reference_start) + start_delta
        else:
            end_delta = find_delta(positions, deltas, int(line.reference_end))
            mapped_end = int(line.reference_end) + end_delta
            line.reference_start = mapped_end - len(
                line.seq)  # line.reference_length
        output.write(line)
示例#5
0
def parse(bamfile, minqual):
    bamhandle = AlignmentFile(bamfile, 'rb')

    positions = Positions()

    for read in bamhandle:
        if isclip(read) is False or read.mapping_quality < minqual:
            continue
        clip = ClipRead(read)
        pos = positions.getposition(bamhandle.get_reference_name(read.reference_id), \
                                    clip.getclippos())
        pos.addclipread(clip)

    bamhandle.close()
    return positions
def run_project_alignments(args):
    """ Project mapped sam file"""

    sam = args.sam
    chromosomes = args.chromosomes.split(",")
    graph_dir = args.data_dir

    linear_ref_paths = {}
    haplotype_paths = {}

    out_sam = AlignmentFile(args.out_sam, "w", template=AlignmentFile(sam))

    logging.info("Reading linear paths")
    for chromosome in tqdm(chromosomes):
        linear_ref_paths[chromosome] = NumpyIndexedInterval.from_file(
            graph_dir + chromosome + "_linear_pathv2.interval")
        haplotype_paths[chromosome] = NumpyIndexedInterval.from_file(
            args.linear_paths_base_name + "_" + chromosome +
            ".intervalcollection.indexed")

    logging.info("Converting")
    n_unmapped = 0
    for sam_record in tqdm(read_sam(sam), total=number_of_lines_in_file(sam)):
        chromosome = sam_record.chromosome
        if chromosome is None:
            out_sam.write(sam_record.pysam_object)
            n_unmapped += 1
            continue
        length = len(sam_record.sequence)
        projected_start = convert_position_on_haplotype_to_position_on_linear_ref(
            linear_ref_paths[chromosome], haplotype_paths[chromosome],
            sam_record.start)
        sam_record.set_start(projected_start)
        out_sam.write(sam_record.pysam_object)

    logging.info("%d sam records missed chromosome (unmapped)" % n_unmapped)
示例#7
0
def test_pad_softclip_1(tmpdir):
    "it should memorize the result"

    make_bam(tmpdir.strpath, """
        r1 + __.*.......
        r1 -   .*.......__
    """)

    o = Namespace(verbos=False, mismatch_limit=-1)
    sam = AlignmentFile(tmpdir.join("test.bam").strpath)

    a = pad_softclip(sam)
    b = pad_softclip(sam)

    assert a is b
示例#8
0
def _recalibrate_reads(bam_path, reference_path, contig, start, end,
                       covariate_kwargs, **kwargs):
    # Recalibrate the reads in bam_path

    global joined_prob  # Global to share over multiprocessing
    # joined_prob contains  P(error| d), where d is a descriptor generated by get_covariate_key

    o_path = f'out_{uuid4()}.bam'

    # Open source bam file:
    with AlignmentFile(bam_path) as alignments, FastaFile(
            reference_path) as fa:
        # @todo: extract only selected region from fasta file:
        reference = CachedFasta(fa)
        # Open target bam file:
        with AlignmentFile(o_path, header=alignments.header, mode='wb') as out:
            # Iterate all reads in the source bam file:
            for read in alignments.fetch(contig, start, end):
                recalibrate_base_calls(read, reference, joined_prob,
                                       covariate_kwargs)
                out.write(read)

    pysam.index(o_path)
    return o_path
示例#9
0
 def passes(self):
     from collections import Counter
     if self._passes is None:
         # for BAM of 1M reads, takes 10-15 seconds
         from pysam import AlignmentFile
         ccs = AlignmentFile(self.filename, check_sq=False)
         qnames = [a.qname for a in ccs]
         names = [int(qname.split("/")[1]) for qname in qnames]
         self.qnames = qnames
         lengths = []
         for qname in qnames:
             a, b = qname.split("/")[2].split("_")
             lengths.append(int(b) - int(a))
         self._lengths = lengths
         self._passes = list(Counter(names).values())
     return self._passes
示例#10
0
def getAllFragmentSizes(bamfile, lower, upper, atac=1):
    sizes = np.zeros(upper - lower, dtype=np.float)
    # loop over samfile
    bamHandle = AlignmentFile(bamfile)
    for read in bamHandle:
        if read.is_proper_pair and not read.is_reverse:
            if atac:

                #get insert size
                #correct by 8 base pairs to be inserion to insertion
                ilen = abs(read.template_length) - 8
            else:
                ilen = abs(read.template_length)
            if ilen < upper and ilen >= lower:
                sizes[ilen - lower] += 1
    bamHandle.close()
    return sizes
def run_tagging_tasks(args: tuple):
    """ Run tagging for one or more tasks

    Args:
        args (tuple): (alignments_path, temp_dir, timeout_time), arglist

    """

    (alignments_path, temp_dir, timeout_time), arglist = args

    target_file = f"{temp_dir}/{uuid4()}.bam"

    timeout_tasks = []
    total_molecules = 0
    read_groups = dict()

    with AlignmentFile(alignments_path) as alignments:
        with sorted_bam_file(target_file, origin_bam=alignments, mode='wb', fast_compression=False,
                             read_groups=read_groups) as output:
            for task in arglist:
                try:
                    statistics = run_tagging_task(alignments, output, read_groups=read_groups, timeout_time=timeout_time, **task)
                    total_molecules += statistics.get('total_molecules_written', 0)
                except TimeoutError:
                    timeout_tasks.append( task )


    meta = {
        'timeout_tasks' : timeout_tasks,
        'total_molecules' : total_molecules,
    }

    if total_molecules>0:
        return target_file, meta
    else:
        # Clean up ?
        try:
            remove(target_file)
            remove(f'{target_file}.bai')
        except Exception as e:
            print(f'Cleaning up failed for {target_file}')
            print(e)
            pass

    return None, meta
示例#12
0
def bamToBed(alignmentFile, outputBedFile):
    with AlignmentFile(alignmentFile) as inputFhd, \
        smart_out_open(outputBedFile,"w") as outputFhd:
        regions, name, strand = None, None, None
        for alignmentSegement in inputFhd:
            if alignmentSegement.is_unmapped:  # unmapped reads
                continue
            if alignmentSegement.qname != name:  # reads from new fragments
                if name:
                    mergeAndOutpuBed(regions, outputFhd, name, strand)
                regions, name, strand = [
                    (alignmentSegement.reference_name, x[0], x[1])
                    for x in alignmentSegement.get_blocks()
                ], alignmentSegement.qname, "-" if alignmentSegement.is_reverse and alignmentSegement.is_read1 or not alignmentSegement.is_reverse and not alignmentSegement.is_read1 else "+"
            else:  # reads from same fragments
                regions.extend([(alignmentSegement.reference_name, x[0], x[1])
                                for x in alignmentSegement.get_blocks()])
        mergeAndOutpuBed(regions, outputFhd, name, strand)
示例#13
0
def fetch_count_read (alignment_file, seq_name, start, end):
    """
    Count the number of read that are at least partly overlapping a specified chromosomic region
    @param alignment_file Path to a sam or a bam file
    @param seq_name Name of the sequence where read are to be aligned on
    @param start Start genomic coordinates of the area of alignment
    @param end End End genomic coordinates of the area of alignment
    """
    # Specific imports
    from pysam import AlignmentFile
    
    al = AlignmentFile(alignment_file, "rb")
    
    # Count read aligned at least partly on the specified region
    n = 0
    for i in al.fetch(seq_name, start, end):
        n += 1
    return n
示例#14
0
def get_barcode_frequency_genomewide(bamfile, storage):
    """ This function obtains the barcode frequency
    and stores it in a table.

    Parameters
    ----------
    bamfile :  str
        Path to a bamfile. The bamfile must be indexed.
    storage : str
        Path to the output hdf5 file, which contains the counts per chromsome.
    """

    # Obtain the header information
    afile = AlignmentFile(bamfile, 'rb')

    if 'RG' in afile.header:
        use_group = True
    else:
        use_group = False

    barcodes = {}
    if use_group:
        # extract barcodes
        for idx, item in enumerate(afile.header['RG']):
            barcodes[item['ID']] = 0
    else:
        barcodes['dummy'] = 0
    print('found {} barcodes'.format(len(barcodes)))

    for aln in afile.fetch(until_eof=True):
        if aln.is_proper_pair and aln.is_read1:
            barcodes[aln.get_tag('RG') if use_group else 'dummy'] += 1

        if not aln.is_paired:
            barcodes[aln.get_tag('RG') if use_group else 'dummy'] += 1

    afile.close()

    names = [key for key in barcodes]
    counts = [barcodes[key] for key in barcodes]

    df = pd.DataFrame({'barcodes': names, 'counts': counts})

    df.to_csv(storage, sep='\t', header=True, index=False)
示例#15
0
def main(bam,
         index,
         flags,
         flag_filter,
         min_quality,
         target,
         file=stdout,
         **kwargs):
    """Interpret arguments and dispatch data to subroutines"""
    if target == "cigar":
        chopper, integer_target = cigar_chopper, None
    else:
        chopper, integer_target = relative_chopper, interpret_flags(target)
    ecx = load_index(index)
    with AlignmentFile(bam) as alignment:
        print(str(alignment.header).rstrip("\n"), file=file)
        n_skipped = 0
        bam_iterator = progressbar(
            filter_bam(alignment, [flags, flag_filter, min_quality]),
            desc="Chopping",
            unit="read",
        )
        with errstate(invalid="ignore"):
            for entry in bam_iterator:
                if entry.query_sequence:
                    chopped_entry, error = chopper(
                        entry,
                        ecx,
                        integer_target,
                    )
                    if chopped_entry.query_sequence:
                        print(chopped_entry.to_string(), file=file)
                    else:
                        n_skipped += 1
    if n_skipped:
        msg_mask = "Skipped {} reads to be safe (unsure where to chop)"
        print(msg_mask.format(n_skipped), file=stderr)
    warning = [
        "WARNING: Read mapping positions were adjusted and retained;",
        "         this is needed to comply with the SAM spec.",
        "         Do not use these positions for analyses outside of edgeCase!",
    ]
    print("\n".join(warning), file=stderr)
    return 0
示例#16
0
def test_pad_softclip_3(tmpdir):
    "it should pad softclipped bases"

    make_bam(
        tmpdir.strpath, """
             123456789_123
        r1 + __.*.......
        r1 -   .*.........
        r2 - ...*.......
        r2 +   .*.......__
    """)

    o = Namespace(verbos=False, mismatch_limit=-1)
    sam = AlignmentFile(tmpdir.join("test.bam").strpath)

    adjusted_pos = pad_softclip(sam)

    assert adjusted_pos["r1"] == (0, 13)  # 0-based position
    assert adjusted_pos["r2"] == (0, 13)
示例#17
0
def test_pad_softclip_2(tmpdir):
    "it should ignore more than two reads which share the same name"

    make_bam(
        tmpdir.strpath, """
        r1 + __.*.......
        r1 -   .*.......__
        r1 -   .*.......__
        r2 +   .*.......__
        r2 -   .*.......__
    """)

    o = Namespace(verbos=False, mismatch_limit=-1)
    sam = AlignmentFile(tmpdir.join("test.bam").strpath)

    adjusted_pos = pad_softclip(sam)

    assert sum(1 for startpos, length in adjusted_pos.values()
               if startpos != -1) == 1
示例#18
0
def gather_sv_data(options, collection):
    # Read regions of interest BED file
    regions = BedTool(options.region_file)

    # Read BAM file
    bamfile = AlignmentFile(options.bam_file, "rb")

    # Intersect regions
    for reg in regions:
        for read in bamfile.fetch(reg.chrom, reg.start, reg.end):
            #print read
            if read.query_name.endswith("2d"):
                collection[read.query_name] = []
            if read.query_name.startswith("ctg"):
                collection[read.query_name] = []
                #print read.reference_id, read.reference_start, read.reference_end
                #print read.query_name, read.query_alignment_start, read.query_alignment_end

    bamfile.close()
示例#19
0
def create_table(ctx, input_bam, output_table, alignment_haplotypes):
    """Convert a BAM file to a tabular format sorted by read for downstream analysis"""
    from pysam import AlignmentFile

    from pore_c import model

    tmp_table = output_table + ".tmp"
    logger.debug(f"Writing temporary unsorted data to {tmp_table}")
    af = AlignmentFile(input_bam)
    chrom_order = list(af.references)
    assert "NULL" not in chrom_order
    chrom_order.append("NULL")
    logger.debug(f"Chromosome order {chrom_order}")

    align_df = model.AlignmentRecord.to_dataframe(
        [model.AlignmentRecord.from_aligned_segment(a) for a in af],
        chrom_order=chrom_order)
    align_df = align_df.sort_values(["read_name"])
    num_aligns, num_reads = len(align_df), align_df.read_idx.nunique()
    logger.debug(
        f"Writing {num_aligns} alignments for {num_reads} reads to {output_table}"
    )

    if alignment_haplotypes:
        ht_df = pd.read_csv(alignment_haplotypes, sep="\t")
        align_df = model.AlignmentRecord.update_dataframe_with_haplotypes(
            align_df, ht_df)

    align_df.to_parquet(output_table,
                        engine=PQ_ENGINE,
                        index=False,
                        version=PQ_VERSION)
    g = align_df.groupby(["align_type"])
    summary = pd.concat({
        "num_reads": g["read_idx"].nunique(),
        "num_aligns": g.size()
    }).unstack(level=0)
    logger.info(f"Mapping summary:\n {summary}")

    haplotype_counts = (align_df.haplotype.value_counts().rename_axis(
        "haplotype").to_frame().rename(columns={"haplotype": "num_aligns"}))
    logger.info(f"Haplotype counts:\n {haplotype_counts}")
示例#20
0
def constructDistributions(bamName, lengths):
	'''
	Given a BAM file, constructs a coverage distribution for each long read
	Inputs
	- (str) bamName: BAM file name
	- (dict[(str) refName] = (int) read length) lengths: 
          returns the length of the long read given its read name
	Outputs
	- ( dict[(str) refName] = (numpy.array of ints) distribution ) dists: contains the coverage distributions 
          for each long read
	'''
	samfile = AlignmentFile(bamName, 'r')
	iter = samfile.fetch()
	dists = {}
	for alignment in iter: 
		refName = alignment.reference_name
		start = int(alignment.reference_start)
		cigarTups = alignment.cigartuples
		updateDistribution(dists, lengths, refName, start, cigarTups)
	return dists
def check_sam_header(input_file):
    ''' tries to parse the header of the sam or bam file used as input '''
    
    m = get_mode_string(input_file, write=False)
    try:
        with AlignmentFile(input_file, m) as af:
            # no tag for the program that generated the output
            if not 'PG' in af.header:
                return False, None
            # only one program tag -> processed only by bwa
            if not len(af.header['PG'])==1:
                return False, None
            if not 'CL' in af.header['PG'][0]:
                return False, None
            # check progrma call
            return 'bwa sampe' in af.header['PG'][0]['CL'], None
    
    # catch any errors caused by pysam being unable to read the file
    except Exception as e:
        return False, str(e)
def extract_barcode(sam, barcode_file, outdir):

    # Create the hash set for cell names
    fin = open(barcode_file, 'r')
    barcodes_filtered = set()
    for line in fin:
        line = line.strip()
        barcodes_filtered.add(line)

    print(len(barcodes_filtered))

    sam_file = AlignmentFile(sam, mode='r')
    #filter_file = AlignmentFile("-", mode='wh', template=sam_file)
    track = sam_file.fetch(until_eof=True)
    for i, aln in enumerate(track):
        # if aln.is_unmapped:
        # continue
        # print(i)
        ''' Error to use query_alignment_sequence, use query_sequence instead? '''
        reads_name, reads, cell_barcode, umi, quality = aln.qname, aln.query_sequence, aln.get_tag(
            'XC'), aln.get_tag('XM'), aln.qual
        # print(reads_name, reads, cell_barcode, umi, quality)
        # print(reads)
        # print(quality)

        if cell_barcode in barcodes_filtered:
            # print(reads_name, reads, cell_barcode, umi, quality)
            if len(reads) != len(aln.qual):
                print("Error, skipped:", reads, quality)
                continue

            fout_umi = open(outdir + '/' + cell_barcode + '.umi', 'a+')
            fout_umi.write(umi + '\n')

            fout_fq = open(outdir + '/' + cell_barcode + '.fastq', 'a+')
            fout_fq.write('@' + reads_name + '\n')
            fout_fq.write(reads + '\n')
            fout_fq.write('+\n')
            fout_fq.write(quality + '\n')
        if i % 100000 == 0:
            print(i / 209400000.0)
示例#23
0
文件: io.py 项目: dkurt/bonito
 def __init__(
         self, mode, iterator, aligner, fd=sys.stdout, min_coverage=0.90,
         min_accuracy=0.99, ref_fn=None, groups=None
 ):
     super().__init__()
     self.fd = fd
     self.log = []
     self.mode = mode
     self.aligner = aligner
     self.iterator = iterator
     self.min_coverage = min_coverage
     self.min_accuracy = min_accuracy
     self.output = AlignmentFile(
         fd, 'w' if self.mode == 'wfq' else self.mode, add_sam_header=self.mode != 'wfq',
         reference_filename=ref_fn,
         header=AlignmentHeader.from_references(
             reference_names=aligner.seq_names,
             reference_lengths=[len(aligner.seq(name)) for name in aligner.seq_names],
             text=sam_header(groups),
         )
     )
示例#24
0
 def __init__(self, mode, iterator, aligner, fd=sys.stdout, duplex=False, ref_fn=None, groups=None, group_key=None):
     super().__init__()
     self.fd = fd
     self.log = []
     self.mode = mode
     self.duplex = duplex
     self.aligner = aligner
     self.iterator = iterator
     self.fastq = mode == 'wfq'
     self.group_key = group_key
     self.output = AlignmentFile(
         fd, 'w' if self.fastq else self.mode, add_sam_header=not self.fastq,
         reference_filename=ref_fn,
         header=AlignmentHeader.from_references(
             reference_names=aligner.seq_names if aligner else [],
             reference_lengths=[
                 len(aligner.seq(name)) for name in aligner.seq_names
             ] if aligner else [],
             text=sam_header(groups),
         )
     )
示例#25
0
def bamtag(sam):
    ''' Convert a BAM/SAM with fastqtransformed read names to have UMI and
    cellular barcode tags
    '''
    from pysam import AlignmentFile

    start_time = time.time()

    sam_file = open_bamfile(sam)
    out_file = AlignmentFile("-", "wh", template=sam_file)
    track = sam_file.fetch(until_eof=True)

    # peek at first alignment to determine the annotations
    queryalignment = track.next()
    annotations = detect_alignment_annotations(queryalignment)
    track = itertools.chain([queryalignment], track)

    re_string = construct_transformed_regex(annotations)
    parser_re = re.compile(re_string)

    for count, aln in enumerate(track, start=1):
        if count and not count % 100000:
            logger.info("Processed %d alignments." % count)

        match = parser_re.match(aln.qname)
        tags = aln.tags

        if "cellular" in annotations:
            aln.tags += [('XC', match.group('CB'))]
        if "molecular" in annotations:
            aln.tags += [('RX', match.group('MB'))]
        if "sample" in annotations:
            aln.tags += [('XS', match.group('SB'))]

        out_file.write(aln)

    total_time = time.time() - start_time
    logger.info('BAM tag conversion done - {:.3}s, {:,} alns/min'.format(
        total_time, int(60. * count / total_time)))
    logger.info("Processed %d alignments." % count)
示例#26
0
def get_genome_size_from_bam(file):
    """ Extract chromosome sizes from a bam-file.

    Parameters
    ----------
    file : str
       bam-file

    Returns
    -------
    dict
        Dict with keys and values corresponding to chromosome names and lengths, respectively.
    """
    afile = AlignmentFile(file, 'rb')

    # extract genome size

    genomesize = {}
    for chrom, length in zip(afile.references, afile.lengths):
        genomesize[chrom] = length
    afile.close()
    return genomesize
示例#27
0
def getFragmentSizesFromChunkList(chunks, bamfile, lower, upper, atac=1):
    sizes = np.zeros(upper - lower, dtype=np.float)
    # loop over samfile
    bamHandle = AlignmentFile(bamfile)
    for chunk in chunks:
        for read in bamHandle.fetch(chunk.chrom, max(0, chunk.start - upper),
                                    chunk.end + upper):
            if read.is_proper_pair and not read.is_reverse:
                if atac:
                    #get left position
                    l_pos = read.pos + 4
                    #get insert size
                    #correct by 8 base pairs to be inserion to insertion
                    ilen = abs(read.template_length) - 8
                else:
                    l_pos = read.pos
                    ilen = abs(read.template_length)
                center = l_pos + (ilen - 1) // 2
                if ilen < upper and ilen >= lower and center >= chunk.start and center < chunk.end:
                    sizes[ilen - lower] += 1
    bamHandle.close()
    return sizes
示例#28
0
def count_mapped_bp(args, tempdir, genes):
    """ Count number of bp mapped to each gene across pangenomes.
    Return number covered genes and average gene depth per species.
    Result contains only covered species, but being a defaultdict,
    would yield 0 for any uncovered species, which is appropriate.
    """
    bam_path = f"{tempdir}/pangenomes.bam"
    bamfile = AlignmentFile(bam_path, "rb")
    covered_genes = {}

    # loop over alignments, sum values per gene
    for aln in bamfile.fetch(until_eof=True):
        gene_id = bamfile.getrname(aln.reference_id)
        gene = genes[gene_id]
        gene["aligned_reads"] += 1
        if keep_read(aln, args.aln_mapid, args.aln_readq, args.aln_mapq, args.aln_cov):
            gene["mapped_reads"] += 1
            gene["depth"] += len(aln.query_alignment_sequence) / float(gene["length"])
            covered_genes[gene_id] = gene

    tsprint("Pangenome count_mapped_bp:  total aligned reads: %s" % sum(g["aligned_reads"] for g in genes.values()))
    tsprint("Pangenome count_mapped_bp:  total mapped reads: %s" % sum(g["mapped_reads"] for g in genes.values()))

    # Filter to genes with non-zero depth, then group by species
    nonzero_gene_depths = defaultdict(list)
    for g in covered_genes.values():
        gene_depth = g["depth"]
        if gene_depth > 0:  # This should always pass, because ags.aln_cov is always >0.
            species_id = g["species_id"]
            nonzero_gene_depths[species_id].append(gene_depth)

    # Compute number of covered genes per species, and average gene depth.
    num_covered_genes = defaultdict(int)
    mean_coverage = defaultdict(float)
    for species_id, non_zero_depths in nonzero_gene_depths.items():
        num_covered_genes[species_id] = len(non_zero_depths)
        mean_coverage[species_id] = np.mean(non_zero_depths)

    return num_covered_genes, mean_coverage, covered_genes
示例#29
0
    def _extract_sites(self, sample):
        """
        Loop through all positions and get pileup information.
        """

        if not self.sites:
            return sample

        # get the pileup

        bam = AlignmentFile(sample.sample_bam)
        pileup = pd.DataFrame()

        for site in self.sites:

            pileup_site = self._pileup(bam, site)

            pileup_site = self._get_genotype_info(pileup_site,
                                                  site['ref_allele'],
                                                  site['alt_allele'])

            pileup = pileup.append(pileup_site, ignore_index=True)

        pileup = pileup[[
            'chrom', 'pos', 'ref', 'alt', 'reads_all', 'matches', 'mismatches',
            'A', 'C', 'T', 'G', 'N', 'minor_allele_freq', 'genotype_class',
            'genotype'
        ]]

        for col in [
                'pos', 'A', 'C', 'T', 'G', 'N', 'matches', 'mismatches',
                'reads_all'
        ]:
            pileup[col] = pileup[col].astype(int)

        sample.pileup = pileup

        return sample
示例#30
0
def fragmentlength_from_bam(bamfile, regions, mapq, maxlen):
    """ Compute fragment length per region from a bam-file or

    Parameters
    ----------
    bamfile : str
        bam-file 
    regions : str, BedTool
        Bed-file or BedTool object containing the regions.
    mapq : int
        Minimum mapping quality.
    maxlen : int
        Maximum fragment length.

    Returns
    -------
    scipy.sparse.coo_matrix
        Sparse regions by maxlen matrix containing the fragment counts.
    """
    chroms = []
    starts = []
    ends = []
    tlens = []

    afile = AlignmentFile(bamfile, "rb")
    for aln in afile.fetch():
        if aln.mapping_quality < mapq:
            continue
        if aln.is_proper_pair and aln.is_read1:
            start = min(aln.reference_start, aln.next_reference_start)
            end = abs(aln.tlen)
            chroms.append(aln.reference_name)
            starts.append(start)
            ends.append(end)
    df = pd.DataFrame({'chrom': chroms, 'start': starts, 'end': ends})
    fragments = BedTool.from_dataframe(df)

    return fragmentlength_from_bed(fragments, regions, maxlen)