Пример #1
0
def preprocess_bam_to_bed(bam, output):
    '''
    Given local bam file, convert reads to set of 101bp intervals and output as bed file. Filter for reads thats are 
    '''
		# convert bam to bed
		vprint("Converting bam to bed...")
		bam = BedTool(bam)
		bed = bam.bam_to_bed()

		# filter intervals
		vprint("Filter reads by size...")
		bed_chunk_iter = bed.to_dataframe(chunksize=10000000)  # chunk large file
		chunks = []
		for chunk in bed_chunk_iter:
				keep = (
						chunk[["start", "end"]]
						.swifter.progress_bar(enable=True, desc=bam)
						.apply(lambda row: is_valid_interval(row["start"], row["end"]), axis=1)
				)

				chunks.append(chunk[keep])

		bed_df = pd.concat(chunks)

		# 101bp interval for input
		vprint("Define 101bp intervals...")
		bed_df["end"] = (
				bed_df["start"].swifter.progress_bar(
						enable=True).apply(define_interval)
		)
		bed_df["name"] = "-"

		# remove duplicates
		vprint("Drop duplicate intervals...")
		bed_df.drop_duplicates(inplace=True)

		# TODO extraneous chromosomes?
		vprint("Remove extra chromosomes...")
		chromosomes = list(range(1, 23))
		chromosomes.append('X')
		chromosomes.append('Y')
		chromosomes = [f'chr{c}' for c in chromosomes]
		bed_df = bed_df.loc[bed_df['chrom'].isin(chromosomes)]

		# Save result
		vprint(f"Saving {bed_df.shape[0]} intervals...")
		BedTool.from_dataframe(bed_df).moveto(output)

		# cleanup tmp files
		pybedtools.cleanup(remove_all=True)

		vprint("Done.")
Пример #2
0
# Convert FASTQ to dict
fq_dict = {}
for i in range(len(fq)):
    if i % 4 == 0:
        read_name = fq[i]
    elif i % 4 == 1:
        dna = fq[i]
    elif i % 4 == 3:
        qual = fq[i]
        # Record one read into dict
        fq_dict[read_name] = (dna, qual)
read_names = list(fq_dict.keys())
# Convert miniBAM to BED
U1U11 = BedTool(bed_path)
bam = BedTool(bam_tmp)
bed = bam.bam_to_bed().intersect(U1U11, wa=True, wb=True).to_dataframe()
bed['SELF_NAME'] = '@' + bed['name']
bed['QNAME'] = [i.split('/')[0] for i in bed['SELF_NAME']]  # QNAME as in BAM
bed['ORDER'] = [i.split('/')[1]
                for i in bed['SELF_NAME']]  # First or second in pair
# Add read names for mates
bed.loc[bed['ORDER'] == '1',
        'MATE_NAME'] = bed.loc[bed['ORDER'] == '1', 'QNAME'] + '/2'
bed.loc[bed['ORDER'] == '2',
        'MATE_NAME'] = bed.loc[bed['ORDER'] == '2', 'QNAME'] + '/1'
# Add some True/False indicators
bed['SELF'] = bed['SELF_NAME'].isin(read_names)  # is self in FASTQ?
bed['MATE'] = bed['MATE_NAME'].isin(read_names)  # is mate in FASTA?
bed['PAIRED'] = np.logical_and(
    bed['SELF'], bed['MATE'])  # Whether the pair can be found in the FASTQ
# Remove reads that are not in FASTQ
Пример #3
0
# Input
if len(sys.argv) == 5:
    realnBAM = sys.argv[1]  # bowtie2 re-aligned bam (rbam for short)
    oriBAM = sys.argv[2]  # original miniBAM (obam)
    out_prefix = sys.argv[3]
    U1U11 = sys.argv[4]  # U1U11 w/ pseudo genes
else:
    sys.stderr.write(
        'Incorrect arguments. Usage: postprocess_realign.py realign_BAM original_BAM out_prefix U1_bed'
    )
    sys.exit(1)
####
# Remove non-optimal multiple alignments
####
rbam_BT = BedTool(realnBAM)  # rbam in BedTool
rbed_DF = rbam_BT.bam_to_bed(tag='AS').to_dataframe()
# Bowtie2 end-to-end returns negative scores so bedtools add 256 to the AS for negative values.
# Therefore, 0 score should actually be fixed to 256
rbed_DF.loc[rbed_DF.score == 0, 'score'] = 256
# Only keep multi-alignments with max AS per read
max_score = rbed_DF.pivot_table(index='name', values='score',
                                aggfunc=max).reset_index()
rbed_DF = rbed_DF.set_index(['name', 'score']).sort_index()
keep = max_score.set_index(['name', 'score']).index.values.tolist()
rbed_DF = rbed_DF.loc[keep, ].reset_index()
rbed_DF = rbed_DF.loc[:, ('chrom', 'start', 'end', 'name', 'score', 'strand')]
multimap_ct = rbed_DF.name.value_counts(
)  # number of times that each read aligns
rbed_BT = BedTool.from_dataframe(rbed_DF)  # create filtered version of rbed
# Intersect with core U1U11
rbed_BT = rbed_BT.intersect(U1U11, wa=True, wb=True)
Пример #4
0
def determine_coverage(bedfile,bamfile,target_site_mutations):

    #bams and bed through bedtools
    target_sites = BedTool(bedfile)
    bambam = BedTool(bamfile)
    bambed = bambam.bam_to_bed().merge()
    covered_ts = target_sites.intersect(bambed)
    non_covered_ts = target_sites.subtract(bambed)







    site_summary = {}

    """
    Get states for sites associated with each chrom.

    """



    for bed,state in zip([non_covered_ts,covered_ts],['no_coverage','no_mutation']):
        for line in bed:

            # break out parts of bed.
            # bed must have 5 columns.

            chrom, start,stop,name,strand = line
            if chrom not in site_summary:
                site_summary[chrom] = {}
            assert name not in site_summary[chrom],textwrap.dedent("""\
            
            {n} is a repeated site name on {c}.
            All site names must be unique.
            Please modify the name in your
            target site bed file. Sites should
            be named
    
            <gene><3 letter AA symbol><ref. AA position>
    
            Examples:
            alsGly121
            epspsLys201
            
    
            """.format(n=name,c=chrom))

            site_summary[chrom][name]=[int(start),state] # give start position and False for if its covered by reads.

    #read in scored target_site mutations.
    f = open(target_site_mutations)
    tsm = f.read().rstrip().split('\n')
    if len(tsm) >1 : #only these will have TSMs
        for line in tsm[1::]:
            sample, aa_sub, chrom,pos,codon_pos,ref,alt = line.split('\t')
            key_name = aa_sub[:-3]
            assert chrom in site_summary,textwrap.dedent("""\
            
            {c} appears in target site mutations file:
            {tsm}
            but not in the supplied bed file:
            {b}
            
            """.format(c=chrom,tsm=target_site_mutations, b=bedfile))

            assert key_name in site_summary[chrom],textwrap.dedent("""\
            
            {c} appears to be referenced in target site mutations file:
            {tsm}
            but not in the supplied bed file:
            {b}
            
            It may be that amino acid symbol is incorrectly formatted. 
            this program requires a 3 letter symbol, e.g. Pro for proline.
            The error was produced by slicing the last 3 items off this
            string: {s}
            
            
            """.format(c=key_name,tsm=target_site_mutations, b=bedfile,s=aa_sub))
            site_summary[chrom][key_name][-1] =textwrap.dedent("""\
               {aa}
               DNA:{ref}\u2192{alt}
               codon_pos:{c}
               
               """.format(aa=aa_sub[-9::
                             ], ref=ref,alt=alt,c=codon_pos)
           )
            print(site_summary[chrom][key_name][-1])

    """    for chrom in site_summary:
Пример #5
0
""" Male read x gene matrix for weight calculation
"""
#!/usr/bin/env python
# coding: utf-8
import pysam, sys, os
import numpy as np
import pandas as pd
from pybedtools import BedTool

# Input
if len(sys.argv) == 4:
    bam = sys.argv[1] # bowtie2 re-aligned & filtered bam (rbam for short)
    out_prefix = sys.argv[2]
    U1U11 = sys.argv[3]  # U1U11 w/ pseudo genes
else:
    sys.stderr.write('Incorrect arguments. Usage: postprocess_realign.py realign_BAM out_prefix U1_bed')
    sys.exit(1)
rbam_BT = BedTool(bam)  # rbam in BedTool
rbed_BT = rbam_BT.bam_to_bed(ed=True)
rbed_DF = rbed_BT.to_dataframe()
# Intersect with U1U11
rbed_U1U11 = rbed_BT.intersect(U1U11, wa=True, wb=True).to_dataframe(names=('chrom', 'start', 'end', 'name', 'score', 'strand', 'chrom2', 'start2', 'end2', 'gene', 'score2', 'strand2'))
# Make read x gene matrix
multimap_ct = rbed_DF.name.value_counts()  # number of times that each read aligns
ct_mat = rbed_U1U11.pivot_table(index='name', columns='gene', values='start', aggfunc=len)
ct_mat = ct_mat.fillna(0).astype(np.int)
ct_mat['num_align'] = multimap_ct[ct_mat.index]
ct_mat['U1U11_map'] = ct_mat.loc[:, ('RNU1-1', 'RNU1-2', 'RNU1-27P', 'RNU1-28P', 'RNU1-3', 'RNU1-4', 'RNVU1-18', 'RNU11')].sum(1)
ct_mat['Other'] = np.where(ct_mat.num_align>ct_mat.U1U11_map, 1, 0)
ct_mat.to_csv(out_prefix + '.mstat.csv')
Пример #6
0
    realnBAM = sys.argv[1]  # bowtie2 re-aligned bam (rbam for short)
    out_prefix = sys.argv[2]
else:
    sys.stderr.write(
        'Incorrect arguments. Usage: postprocess_realign.py realign_BAM out_prefix'
    )
    sys.exit(1)
####
# Remove non-optimal multiple alignments
####
rbam_BT = BedTool(realnBAM)  # rbam in BedTool
cnames = [
    'chrom1', 'start1', 'end1', 'chrom2', 'start2', 'end2', 'name', 'ed',
    'strand1', 'strand2'
]
rbed_DF = pd.read_table(rbam_BT.bam_to_bed(ed=True, mate1=True, bedpe=True).fn,
                        header=None,
                        names=cnames)
rbed_DF = rbed_DF.loc[rbed_DF['chrom1'] != '.']
rbed_DF = rbed_DF.loc[rbed_DF['chrom2'] != '.']
# Only keep multi-alignments with min ED per pair
min_ed = rbed_DF.pivot_table(index='name', values='ed',
                             aggfunc=min).reset_index()
rbed_DF = rbed_DF.set_index(['name', 'ed']).sort_index()
keep = min_ed.set_index(['name', 'ed']).index.values.tolist()
rbed_DF = rbed_DF.loc[keep, ].reset_index()
# bedpe to BED
mate1 = rbed_DF.loc[:, ('chrom1', 'start1', 'end1')]
mate1['name'] = rbed_DF['name'] + '/1'
mate2 = rbed_DF.loc[:, ('chrom2', 'start2', 'end2')]
mate2['name'] = rbed_DF['name'] + '/2'
    fn_sorted = tmpdir + "/sorted.bam"
    fn_fixedmates = tmpdir + "/fixedmates.bam"

    # sort by id
    logging.debug("calling samtools sort")
    pysam.sort(args.infile, "-n", "-o{}".format(fn_sorted), "-T sortprefix")

    # fix mate information
    # also removes secondary and unmapped reads
    logging.debug("calling samtools fixmates")
    pysam.fixmate("-r", fn_sorted, fn_fixedmates)

    # bedtools bam2bed
    alns = BedTool(fn_fixedmates)
    alns_bedpe = alns.bam_to_bed(bedpe=True, mate1=True, ed=True)

    # determine alignment ends and write to file
    with (open(args.outfile, "w") if args.outfile is not None else stdout) as out:
        for i in alns_bedpe:
            chrom = i.fields[0]
            fmstart = i.fields[1]
            fmend = i.fields[2]
            smstart = i.fields[4]
            smend = i.fields[5]
            readid = i.fields[6]
            score = i.fields[7]
            fmstrand = i.fields[8]
            if fmstrand == "+":
                start = fmstart
                end = smend