示例#1
0
def filter_by_allele_freq(input_file, output_file, tumor_bam,
                          matched_control_bam, tumor_AF_thres,
                          control_AF_thres, max_fisher_pvalue):

    hout = open(output_file, 'w')

    print >> hout, '\t'.join([
        "Chr", "Pos", "Dir", "Junc_Seq", "Num_Tumor_Total_Read",
        "Num_Tumor_Var_Read", "Num_Control_Total_Read", "Num_Control_Var_Read",
        "Minus_Log_Fisher_P_value"
    ])

    with open(input_file, 'r') as hin:
        for line in hin:
            F = line.rstrip('\n').split('\t')
            tumor_num = int(F[4])
            control_num = int(F[5])
            region = F[0] + ':' + F[1] + '-' + F[1]

            depth_tumor_info = pysam.depth(tumor_bam, "-r", region)
            depth_tumor = int(depth_tumor_info[0].rstrip('\n').split('\t')[2])
            # depth_tumor = int(depth_tumor_info.split('\t')[2])
            # depth_tumor = int(depth_tumor_info[0].split('\t')[2])
            AF_tumor = float(tumor_num) / depth_tumor
            if AF_tumor < tumor_AF_thres: continue

            # print '\t'.join(F)
            if matched_control_bam != "":
                depth_control_info = pysam.depth(matched_control_bam, "-r",
                                                 region)
                depth_control = int(depth_control_info[0].rstrip('\n').split(
                    '\t')[2]) if len(depth_control_info) != 0 else 0
                # depth_control = int(depth_control_info.split('\t')[2]) if len(depth_control_info) != 0 else 0
                # depth_control = int(depth_control_info[0].split('\t')[2]) if len(depth_control_info) != 0 else 0
                control_AF = float(
                    control_num) / depth_control if depth_control > 0 else 1.0

            else:
                depth_control = "---"
                control_AF = "---"

            if control_AF != "---" and control_AF > control_AF_thres: continue

            lpvalue = "---"
            if control_AF != "":
                oddsratio, pvalue = stats.fisher_exact(
                    [[depth_tumor - tumor_num, tumor_num],
                     [depth_control - control_num, control_num]], 'less')
                if pvalue < 1e-100: pvalue = 1e-100
                lpvalue = (-math.log(pvalue, 10) if pvalue < 1 else 0)
                lpvalue = round(lpvalue, 4)

                if 10**(-lpvalue) > float(max_fisher_pvalue): continue

            print >> hout, '\t'.join(F[:4]) + '\t' + str(depth_tumor) + '\t' + str(tumor_num) + '\t' + \
                           str(depth_control) + '\t' + str(control_num) + '\t' + str(lpvalue)

    hout.close()
示例#2
0
    def calculate_coverage_score(self, bam_file, threshold):
        """Calculate coverage score using pysam from a .bam file.

        The coverage score is calculated by defining a minimum **threshold**
        that will make a residue (single nucleotide read) valid. Anything
        greater than this value will be valid and will be calculated as a
        percentage of the total number of residues.

        :param bam_file: Path to a .bam file
        :type bam_file: str
        :param threshold: Minimum number of reads per residue to be considered
        valid
        :type threshold: int
        :return: Percentage of residues that are greater the given threshold
        :return type: float
        """
        raw_output = pysam.depth(bam_file)

        # NOTE: Probably a better algorithm / implementation here.
        # A grep / awk solution would be better. But perhaps a strictly
        # pythonic algorithm with faster results.
        covered, length = 0, 0
        # The '[:-1]` is to strip trailing newline
        for read in raw_output.split('\n')[:-1]:
            length += 1
            if int(read.split('\t')[2]) >= threshold:
                covered += 1

        # NOTE:
        # This is an expression to get the last residue number
        # raw_output.split( '\n')[-2].split('\t')[2]

        return (covered / length * 100)
示例#3
0
def unicov(bed_file,
           bam_file: List[str],
           bam_dir: Optional[str] = None,
           bam_list: Optional[str] = None,
           **kwargs):

    input_files = kwargs["input_files"]
    names = [sm_tag(x) for x in input_files]
    s = pysam.depth("-a", "-b", bed_file, *input_files)
    f = StringIO(s)
    df = pd.read_table(f)
    df.columns = ["chr", "pos"] + names
    size = df.shape[0]

    dat = {"coverage": COVERAGES}

    for name in names:
        percs = []

        for coverage in COVERAGES:
            count = sum(df[name] >= coverage)
            perc = count / size * 100
            percs.append(perc)

        dat[name] = percs

    df = pd.DataFrame(dat)

    return df.to_string() + "\n"
示例#4
0
def samdepth(samfile, sitesfile):
    depth = pysam.depth(samfile, '-aa', '-b', sitesfile)
    depth = pd.read_csv(StringIO(depth),
                        sep='\t',
                        header=None,
                        usecols=[2],
                        squeeze=True)
    return depth.values
def get_one_coverage(bam_fn, region, region_pad=500):
    chrom, start, end = region
    start = max(start - region_pad, 0)
    end += region_pad  # no problem if we run off the end

    # get the coverage over this region
    region_depth = pysam.depth(bam_fn, "-a", "-r",
                               "%s:%d-%d" % (chrom, start, end))
    return region_depth
示例#6
0
def coverage_over_region(input_bam,
                         region,
                         reference,
                         min_mapq=40,
                         min_baseq=15,
                         min_anchor=11):
    """Compute coverage over 1-indexed, closed region"""
    if reference:
        depth_result = pysam.depth(  # pylint: disable=no-member
            "-Q",
            str(min_mapq),
            "-q",
            str(min_baseq),
            "-l",
            str(min_anchor),
            "-r",
            region,
            "--reference",
            reference,
            input_bam,
        )
    else:
        depth_result = pysam.depth(  # pylint: disable=no-member
            "-Q",
            str(min_mapq),
            "-q",
            str(min_baseq),
            "-l",
            str(min_anchor),
            "-r",
            region,
            input_bam,
        )
    # start, end are 0-indexed half-open coordinates
    _, start, end = pysam.libcutils.parse_region(region=region)
    region_length = end - start
    if len(depth_result) > 0 and region_length > 0:
        depths = np.loadtxt(io.StringIO(depth_result), dtype=int, usecols=2)
        total_coverage = np.sum(depths)
        return (total_coverage / region_length, total_coverage, region_length)
    else:
        return (0., 0., region_length)
示例#7
0
 def getBedCovFile(this, bamFile, bedFile, outFile, bp="20", mpQ="20"):
     try:
         with open(outFile, 'w') as fileHandle:
             print(pysam.depth("-aa", "-q", bp, "-Q", mpQ, "-b", bedFile,
                               bamFile),
                   end="",
                   file=fileHandle)
     except Exception as e:
         print("error: %s %s" % (bamFile, e))
         return False
     return True
def getCoverage(row, bam):
    chromosome = str(row.loc['chr'])
    start = str(row.loc['start'])
    end = str(row.loc['end'])
    region = chromosome + ':' + start + '-' + end
    output = pysam.depth('-aa', '-d', '0', '-r', region, bam)
    coverage = dict()
    for line in output.split('\n')[:-1]:
        line = line.split('\t')
        coverage[int(line[1])] = int(line[2])
    return (coverage)
示例#9
0
def stats(out,loop,core,taxonomyDict,refDict,delSam):

	bamCmd="samtools view -bS -@ "+core+" "+out+str(loop)+".sam > "+str(loop)+"tmp.bam"
	sortCmd="samtools sort -@ "+core+" "+str(loop)+"tmp.bam > "+str(loop)+"sorted.bam"
	indexCmd="samtools index -@ "+core+" "+str(loop)+"sorted.bam"
	
	bam=subprocess.Popen([bamCmd],shell=True,stdout=subprocess.PIPE)
	stdout,error=bam.communicate()
	sort=subprocess.Popen([sortCmd],shell=True,stdout=subprocess.PIPE)
	stdout,error=sort.communicate()
	index=subprocess.Popen([indexCmd],shell=True,stdout=subprocess.PIPE)
	stdout,error=index.communicate()

	statOut=open(out+str(loop)+"_stats","w")
	statOut.write("Species_ID\tSpecies\tPhylum\tSubphylum\tDNA.read_count\tgenome_length\tmean_coverage_per_base\tmedian_coverage_per_base\tcoverage.percent\n")
	
	depth=pysam.depth("-a",str(loop)+"sorted.bam").split("\n")
	depth.pop(-1)
	zeroCounter=0	#count positions with zero reads
	ID="null"
	seq=0
	cov=[]

	for line in depth:
		if line.split("\t")[0] != ID:
			if ID != "null":
			
				if refDict[ID][1].split(",")[0] in taxonomyDict:
					tax=taxonomyDict[refDict[ID][1].split(",")[0]]
				else:
					tax=["NA","NA"]
			
			
				statOut.write(ID+"\t"+reference[ID][1].split(",")[0]+"\t"+tax[0]+"\t"+tax[1]+"\t"+pysam.view("-c",str(loop)+"sorted.bam",ID).strip("\n")+"\t"+str(seq)+"\t"+str(numpy.mean(cov))+"\t"+str(numpy.median(cov))+"\t"+str((float(seq)-float(zeroCounter))/float(seq)*float(100))+"\n")
			ID=line.split("\t")[0]
			seq=0
			zeroCounter=0
			
		elif line.split("\t")[2] == "0":
			zeroCounter+=1
			
		cov.append(int(line.split("\t")[2]))
		seq+=1
		statOut.write(ID+"\t"+reference[ID][1].split(",")[0]+"\t"+tax[0]+"\t"+tax[1]+"\t"+pysam.view("-c",str(loop)+"sorted.bam",ID).strip("\n")+"\t"+str(seq)+"\t"+str(numpy.mean(cov))+"\t"+str(numpy.median(cov))+"\t"+str((float(seq)-float(zeroCounter))/float(seq)*float(100))+"\n")
	
	statOut.close()


	if delSam == "y":
		os.remove(str(loop)+"sorted.bam")
		os.remove(str(loop)+"tmp.bam")
		os.remove(str(loop)+"sorted.bam.bai")
		os.remove(out+str(loop)+".sam")
示例#10
0
def get_depth(ref_name, ref_pos, bam_file):
    pos_str = ref_name + ':' + str(int(ref_pos) - 1) + '-' + str(ref_pos)
    res = pysam.depth("-r", pos_str, bam_file)
    if res == '':
        return 0
    start = 0
    end = len(res) - 1
    for i in range(len(res) - 1, -1, -1):
        if res[i] == '\t':
            start = i + 1
            break
    return int(res[start:end])
def get_coverage_df(bam_file):
    # Create a coverage dictionary.
    coverage_dict = {}
    coverage_list = pysam.depth(bam_file, split_lines=True)
    for line in coverage_list:
        chrom, position, depth = line.split('\t')
        coverage_dict["%s-%s" % (chrom, position)] = depth
    # Convert it to a data frame.
    coverage_df = pandas.DataFrame.from_dict(coverage_dict,
                                             orient='index',
                                             columns=["depth"])
    return coverage_df
示例#12
0
 def getBedCoverage(this, bamFile, bedFile, bp="20", mpQ="20"):
     allSiteCover = createDict()
     try:
         result = pysam.depth("-aa", "-q", bp, "-Q", mpQ, "-b", bedFile,
                              bamFile)
         for line in result.split("\n"):
             if re.search(r"^\s*$", line): continue
             cols = line.split("\t")
             allSiteCover[cols[0]][int(cols[1])] = int(cols[2])
     except Exception as e:
         print("error: %s" % e)
         return False
     return allSiteCover
示例#13
0
文件: Bam.py 项目: xlwuHIT/svf
	def coverage(self,sv,length=0):
		coverage={}
		for i in sv._svdict:
			cov=[]
			for target in sv._svdict[i]:
				deps=pysam.depth("-a", "-Q" "10", "-r", '{}:{}-{}'.format(self._chr_prefix+target[0],target[1],target[2]), "-l", str(length), self._fn).strip().split('\n')
				for dep in deps:
					dep=dep.strip().split('\t')
					if len(dep)<3:
						continue
					cov.append(int(dep[2]))
				coverage[i]=median(cov,1)
		return coverage
def calc_depth_coeff(args, params):
    if not args.b is None:
        stdout=pysam.depth(args.b, '-a')
    else:
        stdout=pysam.depth(args.c, '-a', '--reference', args.fa)
    stdout=stdout.strip().split('\n')
    global coeff
    coeff={}
    d={}
    for line in stdout:
        ls=line.split()
        if ls[0] == args.refseq_id:
            d[int(ls[1]) - 1]=int(ls[2])  # 1-based to 0-based
    tmp=[]
    for i in range(args.refseq_start, args.refseq_end):
        tmp.append(d[i])
    mean_depth=np.mean(tmp)
    for i in range(args.refseq_start, args.refseq_end - params.k + 1):
        if i % params.slide_bin == 0:
            tmp=[]
            for apos in range(params.k):
                tmp.append(d[i + apos])
            tmp_depth=np.mean(tmp)
            coeff[i]= mean_depth / tmp_depth
示例#15
0
def getSam():
    import subprocess
    import numpy
    import pysam
    import subprocess

    bamCmd = "samtools view -bS -@ 20 normalRun.sam > normalRun.bam"
    sortCmd = "samtools sort -@ 20 normalRun.bam > normalRunsorted.bam"
    indexCmd = "samtools index -@ 20 normalRunsorted.bam"

    bam = subprocess.Popen([bamCmd], shell=True, stdout=subprocess.PIPE)
    stdout, error = bam.communicate()
    sort = subprocess.Popen([sortCmd], shell=True, stdout=subprocess.PIPE)
    stdout, error = sort.communicate()
    index = subprocess.Popen([indexCmd], shell=True, stdout=subprocess.PIPE)
    stdout, error = index.communicate()

    depth = pysam.depth("-a", "normalRunsorted.bam").split("\n")
    depth.pop()
    zeroCounter = 0  #count positions with zero reads
    ID = "null"
    seq = 0

    out = open("normalRun", "w")
    out.write(
        "reference\tDNA.read_count\tsequence_length\tmeanCov\tmedianCov\tcovPer\n"
    )
    cov = []

    for line in depth:
        if line.split("\t")[0] != ID:
            if ID != "null":
                out.write(
                    ID + "\t" +
                    pysam.view("-c", "normalRunsorted.bam", ID).strip("\n") +
                    "\t" + str(seq) + "\t" + str(numpy.mean(cov)) + "\t" +
                    str(numpy.median(cov)) + "\t" +
                    str((float(seq) - float(zeroCounter)) / float(seq) *
                        float(100)) + "\n")
            ID = line.split("\t")[0]
            seq = 0
            zeroCounter = 0

        elif line.split("\t")[2] == "0":
            zeroCounter += 1
        cov.append(int(line.split("\t")[2]))
        seq += 1
    out.close()
示例#16
0
def bamdepthtobed(bamfile, outbed='mindepth.bed', mindepth=1, minlength=1):

    baminfor = Baminfo.Baminfo(bamfile)

    outio = open(outbed, 'w')

    for chrom in baminfor.getchrlen():

        print(chrom)

        depthstr = pysam.depth('-r', chrom, bamfile)

        depthchr = depthstr.split('\n')

        del depthstr

        points = list()

        for reg in depthchr:
            try:
                (chrom, site, depthnow) = reg.split('\t')
                site = int(site)
                depthnow = int(depthnow)

                if depthnow >= mindepth:
                    points.append(site)
            # print(chrom, site, depthnow)
            except Exception as e:
                print("warnning:", reg)

        continue_region = continueregion(points, minlength)

        for nowregion in continue_region:
            print(chrom,
                  nowregion['start_site'],
                  nowregion['end_site'],
                  sep='\t',
                  file=outio)

    outio.close()


# if __name__ == '__main__':
#
#     bamdepthtobed(bamfile='/mnt/e/Data/Solanum/Solanum_etuberosum/Solanum_etuberosum_map_to_DM.bam',
#                   outbed='/mnt/e/Data/Solanum/Solanum_etuberosum/Solanum_etuberosum_map_to_DM_min_depth.bed',
#                   mindepth=3,
#                   minlength=150)
示例#17
0
def realdepth(bamfh, region, cut):
    depth = []
    #depth_result = pysam.depth("-a","-r",region,bamfh)
    depth_result = pysam.depth("-a", "-Q" "40", "-r", region, bamfh)
    str_flag = 0
    if isinstance(depth_result, str):
        depth_result = depth_result.split('\n')
        str_flag = 1
    for x in depth_result:
        r = x.rstrip('\n').split('\t')
        if str_flag == 1:
            if len(r) != 3: continue
            depth.append(float(r[2]))
        else:
            depth.append(float(r[2]))
    return depth
示例#18
0
 def create_depth_file(self, bamfile_name, core_num):
     """
     bamfile_name: name of the bamfile to create depth file with
     core_num: core_num is only used in sorting and is optional. Default is 2
     Description: This function calls samtools.
                  You can access the newly created bamfiles through self.bam_root
                  The output from pysam.depth is returned as a string, no longer saved as a file
     """
     if not core_num:
         core_num = 2
     else:
         core_num = int(core_num)
     sorted_bamfile = bamfile_name.split('.bam')[0] + '.sortedByCoord.bam'
     pysam.sort("--threads", str(core_num), "-m", "2G", "-o",
                sorted_bamfile, bamfile_name)
     # The depth file is returned as one single string
     return pysam.depth(sorted_bamfile)
示例#19
0
def getDepths(in_aln,
              region,
              min_base_qual=13,
              excluded_flags="UNMAP,SECONDARY,QCFAIL,DUP"):
    """
    Return list of depths on region.

    :param in_aln: Path to the alignments file.
    :type in_aln: str
    :param region: The evaluated region.
    :type region: anacore.region.Region
    :param min_base_qual: Minimum base quality to count this read base on depth.
    :type min_base_qual: int
    :param excluded_flags: Discard any read that has any of the flags specified in the comma-separated list.
    :type excluded_flags: str
    :return: Depths on region.
    :rtype: list
    """
    out_str = pysam.depth(
        "-a", "-G", excluded_flags, "-J", "-q", str(min_base_qual), "-r",
        "{}:{}-{}".format(region.reference.name, region.start,
                          region.end), in_aln)
    return [int(elt.split()[2]) for elt in out_str.strip().split("\n")]
示例#20
0
    def coverage(self, region):
        print(
            "-->Determining read coverage and depth for \033[1;35m{0}\033[m.".
            format(region[0]))

        # data_file = self.data_file
        pysam_depth = pysam.depth("-r{0}:1-{1}".format(region[0],
                                                       region[1][0]),
                                  self.data_file,
                                  split_lines=True)
        depth_list = []
        depth_counts = 0
        breadth_counts = 0
        for line in pysam_depth:
            depth_list.append(line.split("\t")[2])
            depth_counts += int(line.split("\t")[2])
            breadth_counts += 1

        print(
            "   -->Read coverage and depth analysis complete for \033[1;35m{0}\033[m."
            .format(region[0]))

        return depth_counts / int(region[1][0]), breadth_counts / int(
            region[1][0])
示例#21
0
def calculate_read_depth(target_gene,
                         control_gene,
                         bam_path,
                         output_file,
                         genome_build="hg19"):
    """Create a GDF (GATK DepthOfCoverage Format) file for Stargazer from
    BAM files by computing read depth.

    Parameters
    ----------
    target_gene : str
        Name of the target gene. Choices: {'abcb1', 'cacna1s',
        'cftr', 'cyp1a1', 'cyp1a2', 'cyp1b1', 'cyp2a6',
        'cyp2a13', 'cyp2b6', 'cyp2c8', 'cyp2c9', 'cyp2c19',
        'cyp2d6', 'cyp2e1', 'cyp2f1', 'cyp2j2', 'cyp2r1',
        'cyp2s1', 'cyp2w1', 'cyp3a4', 'cyp3a5', 'cyp3a7',
        'cyp3a43', 'cyp4a11', 'cyp4a22', 'cyp4b1', 'cyp4f2',
        'cyp17a1', 'cyp19a1', 'cyp26a1', 'dpyd', 'g6pd',
        'gstm1', 'gstp1', 'gstt1', 'ifnl3', 'nat1', 'nat2',
        'nudt15', 'por', 'ptgis', 'ryr1', 'slc15a2',
        'slc22a2', 'slco1b1', 'slco1b3', 'slco2b1', 'sult1a1',
        'tbxas1', 'tpmt', 'ugt1a1', 'ugt1a4', 'ugt2b7',
        'ugt2b15', 'ugt2b17', 'vkorc1', 'xpc'}.
    control_gene : str
        Name of a preselected control gene. Used for
        intrasample normalization during copy number analysis
        by Stargazer. Choices: {'egfr', 'ryr1', 'vdr'}.
        Alternatively, you can provide a custom genomic region
        with the 'chr:start-end' format (e.g. chr12:48232319-48301814).
    bam_path : str
        Read BAM files from ``bam_path``, one file path per line. Also
        accepts single BAM file.
    output_file : str
        Path to the output file.
    genome_build : str, default: 'hg19'
        Build of the reference genome assembly. Choices:
        {'hg19', 'hg38'}.
    """

    bam_files = []

    if bam_path.endswith('.bam'):
        bam_files.append(bam_path)
    else:
        with open(bam_path) as f:
            for line in f:
                bam_files.append(line.strip())

    sn_tags = []
    sm_tags = []

    for bam_file in bam_files:
        sn_tags += get_sn_tags(bam_file)
        _sm_tags = get_sm_tags(bam_file)
        if not _sm_tags:
            raise ValueError(f"SM tags not found: {bam_file}")
        elif len(_sm_tags) > 1:
            raise ValueError(f"Multiple SM tags ({_sm_tags}) "
                             f"found: {bam_file}")
        else:
            sm_tags.append(list(_sm_tags)[0])

    if any(["chr" in x for x in list(set(sn_tags))]):
        chr = "chr"
    else:
        chr = ""

    loci = [
        Locus.from_input(target_gene, genome_build),
        Locus.from_input(control_gene, genome_build)
    ]

    depth_data = ""

    for locus in sorted(loci, key=lambda x: (int(x.chrom), x.region_start)):
        depth_data += pysam.depth("-a", "-Q", '1', "-r",
                                  f"{chr}{locus.region}", *bam_files)

    df = pd.read_csv(StringIO(depth_data), sep="\t", header=None)

    df.columns = ["chrom", "pos"] + ["Depth_for_" + x for x in sm_tags]

    df.insert(0, "Locus",
              df["chrom"].astype(str) + ":" + df["pos"].astype(str))

    df.drop(columns=["chrom", "pos"], inplace=True)

    df.insert(1, "Total_Depth", df.iloc[:, 1:].sum(axis=1))
    df.insert(2, "Average_Depth_sample", df.iloc[:, 2:].mean(axis=1))

    df.to_csv(output_file, sep='\t', index=False)
def get_coverage_and_snp_count(task_queue, reference, output_metrics,
                               output_vcf, timeout):
    while True:
        try:
            tup = task_queue.get(block=True, timeout=timeout)
        except queue.Empty:
            break
        bam_file, vcf_file = tup
        # Create a coverage dictionary.
        coverage_dict = {}
        coverage_list = pysam.depth(bam_file, split_lines=True)
        for line in coverage_list:
            chrom, position, depth = line.split('\t')
            coverage_dict["%s-%s" % (chrom, position)] = depth
        # Convert it to a data frame.
        coverage_df = pandas.DataFrame.from_dict(coverage_dict,
                                                 orient='index',
                                                 columns=["depth"])
        # Create a zero coverage dictionary.
        zero_dict = {}
        for record in SeqIO.parse(reference, "fasta"):
            chrom = record.id
            total_len = len(record.seq)
            for pos in list(range(1, total_len + 1)):
                zero_dict["%s-%s" % (str(chrom), str(pos))] = 0
        # Convert it to a data frame with depth_x
        # and depth_y columns - index is NaN.
        zero_df = pandas.DataFrame.from_dict(zero_dict,
                                             orient='index',
                                             columns=["depth"])
        coverage_df = zero_df.merge(coverage_df,
                                    left_index=True,
                                    right_index=True,
                                    how='outer')
        # depth_x "0" column no longer needed.
        coverage_df = coverage_df.drop(columns=['depth_x'])
        coverage_df = coverage_df.rename(columns={'depth_y': 'depth'})
        # Covert the NaN to 0 coverage and get some metrics.
        coverage_df = coverage_df.fillna(0)
        coverage_df['depth'] = coverage_df['depth'].apply(int)
        total_length = len(coverage_df)
        average_coverage = coverage_df['depth'].mean()
        zero_df = coverage_df[coverage_df['depth'] == 0]
        total_zero_coverage = len(zero_df)
        total_coverage = total_length - total_zero_coverage
        genome_coverage = "{:.2%}".format(total_coverage / total_length)
        # Process the associated VCF input.
        column_names = [
            "CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO",
            "FORMAT", "Sample"
        ]
        vcf_df = pandas.read_csv(vcf_file,
                                 sep='\t',
                                 header=None,
                                 names=column_names,
                                 comment='#')
        good_snp_count = len(
            vcf_df[(vcf_df['ALT'].str.len() == 1)
                   & (vcf_df['REF'].str.len() == 1) & (vcf_df['QUAL'] > 150)])
        base_file_name = get_base_file_name(vcf_file)
        if total_zero_coverage > 0:
            header_file = "%s_header.csv" % base_file_name
            with open(header_file, 'w') as outfile:
                with open(vcf_file) as infile:
                    for line in infile:
                        if re.search('^#', line):
                            outfile.write("%s" % line)
            vcf_df_snp = vcf_df[vcf_df['REF'].str.len() == 1]
            vcf_df_snp = vcf_df_snp[vcf_df_snp['ALT'].str.len() == 1]
            vcf_df_snp['ABS_VALUE'] = vcf_df_snp['CHROM'].map(
                str) + "-" + vcf_df_snp['POS'].map(str)
            vcf_df_snp = vcf_df_snp.set_index('ABS_VALUE')
            cat_df = pandas.concat([vcf_df_snp, zero_df], axis=1, sort=False)
            cat_df = cat_df.drop(columns=['CHROM', 'POS', 'depth'])
            cat_df[['ID', 'ALT', 'QUAL', 'FILTER',
                    'INFO']] = cat_df[['ID', 'ALT', 'QUAL', 'FILTER',
                                       'INFO']].fillna('.')
            cat_df['REF'] = cat_df['REF'].fillna('N')
            cat_df['FORMAT'] = cat_df['FORMAT'].fillna('GT')
            cat_df['Sample'] = cat_df['Sample'].fillna('./.')
            cat_df['temp'] = cat_df.index.str.rsplit('-', n=1)
            cat_df[['CHROM',
                    'POS']] = pandas.DataFrame(cat_df.temp.values.tolist(),
                                               index=cat_df.index)
            cat_df = cat_df[[
                'CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO',
                'FORMAT', 'Sample'
            ]]
            cat_df['POS'] = cat_df['POS'].astype(int)
            cat_df = cat_df.sort_values(['CHROM', 'POS'])
            body_file = "%s_body.csv" % base_file_name
            cat_df.to_csv(body_file, sep='\t', header=False, index=False)
            if output_vcf is None:
                output_vcf_file = os.path.join(OUTPUT_VCF_DIR,
                                               "%s.vcf" % base_file_name)
            else:
                output_vcf_file = output_vcf
            with open(output_vcf_file, "w") as outfile:
                for cf in [header_file, body_file]:
                    with open(cf, "r") as infile:
                        for line in infile:
                            outfile.write("%s" % line)
        else:
            if output_vcf is None:
                output_vcf_file = os.path.join(OUTPUT_VCF_DIR,
                                               "%s.vcf" % base_file_name)
            else:
                output_vcf_file = output_vcf
            shutil.copyfile(vcf_file, output_vcf_file)
        bam_metrics = [
            base_file_name, "",
            "%4f" % average_coverage, genome_coverage
        ]
        vcf_metrics = [base_file_name, str(good_snp_count), "", ""]
        if output_metrics is None:
            output_metrics_file = os.path.join(OUTPUT_METRICS_DIR,
                                               "%s.tabular" % base_file_name)
        else:
            output_metrics_file = output_metrics
        metrics_columns = [
            "File", "Number of Good SNPs", "Average Coverage",
            "Genome Coverage"
        ]
        with open(output_metrics_file, "w") as fh:
            fh.write("# %s\n" % "\t".join(metrics_columns))
            fh.write("%s\n" % "\t".join(bam_metrics))
            fh.write("%s\n" % "\t".join(vcf_metrics))
        task_queue.task_done()
示例#23
0
def bam2sdf(genome_build: str, target_gene: str, control_gene: str,
            bam_file: List[str], **kwargs) -> str:
    """
    Create SDF file from BAM file(s).

    Returns:
        str: SDF file.

    Args:
        genome_build (str): Genome build (hg19, hg38).
        target_gene (str): Target gene.
        control_gene (str): Control gene or region.
        bam_file (list[str]): BAM file(s).
    """

    gene_table = get_gene_table()

    targets = [k for k, v in gene_table.items() if v["type"] == "target"]

    if target_gene not in targets:
        raise ValueError(
            f"'{target_gene}' is not among target genes: {targets}")

    tr = gene_table[target_gene][f"{genome_build}_region"].replace("chr", "")

    if "chr" in control_gene or ":" in control_gene:
        cr = control_gene.replace("chr", "")

    else:
        controls = [k for k, v in gene_table.items() if v["control"] == "yes"]

        if control_gene not in controls:
            raise ValueError(
                f"'{control_gene}' is not among control genes: {controls}")

        cr = gene_table[control_gene][f"{genome_build}_region"].replace(
            "chr", "")

    regions = sort_regions([tr, cr])

    # Get sample and sequence names from BAM headers.
    sm = []
    sn = []
    for x in bam_file:
        sm.append(sm_tag(x))

        result = pysam.view("-H", x).strip().split("\n")
        for line in result:
            fields = line.split("\t")
            if "@SQ" == fields[0]:
                for field in fields:
                    if "SN:" in field:
                        y = field.replace("SN:", "")
                        if y not in sn:
                            sn.append(y)

    logger.info(f"Sample IDs: {sm}")
    logger.info(f"Contigs: {sn}")

    # Determine whether the "chr" string should be used.
    if any(["chr" in x for x in sn]):
        chr_str = "chr"
    else:
        chr_str = ""

    result = ""

    for region in regions:
        temp = pysam.depth("-a", "-Q", "1", "-r", f"{chr_str}{region}",
                           *bam_file)
        result += temp

    return result
示例#24
0
 def add_zero_coverage(self, sample_name, sample_reference, nodupbam,
                       hapall, zero_coverage_vcf):
     coverage_dict = {}
     coverage_list = pysam.depth(nodupbam, split_lines=True)
     for line in coverage_list:
         chrom, position, depth = line.split('\t')
         coverage_dict[chrom + "-" + position] = depth
     coverage_df = pd.DataFrame.from_dict(coverage_dict,
                                          orient='index',
                                          columns=["depth"])
     zero_dict = {}
     for record in SeqIO.parse(sample_reference, "fasta"):
         chrom = record.id
         total_len = len(record.seq)
         for pos in list(range(1, total_len + 1)):
             zero_dict[str(chrom) + "-" + str(pos)] = 0
     zero_df = pd.DataFrame.from_dict(zero_dict,
                                      orient='index',
                                      columns=["depth"])
     #df with depth_x and depth_y columns, depth_y index is NaN
     coverage_df = zero_df.merge(coverage_df,
                                 left_index=True,
                                 right_index=True,
                                 how='outer')
     #depth_x "0" column no longer needed
     coverage_df = coverage_df.drop(columns=['depth_x'])
     coverage_df = coverage_df.rename(columns={'depth_y': 'depth'})
     #covert the NaN to 0 coverage
     coverage_df = coverage_df.fillna(0)
     coverage_df['depth'] = coverage_df['depth'].apply(int)
     total_length = len(coverage_df)
     ave_coverage = coverage_df['depth'].mean()
     zero_df = coverage_df[coverage_df['depth'] == 0]
     total_zero_coverage = len(zero_df)
     print("\tPositions with no coverage: {:,}".format(total_zero_coverage))
     total_coverage = total_length - total_zero_coverage
     genome_coverage = "{:.2%}".format(total_coverage / total_length)
     vcf_df = pd.read_csv(hapall,
                          sep='\t',
                          header=None,
                          names=[
                              "CHROM", "POS", "ID", "REF", "ALT", "QUAL",
                              "FILTER", "INFO", "FORMAT", "Sample"
                          ],
                          comment='#')
     good_snp_count = len(
         vcf_df[(vcf_df['ALT'].str.len() == 1)
                & (vcf_df['REF'].str.len() == 1) & (vcf_df['QUAL'] > 150)])
     if total_zero_coverage > 0:
         header_out = open('v_header.csv', 'w+')
         with open(hapall) as fff:
             for line in fff:
                 if re.search('^#', line):
                     print(line.strip(), file=header_out)
         header_out.close()
         vcf_df_snp = vcf_df[vcf_df['REF'].str.len() == 1]
         vcf_df_snp = vcf_df_snp[vcf_df_snp['ALT'].str.len() == 1]
         vcf_df_snp['ABS_VALUE'] = vcf_df_snp['CHROM'].map(
             str) + '-' + vcf_df_snp['POS'].map(str)
         vcf_df_snp = vcf_df_snp.set_index('ABS_VALUE')
         cat_df = pd.concat([vcf_df_snp, zero_df], axis=1, sort=False)
         cat_df = cat_df.drop(columns=['CHROM', 'POS', 'depth'])
         cat_df[['ID', 'ALT', 'QUAL', 'FILTER',
                 'INFO']] = cat_df[['ID', 'ALT', 'QUAL', 'FILTER',
                                    'INFO']].fillna('.')
         cat_df['REF'] = cat_df['REF'].fillna('N')
         cat_df['FORMAT'] = cat_df['FORMAT'].fillna('GT')
         cat_df['Sample'] = cat_df['Sample'].fillna('./.')
         cat_df['temp'] = cat_df.index.str.rsplit('-', n=1)
         cat_df[['CHROM',
                 'POS']] = pd.DataFrame(cat_df.temp.values.tolist(),
                                        index=cat_df.index)
         cat_df = cat_df[[
             'CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO',
             'FORMAT', 'Sample'
         ]]
         cat_df['POS'] = cat_df['POS'].astype(int)
         cat_df = cat_df.sort_values(['CHROM', 'POS'])
         cat_df.to_csv('v_annotated_body.csv',
                       sep='\t',
                       header=False,
                       index=False)
         cat_files = ['v_header.csv', 'v_annotated_body.csv']
         with open(zero_coverage_vcf, "wb") as outfile:
             for cf in cat_files:
                 with open(cf, "rb") as infile:
                     outfile.write(infile.read())
         os.remove('v_header.csv')
         os.remove('v_annotated_body.csv')
     else:
         shutil.copyfile(hapall, zero_coverage_vcf)
     return (zero_coverage_vcf, good_snp_count, ave_coverage,
             genome_coverage)
示例#25
0
    parser.add_argument('-g', '--gff', required=True, help='gene annotation in gff format')
    parser.add_argument('-mf', '--min_frac', required=False, type=float, default=0.3, help='minimum fraction of gene covered by <min_cov> reads (0.3)')
    parser.add_argument('-mc', '--min_cov', required=True, type=int, default=2, help='minimum coverage per base to be considered covered (2)')
    parser.add_argument('-o', '--out', required=False, default='genes_pav.tsv', help='output table')
    args=parser.parse_args()
    gff = GffIO.GffIO(args.gff)
    samfile = pysam.AlignmentFile(args.inbam, "rb")
    genes = {}
    for gene in gff.nextGene():
        ID = gene.ID
        coords = gene.extractCoords("exon")
        contig = gene.seqid
        genes[ID] = {"tot":0, "pass":0}
        for i in range(len(coords["starts"])):
            region=contig+":"+coords["starts"][i]+"-"+coords["ends"][i]
            for pos in pysam.depth('-aa', '-r', region, args.inbam, split_lines=True):
                genes[ID]["tot"] += 1
                depth = pos.split("\t")[2]
                if int(depth) >= args.min_cov:
                    genes[ID]["pass"] += 1
#            for column in samfile.pileup(contig, int(coords["starts"][i]), int(coords["ends"][i]), truncate=True):
#                genes[ID]["tot"] += 1
#                if column.nsegments >= args.min_cov:
#                    genes[ID]["pass"] += 1
    log="Analyzed "+str(len(genes))+" genes in file "+args.inbam+"\n"
    print(log)
    samfile.close()
    outfile = open(args.out, "w")
    header = "Gene\tPresence\tPassed\tTotal\n"
    outfile.write(header)
    for gene in genes.keys():
示例#26
0
文件: pycov.py 项目: sbslee/fuc
    def from_bam(cls,
                 bams,
                 regions=None,
                 zero=False,
                 map_qual=None,
                 names=None):
        """
        Construct CovFrame from BAM files.

        Under the hood, the method computes read depth using the
        :command:`samtools depth` command.

        Parameters
        ----------
        bams : str or list
            One or more input BAM files. Alternatively, you can provide a
            text file (.txt, .tsv, .csv, or .list) containing one BAM file
            per line.
        regions : str, list, or pybed.BedFrame, optional
            By default (``regions=None``), the method counts all reads in BAM
            files, which can be excruciatingly slow for large files (e.g.
            whole genome sequencing). Therefore, use this argument to only
            output positions in given regions. Each region must have the
            format chrom:start-end and be a half-open interval with (start,
            end]. This means, for example, chr1:100-103 will extract
            positions 101, 102, and 103. Alternatively, you can provide a BED
            file (compressed or uncompressed) or a :class:`pybed.BedFrame`
            object to specify regions. Note that the 'chr' prefix in contig
            names (e.g. 'chr1' vs. '1') will be automatically added or
            removed as necessary to match the input BAM's contig names.
        zero : bool, default: False
            If True, output all positions including those with zero depth.
        map_qual: int, optional
            Only count reads with mapping quality greater than or equal to
            this number.
        names : list, optional
            By default (``names=None``), sample name is extracted using SM
            tag in BAM files. If the tag is missing, the method will set the
            filename as sample name. Use this argument to manually provide
            sample names.

        Returns
        -------
        CovFrame
            CovFrame object.

        See Also
        --------
        CovFrame
            CovFrame object creation using constructor.
        CovFrame.from_dict
            Construct CovFrame from dict of array-like or dicts.
        CovFrame.from_file
            Construct CovFrame from a text file containing read depth data.

        Examples
        --------

        >>> from fuc import pycov
        >>> cf = pycov.CovFrame.from_bam(bam)
        >>> cf = pycov.CovFrame.from_bam([bam1, bam2])
        >>> cf = pycov.CovFrame.from_bam(bam, region='19:41497204-41524301')
        """
        # Parse input BAM files.
        bams = common.parse_list_or_file(bams)

        # Check the 'chr' prefix.
        if all([pybam.has_chr_prefix(x) for x in bams]):
            chr_prefix = 'chr'
        else:
            chr_prefix = ''

        # Run the 'samtools depth' command.
        args = []
        if zero:
            args += ['-a']
        if map_qual is not None:
            args += ['-Q', str(map_qual)]
        results = ''
        if regions is None:
            results += pysam.depth(*(bams + args))
        else:
            if isinstance(regions, str):
                regions = [regions]
            elif isinstance(regions, list):
                pass
            elif isinstance(regions, pybed.BedFrame):
                regions = bf.to_regions()
            else:
                raise TypeError("Incorrect type of argument 'regions'")
            if '.bed' in regions[0]:
                bf = pybed.BedFrame.from_file(regions[0])
                regions = bf.to_regions()
            else:
                regions = common.sort_regions(regions)
            for region in regions:
                region = chr_prefix + region.replace('chr', '')
                results += pysam.depth(*(bams + args + ['-r', region]))

        headers = ['Chromosome', 'Position']
        dtype = {'Chromosome': str, 'Position': int}

        for i, bam in enumerate(bams):
            if names:
                name = names[i]
            else:
                samples = pybam.tag_sm(bam)
                if not samples:
                    basename = Path(bam).stem
                    message = (f'SM tags were not found for {bam}, will use '
                               f'file name as sample name ({basename})')
                    samples = [basename]
                    warnings.warn(message)
                if len(samples) > 1:
                    m = f'multiple sample names detected: {bam}'
                    raise ValueError(m)
                name = samples[0]
            headers.append(name)
            dtype[name] = int

        df = pd.read_csv(StringIO(results),
                         sep='\t',
                         header=None,
                         names=headers,
                         dtype=dtype)

        return cls(df)
def get_genome_metrics(bam_file, ref_length = 29903, platform = 'ILLUMINA', read_length=148, verbose=False, temp=None):
    if verbose:
        log.setLevel(logging.DEBUG)
    recovery_20 = 0 
    recovery_10 = 0 
    total_bases = 0
    reads = 0 
    coverage = [ ]
    unmapped_reads = 0 
    try:
        # If illumina create temp bam of full mapped reads 
        if platform == 'ILLUMINA':
            log.debug('Creating temp file and filtering for ' + os.path.basename(bam_file) )
            if temp:
                fd = tempfile.NamedTemporaryFile(prefix='ronaldo', dir=temp)
            else:
                fd = tempfile.NamedTemporaryFile(prefix='ronaldo')
            bam = pysam.AlignmentFile(bam_file, "rb")
            if not bam.has_index():
                pysam.index(bam_file)
                bam = pysam.AlignmentFile(bam_file, "rb")
            # Pre filter full mapped reads .
            temp_bam = pysam.AlignmentFile(fd, 'wb', template=bam)
            for read in bam.fetch():
                if not read.is_unmapped:
                    match_lengths = [match[1] - match[0] for match in read.cigar]
                    if sum(match_lengths) >= read_length: 
                        temp_bam.write(read)
            bam.close()
            temp_bam.close()
            clean_bam_file = fd.name
        else:
            clean_bam_file = bam_file
        log.debug('Fetching read stats for ' + os.path.basename(bam_file) )
        log.debug('Path: ' + clean_bam_file)
        pysam.index(clean_bam_file)
        for stat_line in pysam.idxstats(clean_bam_file).split('\n'):
            if stat_line.startswith('*'):
                unmapped_reads = int(stat_line.split('\t')[2]) + int(stat_line.split('\t')[3])
            elif stat_line.startswith('MN908947.3'):
                reads = int(stat_line.split('\t')[2])
            else:
                if len(stat_line) > 1:
                    ref_name = stat_line.split('\t')[0]
                    logging.warn(f'Other reference file detected, {ref_name}.')       
        log.debug('Calculating depth for ' + os.path.basename(bam_file) )                    
        for coord_line in pysam.depth(clean_bam_file ,'-a', '-d', '0').split('\n'):
            coord = coord_line.split('\t')
            if len(coord) > 2:
                total_bases += 1
                coverage.append(float(coord[2]))
                if int(coord[2]) >= 10:
                    recovery_10 += 1 
                if int(coord[2]) > 20:
                    recovery_20 += 1 
    except (pysam.utils.SamtoolsError, OSError) as ex:
        log.error(f'Error opening BAM file {bam_file}')
        log.error(ex)
        return 0,0,0,0
    finally:
        if platform == 'ILLUMINA':
            fd.close()        
    if total_bases != ref_length and total_bases > 0 :
        log.warn(f'SARSCOV reference genome not expected size, found {total_bases}')
    if total_bases == 0 or not coverage:
        log.warn(f'No reads mapped at all in  {bam_file}')
        return 0, 0, 0, 0
    return round(float(recovery_10) / total_bases * 100, 2) , round(float(recovery_20) / total_bases * 100, 2), round(float(sum(coverage)) / len(coverage),3), reads