Exemplo n.º 1
0
def bamparse((strain, target, bamfile)):
    """Parses bam files using pysam stats"""
    global parsedict
    # Use the stat_baseq_ext (extended base quality statistics) function of pysam stats to return records parsed
    # from sorted bam files
    for rec in pysamstats.stat_baseq_ext(alignmentfile=bamfile, fafile=target):
        # Values of interest can be retrieved using the appropriate keys
        # Simple filtering statement: if the number of matches at a particular position in the reference sequence is
        # greater than the number of mismatches, and the total depth is 5 or more, add the position of the results
        if rec['matches'] > rec['mismatches'] and rec['reads_all'] > 4:
            # Populate the dictionary with the appropriate values
            parsedict[strain][target][rec['chrom']][float(rec['pos'])][rec['reads_all']] = rec['rms_baseq']
    dotter()
    return parsedict
def bamparse((strain, target, bamfile)):
    """Parses bam files using pysam stats"""
    global parsedict
    # Use the stat_baseq_ext (extended base quality statistics) function of pysam stats to return records parsed
    # from sorted bam files
    for rec in pysamstats.stat_baseq_ext(alignmentfile=bamfile, fafile=target):
        # Values of interest can be retrieved using the appropriate keys
        # Simple filtering statement: if the number of matches at a particular position in the reference sequence is
        # greater than the number of mismatches, and the total depth is 5 or more, add the position of the results
        if rec['matches'] > rec['mismatches'] and rec['reads_all'] > 4:
            # Populate the dictionary with the appropriate values
            parsedict[strain][target][rec['chrom']][float(rec['pos'])][rec['reads_all']] = rec['rms_baseq']
    dotter()
    return parsedict
Exemplo n.º 3
0
 def _bowtie(self, raw, db):
     version = Popen(['samtools', '--version'], stdout=PIPE, stderr=STDOUT).stdout.read().split('\n')[0].split()[1]
     raw = map(os.path.abspath, raw)
     if len(raw) == 2:
         name = lcs(*raw)
         indict = dict(("m" + str(x), fastq) for x, fastq in enumerate(raw, 1))
     else:
         indict = dict(("U", ",".join(raw)))
         name = os.path.splitext(raw)[0]
     # SAMtools sort v1.3 has different run parameters
     workingdir = name + "tmp"
     make_path(workingdir)
     name += ".sorted.bam"
     if version < "1.3":
         samsort = SamtoolsSortCommandline(input_bam="-", out_prefix=name)
     else:
         samsort = SamtoolsSortCommandline(input_bam=name, o=True, out_prefix="-")
     indict.update(dict(samtools=[SamtoolsViewCommandline(b=True, S=True, input_file="-"), samsort]))
     if not os.path.isfile(name):
         Bowtie2CommandLine(bt2=os.path.splitext(os.path.abspath(db))[0],
                            threads=self.threads,
                            very_sensitive_local=True,
                            a=True,
                            **indict)(cwd=workingdir)
     if not os.path.isfile(name + ".bai"):
         SamtoolsIndexCommandline(input_bam=name)(cwd=workingdir)
     os.rmdir(workingdir)
     genes = dict()
     for rec in pysamstats.stat_baseq_ext(alignmentfile=name, fafile=db):
         # Values of interest can be retrieved using the appropriate keys
         # Simple filtering statement: if the number of matches at a particular position in the reference sequence is
         # greater than the number of mismatches, and the total depth is 5 or more, add the position of the results
         if rec['chrom'] not in genes:
             genes[rec['chrom']] = dict(identity=1.0, depth=float(rec['reads_all']), quality=0.0)
         else:
             genes[rec['chrom']]['identity'] += 1.0
             genes[rec['chrom']]['depth'] += float(rec['reads_all'])
             genes[rec['chrom']]['quality'] += float(rec['rms_baseq'])
     return genes
Exemplo n.º 4
0
def run_pysamstats_baseq(bamFile, refFile, baseq, mapq, sampleName, record):
    bam_to_process = pysam.AlignmentFile(bamFile)
    keyorder = [
        'Tumor_Sample_Barcode', 'chrom', 'pos', 'ref', 'alt', 'reads_all',
        'reads_fwd', 'reads_rev', 'reads_pp', 'reads_pp_fwd', 'reads_pp_rev',
        'matches', 'matches_fwd', 'matches_rev', 'matches_pp',
        'matches_pp_fwd', 'matches_pp_rev', 'mismatches', 'mismatches_fwd',
        'mismatches_rev', 'mismatches_pp', 'mismatches_pp_fwd',
        'mismatches_pp_rev', 'rms_baseq', 'rms_baseq_fwd', 'rms_baseq_rev',
        'rms_baseq_pp', 'rms_baseq_pp_fwd', 'rms_baseq_pp_rev',
        'rms_baseq_matches', 'rms_baseq_matches_fwd', 'rms_baseq_matches_rev',
        'rms_baseq_matches_pp', 'rms_baseq_matches_pp_fwd',
        'rms_baseq_matches_pp_rev', 'rms_baseq_mismatches',
        'rms_baseq_mismatches_fwd', 'rms_baseq_mismatches_rev',
        'rms_baseq_mismatches_pp', 'rms_baseq_mismatches_pp_fwd',
        'rms_baseq_mismatches_pp_rev'
    ]
    chromosome = record.CHROM
    position = record.POS
    ref = record.REF
    alt = record.ALT[0]
    for rec in pysamstats.stat_baseq_ext(bam_to_process,
                                         refFile,
                                         chrom=chromosome,
                                         start=position,
                                         end=None,
                                         min_mapq=mapq,
                                         min_baseq=baseq,
                                         no_del=False,
                                         no_dup=True,
                                         one_based=True,
                                         truncate=True):
        rec['alt'] = alt
        rec['pos'] = position
        rec['Tumor_Sample_Barcode'] = sampleName
        rec = collections.OrderedDict(
            sorted(rec.items(), key=lambda i: keyorder.index(i[0])))
        # print "Org:",chromosome,position,ref,alt,rec['chrom'],rec['pos'],rec['ref'],"\n"
        return (rec)