def callbase(bamfile, snpsites, out):
    BF = Samfile(bamfile, 'rb') #open your bam file
    SF = open(snpsites, 'r')    #the file contain snp sites info
    RF = open(out, 'w')         #resulte file
    RF.write('ref_name\tpos\tRbase\tAbase\tA\tT\tC\tG\tN\tothers\n')
    for i in SF:
        if i.startswith('#'):
            continue
        else:
            line = ParseSNPsitesLine(i)
            vcf_pos = line.pos-1 #change 1-base to 0-based
            vcf_refname = line.chrom
            print 'processing: %s %s...'%(vcf_refname, str(vcf_pos))
            At, Tt, Ct, Gt, Nt, othert = 0, 0, 0, 0, 0, 0
            for i in BF.pileup(vcf_refname, vcf_pos, vcf_pos+1):
                if i.pos == vcf_pos:
                    vcf_Rbase = line.Rbase
                    vcf_Abase = line.Abase
                    for j in i.pileups:
                        yourbase = j.alignment.seq[j.qpos]
                        if yourbase == 'A': At += 1
                        elif yourbase == 'T': Tt += 1
                        elif yourbase == 'C': Ct += 1
                        elif yourbase == 'G': Gt += 1
                        elif yourbase == 'N': Nt += 1
                        else: othert += 1
        RF.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n'%(vcf_refname, \
str(vcf_pos+1), vcf_Rbase, vcf_Abase, str(At), str(Tt), str(Ct), str(Gt), \
str(Nt), str(othert)))
    BF.close()
示例#2
0
    def removeEdgeMismatches(self, bamFile, minDistance, minBaseQual):
        startTime = Helper.getTime()
        minDistance = int(minDistance)
        counter = 0
        j = 0
        num_lines = len(self.variantDict)
        Helper.info(
            " [%s] remove Missmatches from the first %s bp from read edges" %
            (startTime.strftime("%c"), str(minDistance)), self.logFile,
            self.textField)

        bamFile = Samfile(bamFile, "rb")

        for varKey in self.variantDict.keys():
            variant = self.variantDict[varKey]

            counter += 1
            if counter % 10000 == 0:
                Helper.status('%s mm parsed ' % counter, self.logFile,
                              self.textField, "grey")

            keepSNP = False
            varPos = variant.position - 1
            iter = bamFile.pileup(variant.chromosome, variant.position - 1,
                                  variant.position)
            #walks up the region wich overlap this position
            for x in iter:
                if x.pos == varPos:
                    for pileupread in x.pileups:  #walk through the single reads
                        if not pileupread.is_del and not pileupread.is_refskip:
                            distance = abs(
                                pileupread.alignment.alen -
                                pileupread.query_position
                            ) if pileupread.alignment.is_reverse else pileupread.query_position
                            if distance >= minDistance:
                                #check readBase and Base Quality
                                if pileupread.alignment.query_sequence[
                                        pileupread.
                                        query_position] == variant.alt and pileupread.alignment.query_qualities[
                                            pileupread.
                                            query_position] >= minBaseQual:
                                    #if pileupread.alignment.query_sequence[pileupread.query_position] == variant.alt:
                                    keepSNP = True

            if keepSNP == False:
                j += 1
                del self.variantDict[varKey]

        Helper.status('%s of %svariants were deleted' % (j, num_lines),
                      self.logFile, self.textField, "black")
        Helper.printTimeDiff(startTime, self.logFile, self.textField)
        bamFile.close()
示例#3
0
 def removeEdgeMismatches(self,bamFile,minDistance, minBaseQual):
     startTime=Helper.getTime()
     minDistance=int(minDistance)
     counter=0;j=0  
     num_lines = len(self.variantDict)
     Helper.info(" [%s] remove Missmatches from the first %s bp from read edges" % (startTime.strftime("%c"),str(minDistance)),self.logFile,self.textField)
     
     bamFile = Samfile(bamFile, "rb")
     
     for varKey in self.variantDict.keys():
         variant = self.variantDict[varKey]
         
         counter+=1
         if counter%10000==0:
             Helper.status('%s mm parsed ' % counter ,self.logFile, self.textField,"grey")
         
         keepSNP=False
         varPos=variant.position-1
         iter = bamFile.pileup(variant.chromosome, variant.position-1, variant.position)
         #walks up the region wich overlap this position
         for x in iter:
             if x.pos == varPos:
                 for pileupread in x.pileups: #walk through the single reads
                     if not pileupread.is_del and not pileupread.is_refskip:
                         distance=abs(pileupread.alignment.alen-pileupread.query_position) if pileupread.alignment.is_reverse else pileupread.query_position
                         if distance >= minDistance:
                             #check readBase and Base Quality
                             if pileupread.alignment.query_sequence[pileupread.query_position] == variant.alt and pileupread.alignment.query_qualities[pileupread.query_position]>=minBaseQual:
                             #if pileupread.alignment.query_sequence[pileupread.query_position] == variant.alt:
                                 keepSNP=True
                                 
         if keepSNP==False:
             j+=1
             del self.variantDict[varKey]
     
     Helper.status('%s of %svariants were deleted' % (j,num_lines), self.logFile, self.textField,"black") 
     Helper.printTimeDiff(startTime, self.logFile, self.textField)
     bamFile.close()
示例#4
0
    def blatSearch(self, variants, outFile, minBaseQual, minMissmatch):
        startTime = Helper.getTime()
        Helper.info(
            " [%s] Search non uniquely mapped reads" %
            (startTime.strftime("%c")), self.rnaEdit.logFile,
            self.rnaEdit.textField)

        bamFile = Samfile(self.bamFile, "rb")
        # create Fasta file for blat to remap the variant overlapping reads
        tempFasta = outFile + "_tmp.fa"
        if not os.path.isfile(tempFasta) or not os.path.getsize(
                tempFasta
        ) > 0:  # check if temFast exists and is not empty. If it exist it will not be created again
            tempFastaFile = open(tempFasta, "w+")
            mmNumberTotal = len(variants.variantDict)

            #############################################
            #########    CREATE FASTA FILE        #######
            #############################################
            Helper.info(
                " [%s] Create fasta file for blat " %
                (startTime.strftime("%c")), self.rnaEdit.logFile,
                self.rnaEdit.textField)
            counter = 1

            if len(variants.variantDict.keys()) == 0:
                Helper.error("No Variants left", self.rnaEdit.logFile,
                             self.rnaEdit.textField)

            for varKey in variants.variantDict.keys():
                variant = variants.variantDict[varKey]
                varPos = variant.position - 1
                iter = bamFile.pileup(variant.chromosome, variant.position - 1,
                                      variant.position)
                alignements = []
                for x in iter:
                    if x.pos == varPos:
                        # loop over reads of that position
                        for pileupread in x.pileups:
                            if not pileupread.is_del and not pileupread.is_refskip:
                                if pileupread.alignment.query_sequence[pileupread.query_position] == variant.alt and \
                                        pileupread.alignment.query_qualities[pileupread.query_position] >= minBaseQual:
                                    # if pileupread.alignment.query_sequence[pileupread.query_position] == variant.alt:
                                    alignements.append(
                                        pileupread.alignment.seq)

                if len(alignements) >= minMissmatch:
                    missmatchReadCount = 0
                    for sequence in alignements:
                        tempFastaFile.write("> " + variant.chromosome + "-" +
                                            str(variant.position) + "-" +
                                            variant.ref + "-" + variant.alt +
                                            "-" + str(missmatchReadCount) +
                                            "\n" + sequence + "\n")
                        missmatchReadCount += 1

                counter += 1
                if counter % 1000 == 0:
                    sys.stdout.write("\r" + str(counter) + " of " +
                                     str(mmNumberTotal) + " variants done")
                    Helper.info(
                        str(counter) + " of " + str(mmNumberTotal) +
                        " variants done", self.rnaEdit.logFile,
                        self.rnaEdit.textField)
                    sys.stdout.flush()

            Helper.info("\n created fasta file " + tempFasta,
                        self.rnaEdit.logFile, self.rnaEdit.textField)
            Helper.printTimeDiff(startTime, self.rnaEdit.logFile,
                                 self.rnaEdit.textField)
            tempFastaFile.close()

        #############################
        #####   do blat search  #####
        #############################
        pslFile = outFile + ".psl"
        if not os.path.isfile(pslFile) or not os.path.getsize(pslFile) > 0:
            cmd = [
                self.rnaEdit.params.sourceDir + "blat", "-stepSize=5",
                "-repMatch=2253", "-minScore=20", "-minIdentity=0", "-noHead",
                self.rnaEdit.params.refGenome, tempFasta, pslFile
            ]
            # print cmd
            Helper.proceedCommand("do blat search for unique reads", cmd,
                                  tempFasta, "None", self.rnaEdit)
        Helper.info(" [%s] Blat finished" % (startTime.strftime("%c")),
                    self.rnaEdit.logFile, self.rnaEdit.textField)
        Helper.info(
            " [%s] Parse Blat output to look for non uniquely mapped reads" %
            (startTime.strftime("%c")), self.rnaEdit.logFile,
            self.rnaEdit.textField)

        if not os.path.isfile(outFile):
            # open psl file
            pslFile = open(pslFile, "r")
            blatDict = {}

            for line in pslFile:  # summarize the blat hits
                pslFields = line.split()
                chr, pos, ref, alt, mmReadCount = pslFields[9].split("-")
                varTuple = (chr, int(pos), ref, alt)
                try:
                    blatScore = [
                        pslFields[0], pslFields[13], pslFields[17],
                        pslFields[18], pslFields[20]
                    ]  # #of Matches, targetName, blockCount, blockSize, targetStarts
                except IndexError:
                    Helper.warning("Not enough Values in '%s' (Skip)" % line,
                                   self.rnaEdit.logFile,
                                   self.rnaEdit.textField)
                    continue
                if varTuple in blatDict:
                    blatDict[varTuple] = blatDict[varTuple] + [blatScore]
                else:
                    blatDict[varTuple] = [blatScore]

            siteDict = {}
            discardDict = {}
            Helper.info(
                " [%s] Analyse Blat hits (Slow)" % (startTime.strftime("%c")),
                self.rnaEdit.logFile, self.rnaEdit.textField)

            # loop over blat Hits
            for varTuple in blatDict.keys(
            ):  # Loop over all blat hits of mmReads to observe the number of Alignements
                keepSNP = False
                chr, pos, ref, alt = varTuple
                pslLine = blatDict[varTuple]
                largestScore = 0
                largestScoreLine = pslLine[0]
                scoreArray = []

                # look for largest blatScore and save the largest line too
                for blatHit in pslLine:
                    lineScore = int(blatHit[0])
                    scoreArray.append(lineScore)
                    if lineScore > largestScore:
                        largestScore = lineScore
                        largestScoreLine = blatHit

                scoreArray.sort(reverse=True)
                if len(scoreArray
                       ) < 2:  # test if more than one blat Hit exists
                    scoreArray.append(0)
                if chr == largestScoreLine[1] and scoreArray[1] < scoreArray[
                        0] * 0.95:  # check if same chromosome and hit is lower the 95 percent of first hit
                    blockCount, blockSizes, blockStarts = int(largestScoreLine[2]), largestScoreLine[3].split(",")[:-1], \
                                                          largestScoreLine[4].split(",")[:-1]
                    for i in range(blockCount):
                        startPos = int(blockStarts[i]) + 1
                        endPos = startPos + int(blockSizes[i])
                        if pos >= startPos and pos < endPos:  # check if alignement overlaps missmatch
                            keepSNP = True

                if keepSNP == True:
                    if varTuple in siteDict:
                        siteDict[varTuple] += 1
                    else:
                        siteDict[varTuple] = 1
                elif not keepSNP:  # when read not passes the blat criteria
                    if varTuple in discardDict:
                        discardDict[varTuple] += 1
                    else:
                        discardDict[varTuple] = 1
            pslFile.close()

            ##############################################################################
            #####        loop through variants and delete invalid variants          ######
            ##############################################################################
            Helper.info(
                " [%s] Deleting invalid variants" % (startTime.strftime("%c")),
                self.rnaEdit.logFile, self.rnaEdit.textField)

            mmNumberTotal = 0
            mmNumberTooSmall = 0
            mmReadsSmallerDiscardReads = 0
            for key in variants.variantDict.keys():
                numberBlatReads = 0
                numberDiscardReads = 0
                if key in siteDict:
                    numberBlatReads = siteDict[key]
                if key in discardDict:
                    numberDiscardReads = discardDict[key]

                if numberBlatReads <= minMissmatch and numberBlatReads <= numberDiscardReads:
                    del variants.variantDict[key]

                # count statistics
                if numberBlatReads < minMissmatch:
                    mmNumberTooSmall += 1
                elif numberBlatReads < numberDiscardReads:  # check if more reads fit the blat criteria than not
                    mmReadsSmallerDiscardReads += 1
                mmNumberTotal += 1

            if not self.rnaEdit.params.keepTemp:
                os.remove(tempFasta)
                os.remove(pslFile.name)

            # output statistics
            mmPassedNumber = mmNumberTotal - (mmNumberTooSmall +
                                              mmReadsSmallerDiscardReads)

            Helper.info(
                "\t\t %d out of %d passed blat criteria" %
                (mmPassedNumber, mmNumberTotal), self.rnaEdit.logFile,
                self.rnaEdit.textField)
            Helper.info(
                "\t\t %d Missmatches had fewer than %d missmatching-Reads." %
                (mmNumberTooSmall, minMissmatch), self.rnaEdit.logFile,
                self.rnaEdit.textField)
            Helper.info(
                "\t\t %d Missmatches had more missaligned reads than correct ones."
                % mmReadsSmallerDiscardReads, self.rnaEdit.logFile,
                self.rnaEdit.textField)

        Helper.printTimeDiff(startTime, self.rnaEdit.logFile,
                             self.rnaEdit.textField)
示例#5
0
def bam_depth_with_nm(args):
    """
    * unmapped is discarded
    * both clipped is discarded
    * end clipped is included
    * multimap is included
    * stratified with NM

    default mode:
        pos is 1-based

    summary mode:
        covered
    """
    sam = Samfile(args.bam)
    if args.region:
        c, s, e = parse_region(args.region)
        it = sam.pileup(reference=r, start=s, end=e, max_depth=args.max_depth)
    else:
        it = sam.pileup(max_depth=args.max_depth)
    sam_info = SamInfo(sam)

    def cond(prec):
        rec = prec.alignment
        if rec.is_unmapped:
            return False
        read = sam_info.get_read_info(rec)
        if read.overhang > 0:
            return False
        return True

    max_key = 'NM_more'
    nm_keys = ['NM' + str(nm) for nm in range(args.max_nm + 1)] + [max_key]

    def get_key(prec):
        rec = prec.alignment
        nm = rec.get_tag('NM')
        if nm < args.max_nm:
            return 'NM' + str(nm)
        else:
            return max_key

    header = ['contig', 'pos'] + nm_keys

    def iter_table(it):
        Record = namedtuple('Record', header)
        for pcol in it:
            ps = filter(cond, pcol.pileups)
            counts = Counter(map(get_key, ps))
            yield Record(pcol.reference_name, pcol.pos + 1,
                         *(counts[k] for k in nm_keys))

    summary_header = ['contig', 'length', 'covered'] + nm_keys

    def iter_summary(it):
        """ NMx is the number of covered position with at least a read whose edit distance to the refernece is under x.
        """
        Record = namedtuple('Record', summary_header)

        def get_min_nm(row):
            for k in nm_keys:
                if getattr(row, k) > 0:
                    return k

        it1 = iter_table(it)
        for contig, rows in groupby(it1, lambda row: row.contig):
            length = sam_info.get_length(contig)
            counts = Counter([get_min_nm(row) for row in rows])
            nm_counts = [counts[k] for k in nm_keys]
            covered = sum(nm_counts)
            yield Record(contig, length, covered, *nm_counts)

    read_count_header = ['contig', 'length', 'total'] + nm_keys

    def iter_read_counts(it):
        """ NMx is the number of reads whose edit distance to the refernece is under x.
        """
        Record = namedtuple('Record', read_count_header)

        it1 = iter_table(it)
        for contig, rows in groupby(it1, lambda row: row.contig):
            length = sam_info.get_length(contig)
            rows = list(rows)
            counts = {}
            for k in nm_keys:
                counts[k] = sum(getattr(row, k) for row in rows)

            nm_counts = [counts[k] for k in nm_keys]
            total = sum(nm_counts)
            yield Record(contig, length, total, *nm_counts)

    if args.summary:
        logging.info('Emit coverage summary')
        print(*summary_header, sep='\t')
        for row in iter_summary(it):
            print(*row, sep='\t')
    elif args.read_count:
        logging.info('Emit read counts')
        print(*read_count_header, sep='\t')
        for row in iter_read_counts(it):
            print(*row, sep='\t')
    else:
        print(*header, sep='\t')  # header
        for row in iter_table(it):
            print(*row, sep='\t')
示例#6
0
    def from_bam(pysam_samfile, loci, normalized_contig_names=True):
        '''
        Create a PileupCollection for a set of loci from a BAM file.

        Parameters
        ----------
        pysam_samfile : `pysam.Samfile` instance, or filename string
            to a BAM file. The BAM file must be indexed.

        loci : list of Locus instances
            Loci to collect pileups for.

        normalized_contig_names : whether the contig names have been normalized
            (e.g. pyensembl removes the 'chr' prefix). Set to true to
            de-normalize the names when querying the BAM file.

        Returns
        ----------
        PileupCollection instance containing pileups for the specified loci.
        All alignments in the BAM file are included (e.g. duplicate reads,
        secondary alignments, etc.). See `PileupCollection.filter` if these
        need to be removed. 
        '''

        loci = [to_locus(obj) for obj in loci]

        close_on_completion = False
        if typechecks.is_string(pysam_samfile):
            pysam_samfile = Samfile(pysam_samfile)
            close_on_completion = True

        try:
            # Map from pyensembl normalized chromosome names used in Variant to
            # the names used in the BAM file.
            if normalized_contig_names:
                chromosome_name_map = {}
                for name in pysam_samfile.references:
                    normalized = pyensembl.locus.normalize_chromosome(name)
                    chromosome_name_map[normalized] = name
                    chromosome_name_map[name] = name
            else:
                chromosome_name_map = None

            result = PileupCollection({})

            # Optimization: we sort variants so our BAM reads are localized.
            locus_iterator = itertools.chain.from_iterable(
                (Locus.from_interbase_coordinates(locus_interval.contig, pos)
                 for pos in locus_interval.positions)
                for locus_interval in sorted(loci))
            for locus in locus_iterator:
                result.pileups[locus] = Pileup(locus, [])
                if normalized_contig_names:
                    try:
                        chromosome = chromosome_name_map[locus.contig]
                    except KeyError:
                        logging.warn("No such contig in bam: %s" %
                                     locus.contig)
                        continue
                else:
                    chromosome = locus.contig
                columns = pysam_samfile.pileup(
                    chromosome,
                    locus.position,
                    locus.position + 1,  # exclusive, 0-indexed
                    truncate=True,
                    stepper="nofilter")
                try:
                    column = next(columns)
                except StopIteration:
                    # No reads align to this locus.
                    continue

                # Note that storing the pileups here is necessary, since the
                # subsequent assertion will invalidate our column.
                pileups = column.pileups
                assert list(columns) == []  # column is invalid after this.
                for pileup_read in pileups:
                    if not pileup_read.is_refskip:
                        element = PileupElement.from_pysam_alignment(
                            locus, pileup_read)
                        result.pileups[locus].append(element)
            return result
        finally:
            if close_on_completion:
                pysam_samfile.close()
示例#7
0
文件: core.py 项目: dnil/chanjo
def BamFile(bam_path):
  """Return enclosed function to read, read depths from the "bam_path".

  .. code-block:: python

    >>> from chanjo.depth_reader import BamFile
    >>> read_depths = BamFile('./alignment.bam')

  Args:
    bam_path (path): path to alignment BAM-file

  Returns:
    function: function to read from the BAM-file
  """
  # raise an error if the file doesn't exist
  if not os.path.exists(bam_path):
    raise OSError(errno.ENOENT, bam_path)

  bam = Samfile(bam_path)

  try:
    bam.pileup()
  except ValueError:
    # catch error when BAM-file isn't indexed (+ ".bai" file)
    raise OSError(
      errno.ENOENT,
      "BAM-file (%s) must be indexed." % os.path.basename(bam_path)
    )

  def reader(contig, start, end):
    """Generate a list of read depths for each position (start, end).

    The `numpy` array is used to optimize performance when building and
    slicing the list.

    This function depends on `Pysam` >=0.7.5 since the ``truncate``
    option wasn't available in previous versions.

    .. code-block:: python

      >>> read_depths = BamFile('./alignment.bam')
      >>> read_depths('17', 1, 5)
      array([3., 4., 4., 5., 4.])

    .. note::

      Positions are expected to be 1:1-based. In other words; if
      start=1, end=9 you should expect read depths for base pair
      positions 1-9 to be returned.

    Args:
      contig (str): contig/chromosome id (str) of interest
      start (int): first position of the interval (1-based)
      end (int): last position of the interval (1-based)

    Returns:
      list or numpy.array: array of read depths for *each* position in
        the interval
    """
    # convert start to 0-based since this is what pysam expects!
    pysam_start = start - 1

    # pysam expects contig as bytes in Python 2
    pysam_contig = str(contig)

    # check that we don't have a negative start position
    if pysam_start < 0:
      raise ValueError("Start position must be > 0, not %d" % start)

    # preallocate an array of 0 read depth for each position
    # pysam excludes positions with 0 read depth
    read_depths = prealloc_func(end - pysam_start)

    try:
      # overwrite read-covered positions (>0 read depth)
      # ``truncate`` ensures it starts and ends on the gives positions
      # note: ``col.pos`` is 0-based, as is ``pysam_start``
      for col in bam.pileup(pysam_contig, pysam_start, end, truncate=True):
        read_depths[col.pos - pysam_start] = col.n

    except ValueError as ve:
      # catch errors where the contig doesn't exist in the BAM-file
      raise ValueError(
        "Must use contig that exist in the Bam-file. Error: %s" % ve)

    return read_depths

  return reader