Python Samfile.pileup примеры использования

Язык программирования: Python

Пространство имен/Пакет: pysam

Класс/Тип: Samfile

Метод/Функция: pileup

Примеров на hotexamples.com: 7

Python Samfile.pileup - 7 примеров найдено. Это лучшие примеры Python кода для pysam.Samfile.pileup, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Samfile(30)

close(30)

fetch(30)

getrname(10)

write(8)

count(4)

pileup(4)

seek(2)

count_coverage(1)

gettid(1)

next(1)

Пример #1

Показать файл

Файл: callbase-pysam.py Проект: freemao/call-base-each-snp-site

def callbase(bamfile, snpsites, out):
    BF = Samfile(bamfile, 'rb') #open your bam file
    SF = open(snpsites, 'r')    #the file contain snp sites info
    RF = open(out, 'w')         #resulte file
    RF.write('ref_name\tpos\tRbase\tAbase\tA\tT\tC\tG\tN\tothers\n')
    for i in SF:
        if i.startswith('#'):
            continue
        else:
            line = ParseSNPsitesLine(i)
            vcf_pos = line.pos-1 #change 1-base to 0-based
            vcf_refname = line.chrom
            print 'processing: %s %s...'%(vcf_refname, str(vcf_pos))
            At, Tt, Ct, Gt, Nt, othert = 0, 0, 0, 0, 0, 0
            for i in BF.pileup(vcf_refname, vcf_pos, vcf_pos+1):
                if i.pos == vcf_pos:
                    vcf_Rbase = line.Rbase
                    vcf_Abase = line.Abase
                    for j in i.pileups:
                        yourbase = j.alignment.seq[j.qpos]
                        if yourbase == 'A': At += 1
                        elif yourbase == 'T': Tt += 1
                        elif yourbase == 'C': Ct += 1
                        elif yourbase == 'G': Gt += 1
                        elif yourbase == 'N': Nt += 1
                        else: othert += 1
        RF.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n'%(vcf_refname, \
str(vcf_pos+1), vcf_Rbase, vcf_Abase, str(At), str(Tt), str(Ct), str(Gt), \
str(Nt), str(othert)))
    BF.close()

Пример #2

Показать файл

    def removeEdgeMismatches(self, bamFile, minDistance, minBaseQual):
        startTime = Helper.getTime()
        minDistance = int(minDistance)
        counter = 0
        j = 0
        num_lines = len(self.variantDict)
        Helper.info(
            " [%s] remove Missmatches from the first %s bp from read edges" %
            (startTime.strftime("%c"), str(minDistance)), self.logFile,
            self.textField)

        bamFile = Samfile(bamFile, "rb")

        for varKey in self.variantDict.keys():
            variant = self.variantDict[varKey]

            counter += 1
            if counter % 10000 == 0:
                Helper.status('%s mm parsed ' % counter, self.logFile,
                              self.textField, "grey")

            keepSNP = False
            varPos = variant.position - 1
            iter = bamFile.pileup(variant.chromosome, variant.position - 1,
                                  variant.position)
            #walks up the region wich overlap this position
            for x in iter:
                if x.pos == varPos:
                    for pileupread in x.pileups:  #walk through the single reads
                        if not pileupread.is_del and not pileupread.is_refskip:
                            distance = abs(
                                pileupread.alignment.alen -
                                pileupread.query_position
                            ) if pileupread.alignment.is_reverse else pileupread.query_position
                            if distance >= minDistance:
                                #check readBase and Base Quality
                                if pileupread.alignment.query_sequence[
                                        pileupread.
                                        query_position] == variant.alt and pileupread.alignment.query_qualities[
                                            pileupread.
                                            query_position] >= minBaseQual:
                                    #if pileupread.alignment.query_sequence[pileupread.query_position] == variant.alt:
                                    keepSNP = True

            if keepSNP == False:
                j += 1
                del self.variantDict[varKey]

        Helper.status('%s of %svariants were deleted' % (j, num_lines),
                      self.logFile, self.textField, "black")
        Helper.printTimeDiff(startTime, self.logFile, self.textField)
        bamFile.close()

Пример #3

Показать файл

Файл: VariantSet.py Проект: djhn75/RNAEditor

 def removeEdgeMismatches(self,bamFile,minDistance, minBaseQual):
     startTime=Helper.getTime()
     minDistance=int(minDistance)
     counter=0;j=0  
     num_lines = len(self.variantDict)
     Helper.info(" [%s] remove Missmatches from the first %s bp from read edges" % (startTime.strftime("%c"),str(minDistance)),self.logFile,self.textField)
     
     bamFile = Samfile(bamFile, "rb")
     
     for varKey in self.variantDict.keys():
         variant = self.variantDict[varKey]
         
         counter+=1
         if counter%10000==0:
             Helper.status('%s mm parsed ' % counter ,self.logFile, self.textField,"grey")
         
         keepSNP=False
         varPos=variant.position-1
         iter = bamFile.pileup(variant.chromosome, variant.position-1, variant.position)
         #walks up the region wich overlap this position
         for x in iter:
             if x.pos == varPos:
                 for pileupread in x.pileups: #walk through the single reads
                     if not pileupread.is_del and not pileupread.is_refskip:
                         distance=abs(pileupread.alignment.alen-pileupread.query_position) if pileupread.alignment.is_reverse else pileupread.query_position
                         if distance >= minDistance:
                             #check readBase and Base Quality
                             if pileupread.alignment.query_sequence[pileupread.query_position] == variant.alt and pileupread.alignment.query_qualities[pileupread.query_position]>=minBaseQual:
                             #if pileupread.alignment.query_sequence[pileupread.query_position] == variant.alt:
                                 keepSNP=True
                                 
         if keepSNP==False:
             j+=1
             del self.variantDict[varKey]
     
     Helper.status('%s of %svariants were deleted' % (j,num_lines), self.logFile, self.textField,"black") 
     Helper.printTimeDiff(startTime, self.logFile, self.textField)
     bamFile.close()

Пример #4

Показать файл

Файл: CallEditingSites.py Проект: tkragholm/RNAEditor

    def blatSearch(self, variants, outFile, minBaseQual, minMissmatch):
        startTime = Helper.getTime()
        Helper.info(
            " [%s] Search non uniquely mapped reads" %
            (startTime.strftime("%c")), self.rnaEdit.logFile,
            self.rnaEdit.textField)

        bamFile = Samfile(self.bamFile, "rb")
        # create Fasta file for blat to remap the variant overlapping reads
        tempFasta = outFile + "_tmp.fa"
        if not os.path.isfile(tempFasta) or not os.path.getsize(
                tempFasta
        ) > 0:  # check if temFast exists and is not empty. If it exist it will not be created again
            tempFastaFile = open(tempFasta, "w+")
            mmNumberTotal = len(variants.variantDict)

            #############################################
            #########    CREATE FASTA FILE        #######
            #############################################
            Helper.info(
                " [%s] Create fasta file for blat " %
                (startTime.strftime("%c")), self.rnaEdit.logFile,
                self.rnaEdit.textField)
            counter = 1

            if len(variants.variantDict.keys()) == 0:
                Helper.error("No Variants left", self.rnaEdit.logFile,
                             self.rnaEdit.textField)

            for varKey in variants.variantDict.keys():
                variant = variants.variantDict[varKey]
                varPos = variant.position - 1
                iter = bamFile.pileup(variant.chromosome, variant.position - 1,
                                      variant.position)
                alignements = []
                for x in iter:
                    if x.pos == varPos:
                        # loop over reads of that position
                        for pileupread in x.pileups:
                            if not pileupread.is_del and not pileupread.is_refskip:
                                if pileupread.alignment.query_sequence[pileupread.query_position] == variant.alt and \
                                        pileupread.alignment.query_qualities[pileupread.query_position] >= minBaseQual:
                                    # if pileupread.alignment.query_sequence[pileupread.query_position] == variant.alt:
                                    alignements.append(
                                        pileupread.alignment.seq)

                if len(alignements) >= minMissmatch:
                    missmatchReadCount = 0
                    for sequence in alignements:
                        tempFastaFile.write("> " + variant.chromosome + "-" +
                                            str(variant.position) + "-" +
                                            variant.ref + "-" + variant.alt +
                                            "-" + str(missmatchReadCount) +
                                            "\n" + sequence + "\n")
                        missmatchReadCount += 1

                counter += 1
                if counter % 1000 == 0:
                    sys.stdout.write("\r" + str(counter) + " of " +
                                     str(mmNumberTotal) + " variants done")
                    Helper.info(
                        str(counter) + " of " + str(mmNumberTotal) +
                        " variants done", self.rnaEdit.logFile,
                        self.rnaEdit.textField)
                    sys.stdout.flush()

            Helper.info("\n created fasta file " + tempFasta,
                        self.rnaEdit.logFile, self.rnaEdit.textField)
            Helper.printTimeDiff(startTime, self.rnaEdit.logFile,
                                 self.rnaEdit.textField)
            tempFastaFile.close()

        #############################
        #####   do blat search  #####
        #############################
        pslFile = outFile + ".psl"
        if not os.path.isfile(pslFile) or not os.path.getsize(pslFile) > 0:
            cmd = [
                self.rnaEdit.params.sourceDir + "blat", "-stepSize=5",
                "-repMatch=2253", "-minScore=20", "-minIdentity=0", "-noHead",
                self.rnaEdit.params.refGenome, tempFasta, pslFile
            ]
            # print cmd
            Helper.proceedCommand("do blat search for unique reads", cmd,
                                  tempFasta, "None", self.rnaEdit)
        Helper.info(" [%s] Blat finished" % (startTime.strftime("%c")),
                    self.rnaEdit.logFile, self.rnaEdit.textField)
        Helper.info(
            " [%s] Parse Blat output to look for non uniquely mapped reads" %
            (startTime.strftime("%c")), self.rnaEdit.logFile,
            self.rnaEdit.textField)

        if not os.path.isfile(outFile):
            # open psl file
            pslFile = open(pslFile, "r")
            blatDict = {}

            for line in pslFile:  # summarize the blat hits
                pslFields = line.split()
                chr, pos, ref, alt, mmReadCount = pslFields[9].split("-")
                varTuple = (chr, int(pos), ref, alt)
                try:
                    blatScore = [
                        pslFields[0], pslFields[13], pslFields[17],
                        pslFields[18], pslFields[20]
                    ]  # #of Matches, targetName, blockCount, blockSize, targetStarts
                except IndexError:
                    Helper.warning("Not enough Values in '%s' (Skip)" % line,
                                   self.rnaEdit.logFile,
                                   self.rnaEdit.textField)
                    continue
                if varTuple in blatDict:
                    blatDict[varTuple] = blatDict[varTuple] + [blatScore]
                else:
                    blatDict[varTuple] = [blatScore]

            siteDict = {}
            discardDict = {}
            Helper.info(
                " [%s] Analyse Blat hits (Slow)" % (startTime.strftime("%c")),
                self.rnaEdit.logFile, self.rnaEdit.textField)

            # loop over blat Hits
            for varTuple in blatDict.keys(
            ):  # Loop over all blat hits of mmReads to observe the number of Alignements
                keepSNP = False
                chr, pos, ref, alt = varTuple
                pslLine = blatDict[varTuple]
                largestScore = 0
                largestScoreLine = pslLine[0]
                scoreArray = []

                # look for largest blatScore and save the largest line too
                for blatHit in pslLine:
                    lineScore = int(blatHit[0])
                    scoreArray.append(lineScore)
                    if lineScore > largestScore:
                        largestScore = lineScore
                        largestScoreLine = blatHit

                scoreArray.sort(reverse=True)
                if len(scoreArray
                       ) < 2:  # test if more than one blat Hit exists
                    scoreArray.append(0)
                if chr == largestScoreLine[1] and scoreArray[1] < scoreArray[
                        0] * 0.95:  # check if same chromosome and hit is lower the 95 percent of first hit
                    blockCount, blockSizes, blockStarts = int(largestScoreLine[2]), largestScoreLine[3].split(",")[:-1], \
                                                          largestScoreLine[4].split(",")[:-1]
                    for i in range(blockCount):
                        startPos = int(blockStarts[i]) + 1
                        endPos = startPos + int(blockSizes[i])
                        if pos >= startPos and pos < endPos:  # check if alignement overlaps missmatch
                            keepSNP = True

                if keepSNP == True:
                    if varTuple in siteDict:
                        siteDict[varTuple] += 1
                    else:
                        siteDict[varTuple] = 1
                elif not keepSNP:  # when read not passes the blat criteria
                    if varTuple in discardDict:
                        discardDict[varTuple] += 1
                    else:
                        discardDict[varTuple] = 1
            pslFile.close()

            ##############################################################################
            #####        loop through variants and delete invalid variants          ######
            ##############################################################################
            Helper.info(
                " [%s] Deleting invalid variants" % (startTime.strftime("%c")),
                self.rnaEdit.logFile, self.rnaEdit.textField)

            mmNumberTotal = 0
            mmNumberTooSmall = 0
            mmReadsSmallerDiscardReads = 0
            for key in variants.variantDict.keys():
                numberBlatReads = 0
                numberDiscardReads = 0
                if key in siteDict:
                    numberBlatReads = siteDict[key]
                if key in discardDict:
                    numberDiscardReads = discardDict[key]

                if numberBlatReads <= minMissmatch and numberBlatReads <= numberDiscardReads:
                    del variants.variantDict[key]

                # count statistics
                if numberBlatReads < minMissmatch:
                    mmNumberTooSmall += 1
                elif numberBlatReads < numberDiscardReads:  # check if more reads fit the blat criteria than not
                    mmReadsSmallerDiscardReads += 1
                mmNumberTotal += 1

            if not self.rnaEdit.params.keepTemp:
                os.remove(tempFasta)
                os.remove(pslFile.name)

            # output statistics
            mmPassedNumber = mmNumberTotal - (mmNumberTooSmall +
                                              mmReadsSmallerDiscardReads)

            Helper.info(
                "\t\t %d out of %d passed blat criteria" %
                (mmPassedNumber, mmNumberTotal), self.rnaEdit.logFile,
                self.rnaEdit.textField)
            Helper.info(
                "\t\t %d Missmatches had fewer than %d missmatching-Reads." %
                (mmNumberTooSmall, minMissmatch), self.rnaEdit.logFile,
                self.rnaEdit.textField)
            Helper.info(
                "\t\t %d Missmatches had more missaligned reads than correct ones."
                % mmReadsSmallerDiscardReads, self.rnaEdit.logFile,
                self.rnaEdit.textField)

        Helper.printTimeDiff(startTime, self.rnaEdit.logFile,
                             self.rnaEdit.textField)

Пример #5

Показать файл

def bam_depth_with_nm(args):
    """
    * unmapped is discarded
    * both clipped is discarded
    * end clipped is included
    * multimap is included
    * stratified with NM

    default mode:
        pos is 1-based

    summary mode:
        covered
    """
    sam = Samfile(args.bam)
    if args.region:
        c, s, e = parse_region(args.region)
        it = sam.pileup(reference=r, start=s, end=e, max_depth=args.max_depth)
    else:
        it = sam.pileup(max_depth=args.max_depth)
    sam_info = SamInfo(sam)

    def cond(prec):
        rec = prec.alignment
        if rec.is_unmapped:
            return False
        read = sam_info.get_read_info(rec)
        if read.overhang > 0:
            return False
        return True

    max_key = 'NM_more'
    nm_keys = ['NM' + str(nm) for nm in range(args.max_nm + 1)] + [max_key]

    def get_key(prec):
        rec = prec.alignment
        nm = rec.get_tag('NM')
        if nm < args.max_nm:
            return 'NM' + str(nm)
        else:
            return max_key

    header = ['contig', 'pos'] + nm_keys

    def iter_table(it):
        Record = namedtuple('Record', header)
        for pcol in it:
            ps = filter(cond, pcol.pileups)
            counts = Counter(map(get_key, ps))
            yield Record(pcol.reference_name, pcol.pos + 1,
                         *(counts[k] for k in nm_keys))

    summary_header = ['contig', 'length', 'covered'] + nm_keys

    def iter_summary(it):
        """ NMx is the number of covered position with at least a read whose edit distance to the refernece is under x.
        """
        Record = namedtuple('Record', summary_header)

        def get_min_nm(row):
            for k in nm_keys:
                if getattr(row, k) > 0:
                    return k

        it1 = iter_table(it)
        for contig, rows in groupby(it1, lambda row: row.contig):
            length = sam_info.get_length(contig)
            counts = Counter([get_min_nm(row) for row in rows])
            nm_counts = [counts[k] for k in nm_keys]
            covered = sum(nm_counts)
            yield Record(contig, length, covered, *nm_counts)

    read_count_header = ['contig', 'length', 'total'] + nm_keys

    def iter_read_counts(it):
        """ NMx is the number of reads whose edit distance to the refernece is under x.
        """
        Record = namedtuple('Record', read_count_header)

        it1 = iter_table(it)
        for contig, rows in groupby(it1, lambda row: row.contig):
            length = sam_info.get_length(contig)
            rows = list(rows)
            counts = {}
            for k in nm_keys:
                counts[k] = sum(getattr(row, k) for row in rows)

            nm_counts = [counts[k] for k in nm_keys]
            total = sum(nm_counts)
            yield Record(contig, length, total, *nm_counts)

    if args.summary:
        logging.info('Emit coverage summary')
        print(*summary_header, sep='\t')
        for row in iter_summary(it):
            print(*row, sep='\t')
    elif args.read_count:
        logging.info('Emit read counts')
        print(*read_count_header, sep='\t')
        for row in iter_read_counts(it):
            print(*row, sep='\t')
    else:
        print(*header, sep='\t')  # header
        for row in iter_table(it):
            print(*row, sep='\t')

Пример #6

Показать файл

    def from_bam(pysam_samfile, loci, normalized_contig_names=True):
        '''
        Create a PileupCollection for a set of loci from a BAM file.

        Parameters
        ----------
        pysam_samfile : `pysam.Samfile` instance, or filename string
            to a BAM file. The BAM file must be indexed.

        loci : list of Locus instances
            Loci to collect pileups for.

        normalized_contig_names : whether the contig names have been normalized
            (e.g. pyensembl removes the 'chr' prefix). Set to true to
            de-normalize the names when querying the BAM file.

        Returns
        ----------
        PileupCollection instance containing pileups for the specified loci.
        All alignments in the BAM file are included (e.g. duplicate reads,
        secondary alignments, etc.). See `PileupCollection.filter` if these
        need to be removed. 
        '''

        loci = [to_locus(obj) for obj in loci]

        close_on_completion = False
        if typechecks.is_string(pysam_samfile):
            pysam_samfile = Samfile(pysam_samfile)
            close_on_completion = True

        try:
            # Map from pyensembl normalized chromosome names used in Variant to
            # the names used in the BAM file.
            if normalized_contig_names:
                chromosome_name_map = {}
                for name in pysam_samfile.references:
                    normalized = pyensembl.locus.normalize_chromosome(name)
                    chromosome_name_map[normalized] = name
                    chromosome_name_map[name] = name
            else:
                chromosome_name_map = None

            result = PileupCollection({})

            # Optimization: we sort variants so our BAM reads are localized.
            locus_iterator = itertools.chain.from_iterable(
                (Locus.from_interbase_coordinates(locus_interval.contig, pos)
                 for pos in locus_interval.positions)
                for locus_interval in sorted(loci))
            for locus in locus_iterator:
                result.pileups[locus] = Pileup(locus, [])
                if normalized_contig_names:
                    try:
                        chromosome = chromosome_name_map[locus.contig]
                    except KeyError:
                        logging.warn("No such contig in bam: %s" %
                                     locus.contig)
                        continue
                else:
                    chromosome = locus.contig
                columns = pysam_samfile.pileup(
                    chromosome,
                    locus.position,
                    locus.position + 1,  # exclusive, 0-indexed
                    truncate=True,
                    stepper="nofilter")
                try:
                    column = next(columns)
                except StopIteration:
                    # No reads align to this locus.
                    continue

                # Note that storing the pileups here is necessary, since the
                # subsequent assertion will invalidate our column.
                pileups = column.pileups
                assert list(columns) == []  # column is invalid after this.
                for pileup_read in pileups:
                    if not pileup_read.is_refskip:
                        element = PileupElement.from_pysam_alignment(
                            locus, pileup_read)
                        result.pileups[locus].append(element)
            return result
        finally:
            if close_on_completion:
                pysam_samfile.close()

Пример #7

Показать файл

Файл: core.py Проект: dnil/chanjo

def BamFile(bam_path):
  """Return enclosed function to read, read depths from the "bam_path".

  .. code-block:: python

    >>> from chanjo.depth_reader import BamFile
    >>> read_depths = BamFile('./alignment.bam')

  Args:
    bam_path (path): path to alignment BAM-file

  Returns:
    function: function to read from the BAM-file
  """
  # raise an error if the file doesn't exist
  if not os.path.exists(bam_path):
    raise OSError(errno.ENOENT, bam_path)

  bam = Samfile(bam_path)

  try:
    bam.pileup()
  except ValueError:
    # catch error when BAM-file isn't indexed (+ ".bai" file)
    raise OSError(
      errno.ENOENT,
      "BAM-file (%s) must be indexed." % os.path.basename(bam_path)
    )

  def reader(contig, start, end):
    """Generate a list of read depths for each position (start, end).

    The `numpy` array is used to optimize performance when building and
    slicing the list.

    This function depends on `Pysam` >=0.7.5 since the ``truncate``
    option wasn't available in previous versions.

    .. code-block:: python

      >>> read_depths = BamFile('./alignment.bam')
      >>> read_depths('17', 1, 5)
      array([3., 4., 4., 5., 4.])

    .. note::

      Positions are expected to be 1:1-based. In other words; if
      start=1, end=9 you should expect read depths for base pair
      positions 1-9 to be returned.

    Args:
      contig (str): contig/chromosome id (str) of interest
      start (int): first position of the interval (1-based)
      end (int): last position of the interval (1-based)

    Returns:
      list or numpy.array: array of read depths for *each* position in
        the interval
    """
    # convert start to 0-based since this is what pysam expects!
    pysam_start = start - 1

    # pysam expects contig as bytes in Python 2
    pysam_contig = str(contig)

    # check that we don't have a negative start position
    if pysam_start < 0:
      raise ValueError("Start position must be > 0, not %d" % start)

    # preallocate an array of 0 read depth for each position
    # pysam excludes positions with 0 read depth
    read_depths = prealloc_func(end - pysam_start)

    try:
      # overwrite read-covered positions (>0 read depth)
      # ``truncate`` ensures it starts and ends on the gives positions
      # note: ``col.pos`` is 0-based, as is ``pysam_start``
      for col in bam.pileup(pysam_contig, pysam_start, end, truncate=True):
        read_depths[col.pos - pysam_start] = col.n

    except ValueError as ve:
      # catch errors where the contig doesn't exist in the BAM-file
      raise ValueError(
        "Must use contig that exist in the Bam-file. Error: %s" % ve)

    return read_depths

  return reader