示例#1
0
def get_counts(args):
    """function to get fragment sizes

    """
    if args.out is None:
        args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1])  
    chunks = ChunkList.read(args.bed)
    mat = np.zeros(len(chunks), dtype=np.int)
    bamHandle = AlignmentFile(args.bam)
    j = 0
    for chunk in chunks:
        for read in bamHandle.fetch(chunk.chrom, max(0, chunk.start - args.upper), chunk.end + args.upper):
            if read.is_proper_pair and not read.is_reverse:
                if args.atac:
                    #get left position
                    l_pos = read.pos + 4
                    #get insert size
                    #correct by 8 base pairs to be inserion to insertion
                    ilen = abs(read.template_length) - 8
                else:
                    l_pos = read.pos
                    ilen = abs(read.template_length)
                r_pos = l_pos + ilen - 1
                if _between(ilen, args.lower, args.upper) and (_between(l_pos, chunk.start, chunk.end) or _between(r_pos, chunk.start, chunk.end)):
                    mat[j] += 1
        j += 1
    bamHandle.close()
    np.savetxt(args.out + ".counts.txt.gz", mat, delimiter="\n", fmt='%i')
示例#2
0
def fetch_count_read (alignment_file, seq_name, start, end):
    """
    Count the number of read that are at least partly overlapping a specified chromosomic region
    @param alignment_file Path to a sam or a bam file
    @param seq_name Name of the sequence where read are to be aligned on
    @param start Start genomic coordinates of the area of alignment
    @param end End End genomic coordinates of the area of alignment
    """
    # Specific imports
    from pysam import AlignmentFile
    
    # Init a generator on the sam or bam file with pysam
    if alignment_file[-3:].lower() == "bam":
        al = AlignmentFile(alignment_file, "rb")
        
    elif alignment_file[-3:].lower() == "sam":
        al = AlignmentFile(alignment_file, "r")
    
    else:
        raise Exception("Wrong file format (sam or bam)") 
    
    # Count read aligned at least partly on the specified region
    n = 0
    for i in al.fetch(seq_name, start, end):
        n += 1
        
    al.close()
    
    return n
示例#3
0
文件: sam.py 项目: acorg/dark-matter
def samfile(filename):
    """
    A context manager to open and close a SAM/BAM file.

    @param filename: A C{str} file name to open.
    """
    f = AlignmentFile(filename)
    yield f
    f.close()
示例#4
0
def gather_sv_data(options, collection):
	# Read regions of interest BED file
	regions = BedTool(options.region_file)

	# Read BAM file
	bamfile = AlignmentFile(options.bam_file, "rb")

	# Intersect regions
	for reg in regions:
		for read in bamfile.fetch(reg.chrom, reg.start, reg.end):
			#print read
			if read.query_name.endswith("2d"):
				collection[read.query_name] = []
			if read.query_name.startswith("ctg"):
				collection[read.query_name] = []
				#print read.reference_id, read.reference_start, read.reference_end
				#print read.query_name, read.query_alignment_start, read.query_alignment_end

	bamfile.close()
示例#5
0
文件: sam.py 项目: bamueh/dark-matter
class PaddedSAM(object):
    """
    Obtain aligned (padded) queries from a SAM/BAM file.

    @param filename: The C{str} name of the SAM/BAM file.
    """
    def __init__(self, filename):
        self.samfile = AlignmentFile(filename)
        # self.referenceInsertions will be keyed by offset into the reference
        # sequence. The inserted bases would need to begin at this offset. The
        # value will be a Counter whose keys are the nucleotides proposed for
        # insertion, with a value indicating how many times the nucleotide was
        # proposed for insertion at that offset.
        self.referenceInsertions = defaultdict(Counter)

    def close(self):
        """
        Close the opened SAM/BAM file.
        """
        self.samfile.close()

    def referencesToStr(self, indent=0):
        """
        List the reference names and their lengths.

        @param indent: An C{int} number of spaces to indent each line.
        @return: A C{str} describing known reference names and their lengths.
        """
        samfile = self.samfile
        result = []
        indent = ' ' * indent
        for i in range(samfile.nreferences):
            result.append('%s%s (length %d)' % (
                indent, samfile.get_reference_name(i), samfile.lengths[i]))
        return '\n'.join(result)

    def queries(self, referenceName=None, minLength=0, rcSuffix='',
                dropSecondary=False, dropSupplementary=False,
                dropDuplicates=False, allowDuplicateIds=False,
                keepQCFailures=False, rcNeeded=False, padChar='-',
                queryInsertionChar='N'):
        """
        Produce padded (with gaps) queries according to the CIGAR string and
        reference sequence length for each matching query sequence.

        @param referenceName: The C{str} name of the reference sequence to
            print alignments for. This is only needed if the SAM/BAM alignment
            was made against multiple references *and* they are not all of the
            same length. If there is only one reference sequence or if all
            reference sequences are of the same length, there is no need to
            provide a reference name (i.e., pass C{None}).
        @param minLength: Ignore queries shorter than this C{int} value. Note
            that this refers to the length of the query sequence once it has
            been aligned to the reference. The alignment may introduce
            C{queryInsertionChar} characters into the read, and these are
            counted towards its length because the alignment is assuming the
            query is missing a base at those locations.
        @param rcSuffix: A C{str} to add to the end of query names that are
            reverse complemented. This is added before the /1, /2, etc., that
            are added for duplicated ids (if there are duplicates and
            C{allowDuplicateIds} is C{False}.
        @param dropSecondary: If C{True}, secondary matches will not be
            yielded.
        @param dropSupplementary: If C{True}, supplementary matches will not be
            yielded.
        @param dropDuplicates: If C{True}, matches flagged as optical or PCR
            duplicates will not be yielded.
        @param allowDuplicateIds: If C{True}, repeated query ids (due to
            secondary or supplemental matches) will not have /1, /2, etc.
            appended to their ids. So repeated ids may appear in the yielded
            FASTA.
        @param keepQCFailures: If C{True}, reads that are marked as quality
            control failures will be included in the output.
        @param rcNeeded: If C{True}, queries that are flagged as matching when
            reverse complemented should have reverse complementing when
            preparing the output sequences. This must be used if the program
            that created the SAM/BAM input flags reversed matches but does not
            also store the reverse complemented query.
        @param padChar: A C{str} of length one to use to pad queries with to
            make them the same length as the reference sequence.
        @param queryInsertionChar:  A C{str} of length one to use to insert
            into queries when the CIGAR string indicates that the alignment
            of a query would cause a deletion in the reference. This character
            is inserted as a 'missing' query character (i.e., a base that can
            be assumed to have been lost due to an error) whose existence is
            necessary for the match to continue.
        @raises UnequalReferenceLengthError: If C{referenceName} is C{None}
            and the reference sequence lengths in the SAM/BAM file are not all
            identical.
        @raises UnknownReference: If C{referenceName} does not exist.
        @return: A generator that yields C{Read} instances that are padded
            with gap characters to align them to the length of the reference
            sequence.
        """
        samfile = self.samfile

        if referenceName:
            referenceId = samfile.get_tid(referenceName)
            if referenceId == -1:
                raise UnknownReference(
                    'Reference %r is not present in the SAM/BAM file.'
                    % referenceName)
            referenceLength = samfile.lengths[referenceId]
        else:
            # No reference given. All references must have the same length.
            if len(set(samfile.lengths)) != 1:
                raise UnequalReferenceLengthError(
                    'Your SAM/BAM file has %d reference sequences, and their '
                    'lengths (%s) are not all identical.' % (
                        samfile.nreferences,
                        ', '.join(map(str, sorted(samfile.lengths)))))
            referenceId = None
            referenceLength = samfile.lengths[0]

        # Hold the count for each id so we can add /1, /2 etc to duplicate
        # ids (unless --allowDuplicateIds was given).
        idCount = Counter()

        MATCH_OPERATIONS = {CMATCH, CEQUAL, CDIFF}

        for read in samfile.fetch():
            query = read.query_sequence
            if (read.is_unmapped or
                    (read.is_secondary and dropSecondary) or
                    (read.is_supplementary and dropSupplementary) or
                    (read.is_duplicate and dropDuplicates) or
                    (read.is_qcfail and not keepQCFailures) or
                    (referenceId is not None and
                     read.reference_id != referenceId)):
                continue

            if read.is_reverse:
                if rcNeeded:
                    query = DNARead('id', query).reverseComplement().sequence
                if rcSuffix:
                    read.query_name += rcSuffix

            referenceStart = read.reference_start
            atStart = True
            queryIndex = 0
            referenceIndex = referenceStart
            alignedSequence = ''

            for operation, length in read.cigartuples:

                # The operations are tested in the order they appear in
                # https://samtools.github.io/hts-specs/SAMv1.pdf It would be
                # more efficient to test them in order of frequency of
                # occurrence.
                if operation in MATCH_OPERATIONS:
                    atStart = False
                    alignedSequence += query[queryIndex:queryIndex + length]
                elif operation == CINS:
                    # Insertion to the reference. This consumes query bases but
                    # we don't output them because the reference cannot be
                    # changed.  I.e., these bases in the query would need to be
                    # inserted into the reference.  Remove these bases from the
                    # query but record what would have been inserted into the
                    # reference.
                    atStart = False
                    for i in range(length):
                        self.referenceInsertions[referenceIndex + i][
                            query[queryIndex + i]] += 1
                elif operation == CDEL:
                    # Delete from the reference. Some bases from the reference
                    # would need to be deleted to continue the match. So we put
                    # an insertion into the query to compensate.
                    atStart = False
                    alignedSequence += queryInsertionChar * length
                elif operation == CREF_SKIP:
                    # Skipped reference. Opens a gap in the query. For
                    # mRNA-to-genome alignment, an N operation represents an
                    # intron.  For other types of alignments, the
                    # interpretation of N is not defined. So this is unlikely
                    # to occur.
                    atStart = False
                    alignedSequence += queryInsertionChar * length
                elif operation == CSOFT_CLIP:
                    # Bases in the query that are not part of the match. We
                    # remove these from the query if they protrude before the
                    # start or after the end of the reference. According to the
                    # SAM docs, 'S' operations may only have 'H' operations
                    # between them and the ends of the CIGAR string.
                    if atStart:
                        # Don't set atStart=False, in case there's another 'S'
                        # operation.
                        unwantedLeft = length - referenceStart
                        if unwantedLeft > 0:
                            # The query protrudes left. Copy its right part.
                            alignedSequence += query[queryIndex + unwantedLeft:
                                                     queryIndex + length]
                            referenceStart = 0
                        else:
                            referenceStart -= length
                            alignedSequence += query[
                                queryIndex:queryIndex + length]
                    else:
                        unwantedRight = (
                            (referenceStart + len(alignedSequence) + length) -
                            referenceLength)

                        if unwantedRight > 0:
                            # The query protrudes right. Copy its left part.
                            alignedSequence += query[
                                queryIndex:queryIndex + length - unwantedRight]
                        else:
                            alignedSequence += query[
                                queryIndex:queryIndex + length]
                elif operation == CHARD_CLIP:
                    # Some bases have been completely removed from the query.
                    # This (H) can only be present as the first and/or last
                    # operation. There is nothing to do as the bases are simply
                    # not present in the query string in the SAM/BAM file.
                    pass
                elif operation == CPAD:
                    # This is "silent deletion from the padded reference",
                    # which consumes neither query nor reference.
                    atStart = False
                else:
                    raise ValueError('Unknown CIGAR operation:', operation)

                if operation in _CONSUMES_QUERY:
                    queryIndex += length

                if operation in _CONSUMES_REFERENCE:
                    referenceIndex += length

            # Sanity check that we consumed the entire query.
            assert queryIndex == len(query)

            # We cannot test we consumed the entire reference.  The CIGAR
            # string applies to (i.e., exhausts) the query and is silent about
            # the part of the reference that to the right of the aligned query.

            # Check the length restriction now that we have (possibly) added
            # queryInsertionChar characters to pad the query out to the length
            # it requires to match the reference.
            if len(alignedSequence) < minLength:
                continue

            # Put gap characters before and after the aligned sequence so that
            # it is offset properly and matches the length of the reference.
            paddedSequence = (
                (padChar * referenceStart) +
                alignedSequence +
                padChar * (referenceLength -
                           (referenceStart + len(alignedSequence))))

            if allowDuplicateIds:
                suffix = ''
            else:
                count = idCount[read.query_name]
                idCount[read.query_name] += 1
                suffix = '' if count == 0 else '/%d' % count

            yield Read('%s%s' % (read.query_name, suffix), paddedSequence)
示例#6
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)
    if opts.bam:
        mreads = path.realpath(opts.bam)
    else:
        mreads = path.join(opts.workdir, load_parameters_fromdb(opts))

    filter_exclude = opts.filter

    outdir = path.join(opts.workdir, '04_normalization')
    mkdir(outdir)

    mappability = gc_content = n_rsites = None
    if opts.normalization == 'oneD':
        if not opts.fasta:
            raise Exception('ERROR: missing path to FASTA for oneD normalization')
        if not opts.renz:
            raise Exception('ERROR: missing restriction enzyme name for oneD normalization')
        if not opts.mappability:
            raise Exception('ERROR: missing path to mappability for oneD normalization')
        bamfile = AlignmentFile(mreads, 'rb')
        refs = bamfile.references
        bamfile.close()

        # get genome sequence ~1 min
        printime('  - parsing FASTA')
        genome = parse_fasta(opts.fasta, verbose=False)

        fas = set(genome.keys())
        bam = set(refs)
        if fas - bam:
            print 'WARNING: %d extra chromosomes in FASTA (removing them)' % (len(fas - bam))
            if len(fas - bam) <= 50:
                print '\n'.join([('  - ' + c) for c in (fas - bam)])
        if bam - fas:
            txt = ('\n'.join([('  - ' + c) for c in (bam - fas)])
                   if len(bam - fas) <= 50 else '')
            raise Exception('ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' % (
                len(bam - fas), txt))
        refs = [crm for crm in refs if crm in genome]
        if len(refs) == 0:
            raise Exception("ERROR: chromosomes in FASTA different the ones"
                            " in BAM")

        # get mappability ~2 min
        printime('  - Parsing mappability')
        mappability = parse_mappability_bedGraph(
            opts.mappability, opts.reso,
            wanted_chrom=refs[0] if len(refs)==1 else None)
        # resize chomosomes
        for c in refs:
            if not c in mappability:
                mappability[c] = [float('nan')] * (len(refs) / opts.reso + 1)
            if len(mappability[c]) < len(refs) / opts.reso + 1:
                mappability[c] += [float('nan')] * (
                    (len(refs) / opts.reso + 1) - len(mappability[c]))
        # concatenates
        mappability = reduce(lambda x, y: x + y,
                             (mappability.get(c, []) for c in refs))

        printime('  - Computing GC content per bin (removing Ns)')
        gc_content = get_gc_content(genome, opts.reso, chromosomes=refs,
                                    n_cpus=opts.cpus)
        # compute r_sites ~30 sec
        # TODO: read from DB
        printime('  - Computing number of RE sites per bin (+/- 200 bp)')
        n_rsites  = []
        re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '')
        for crm in refs:
            for pos in xrange(200, len(genome[crm]) + 200, opts.reso):
                seq = genome[crm][pos-200:pos + opts.reso + 200]
                n_rsites.append(seq.count(re_site))

        ## CHECK TO BE REMOVED
        # out = open('tmp_mappability.txt', 'w')
        # i = 0
        # for crm in refs:
        #     for pos in xrange(len(genome[crm]) / opts.reso + 1):
        #         out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i]))
        #         i += 1
        # out.close()
        # compute GC content ~30 sec
        # TODO: read from DB
    biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam(
        mreads, filter_exclude, opts.reso, min_count=opts.min_count, sigma=2,
        factor=1, outdir=outdir, extra_out=param_hash, ncpus=opts.cpus,
        normalization=opts.normalization, mappability=mappability,
        p_fit=opts.p_fit, cg_content=gc_content, n_rsites=n_rsites,
        min_perc=opts.min_perc, max_perc=opts.max_perc, seed=opts.seed,
        normalize_only=opts.normalize_only, max_njobs=opts.max_njobs,
        extra_bads=opts.badcols, biases_path=opts.biases_path)

    bad_col_image = path.join(outdir, 'filtered_bins_%s_%s.png' % (
        nicer(opts.reso).replace(' ', ''), param_hash))

    inter_vs_gcoord = path.join(opts.workdir, '04_normalization',
                                'interactions_vs_genomic-coords.png_%s_%s.png' % (
                                    opts.reso, param_hash))

    # get and plot decay
    if not opts.normalize_only:
        printime('  - Computing interaction decay vs genomic distance')
        (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions(
            decay, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only,
            savefig=inter_vs_gcoord)

        print ('    -> Decay slope 0.7-10 Mb\t%s' % a2)
    else:
        a2 = 0.

    printime('  - Saving biases and badcol columns')
    # biases
    bias_file = path.join(outdir, 'biases_%s_%s.pickle' % (
        nicer(opts.reso).replace(' ', ''), param_hash))
    out = open(bias_file, 'w')

    dump({'biases'    : biases,
          'decay'     : decay,
          'badcol'    : badcol,
          'resolution': opts.reso}, out, HIGHEST_PROTOCOL)
    out.close()

    finish_time = time.localtime()

    try:
        save_to_db(opts, bias_file, mreads, bad_col_image,
                   len(badcol), len(biases), raw_cisprc, norm_cisprc,
                   inter_vs_gcoord, a2, opts.filter,
                   launch_time, finish_time)
    except:
        # release lock anyway
        print_exc()
        try:
            remove(path.join(opts.workdir, '__lock_db'))
        except OSError:
            pass
        exit(1)
    return bigg


 if __name__=="__main__":
 	import argparse
 	parser=argparse.ArgumentParser()
 	parser.add_argument("-b", "--bamfile",
                        help="the sorted and indexed bam file")
 	parser.add_argument("-o", "--out", default="bigg.bed",
 						help="the output file name")

 	args = parser.parse_args()

 	# make a file using the functions
	samfile=AlignmentFile(args.bamfile)

	fw=open(args.out, "w")

	for n, record in enumerate(samfile):
	    try:
	        bigg=sam_to_bigGenePred(record)
	        fw.write(bigg.to_str())
	        fw.write("\n")
	    except ValueError:
	        pass
	    #if n>100:
	        #break

	fw.close()
	samfile.close()
示例#8
0
文件: umis.py 项目: roryk/umis
def tagcount(sam, out, genemap, output_evidence_table, positional, minevidence,
             cb_histogram, cb_cutoff, no_scale_evidence, subsample):
    ''' Count up evidence for tagged molecules
    '''
    from pysam import AlignmentFile

    from io import StringIO
    import pandas as pd

    from utils import weigh_evidence

    logger.info('Reading optional files')

    gene_map = None
    if genemap:
        with open(genemap) as fh:
            try:
                gene_map = dict(p.strip().split() for p in fh)
            except ValueError:
                logger.error('Incorrectly formatted gene_map, need to be tsv.')
                sys.exit()

    if positional:
        tuple_template = '{0},{1},{2},{3}'
    else:
        tuple_template = '{0},{1},{3}'

    if not cb_cutoff:
        cb_cutoff = 0

    if cb_histogram and cb_cutoff == "auto":
        cb_cutoff = guess_depth_cutoff(cb_histogram)

    cb_cutoff = int(cb_cutoff)

    cb_hist = None
    filter_cb = False
    if cb_histogram:
        cb_hist = pd.read_table(cb_histogram, index_col=0, header=-1, squeeze=True)
        total_num_cbs = cb_hist.shape[0]
        cb_hist = cb_hist[cb_hist > cb_cutoff]
        logger.info('Keeping {} out of {} cellular barcodes.'.format(cb_hist.shape[0], total_num_cbs))
        filter_cb = True

    parser_re = re.compile('.*:CELL_(?P<CB>.*):UMI_(?P<MB>.*)')

    if subsample:
        logger.info('Creating reservoir of subsampled reads ({} per cell)'.format(subsample))
        start_sampling  = time.time()

        reservoir = collections.defaultdict(list)
        cb_hist_sampled = 0 * cb_hist
        cb_obs = 0 * cb_hist

        sam_mode = 'r' if sam.endswith(".sam") else 'rb'
        sam_file = AlignmentFile(sam, mode=sam_mode)
        track = sam_file.fetch(until_eof=True)
        current_read = 'none_observed_yet'
        for i, aln in enumerate(track):
            if aln.qname == current_read:
                continue

            current_read = aln.qname
            match = parser_re.match(aln.qname)
            CB = match.group('CB')

            if CB not in cb_hist.index:
                continue

            cb_obs[CB] += 1
            if len(reservoir[CB]) < subsample:
                reservoir[CB].append(i)
                cb_hist_sampled[CB] += 1
            else:
                s = pd.np.random.randint(0, cb_obs[CB])
                if s < subsample:
                    reservoir[CB][s] = i

        index_filter = set(itertools.chain.from_iterable(reservoir.values()))
        sam_file.close()
        sampling_time = time.time() - start_sampling
        logger.info('Sampling done - {:.3}s'.format(sampling_time))

    evidence = collections.defaultdict(int)

    logger.info('Tallying evidence')
    start_tally = time.time()

    sam_mode = 'r' if sam.endswith(".sam") else 'rb'
    sam_file = AlignmentFile(sam, mode=sam_mode)
    track = sam_file.fetch(until_eof=True)
    count = 0
    unmapped = 0
    kept = 0
    nomatchcb = 0
    current_read = 'none_observed_yet'
    count_this_read = True
    for i, aln in enumerate(track):
        count += 1
        if not count % 100000:
            logger.info("Processed %d alignments, kept %d." % (count, kept))
            logger.info("%d were filtered for being unmapped." % unmapped)
            if filter_cb:
                logger.info("%d were filtered for not matching known barcodes."
                            % nomatchcb)

        if aln.is_unmapped:
            unmapped += 1
            continue

        if aln.qname != current_read:
            current_read = aln.qname
            if subsample and i not in index_filter:
                count_this_read = False
                continue
            else:
                count_this_read = True
        else:
            if not count_this_read:
                continue

        match = parser_re.match(aln.qname)
        CB = match.group('CB')
        if filter_cb:
            if CB not in cb_hist.index:
                nomatchcb += 1
                continue

        MB = match.group('MB')

        txid = sam_file.getrname(aln.reference_id)
        if gene_map:
            target_name = gene_map[txid]

        else:
            target_name = txid

        e_tuple = tuple_template.format(CB, target_name, aln.pos, MB)

        # Scale evidence by number of hits
        if no_scale_evidence:
            evidence[e_tuple] += 1.0
        else:
            evidence[e_tuple] += weigh_evidence(aln.tags)
        kept += 1

    tally_time = time.time() - start_tally
    logger.info('Tally done - {:.3}s, {:,} alns/min'.format(tally_time, int(60. * count / tally_time)))
    logger.info('Collapsing evidence')

    buf = StringIO()
    for key in evidence:
        line = '{},{}\n'.format(key, evidence[key])
        buf.write(unicode(line), "utf-8")

    buf.seek(0)
    evidence_table = pd.read_csv(buf)
    evidence_query = 'evidence >= %f' % minevidence
    if positional:
        evidence_table.columns=['cell', 'gene', 'umi', 'pos', 'evidence']
        collapsed = evidence_table.query(evidence_query).groupby(['cell', 'gene'])['umi', 'pos'].size()

    else:
        evidence_table.columns=['cell', 'gene', 'umi', 'evidence']
        collapsed = evidence_table.query(evidence_query).groupby(['cell', 'gene'])['umi'].size()

    expanded = collapsed.unstack().T

    if gene_map:
        # This Series is just for sorting the index
        genes = pd.Series(index=set(gene_map.values()))
        genes = genes.sort_index()
        # Now genes is assigned to a DataFrame
        genes = expanded.ix[genes.index]

    else:
        genes = expanded

    genes.replace(pd.np.nan, 0, inplace=True)

    logger.info('Output results')

    if subsample:
        cb_hist_sampled.to_csv('ss_{}_'.format(subsample) + os.path.basename(cb_histogram), sep='\t')

    if output_evidence_table:
        import shutil
        buf.seek(0)
        with open(output_evidence_table, 'w') as etab_fh:
            shutil.copyfileobj(buf, etab_fh)

    genes.to_csv(out)
示例#9
0
def load_hic_data_from_bam(fnam,
                           resolution,
                           biases=None,
                           tmpdir='.',
                           ncpus=8,
                           filter_exclude=(1, 2, 3, 4, 6, 7, 8, 9, 10),
                           region=None,
                           nchunks=100,
                           verbose=True,
                           clean=True):
    """
    :param fnam: TADbit-generated BAM file with read-ends1 and read-ends2
    :param resolution: the resolution of the experiment (size of a bin in
       bases)
    :param None biases: path to pickle file where are stored the biases. Keys
       in this file should be: 'biases', 'badcol', 'decay' and 'resolution'
    :param '.' tmpdir: path to folder where to create temporary files
    :param 8 ncpus:
    :param (1, 2, 3, 4, 6, 7, 8, 9, 10) filter exclude: filters to define the
       set of valid pair of reads.
    :param None region: chromosome name, if None, all genome will be loaded
    :param 100 nchunks: maximum number of chunks into which to cut the BAM
    :param True verbose: speak
    :param True clean: remove temps

    :returns: HiC_data object
    """
    bam = AlignmentFile(fnam)
    genome_seq = OrderedDict((c, l) for c, l in zip(
        bam.references, [x // resolution + 1 for x in bam.lengths]))
    bam.close()

    sections = []
    if region:
        size = genome_seq[region]
        sections.extend([(region, i) for i in range(size)])
    else:
        for crm in genome_seq:
            len_crm = genome_seq[crm]
            sections.extend([(crm, i) for i in range(len_crm)])

        size = sum(genome_seq.values())

    chromosomes = {region: genome_seq[region]} if region else genome_seq
    dict_sec = dict([(j, i) for i, j in enumerate(sections)])
    imx = HiC_data((),
                   size,
                   chromosomes=chromosomes,
                   dict_sec=dict_sec,
                   resolution=resolution)

    if biases:
        if isinstance(biases, basestring):
            biases = load(open(biases, 'rb'))
        if biases['resolution'] != resolution:
            raise Exception('ERROR: resolution of biases do not match to the '
                            'one wanted (%d vs %d)' %
                            (biases['resolution'], resolution))
        if region:
            chrom_start = 0
            chrom_end = 0
            for crm in genome_seq:
                if crm == region:
                    chrom_end = chrom_start + genome_seq[crm]
                    break
                len_crm = genome_seq[crm]
                chrom_start += len_crm
            imx.bias = dict((k - chrom_start, v)
                            for k, v in biases.get('biases', {}).items()
                            if chrom_start <= k < chrom_end)
            imx.bads = dict((k - chrom_start, v)
                            for k, v in biases.get('badcol', {}).items()
                            if chrom_start <= k < chrom_end)
        else:
            imx.bads = biases['badcol']
            imx.bias = biases['biases']
        imx.expected = biases['decay']

    get_matrix(fnam,
               resolution,
               biases=None,
               filter_exclude=filter_exclude,
               normalization='raw',
               tmpdir=tmpdir,
               clean=clean,
               ncpus=ncpus,
               nchunks=nchunks,
               dico=imx,
               region1=region,
               verbose=verbose)
    imx._symmetricize()
    imx.symmetricized = True

    return imx
示例#10
0
def load_hic_data_from_bam(fnam, resolution, biases=None, tmpdir='.', ncpus=8,
                           filter_exclude=(1, 2, 3, 4, 6, 7, 8, 9, 10),
                           region=None, verbose=True, clean=True):
    """
    :param fnam: TADbit-generated BAM file with read-ends1 and read-ends2
    :param resolution: the resolution of the experiment (size of a bin in
       bases)
    :param None biases: path to pickle file where are stored the biases. Keys
       in this file should be: 'biases', 'badcol', 'decay' and 'resolution'
    :param '.' tmpdir: path to folder where to create temporary files
    :param 8 ncpus:
    :param (1, 2, 3, 4, 6, 7, 8, 9, 10) filter exclude: filters to define the
       set of valid pair of reads.
    :param None region: chromosome name, if None, all genome will be loaded

    :returns: HiC_data object
    """
    bam = AlignmentFile(fnam)
    genome_seq = OrderedDict((c, l) for c, l in
                             zip(bam.references,
                                 [x / resolution + 1 for x in bam.lengths]))
    bam.close()

    sections = []
    for crm in genome_seq:
        len_crm = genome_seq[crm]
        sections.extend([(crm, i) for i in xrange(len_crm)])

    size = sum(genome_seq.values())

    chromosomes = {region: genome_seq[region]} if region else genome_seq
    dict_sec = dict([(j, i) for i, j in enumerate(sections)])
    imx = HiC_data((), size, chromosomes=chromosomes, dict_sec=dict_sec,
                   resolution=resolution)

    if biases:
        if isinstance(biases, basestring):
            biases = load(open(biases))
        if biases['resolution'] != resolution:
            raise Exception('ERROR: resolution of biases do not match to the '
                            'one wanted (%d vs %d)' % (
                                biases['resolution'], resolution))
        if region:
            chrom_start = 0
            for crm in genome_seq:
                if crm == region:
                    break
                len_crm = genome_seq[crm]
                chrom_start += len_crm
            imx.bads     = dict((b - chrom_start, biases['badcol'][b]) for b in biases['badcol'])
            imx.bias     = dict((b - chrom_start, biases['biases'][b]) for b in biases['biases'])
        else:
            imx.bads     = biases['badcol']
            imx.bias     = biases['biases']
        imx.expected = biases['decay']

    get_matrix(fnam, resolution, biases=None, filter_exclude=filter_exclude,
               normalization='raw', tmpdir=tmpdir, clean=clean,
               ncpus=ncpus, dico=imx, region1=region, verbose=verbose)
    imx._symmetricize()
    imx.symmetricized = True

    return imx
示例#11
0
class NameSortedBamSource(DataSource):
    name = "name_sorted_bam"
    version = "0.1.0"
    container = "dataframe"
    partition_access = False
    description = "Readname-sorted BAM of poreC alignments"

    def __init__(self, urlpath, include_unmapped=True, metadata=None):
        self._urlpath = urlpath
        self._include_unmapped = include_unmapped
        self._af = None
        self._dtype = None
        super(NameSortedBamSource, self).__init__(metadata=metadata)

    def _open_dataset(self):
        # TODO check that bam file has namesorted in header
        self._af = AlignmentFile(self._urlpath)

    def _get_schema(self):
        if self._af is None:
            self._open_dataset()
        chrom_names = list(self._af.references)
        assert "NULL" not in chrom_names
        dtype = BamEntryDf.DTYPE.copy()
        dtype["chrom"] = pd.CategoricalDtype(chrom_names + ["NULL"],
                                             ordered=True)
        self._dtype = dtype
        return Schema(datashape=None,
                      dtype=dtype,
                      shape=(None, len(dtype)),
                      npartitions=None,
                      extra_metadata={})

    def get_chrom_dtype(self):
        return self._schema.dtype["chrom"]

    @staticmethod
    def _group_by_read(
        align_iter: Iterator[AlignedSegment]
    ) -> Iterator[List[Tuple[int, int, AlignedSegment]]]:
        """Iterate over alignments in name-sorted bam file grouping by read"""
        current_read_name = None
        read_idx = 0
        aligns = []
        for align_idx, align in enumerate(align_iter):
            if current_read_name is None:
                current_read_name = align.query_name
                aligns.append((read_idx, align_idx, align))
            elif current_read_name == align.query_name:
                aligns.append((read_idx, align_idx, align))
            else:
                yield aligns
                read_idx += 1
                current_read_name = align.query_name
                aligns = [(read_idx, align_idx, align)]
        yield aligns

    def _align_to_tuple(self, align_data):
        read_idx, align_idx, align = align_data
        if align.is_unmapped:
            align_cat = "unmapped"
            chrom, start, end, align_score = "NULL", 0, 0, 0
            read_length = align.query_length
        else:
            chrom, start, end = (align.reference_name, align.reference_start,
                                 align.reference_end)
            read_length = align.infer_read_length()
            align_score = align.get_tag("AS")
            if align.is_secondary:
                align_cat = "secondary"
            elif align.is_supplementary:
                align_cat = "supplementary"
            else:
                align_cat = "primary"
        return (
            read_idx,
            align_idx,
            align_cat,
            chrom,
            start,
            end,
            not align.is_reverse,
            align.query_name,
            read_length,
            align.query_alignment_start,
            align.query_alignment_end,
            align.mapq,
            align_score,
        )

    def read_chunked(self,
                     chunksize=10000,
                     yield_aligns=False,
                     max_chunks=None):
        """Read the bam into a dataframe in chunks, yield those chunks
        """
        self._load_metadata()
        from toolz import partition_all

        BamEntryDf.set_dtype("chrom", self.get_chrom_dtype())
        align_iter = self._af.fetch(until_eof=self._include_unmapped)
        columns = list(self._schema.dtype.keys())
        for chunk_idx, chunk in enumerate(
                partition_all(chunksize, self._group_by_read(align_iter))):
            aligns = [a for read_aligns in chunk for a in read_aligns]
            df = (pd.DataFrame([self._align_to_tuple(a) for a in aligns],
                               columns=columns).astype(
                                   BamEntryDf.DTYPE).bamdf.cast(fillna=True,
                                                                subset=True))
            if yield_aligns:
                yield (aligns, df)
            else:
                yield (df)
            if max_chunks and chunk_idx == max_chunks - 1:
                break

    def _close(self):
        if self._af is not None:
            self._af.close()
示例#12
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)
    if opts.bam:
        mreads = path.realpath(opts.bam)
    else:
        mreads = path.join(opts.workdir, load_parameters_fromdb(opts))

    filter_exclude = opts.filter

    outdir = path.join(opts.workdir, '04_normalization')
    mkdir(outdir)

    mappability = gc_content = n_rsites = None
    if opts.normalization == 'oneD':
        if not opts.fasta:
            raise Exception(
                'ERROR: missing path to FASTA for oneD normalization')
        if not opts.renz:
            raise Exception(
                'ERROR: missing restriction enzyme name for oneD normalization'
            )
        if not opts.mappability:
            raise Exception(
                'ERROR: missing path to mappability for oneD normalization')
        bamfile = AlignmentFile(mreads, 'rb')
        refs = bamfile.references
        bamfile.close()

        # get genome sequence ~1 min
        printime('  - parsing FASTA')
        genome = parse_fasta(opts.fasta, verbose=False)

        fas = set(genome.keys())
        bam = set(refs)
        if fas - bam:
            print('WARNING: %d extra chromosomes in FASTA (removing them)' %
                  (len(fas - bam)))
            if len(fas - bam) <= 50:
                print('\n'.join([('  - ' + c) for c in (fas - bam)]))
        if bam - fas:
            txt = ('\n'.join([('  - ' + c)
                              for c in (bam -
                                        fas)]) if len(bam - fas) <= 50 else '')
            raise Exception(
                'ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' %
                (len(bam - fas), txt))
        refs = [crm for crm in refs if crm in genome]
        if len(refs) == 0:
            raise Exception("ERROR: chromosomes in FASTA different the ones"
                            " in BAM")

        # get mappability ~2 min
        printime('  - Parsing mappability')
        mappability = parse_mappability_bedGraph(
            opts.mappability,
            opts.reso,
            wanted_chrom=refs[0] if len(refs) == 1 else None)
        # resize chomosomes
        for c in refs:
            if not c in mappability:
                mappability[c] = [float('nan')] * (len(refs) // opts.reso + 1)
            if len(mappability[c]) < len(refs) // opts.reso + 1:
                mappability[c] += [float('nan')] * (
                    (len(refs) // opts.reso + 1) - len(mappability[c]))
        # concatenates
        mappability = reduce(lambda x, y: x + y,
                             (mappability.get(c, []) for c in refs))

        printime('  - Computing GC content per bin (removing Ns)')
        gc_content = get_gc_content(genome,
                                    opts.reso,
                                    chromosomes=refs,
                                    n_cpus=opts.cpus)
        # compute r_sites ~30 sec
        # TODO: read from DB
        printime('  - Computing number of RE sites per bin (+/- 200 bp)')
        n_rsites = []
        re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '')
        for crm in refs:
            for pos in range(200, len(genome[crm]) + 200, opts.reso):
                seq = genome[crm][pos - 200:pos + opts.reso + 200]
                n_rsites.append(seq.count(re_site))

        ## CHECK TO BE REMOVED
        # out = open('tmp_mappability.txt', 'w')
        # i = 0
        # for crm in refs:
        #     for pos in xrange(len(genome[crm]) / opts.reso + 1):
        #         out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i]))
        #         i += 1
        # out.close()
        # compute GC content ~30 sec
        # TODO: read from DB
    biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam(
        mreads,
        filter_exclude,
        opts.reso,
        min_count=opts.min_count,
        sigma=2,
        factor=1,
        outdir=outdir,
        extra_out=param_hash,
        ncpus=opts.cpus,
        normalization=opts.normalization,
        mappability=mappability,
        p_fit=opts.p_fit,
        cg_content=gc_content,
        n_rsites=n_rsites,
        min_perc=opts.min_perc,
        max_perc=opts.max_perc,
        seed=opts.seed,
        normalize_only=opts.normalize_only,
        max_njobs=opts.max_njobs,
        extra_bads=opts.badcols,
        biases_path=opts.biases_path)

    inter_vs_gcoord = path.join(
        opts.workdir, '04_normalization',
        'interactions_vs_genomic-coords.png_%s_%s.png' %
        (opts.reso, param_hash))

    # get and plot decay
    if not opts.normalize_only:
        printime('  - Computing interaction decay vs genomic distance')
        (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions(
            decay,
            max_diff=10000,
            resolution=opts.reso,
            normalized=not opts.filter_only,
            savefig=inter_vs_gcoord)

        print('    -> Decay slope 0.7-10 Mb\t%s' % a2)
    else:
        a2 = 0.

    printime('  - Saving biases and badcol columns')
    # biases
    bias_file = path.join(
        outdir, 'biases_%s_%s.pickle' %
        (nicer(opts.reso).replace(' ', ''), param_hash))
    out = open(bias_file, 'wb')

    dump(
        {
            'biases': biases,
            'decay': decay,
            'badcol': badcol,
            'resolution': opts.reso
        }, out, HIGHEST_PROTOCOL)
    out.close()

    finish_time = time.localtime()

    try:
        save_to_db(opts, bias_file, mreads, len(badcol), len(biases),
                   raw_cisprc, norm_cisprc, inter_vs_gcoord, a2, opts.filter,
                   launch_time, finish_time)
    except:
        # release lock anyway
        print_exc()
        try:
            remove(path.join(opts.workdir, '__lock_db'))
        except OSError:
            pass
        exit(1)
示例#13
0
def main():
    """
    main function
    """
    opts = get_options()
    filter_exclude = filters_to_bin(opts.filter)
    tadbit_bam = opts.tadbit_bam
    hicup_bam = opts.hicup_bam
    map_folder = opts.map_folder
    nreads = opts.nreads * 1_000_000

    tag_dict = {
        (1, 1): (67, 131),
        (0, 0): (115, 179),
        (1, 0): (99, 147),
        (0, 1): (83, 163),
    }

    out = open(hicup_bam, 'w')
    for seqs in get_mapped_chunk(map_folder, nreads):
        bamfile = AlignmentFile(tadbit_bam, 'rb')
        refs = bamfile.references
        printime(f' - processing BAM (for {len(seqs) / 1_000_000}M reads)')
        for r in bamfile.fetch(multiple_iterators=False):
            if r.flag & filter_exclude:
                continue
            rid = r.qname
            ridname = rid.split('#')[0]
            pos1 = r.reference_start + 1
            which, len1 = r.cigar[0]
            tags = dict(r.tags)
            if which == 6:  # first read-end
                s1, s2 = tags['S1'], tags['S2']
            else:
                s2, s1 = tags['S1'], tags['S2']
            if s1 == 0:
                pos1 = pos1 - len1 + 1
            try:
                seq, qal = seqs[ridname, pos1]
            except KeyError:
                continue
            crm1 = r.reference_name
            crm2 = refs[r.mrnm]
            pos2 = r.mpos + 1
            len2 = r.tlen

            dist = 0 if crm1 != crm2 else abs(pos2 - pos1)
            tags = dict(r.tags)

            if s2 == 0:
                pos2 = pos2 - len2 + 1

            flag = tag_dict[s1, s2][0]

            out.write((f'{r.qname}\t{flag}\t{crm1}\t{pos1}\t{len1}\t'
                       f'{len(seq)}M\t{crm2}\t{pos2}\t{dist}\t{seq}\t'
                       f'{qal}\tMD:Z:{len1}\tPG:Z:MarkDuplicates\tNM:i:0\t'
                       f'AS:i:{len1}\tXS:i:1\n'))
        bamfile.close()
        seqs.clear()
    out.close()
示例#14
0
def augment_alignment_by_barcode_from_name(inbam, outbam, reftable):
    """
    This function takes a bam-file and outputs
    a bam-file with RG-tag representing the barcodes.

    The barcodes are encoded by the read name
    where the the read name is assumed to have the form:
    @<barcodename>:<number> ...

    The barcode name from the read is then used to look up
    the corresponding barcode name from a reference table
    that has the form

    > cat barcodesheet.tsv
    readpre1    cell1
    readpre2    cell2

    Therefore, for a the read @readpre1:12312 the corresponding
    RGID would be cell1.

    Importantly, the readprefixes that encode the barcode in the
    read names must be equally long!
    """

    # load mapping between read name prefix and barcode
    refs = pd.read_csv(reftable, sep='\t', header=[0])
    # obtain the prefix length and check if it is the same
    # for all entries
    reflen = len(refs['Readprefix'][0])
    rmap = {}
    for row in refs.iterrows():
        rmap[row[1].Readprefix] = row[1].Name

    treatment_reader = AlignmentFile(inbam, 'rb')
    bam_writer = AlignmentFile(outbam + '.tmp',
                               'wb',
                               template=treatment_reader)

    barcodes = set()
    for aln in treatment_reader.fetch(until_eof=True):
        # extract barcode between @ and the first :
        refprefix = aln.query_name[:reflen]
        barcode = rmap[refprefix]
        barcodes.add(barcode)

        aln.set_tag('RG', barcode)
        bam_writer.write(aln)

    treatment_reader.close()
    bam_writer.close()

    # update the header with the available barcodes
    f = AlignmentFile(outbam + '.tmp', 'rb')
    header = f.header.to_dict().copy()
    header['RG'] = [{'ID': bc, 'SM': bc} for bc in barcodes]
    bam_writer = AlignmentFile(outbam, 'wb', header=header)
    for aln in f.fetch(until_eof=True):
        bam_writer.write(aln)

    f.close()
    os.remove(outbam + '.tmp')
    bam_writer.close()
示例#15
0
def split_reads_by_barcode(barcode_bams,
                           treatment_bam,
                           output_bam,
                           min_mapq=None,
                           max_mismatches=1,
                           log=None):
    """Splits the reads by barcodes.

    This function takes a set of barcode alignments (in bam format)
    and reads (in bam format). It produces an output bam file
    containing the RG tag that hold the barcode.

    Note: The method assumes that the respective BAM files contain reads
    with corresponding read ids. Also the BAM files need to be sorted
    by read name. Furthermore, the treatment_reads must be a subset of
    the barcode reads.

    barcodes : list(str)
        List of bam-file names pointing to the barcode alignments
        towards the pseudo genomes.
    reads : str
        BAM file containing the read alignments
    output_bam : str
        Output BAM file
    max_open_files : int
        Maximum number of writers to handle in one sweep. This is limited by the
        operating system's limitation opening file handles. Default: 1000.
    min_mapq : int
        Filter for mapping quality of the barcode reads. Only reads mapq >= min_mapq
        are considered. Default: None means no filter is applied.
    max_mismatches : int
        Maximum number of mismatches. Default: 1.
    """

    aligned_cnt = 0
    unaligned_cnt = 0
    if not min_mapq:
        min_mapq = 0

    treatment_reader = AlignmentFile(treatment_bam, 'rb')

    barcode_readers = [
        AlignmentFile(bamfile, 'rb') for bamfile in barcode_bams
    ]

    # start at the beginning of the bam files
    barcode_it = [reader.fetch(until_eof=True) for reader in barcode_readers]
    bnames = [next(br) for br in barcode_it]
    bam_writer = AlignmentFile(output_bam + '.tmp',
                               'wb',
                               template=treatment_reader)

    barcodes = set()
    log_content = {}
    log_content['with_barcode'] = 0
    log_content['no_barcode'] = 0
    log_content['unmapped'] = 0

    for aln in treatment_reader.fetch(until_eof=True):
        # only retain the aligned reads
        if aln.is_unmapped:
            log_content['unmapped'] += 1
            continue

        # extract the corresponding barcodes
        for i, br_it in enumerate(barcode_it):
            while not bnames[i].query_name == aln.query_name:
                # increment barcode names until they
                # match the alignment read name.
                bnames[i] = next(br_it)

        if any([x.is_unmapped or x.is_reverse for x in bnames]):
            # one or more barcodes are abscent
            # Barcode must map to the forward strand only
            # Reverse strand matches are definitely due to sequencing errors
            log_content['no_barcode'] += 1
            continue

        if any([x.mapq < min_mapq for x in bnames]):
            # remove poor mapping quality
            log_content['no_barcode'] += 1
            continue

        if any([x.get_tag('XM') > max_mismatches for x in bnames]):
            # maximum number of mismatches exceeded
            log_content['no_barcode'] += 1
            continue

        comb_id = '_'.join([baln.reference_name for baln in bnames])
        barcodes.add(comb_id)

        aln.set_tag('RG', comb_id)
        bam_writer.write(aln)
        log_content['with_barcode'] += 1

    print("End batch ...")

    treatment_reader.close()
    bam_writer.close()
    for reader in barcode_readers:
        reader.close()

    f = AlignmentFile(output_bam + '.tmp', 'rb')
    header = f.header.to_dict().copy()
    header['RG'] = [{'ID': combid, 'SM': combid} for combid in barcodes]
    bam_writer = AlignmentFile(output_bam, 'wb', header=header)
    for aln in f.fetch(until_eof=True):
        bam_writer.write(aln)

    f.close()
    os.remove(output_bam + '.tmp')
    bam_writer.close()

    #write log file
    with open(log, 'w') as f:
        f.write('Readgroup\tcounts\n')
        for icnt in log_content:
            f.write('{}\t{}\n'.format(icnt, log_content[icnt]))
示例#16
0
parser.add_argument("-o",
                    "--out",
                    default="bigg.bed",
                    help="the output file name")
parser.add_argument("-s",
                    "--score",
                    default=21,
                    help="The min mapq score used to keep a read")

args = parser.parse_args()

# make a file using the functions
samfile = AlignmentFile(args.bamfile)

fw = open(args.out, "w")

for n, record in enumerate(samfile):
    # add mapq filter to rm the secondary and supplementary mapping
    if record.mapq >= args.score:
        try:
            bigg = convert.sam_to_bigGenePred(record, samfile)
            fw.write(bigg.to_str())
            fw.write("\n")
        except ValueError:
            pass
    #if n>100:
    #break

fw.close()
samfile.close()