示例#1
0
def parse_barcode(bamfile):
	"""parses a sorted and index bam file, removes all cases where rna hits more than one spot in genome
	and writes to a file, create file for mutant and wildtype based on barcodes"""
	samfile = Samfile(bamfile, "rb")
	multi_hit_file = Samfile("MultiHit.bam","wb",template=samfile)
	mutant_one = Samfile("MutantOne.bam","wb",template=samfile)
	wildtype_one = Samfile("WildtypeOne.bam","wb",template=samfile)
	mutant_two = Samfile("MutantTwo.bam","wb",template=samfile)
	wildtype_two = Samfile("WildtypeTwo.bam","wb",template=samfile)
	for line in samfile.fetch():
		#if line.is_secondary:
		## does this hit to more than one spot in genome
		#	multi_hit_file.write(line)
		if "#GAGT"in line.qname: 
		## write to mutant file
			mutant_one.write(line)
		elif "#TTAG" in line.qname:
			mutant_two.write(line)
		elif "#ACCC" in line.qname:
		### write to wildtype file
			wildtype_one.write(line)
		elif "#CGTA" in line.qname:
		### write to wildtype file
			wildtype_two.write(line)

	multi_hit_file.close()
	mutant_one.close()
	wildtype_one.close()
	mutant_two.close()
	wildtype_two.close()
	samfile.close()
示例#2
0
def get_raw_signal(arguments):
    (mpbs_region, reads_file, organism, window_size, forward_shift,
     reverse_shift) = arguments

    bam = Samfile(reads_file, "rb")
    signal = np.zeros(window_size)

    for region in mpbs_region:
        mid = (region.final + region.initial) // 2
        p1 = mid - window_size // 2
        p2 = mid + window_size // 2

        if p1 <= 0:
            continue
        # Fetch raw signal
        for read in bam.fetch(region.chrom, p1, p2):
            # check if the read is unmapped, according to issue #112
            if read.is_unmapped:
                continue

            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1 <= cut_site < p2:
                    signal[cut_site - p1] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1 <= cut_site < p2:
                    signal[cut_site - p1] += 1.0

    return signal
示例#3
0
def bam_fill_seq(args):
    """ Fill empty sequence with known seqs
    """
    if not args.source_bam:
        source_bam = args.bam
    else:
        source_bam = args.source_bam
    logging.info('Loading samfile: %s', source_bam)
    src_seqs = {1: {}, 2: {}}

    src = pysam.Samfile(source_bam)
    with src:
        for rec in src:
            if rec.is_supplementary:  # skip supplementary alignment
                continue
            if rec.is_secondary:  # skip supplementary alignment
                continue
            if rec.query_sequence is None:  # empty
                continue
            if rec.is_read2:
                src_seqs[2][rec.qname] = (rec.query_sequence,
                                          rec.query_qualities, rec.is_reverse)
            else:
                src_seqs[1][rec.qname] = (rec.query_sequence,
                                          rec.query_qualities, rec.is_reverse)

    logging.info('Loaded read1 : %s', len(src_seqs[1]))
    logging.info('Loaded read2 : %s', len(src_seqs[2]))

    sam = Samfile(args.bam)
    if args.output.endswith('.bam'):
        mode = 'wb'
    else:
        mode = 'wh'
    out = pysam.Samfile(args.output, mode=mode, template=sam)

    if args.region:
        it = sam.fetch(region=args.region)
    else:
        it = sam

    for rec in it:
        qname = rec.qname
        if rec.query_sequence is None:  # only fill when empty
            ret = src_seqs[2 if rec.is_read2 else 1].get(rec.qname)
            if ret is not None:
                seq, qual, is_rev = ret
                if is_rev != rec.is_reverse:
                    seq = dna_revcomp(seq)
                    if qual is not None:
                        qual = list(reversed(qual))
                cigar = Cigar(rec.cigartuples)
                seq = cigar.hard_clip_seq(seq)
                if qual is not None:
                    qual = cigar.hard_clip_seq(qual)
                rec.query_sequence = seq  # refill
                rec.query_qualities = qual

        out.write(rec)
示例#4
0
文件: bam_aln.py 项目: m1m0r1/galgo
def bam_variant_aln(args):
    samfile = Samfile(args.bam)
    for rec in args.vcf:
        fp = open(vcf)
        reader = pyvcf.Reader(fp)
        self.positions = []

    for rec in samfile.fetch(vcf):
        samfile.getrname(rec.tid)
        rec
示例#5
0
def main(args):
    m260b.debug.debug.DEBUG = args.debug
    ref_header, ref_sequence = read_basic_fasta(args.reference_file)
    if args.input_bam:
        reads = Samfile(args.input_bam)
        if args.start and args.stop:
            reads = reads.fetch(ref_header[1:].strip(), args.start, args.stop)
    else:
        reads = get_sorted_aligned_reads(args, ref_header, ref_sequence)
    #vcf_stream = VCFWriter(open(args.out_vcf, 'wb'), make_vcf_header(args)) if args.out_vcf else None
    chr = ref_header[1:].strip()
    fail_reasons = Counter()
    haplo_out = None
    if args.haplotype_out:
        haplo_out = Samfile(args.haplotype_out,
                            'wb',
                            header=SAM_HEADER(ref_header, ref_sequence))
    vcf_stream = VCFWriter(open(args.out_vcf, 'wb'),
                           make_vcf_header(args)) if args.out_vcf else None
    for region, reads in active_regions(reads,
                                        ref_sequence,
                                        chr,
                                        start_offset=0,
                                        flank=30,
                                        dfrac=1.0):
        #print('Calling region {}-{}'.format(region.start, region.stop))
        haplotype = build_haplotype(region.reference,
                                    reads,
                                    k=11,
                                    min_kmer_count=2)
        if haplotype.fail_reason:
            print('Failure {} at window\n{}'.format(haplotype.fail_reason,
                                                    region))
            continue
        # align the haplotype to the reference sequence
        offset, cigar, score, mismatch = banded_sw(region.reference,
                                                   haplotype.seq)
        haplotype_start = region.start + offset
        _info = AlignmentInfo(haplotype_start, cigar, False, mismatch)
        haplo_seq = SeqRecord(Seq(haplotype.seq, DNA),
                              id='Haplotype{}'.format(region.start))
        dict.__setitem__(haplo_seq._per_letter_annotations, 'phred_quality',
                         [40] * len(haplotype.seq))
        haplo_read = alignment_info_to_sam(haplo_seq, _info, 'nomate', None,
                                           'hw2_rg', False)
        if haplo_out:
            haplo_out.write(haplo_read)
        #print(haplotype)
        for variant in vcf_from_haplotype(region, haplotype, SAMPLE_NAME, chr):
            if vcf_stream:
                vcf_stream.write_record(variant)
            print(vcf2m260(variant))
    if vcf_stream:
        vcf_stream.flush()
        vcf_stream.close()
示例#6
0
def get_raw_tracks(args):
    # Initializing Error Handler
    err = ErrorHandler()

    if len(args.input_files) != 2:
        err.throw_error("ME_FEW_ARG",
                        add_msg="You must specify reads and regions file.")

    output_fname = os.path.join(args.output_location,
                                "{}.wig".format(args.output_prefix))

    bam = Samfile(args.input_files[0], "rb")
    regions = GenomicRegionSet("Interested regions")
    regions.read(args.input_files[1])
    regions.merge()
    reads_file = GenomicSignal()

    with open(output_fname, "a") as output_f:
        for region in regions:
            # Raw counts
            signal = [0.0] * (region.final - region.initial)
            for read in bam.fetch(region.chrom, region.initial, region.final):
                if not read.is_reverse:
                    cut_site = read.pos + args.forward_shift
                    if region.initial <= cut_site < region.final:
                        signal[cut_site - region.initial] += 1.0
                else:
                    cut_site = read.aend + args.reverse_shift - 1
                    if region.initial <= cut_site < region.final:
                        signal[cut_site - region.initial] += 1.0

            if args.norm:
                signal = reads_file.boyle_norm(signal)
                perc = scoreatpercentile(signal, 98)
                std = np.std(signal)
                signal = reads_file.hon_norm_atac(signal, perc, std)

            output_f.write("fixedStep chrom=" + region.chrom + " start=" +
                           str(region.initial + 1) + " step=1\n" +
                           "\n".join([str(e)
                                      for e in np.nan_to_num(signal)]) + "\n")
    output_f.close()

    if args.bigWig:
        genome_data = GenomeData(args.organism)
        chrom_sizes_file = genome_data.get_chromosome_sizes()
        bw_filename = os.path.join(args.output_location,
                                   "{}.bw".format(args.output_prefix))
        os.system(" ".join([
            "wigToBigWig", output_fname, chrom_sizes_file, bw_filename,
            "-verbose=0"
        ]))
        os.remove(output_fname)
示例#7
0
def main(args):
    option = "r" if args.samformat else "rb"
    samfile = Samfile(args.bamfile, "rb")

    #Iterates over each read instead of each contig
    outputs = defaultdict(list)
    #import ipdb; ipdb.set_trace()
    for aln in samfile.fetch(until_eof = True):
        ref = samfile.getrname(aln.tid)
        outputs[ref].append(aln)

    for ref, alns in outputs.iteritems():
        print_reads(alns, ref, samfile.header)
示例#8
0
class GenomicSignal:
    def __init__(self, file_name):
        self.bam = Samfile(file_name,"rb")
    def get_signal(self, ref, start, end, ext, initial_clip = 1000, ext_both_directions=False):
        pileup_region = PileupRegion(start,end,ext)
        iter = self.bam.fetch(reference=ref, start=start, end=end)
        if(not ext_both_directions):
            for alignment in iter: pileup_region.__call__(alignment)
        else:
            for alignment in iter: pileup_region.__call2__(alignment)
        raw_signal = array([min(e,initial_clip) for e in pileup_region.vector])
        mean = raw_signal.mean()
        std = raw_signal.std()
        clip_signal = [min(e, mean + (10 * std)) for e in raw_signal]

        return clip_signal
示例#9
0
def get_raw_tracks(args):
    # Initializing Error Handler
    err = ErrorHandler()

    if len(args.input_files) != 2:
        err.throw_error("ME_FEW_ARG", add_msg="You must specify reads and regions file.")

    output_fname = os.path.join(args.output_location, "{}.wig".format(args.output_prefix))

    bam = Samfile(args.input_files[0], "rb")
    regions = GenomicRegionSet("Interested regions")
    regions.read(args.input_files[1])
    regions.merge()
    reads_file = GenomicSignal()

    with open(output_fname, "a") as output_f:
        for region in regions:
            # Raw counts
            signal = [0.0] * (region.final - region.initial)
            for read in bam.fetch(region.chrom, region.initial, region.final):
                if not read.is_reverse:
                    cut_site = read.pos + args.forward_shift
                    if region.initial <= cut_site < region.final:
                        signal[cut_site - region.initial] += 1.0
                else:
                    cut_site = read.aend + args.reverse_shift - 1
                    if region.initial <= cut_site < region.final:
                        signal[cut_site - region.initial] += 1.0

            if args.norm:
                signal = reads_file.boyle_norm(signal)
                perc = scoreatpercentile(signal, 98)
                std = np.std(signal)
                signal = reads_file.hon_norm_atac(signal, perc, std)

            output_f.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" +
                           "\n".join([str(e) for e in np.nan_to_num(signal)]) + "\n")
    output_f.close()

    if args.bigWig:
        genome_data = GenomeData(args.organism)
        chrom_sizes_file = genome_data.get_chromosome_sizes()
        bw_filename = os.path.join(args.output_location, "{}.bw".format(args.output_prefix))
        os.system(" ".join(["wigToBigWig", output_fname, chrom_sizes_file, bw_filename, "-verbose=0"]))
        os.remove(output_fname)
示例#10
0
def main(args):
    option = "r" if args.samformat else "rb"
    samfile = Samfile(args.bamfile, "rb")
    ref_ids = [samfile.gettid(r) for r in samfile.references]
    #Iterates over each read instead of each contig
    reads_to_print = []
    for aln in samfile.fetch(until_eof = True):
        if pair_is_aligned(aln, ref_ids):
            if args.read_pair == 1 and aln.is_read1:
                reads_to_print.append(aln)
            elif args.read_pair == 2 and aln.is_read2:
                reads_to_print.append(aln)
            elif args.read_pair == 0:
                reads_to_print.append(aln)
        if len(reads_to_print) >= 10000:
            # Flush the reads collected
            print_reads(reads_to_print)
            reads_to_print = []

    print_reads(reads_to_print)
def main(args):
    option = "r" if args.samformat else "rb"
    samfile = Samfile(args.bamfile, option)
    ref_ids = [samfile.gettid(r) for r in samfile.references]
    #Iterates over each read instead of each contig
    reads_to_print_1 = []
    reads_to_print_2 = []
    reads_to_print_u = []
    for aln in samfile.fetch(until_eof = True):
        if aln.tid in ref_ids: # This read is aligned
            if aln.rnext in ref_ids: # The mate is also aligned
                if aln.is_read1:
                    reads_to_print_1.append(aln)
                    reads_to_print_1 = flush_reads(reads_to_print_1, args.R1)
                elif aln.is_read2:
                    reads_to_print_2.append(aln)
                    reads_to_print_2 = flush_reads(reads_to_print_2, args.R2)
            else:
                reads_to_print_u.append(aln)
                reads_to_print_u = flush_reads(reads_to_print_u, args.u)

    print_reads(reads_to_print_1, args.R1)
    print_reads(reads_to_print_2, args.R2)
    print_reads(reads_to_print_u, args.u)
示例#12
0
    spr = 0.0
    counter = 0.0
    for line in intFile:

        # Fetching signal
        ll = line.strip().split("\t")
        mLen = int(ll[2]) - int(ll[1])
        mid = (int(ll[1]) + int(ll[2])) / 2
        p1 = max(mid - halfWindow, 0)
        p2 = mid + halfWindow

        # Fetch raw signal
        pileup_region = PileupRegion(p1, p2, 1)
        if (ps_version == "0.7.5"):
            bam.fetch(reference=ll[0],
                      start=p1,
                      end=p2,
                      callback=pileup_region)
        else:
            iter = bam.fetch(reference=ll[0], start=p1, end=p2)
            for alignment in iter:
                pileup_region.__call__(alignment)
        raw_signal = array(
            [min(e, initial_clip) for e in pileup_region.vector])

        # Std-based clipping
        mean = raw_signal.mean()
        std = raw_signal.std()
        clip_signal = [min(e, mean + (10 * std)) for e in raw_signal]

        # Bias Correction
        correctedSignal = bias_correction(bam, clip_signal, biasTableF,
示例#13
0
tf_list = list()
gene_list = list()
tc_list = list()

for i, r in enumerate(gr_tfs):
	tf = r.name.split(".")[-1]
	gene = gr_genes[i].name
	if gene == "." or "+" in gene or "-" in gene or ":" in gene:
		continue

	mid = (r.initial + r.final) / 2
	p1 = max(mid - 100, 0)
	p2 = min(mid + 100, chrom_sizes_dict[r.chrom])

	iter = bam.fetch(reference=r.chrom, start=p1, end=p2)
	tc = 0
	for alignment in iter:
		tc += 1
		
	tf_list.append(tf)
	gene_list.append(gene)
	tc_list.append(tc)


df = pd.DataFrame([tf_list, gene_list, tc_list])
df = df.transpose()
df.rename(columns={0: 'TF', 1: 'Gene', 2: 'TC'}, inplace=True)
df = df.groupby(['TF', 'Gene']).sum().reset_index()
df.to_csv(output_file, header=False, index=False, sep='\t')
示例#14
0
  # Evaluating Overall TC
  try: regionTagCount = tag_count(chrName, p1, p2, dnaseBam, tcHalfWindow)
  except Exception: 
    print "Exception TC raised in "+line
    writeOutput(ll,regionTagCount,resVec,outFile)
    continue

  # Fetching sequence
  try: sequence = str(genomeFile.fetch(chrName, p1, p2))
  except Exception:
    print "Exception SEQUENCE raised in "+line
    writeOutput(ll,regionTagCount,resVec,outFile)
    continue

  # Fetching footprints
  try: footprints = fpBam.fetch(reference=chrName, start=p1, end=p2)
  except Exception:
    print "Exception FOOTPRINTS raised in "+line
    writeOutput(ll,regionTagCount,resVec,outFile)
    continue

  # Best mpbs
  maxPos = -99999
  maxValue = globalMin
  maxMotifLen = -1

  # Performing motif matching and footprint overlapping
  for res in search(sequence, [e.pssm_list for e in motifList], [e.min for e in motifList], absolute_threshold=True, both_strands=True):

    for (position, score) in res:
示例#15
0
    def estimate_table(self, regions, dnase_file_name, genome_file_name, k_nb, shift):
        """ 
        Estimates bias based on HS regions, DNase-seq signal and genomic sequences.

        Keyword arguments:
        regions -- DNase-seq HS regions.
        dnase_file_name -- DNase-seq file name.
        genome_file_name -- Genome to fetch genomic sequences from.
        
        Return:
        bias_table_F, bias_table_R -- Bias tables.
        """

        # Parameters
        maxDuplicates = 100
        pseudocount = 1.0

        # Initializing bam and fasta
        if(dnase_file_name.split(".")[-1].upper() != "BAM"): return None # TODO ERROR
        bamFile = Samfile(dnase_file_name, "rb")
        fastaFile = Fastafile(genome_file_name)

        # Initializing dictionaries
        obsDictF = dict(); obsDictR = dict()
        expDictF = dict(); expDictR = dict()

        ct_reads_r=0
        ct_reads_f=0
        ct_kmers=0

        # Iterating on HS regions
        for region in regions:

            # Initialization
            prevPos = -1
            trueCounter = 0

            # Evaluating observed frequencies ####################################

            # Fetching reads
            for r in bamFile.fetch(region.chrom, region.initial, region.final):

                # Calculating positions
                if(not r.is_reverse): p1 = r.pos - (k_nb/2) - 1 + shift
                else: p1 = r.aend - (k_nb/2) + 1 - shift
                p2 = p1 + k_nb

                # Verifying PCR artifacts
                if(p1 == prevPos): trueCounter += 1
                else:
                    prevPos = p1
                    trueCounter = 0
                if(trueCounter > maxDuplicates): continue

                # Fetching k-mer
                try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
                except Exception: continue
                if(r.is_reverse): currStr = AuxiliaryFunctions.revcomp(currStr)

                # Counting k-mer in dictionary
                if(not r.is_reverse):
                    ct_reads_r+=1
                    try: obsDictF[currStr] += 1
                    except Exception: obsDictF[currStr] = 1
                else:
                    ct_reads_f+=1
                    try: obsDictR[currStr] += 1
                    except Exception: obsDictR[currStr] = 1 


            # Evaluating expected frequencies ####################################

            # Fetching whole sequence
            try: currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
            except Exception: continue
            currRevComp = AuxiliaryFunctions.revcomp(currStr)

            # Iterating on each sequence position
            for i in range(0,len(currStr)-k_nb):
                ct_kmers+=1
                # Counting k-mer in dictionary
                s = currStr[i:i+k_nb]
                try: expDictF[s] += 1
                except Exception: expDictF[s] = 1

                # Counting k-mer in dictionary for reverse complement
                s = currRevComp[i:i+k_nb]
                try: expDictR[s] += 1
                except Exception: expDictR[s] = 1

        # Closing files
        bamFile.close()
        fastaFile.close()

        # Creating bias dictionary
        alphabet = ["A","C","G","T"]
        kmerComb = ["".join(e) for e in product(alphabet, repeat=k_nb)]
        bias_table_F = dict([(e,0.0) for e in kmerComb]) 
        bias_table_R = dict([(e,0.0) for e in kmerComb]) 
        for kmer in kmerComb:
            try: obsF = obsDictF[kmer] + pseudocount
            except Exception: obsF = pseudocount
            try: expF = expDictF[kmer] + pseudocount
            except Exception: expF = pseudocount
            bias_table_F[kmer] = round(float(obsF/ct_reads_f)/float(expF/ct_kmers),6)
            try: obsR = obsDictR[kmer] + pseudocount
            except Exception: obsR = pseudocount
            try: expR = expDictR[kmer] + pseudocount
            except Exception: expR = pseudocount
            bias_table_R[kmer] = round(float(obsR/ct_reads_r)/float(expR/ct_kmers),6)

        # Return
        return [bias_table_F, bias_table_R]
示例#16
0
def get_raw_signal(arguments):
    (mpbs_name, mpbs_file1, mpbs_file2, reads_file1, reads_file2, organism,
     window_size, forward_shift, reverse_shift) = arguments

    mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1")
    mpbs1.read(mpbs_file1)

    mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2")
    mpbs2.read(mpbs_file2)

    mpbs = mpbs1.combine(mpbs2, output=True)
    mpbs.sort()

    bam1 = Samfile(reads_file1, "rb")
    bam2 = Samfile(reads_file2, "rb")

    genome_data = GenomeData(organism)
    fasta = Fastafile(genome_data.get_genome())

    signal_1 = np.zeros(window_size)
    signal_2 = np.zeros(window_size)
    motif_len = None
    pwm = dict([("A", [0.0] * window_size), ("C", [0.0] * window_size),
                ("G", [0.0] * window_size), ("T", [0.0] * window_size),
                ("N", [0.0] * window_size)])

    mpbs_regions = mpbs.by_names([mpbs_name])
    num_motif = len(mpbs_regions)

    for region in mpbs_regions:
        if motif_len is None:
            motif_len = region.final - region.initial

        mid = (region.final + region.initial) / 2
        p1 = mid - window_size / 2
        p2 = mid + window_size / 2

        if p1 <= 0:
            continue

        # Fetch raw signal
        for read in bam1.fetch(region.chrom, p1, p2):
            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1 <= cut_site < p2:
                    signal_1[cut_site - p1] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1 <= cut_site < p2:
                    signal_1[cut_site - p1] += 1.0

        for read in bam2.fetch(region.chrom, p1, p2):
            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1 <= cut_site < p2:
                    signal_2[cut_site - p1] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1 <= cut_site < p2:
                    signal_2[cut_site - p1] += 1.0
        update_pwm(pwm, fasta, region, p1, p2)

    return signal_1, signal_2, motif_len, pwm, num_motif
示例#17
0
def estimate_bias_kmer(args):
    # Parameters
    maxDuplicates = 100
    pseudocount = 1.0

    # Initializing bam and fasta
    bamFile = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fastaFile = Fastafile(genome_data.get_genome())
    regions = GenomicRegionSet("regions")
    regions.read(args.regions_file)

    # Initializing dictionaries
    obsDictF = dict()
    obsDictR = dict()
    expDictF = dict()
    expDictR = dict()

    ct_reads_r = 0
    ct_reads_f = 0
    ct_kmers = 0

    # Iterating on HS regions
    for region in regions:

        # Initialization
        prevPos = -1
        trueCounter = 0

        # Evaluating observed frequencies ####################################
        # Fetching reads
        for r in bamFile.fetch(region.chrom, region.initial, region.final):

            # Calculating positions
            if not r.is_reverse:
                cut_site = r.pos + args.forward_shift - 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            else:
                cut_site = r.aend + args.reverse_shift + 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            p2 = p1 + args.k_nb

            # Verifying PCR artifacts
            if p1 == prevPos:
                trueCounter += 1
            else:
                prevPos = p1
                trueCounter = 0
            if trueCounter > maxDuplicates: continue

            # Fetching k-mer
            try:
                currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)

            # Counting k-mer in dictionary
            if not r.is_reverse:
                ct_reads_f += 1
                try:
                    obsDictF[currStr] += 1
                except Exception:
                    obsDictF[currStr] = 1
            else:
                ct_reads_r += 1
                try:
                    obsDictR[currStr] += 1
                except Exception:
                    obsDictR[currStr] = 1

        # Evaluating expected frequencies ####################################
        # Fetching whole sequence
        try:
            currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue
        currRevComp = AuxiliaryFunctions.revcomp(currStr)

        # Iterating on each sequence position
        for i in range(0, len(currStr) - args.k_nb):
            ct_kmers += 1
            # Counting k-mer in dictionary
            s = currStr[i:i + args.k_nb]
            try:
                expDictF[s] += 1
            except Exception:
                expDictF[s] = 1

            # Counting k-mer in dictionary for reverse complement
            s = currRevComp[i:i + args.k_nb]
            try:
                expDictR[s] += 1
            except Exception:
                expDictR[s] = 1

    # Closing files
    bamFile.close()
    fastaFile.close()

    # Creating bias dictionary
    alphabet = ["A", "C", "G", "T"]
    kmerComb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)]
    bias_table_F = dict([(e, 0.0) for e in kmerComb])
    bias_table_R = dict([(e, 0.0) for e in kmerComb])
    for kmer in kmerComb:
        try:
            obsF = obsDictF[kmer] + pseudocount
        except Exception:
            obsF = pseudocount
        try:
            expF = expDictF[kmer] + pseudocount
        except Exception:
            expF = pseudocount
        if ct_reads_f == 0:
            bias_table_F[kmer] = 1
        else:
            bias_table_F[kmer] = round(float(obsF / ct_reads_f) / float(expF / ct_kmers), 6)
        try:
            obsR = obsDictR[kmer] + pseudocount
        except Exception:
            obsR = pseudocount
        try:
            expR = expDictR[kmer] + pseudocount
        except Exception:
            expR = pseudocount
        if ct_reads_r == 0:
            bias_table_R[kmer] = 1
        else:
            bias_table_R[kmer] = round(float(obsR / ct_reads_r) / float(expR / ct_kmers), 6)

    write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
示例#18
0
def create_signal(args, regions):
    def revcomp(s):
        rev_dict = dict([("A", "T"), ("T", "A"), ("C", "G"), ("G", "C"), ("N", "N")])
        return "".join([rev_dict[e] for e in s[::-1]])

    alphabet = ["A", "C", "G", "T"]
    kmer_comb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)]
    f_obs_dict = dict([(e, 0.0) for e in kmer_comb])
    r_obs_dict = dict([(e, 0.0) for e in kmer_comb])
    f_exp_dict = dict([(e, 0.0) for e in kmer_comb])
    r_exp_dict = dict([(e, 0.0) for e in kmer_comb])

    bam_file = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fasta_file = Fastafile(genome_data.get_genome())

    for region in regions:
        # Fetching observed reads
        reads = bam_file.fetch(reference=region.chrom, start=region.initial, end=region.final)
        for read in reads:
            if not read.is_reverse:
                p1 = read.pos - int(floor(args.k_nb / 2)) + args.forward_shift - 1
            else:
                p1 = read.aend - int(floor(args.k_nb / 2)) + args.reverse_shift + 1
            p2 = p1 + args.k_nb
            try:
                dna_sequence_obs = str(fasta_file.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if 'N' not in dna_sequence_obs:
                if read.is_reverse:
                    dna_sequence_obs = revcomp(dna_sequence_obs)
                    r_obs_dict[dna_sequence_obs] += 1
                else:
                    f_obs_dict[dna_sequence_obs] += 1

        # Fetching whole sequence
        try:
            dna_sequence_exp = str(fasta_file.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue
        dna_sequence_exp_rev = revcomp(dna_sequence_exp)
        for i in range(0, len(dna_sequence_exp) - args.k_nb):
            s = dna_sequence_exp[i:i + args.k_nb]
            if "N" not in s:
                f_exp_dict[s] += 1
            s = dna_sequence_exp_rev[i:i + args.k_nb]
            if "N" not in s:
                r_exp_dict[s] += 1

    output_fname_f_obs = os.path.join(args.output_location, "{}_f_obs.fa".format(str(args.k_nb)))
    output_fname_f_exp = os.path.join(args.output_location, "{}_f_exp.fa".format(str(args.k_nb)))
    output_fname_r_obs = os.path.join(args.output_location, "{}_r_obs.fa".format(str(args.k_nb)))
    output_fname_r_exp = os.path.join(args.output_location, "{}_r_exp.fa".format(str(args.k_nb)))

    output_file_f_obs = open(output_fname_f_obs, "w")
    output_file_f_exp = open(output_fname_f_exp, "w")
    output_file_r_obs = open(output_fname_r_obs, "w")
    output_file_r_exp = open(output_fname_r_exp, "w")

    for kmer in r_obs_dict.keys():
        if f_obs_dict[kmer] > 0:
            output_file_f_obs.write(kmer + "\t" + str(f_obs_dict[kmer]) + "\n")
    for kmer in r_obs_dict.keys():
        if f_exp_dict[kmer] > 0:
            output_file_f_exp.write(kmer + "\t" + str(f_exp_dict[kmer]) + "\n")
    for kmer in r_obs_dict.keys():
        if r_obs_dict[kmer] > 0:
            output_file_r_obs.write(kmer + "\t" + str(r_obs_dict[kmer]) + "\n")
    for kmer in r_obs_dict.keys():
        if r_exp_dict[kmer] > 0:
            output_file_r_exp.write(kmer + "\t" + str(r_exp_dict[kmer]) + "\n")

    output_file_f_obs.close()
    output_file_f_exp.close()
    output_file_r_obs.close()
    output_file_r_exp.close()
示例#19
0
def estimate_bias_pwm(args):
    # Parameters
    max_duplicates = 100

    # Initializing bam and fasta
    bamFile = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fastaFile = Fastafile(genome_data.get_genome())
    regions = GenomicRegionSet("regions")
    regions.read(args.regions_file)

    obs_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    exp_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    obs_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    exp_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])

    # Iterating on HS regions
    for region in regions:
        # Initialization
        prev_pos = -1
        true_counter = 0

        # Evaluating observed frequencies
        # Fetching reads
        for r in bamFile.fetch(region.chrom, region.initial, region.final):
            # Calculating positions
            if not r.is_reverse:
                cut_site = r.pos + args.forward_shift - 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            else:
                cut_site = r.aend + args.reverse_shift + 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            p2 = p1 + args.k_nb

            # Verifying PCR artifacts
            if p1 == prev_pos:
                true_counter += 1
            else:
                prev_pos = p1
                true_counter = 0
            if true_counter > max_duplicates: continue

            # Fetching k-mer
            try:
                currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)

            # Counting k-mer in dictionary
            if not r.is_reverse:
                for i in range(0, len(currStr)):
                    obs_f_pwm_dict[currStr[i]][i] += 1
            else:
                for i in range(0, len(currStr)):
                    obs_r_pwm_dict[currStr[i]][i] += 1

        # Evaluating expected frequencies
        # Fetching whole sequence
        try:
            currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue

        # Iterating on each sequence position
        s = None
        for i in range(0, len(currStr) - args.k_nb):
            # Counting k-mer in dictionary
            s = currStr[i:i + args.k_nb]
            for i in range(0, len(s)):
                exp_f_pwm_dict[s[i]][i] += 1

            # Counting k-mer in dictionary for reverse complement
            s = AuxiliaryFunctions.revcomp(s)
            for i in range(0, len(s)):
                exp_r_pwm_dict[s[i]][i] += 1

    # Closing files
    bamFile.close()
    fastaFile.close()

    # Output pwms
    os.system("mkdir -p " + os.path.join(args.output_location, "pfm"))
    pwm_dict_list = [obs_f_pwm_dict, obs_r_pwm_dict, exp_f_pwm_dict, exp_r_pwm_dict]
    pwm_file_list = []
    pwm_obs_f = os.path.join(args.output_location, "pfm", "obs_{}_f.pfm".format(str(args.k_nb)))
    pwm_obs_r = os.path.join(args.output_location, "pfm", "obs_{}_r.pfm".format(str(args.k_nb)))
    pwm_exp_f = os.path.join(args.output_location, "pfm", "exp_{}_f.pfm".format(str(args.k_nb)))
    pwm_exp_r = os.path.join(args.output_location, "pfm", "exp_{}_r.pfm".format(str(args.k_nb)))

    pwm_file_list.append(pwm_obs_f)
    pwm_file_list.append(pwm_obs_r)
    pwm_file_list.append(pwm_exp_f)
    pwm_file_list.append(pwm_exp_r)

    for i in range(len(pwm_dict_list)):
        with open(pwm_file_list[i], "w") as pwm_file:
            for e in ["A", "C", "G", "T"]:
                pwm_file.write(" ".join([str(int(f)) for f in pwm_dict_list[i][e]]) + "\n")

    motif_obs_f = motifs.read(open(pwm_obs_f), "pfm")
    motif_obs_r = motifs.read(open(pwm_obs_r), "pfm")
    motif_exp_f = motifs.read(open(pwm_exp_f), "pfm")
    motif_exp_r = motifs.read(open(pwm_exp_r), "pfm")

    # Output logos
    os.system("mkdir -p " + os.path.join(args.output_location, "logo"))
    logo_obs_f = os.path.join(args.output_location, "logo", "obs_{}_f.pdf".format(str(args.k_nb)))
    logo_obs_r = os.path.join(args.output_location, "logo", "obs_{}_r.pdf".format(str(args.k_nb)))
    logo_exp_f = os.path.join(args.output_location, "logo", "exp_{}_f.pdf".format(str(args.k_nb)))
    logo_exp_r = os.path.join(args.output_location, "logo", "exp_{}_r.pdf".format(str(args.k_nb)))

    motif_obs_f.weblogo(logo_obs_f, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.2, yaxis_tic_interval=0.1)
    motif_obs_r.weblogo(logo_obs_r, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.2, yaxis_tic_interval=0.1)
    motif_exp_f.weblogo(logo_exp_f, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.02, yaxis_tic_interval=0.01)
    motif_exp_r.weblogo(logo_exp_r, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.02, yaxis_tic_interval=0.01)

    # Creating bias dictionary
    alphabet = ["A", "C", "G", "T"]
    k_mer_comb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)]
    bias_table_F = dict([(e, 0.0) for e in k_mer_comb])
    bias_table_R = dict([(e, 0.0) for e in k_mer_comb])
    for k_mer in k_mer_comb:
        obs_f = get_ppm_score(k_mer, motif_obs_f.pwm, args.k_nb)
        exp_f = get_ppm_score(k_mer, motif_exp_f.pwm, args.k_nb)
        bias_table_F[k_mer] = round(obs_f / exp_f, 6)
        obs_r = get_ppm_score(k_mer, motif_obs_r.pwm, args.k_nb)
        exp_r = get_ppm_score(k_mer, motif_exp_r.pwm, args.k_nb)
        bias_table_R[k_mer] = round(obs_r / exp_r, 6)

    write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
示例#20
0
def get_raw_signal(arguments):
    (mpbs_name, mpbs_file1, mpbs_file2, reads_file1, reads_file2, organism,
     window_size, forward_shift, reverse_shift) = arguments

    mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1")
    mpbs1.read(mpbs_file1)

    mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2")
    mpbs2.read(mpbs_file2)

    mpbs = mpbs1.combine(mpbs2, output=True)
    mpbs.sort()

    bam1 = Samfile(reads_file1, "rb")
    bam2 = Samfile(reads_file2, "rb")

    genome_data = GenomeData(organism)
    fasta = Fastafile(genome_data.get_genome())

    signal_1 = np.zeros(window_size)
    signal_2 = np.zeros(window_size)
    motif_len = None
    pwm = dict([("A", [0.0] * window_size), ("C", [0.0] * window_size),
                ("G", [0.0] * window_size), ("T", [0.0] * window_size),
                ("N", [0.0] * window_size)])

    mpbs_regions = mpbs.by_names([mpbs_name])
    num_motif = len(mpbs_regions)

    for region in mpbs_regions:
        if motif_len is None:
            motif_len = region.final - region.initial

        mid = (region.final + region.initial) / 2
        p1 = mid - window_size / 2
        p2 = mid + window_size / 2

        if p1 <= 0:
            continue

        # Fetch raw signal
        for read in bam1.fetch(region.chrom, p1, p2):
            # check if the read is unmapped, according to issue #112
            if read.is_unmapped:
                continue

            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1 <= cut_site < p2:
                    signal_1[cut_site - p1] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1 <= cut_site < p2:
                    signal_1[cut_site - p1] += 1.0

        for read in bam2.fetch(region.chrom, p1, p2):
            # check if the read is unmapped, according to issue #112
            if read.is_unmapped:
                continue

            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1 <= cut_site < p2:
                    signal_2[cut_site - p1] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1 <= cut_site < p2:
                    signal_2[cut_site - p1] += 1.0

        update_pwm(pwm, fasta, region, p1, p2)

    return signal_1, signal_2, motif_len, pwm, num_motif
示例#21
0
    try:

        # Initialization
        ll = line.strip().split("\t")
        chrName = ll[0]
        p1 = int(ll[1])
        p2 = int(ll[2])
        p1_w = p1 - (window / 2)
        p2_w = p2 + (window / 2)
        p1_wk = p1_w - (k_nb / 2)
        p2_wk = p2_w + (k_nb / 2)

        # Raw counts
        nf = [0.0] * (p2_w - p1_w)
        nr = [0.0] * (p2_w - p1_w)
        for r in bamFile.fetch(chrName, p1_w, p2_w):
            if ((not r.is_reverse) and (r.pos > p1_w)): nf[r.pos - p1_w] += 1.0
            if ((r.is_reverse) and ((r.aend - 1) < p2_w)):
                nr[r.aend - 1 - p1_w] += 1.0

        #for i in range(p1_w, p2_w):
        #  print i+1, nf[i-p1_w], nr[i-p1_w]

        # Smoothed counts
        Nf = []
        Nr = []
        fSum = sum(nf[:window])
        rSum = sum(nr[:window])
        fLast = nf[0]
        rLast = nr[0]
        for i in range((window / 2), len(nf) - (window / 2)):
示例#22
0
# Opening files
bamFile = Samfile(bamFileName, "rb")
hsFile = open(hsFileName, "r")

# Resulting statistics
O_Plus = 0.0
O_Minus = 0.0
R = 0.0

# Iterating on HS regions
for line in hsFile:

    # Fetching signal
    ll = line.strip().split("\t")
    pileup_region = PileupRegion(int(ll[1]), int(ll[2]), ext)
    iter = bamFile.fetch(reference=ll[0], start=int(ll[1]), end=int(ll[2]))
    for alignment in iter:
        pileup_region.__call__(alignment)
    raw_signalF = [min(e, initial_clip) for e in pileup_region.vectorF]
    raw_signalR = [min(e, initial_clip) for e in pileup_region.vectorR]

    # Updating statistics
    O_Plus += sum(raw_signalF)
    O_Minus += sum(raw_signalR)
    R += int(ll[2]) - int(ll[1])

# Writing results
outputFile = open(outputFileName, "w")
outputFile.write("\t".join(["O+", "O-", "R", "NormFactor+", "NormFactor-"]) +
                 "\n")
outputFile.write("\t".join(
示例#23
0
    def estimate_table_pwm(self, regions, dnase_file_name, genome_file_name,
                           k_nb, forward_shift, reverse_shift):
        """
        Estimates bias based on HS regions, DNase-seq signal and genomic sequences.

        Keyword arguments:
        regions -- DNase-seq HS regions.
        atac_file_name -- DNase-seq file name.
        genome_file_name -- Genome to fetch genomic sequences from.

        Return:
        bias_table_F, bias_table_R -- Bias tables.
        """

        # Initializing bam and fasta
        if (dnase_file_name.split(".")[-1].upper() != "BAM"):
            return None  # TODO ERROR
        bamFile = Samfile(dnase_file_name, "rb")
        fastaFile = Fastafile(genome_file_name)

        obsSeqsF = []
        obsSeqsR = []
        expSeqsF = []
        expSeqsR = []

        # Iterating on HS regions
        for region in regions:
            # Evaluating observed frequencies
            # Fetching reads
            for r in bamFile.fetch(region.chrom, region.initial, region.final):
                # Calculating positions
                # if(not r.is_reverse): p1 = r.pos - (k_nb/2) - 1 + shift
                # else: p1 = r.aend - (k_nb/2) + 1 - shift
                if (not r.is_reverse):
                    cut_site = r.pos + forward_shift - 1
                    p1 = cut_site - int(floor(k_nb / 2))
                else:
                    cut_site = r.aend + reverse_shift + 1
                    p1 = cut_site - int(floor(k_nb / 2))
                p2 = p1 + k_nb

                # Fetching k-mer
                try:
                    currStr = str(fastaFile.fetch(region.chrom, p1,
                                                  p2)).upper()
                except Exception:
                    continue
                if (r.is_reverse):
                    currStr = AuxiliaryFunctions.revcomp(currStr)

                # Counting k-mer in dictionary
                if 'N' not in currStr:
                    if (not r.is_reverse):
                        obsSeqsF.append(Seq(currStr))
                    else:
                        obsSeqsR.append(Seq(currStr))

            # Evaluating expected frequencies
            # Fetching whole sequence
            try:
                currStr = str(
                    fastaFile.fetch(region.chrom, region.initial,
                                    region.final)).upper()
            except Exception:
                continue
            currRevComp = AuxiliaryFunctions.revcomp(currStr)

            # Iterating on each sequence position
            for i in range(0, len(currStr) - k_nb):
                s = currStr[i:i + k_nb]
                if 'N' not in currStr:
                    # Counting k-mer in dictionary
                    expSeqsF.append(Seq(s))

                    # Counting k-mer in dictionary for reverse complement
                    s = currRevComp[i:i + k_nb]
                    expSeqsR.append(Seq(s))

        # Closing files
        bamFile.close()
        fastaFile.close()

        obsMotifsF = motifs.create(obsSeqsF)
        obsMotifsR = motifs.create(obsSeqsR)
        expMotifsF = motifs.create(expSeqsF)
        expMotifsR = motifs.create(expSeqsR)

        obsPwmF = obsMotifsF.pwm
        obsPwmR = obsMotifsR.pwm
        expPwmF = expMotifsF.pwm
        expPwmR = expMotifsR.pwm

        # Output logos
        logo_obs_f = os.path.join(
            self.output_loc, "Bias", "logo",
            "obs_{}_{}_f.pdf".format(str(k_nb), str(forward_shift)))
        logo_obs_r = os.path.join(
            self.output_loc, "Bias", "logo",
            "obs_{}_{}_r.pdf".format(str(k_nb), str(forward_shift)))
        logo_exp_f = os.path.join(
            self.output_loc, "Bias", "logo",
            "exp_{}_{}_f.pdf".format(str(k_nb), str(forward_shift)))
        logo_exp_r = os.path.join(
            self.output_loc, "Bias", "logo",
            "exp_{}_{}_r.pdf".format(str(k_nb), str(forward_shift)))
        obsMotifsF.weblogo(logo_obs_f,
                           format="pdf",
                           stack_width="large",
                           color_scheme="color_classic",
                           yaxis_scale=0.2,
                           yaxis_tic_interval=0.1)
        obsMotifsR.weblogo(logo_obs_r,
                           format="pdf",
                           stack_width="large",
                           color_scheme="color_classic",
                           yaxis_scale=0.2,
                           yaxis_tic_interval=0.1)
        expMotifsF.weblogo(logo_exp_f,
                           format="pdf",
                           stack_width="large",
                           color_scheme="color_classic",
                           yaxis_scale=0.02,
                           yaxis_tic_interval=0.01)
        expMotifsR.weblogo(logo_exp_r,
                           format="pdf",
                           stack_width="large",
                           color_scheme="color_classic",
                           yaxis_scale=0.02,
                           yaxis_tic_interval=0.01)

        # Output pwms
        pwm_data_list = [obsPwmF, obsPwmR, expPwmF, expPwmR]
        pwm_file_list = []
        pwm_obs_f = os.path.join(
            self.output_loc, "Bias", "pwm",
            "obs_{}_{}_f.pwm".format(str(k_nb), str(forward_shift)))
        pwm_obs_r = os.path.join(
            self.output_loc, "Bias", "pwm",
            "obs_{}_{}_r.pwm".format(str(k_nb), str(forward_shift)))
        pwm_exp_f = os.path.join(
            self.output_loc, "Bias", "pwm",
            "exp_{}_{}_f.pwm".format(str(k_nb), str(forward_shift)))
        pwm_exp_r = os.path.join(
            self.output_loc, "Bias", "pwm",
            "exp_{}_{}_r.pwm".format(str(k_nb), str(forward_shift)))

        pwm_file_list.append(pwm_obs_f)
        pwm_file_list.append(pwm_obs_r)
        pwm_file_list.append(pwm_exp_f)
        pwm_file_list.append(pwm_exp_r)

        for i in range(len(pwm_data_list)):
            with open(pwm_file_list[i], "w") as f:
                f.write(str(pwm_data_list[i]))

        # Creating bias dictionary
        alphabet = ["A", "C", "G", "T"]
        k_mer_comb = ["".join(e) for e in product(alphabet, repeat=k_nb)]
        bias_table_F = dict([(e, 0.0) for e in k_mer_comb])
        bias_table_R = dict([(e, 0.0) for e in k_mer_comb])
        for k_mer in k_mer_comb:
            obsF = self.get_pwm_score(k_mer, obsPwmF, k_nb)
            expF = self.get_pwm_score(k_mer, expPwmF, k_nb)
            bias_table_F[k_mer] = round(obsF / expF, 6)
            obsR = self.get_pwm_score(k_mer, obsPwmR, k_nb)
            expR = self.get_pwm_score(k_mer, expPwmR, k_nb)
            bias_table_R[k_mer] = round(obsR / expR, 6)

        # Return
        return [bias_table_F, bias_table_R]
示例#24
0
        higherStrand = "NA"
        for k in ctcfIndexList:
            motifFile = motifFileList[k]
            motifFetch = motifFile.fetch(chrom, start, end)
            for read in motifFetch:
                rr = read.qname.split(":")
                motifScore = float(rr[1])
                if (motifScore > higherScore):
                    higherScore = motifScore
                    higherStrand = "+"
                    if (read.is_reverse): higherStrand = "-"
        vectorTable1.append(higherStrand)

        # Genomic region
        regionVec = []
        regionFetch = regionsFile.fetch(chrom, start, end)
        for read in regionFetch:
            qName = "NA"
            if (read.qname and read.qname != "."): qName = read.qname
            try:
                geneSymbol = aliasDict[read.qname.split(":")[1]]
            except Exception:
                geneSymbol = "NA"
            if (qName == "INTERGENIC:."): qName = "INTERGENIC:NA"
            startx = "NA"
            if (read.pos and read.pos != "."): startx = str(read.pos)
            endx = "NA"
            if (read.aend and read.aend != "."): endx = str(read.aend)
            strand = "+"
            if (read.is_reverse): strand = "-"
            if (qName == "INTERGENIC:NA"): strand = "NA"
示例#25
0
    def estimate_table(self, regions, dnase_file_name, genome_file_name, k_nb,
                       forward_shift, reverse_shift):
        """ 
        Estimates bias based on HS regions, DNase-seq signal and genomic sequences.

        Keyword arguments:
        regions -- DNase-seq HS regions.
        dnase_file_name -- DNase-seq file name.
        genome_file_name -- Genome to fetch genomic sequences from.
        
        Return:
        bias_table_F, bias_table_R -- Bias tables.
        """

        # Parameters
        maxDuplicates = 100
        pseudocount = 1.0

        # Initializing bam and fasta
        if (dnase_file_name.split(".")[-1].upper() != "BAM"):
            return None  # TODO ERROR
        bamFile = Samfile(dnase_file_name, "rb")
        fastaFile = Fastafile(genome_file_name)

        # Initializing dictionaries
        obsDictF = dict()
        obsDictR = dict()
        expDictF = dict()
        expDictR = dict()

        ct_reads_r = 0
        ct_reads_f = 0
        ct_kmers = 0

        # Iterating on HS regions
        for region in regions:

            # Initialization
            prevPos = -1
            trueCounter = 0

            # Evaluating observed frequencies ####################################
            # Fetching reads
            for r in bamFile.fetch(region.chrom, region.initial, region.final):

                # Calculating positions
                if (not r.is_reverse):
                    cut_site = r.pos + forward_shift - 1
                    p1 = cut_site - int(floor(k_nb / 2))
                else:
                    cut_site = r.aend + reverse_shift + 1
                    p1 = cut_site - int(floor(k_nb / 2))
                p2 = p1 + k_nb

                # Verifying PCR artifacts
                if (p1 == prevPos):
                    trueCounter += 1
                else:
                    prevPos = p1
                    trueCounter = 0
                if (trueCounter > maxDuplicates): continue

                # Fetching k-mer
                try:
                    currStr = str(fastaFile.fetch(region.chrom, p1,
                                                  p2)).upper()
                except Exception:
                    continue
                if (r.is_reverse):
                    currStr = AuxiliaryFunctions.revcomp(currStr)

                # Counting k-mer in dictionary
                if (not r.is_reverse):
                    ct_reads_f += 1
                    try:
                        obsDictF[currStr] += 1
                    except Exception:
                        obsDictF[currStr] = 1
                else:
                    ct_reads_r += 1
                    try:
                        obsDictR[currStr] += 1
                    except Exception:
                        obsDictR[currStr] = 1

            # Evaluating expected frequencies ####################################
            # Fetching whole sequence
            try:
                currStr = str(
                    fastaFile.fetch(region.chrom, region.initial,
                                    region.final)).upper()
            except Exception:
                continue
            currRevComp = AuxiliaryFunctions.revcomp(currStr)

            # Iterating on each sequence position
            for i in range(0, len(currStr) - k_nb):
                ct_kmers += 1
                # Counting k-mer in dictionary
                s = currStr[i:i + k_nb]
                try:
                    expDictF[s] += 1
                except Exception:
                    expDictF[s] = 1

                # Counting k-mer in dictionary for reverse complement
                s = currRevComp[i:i + k_nb]
                try:
                    expDictR[s] += 1
                except Exception:
                    expDictR[s] = 1

        # Closing files
        bamFile.close()
        fastaFile.close()

        # Creating bias dictionary
        alphabet = ["A", "C", "G", "T"]
        kmerComb = ["".join(e) for e in product(alphabet, repeat=k_nb)]
        bias_table_F = dict([(e, 0.0) for e in kmerComb])
        bias_table_R = dict([(e, 0.0) for e in kmerComb])
        for kmer in kmerComb:
            try:
                obsF = obsDictF[kmer] + pseudocount
            except Exception:
                obsF = pseudocount
            try:
                expF = expDictF[kmer] + pseudocount
            except Exception:
                expF = pseudocount
            if ct_reads_f == 0:
                bias_table_F[kmer] = 1
            else:
                bias_table_F[kmer] = round(
                    float(obsF / ct_reads_f) / float(expF / ct_kmers), 6)
            try:
                obsR = obsDictR[kmer] + pseudocount
            except Exception:
                obsR = pseudocount
            try:
                expR = expDictR[kmer] + pseudocount
            except Exception:
                expR = pseudocount
            if ct_reads_r == 0:
                bias_table_R[kmer] = 1
            else:
                bias_table_R[kmer] = round(
                    float(obsR / ct_reads_r) / float(expR / ct_kmers), 6)

        # Return
        return [bias_table_F, bias_table_R]
示例#26
0
chromList = sorted(csDict.keys())

#################################################
# Fetching observed frequencies
#################################################

if (allTagsFg == "Y"):

    # Iterating on chromosomes
    for chrom in chromList:

        prevPos = -1
        trueCounter = 0

        # Iterating on chromosome reads
        for r in bamFile.fetch(chrom, (k_nb / 2), csDict[chrom] - (k_nb / 2)):

            # Calculating positions
            if (not r.is_reverse):
                p1 = r.pos - (k_nb / 2) - 1  # The -1 is because He is wrong
            else:
                p1 = r.aend - (k_nb / 2) + 1  # The +1 is because He is wrong
            p2 = p1 + k_nb

            # Verifying PCR artifacts
            if (p1 == prevPos): trueCounter += 1
            else:
                prevPos = p1
                trueCounter = 0
            if (trueCounter > maxDuplicates): continue
示例#27
0
chromList = ["chr" + str(e) for e in range(1, 23) + ["X"]]
command = "mkdir -p " + tempLoc
os.system(command)

# Iterating throught the regions
chipRegionFile = open(chipRegionFileName, "rU")
motifBamFile = Samfile(motifBamFileName, "rb")
tempBedFileName = tempLoc + "tempBedFileName"
tempBedFile = open(tempBedFileName, "w")
for line in chipRegionFile:
    ll = line.strip().split("\t")
    chrom = ll[0]
    start = int(ll[1])
    end = int(ll[2])
    if (chrom not in chromList): continue
    mfetch = motifBamFile.fetch(chrom, start, end)
    bestScore = -9999
    bestMotif = None
    for read in mfetch:
        reference_start = str(read.reference_start)
        reference_end = str(read.reference_end)
        rr = read.query_name.split(":")
        name = rr[0]
        score = float(rr[1])
        strand = "+"
        if (read.is_reverse): strand = "-"
        if (score > bestScore):
            bestScore = score
            bestMotif = [
                chrom, reference_start, reference_end, name,
                str(score), strand
示例#28
0
class GenomicSignal:
    """
    Represents a genomic signal. It should be used to fetch normalized and slope
    signals from a bam file.
    Usage:
    1. Initialize class.
    2. Call load_sg_coefs once.
    3. Call get_signal as many times as needed.

    Authors: Eduardo G. Gusmao.
    """
    def __init__(self, file_name):
        """ 
        Initializes GenomicSignal.
        """
        self.file_name = file_name
        self.sg_coefs = None
        self.bam = Samfile(file_name, "rb")

    def load_sg_coefs(self, slope_window_size):
        """ 
        Loads Savitzky-Golay coefficients into self.sg_coefs based on a slope_window_size.

        Keyword arguments:
        slope_window_size -- Window size of Savitzky-Golay coefficients.

        Return:
        None -- It updates self.sg_coefs.
        """
        self.sg_coefs = self.savitzky_golay_coefficients(
            slope_window_size, 2, 1)

    def get_tag_count(self,
                      ref,
                      start,
                      end,
                      downstream_ext,
                      upstream_ext,
                      forward_shift,
                      reverse_shift,
                      initial_clip=1000):
        """
        Gets the tag count associated with self.bam based on start, end and ext.

        Keyword arguments:
        ref -- Chromosome name.
        start -- Initial genomic coordinate of signal.
        end -- Final genomic coordinate of signal.
        downstream_ext -- Number of bps to extend towards the downstream region (right for forward strand and left for reverse strand).
        upstream_ext -- Number of bps to extend towards the upstream region (left for forward strand and right for reverse strand).
        forward_shift -- Number of bps to shift the reads aligned to the forward strand. Can be a positive number for a shift towards the downstream region (towards the inside of the aligned read) and a negative number for a shift towards the upstream region.
        reverse_shift -- Number of bps to shift the reads aligned to the reverse strand. Can be a positive number for a shift towards the upstream region and a negative number for a shift towards the downstream region (towards the inside of the aligned read).
        initial_clip -- Signal will be initially clipped at this level to avoid outliers.

        Return:
        tag_count -- Total signal.
        """

        # Fetch raw signal
        pileup_region = PileupRegion(start, end, downstream_ext, upstream_ext,
                                     forward_shift, reverse_shift)
        if (ps_version == "0.7.5"):
            self.bam.fetch(reference=ref,
                           start=start,
                           end=end,
                           callback=pileup_region)
        else:
            iter = self.bam.fetch(reference=ref, start=start, end=end)
            for alignment in iter:
                pileup_region.__call__(alignment)
        raw_signal = array(
            [min(e, initial_clip) for e in pileup_region.vector])

        # Std-based clipping
        mean = raw_signal.mean()
        std = raw_signal.std()
        clip_signal = [min(e, mean + (10 * std)) for e in raw_signal]

        # Tag count
        try:
            tag_count = sum(clip_signal)
        except Exception:
            tag_count = 0

        return tag_count

    def get_signal(self,
                   ref,
                   start,
                   end,
                   downstream_ext,
                   upstream_ext,
                   forward_shift,
                   reverse_shift,
                   initial_clip=1000,
                   per_norm=98,
                   per_slope=98,
                   bias_table=None,
                   genome_file_name=None,
                   print_raw_signal=False,
                   print_bc_signal=False,
                   print_norm_signal=False,
                   print_slope_signal=False,
                   strands_specific=False):
        """
        Gets the signal associated with self.bam based on start, end and ext.
        initial_clip, per_norm and per_slope are used as normalization factors during the normalization
        and slope evaluation procedures.

        Keyword arguments:
        ref -- Chromosome name.
        start -- Initial genomic coordinate of signal.
        end -- Final genomic coordinate of signal.
        initial_clip -- Signal will be initially clipped at this level to avoid outliers.
        per_norm -- Percentile value for 'hon_norm' function of the normalized signal.
        per_slope -- Percentile value for 'hon_norm' function of the slope signal.
        bias_table -- Bias table to perform bias correction.
        genome_file_name -- Genome to perform bias correction.
        downstream_ext -- Number of bps to extend towards the downstream region
        (right for forward strand and left for reverse strand).
        upstream_ext -- Number of bps to extend towards the upstream region
        (left for forward strand and right for reverse strand).
        forward_shift -- Number of bps to shift the reads aligned to the forward strand.
        Can be a positive number for a shift towards the downstream region
        (towards the inside of the aligned read) and a negative number for a shift towards the upstream region.
        reverse_shift -- Number of bps to shift the reads aligned to the reverse strand.
        Can be a positive number for a shift towards the upstream region and a negative number
        for a shift towards the downstream region (towards the inside of the aligned read).

        Return:
        hon_signal -- Normalized signal.
        slopehon_signal -- Slope signal.
        """

        # Fetch raw signal
        pileup_region = PileupRegion(start, end, downstream_ext, upstream_ext,
                                     forward_shift, reverse_shift)
        if (ps_version == "0.7.5"):
            self.bam.fetch(reference=ref,
                           start=start,
                           end=end,
                           callback=pileup_region)
        else:
            iter = self.bam.fetch(reference=ref, start=start, end=end)
            for alignment in iter:
                pileup_region.__call__(alignment)
        raw_signal = array(
            [min(e, initial_clip) for e in pileup_region.vector])

        # Std-based clipping
        mean = raw_signal.mean()
        std = raw_signal.std()
        clip_signal = [min(e, mean + (10 * std)) for e in raw_signal]

        # Cleavage bias correction
        bias_corrected_signal = self.bias_correction(clip_signal, bias_table,
                                                     genome_file_name, ref,
                                                     start, end, forward_shift,
                                                     reverse_shift,
                                                     strands_specific)

        # Boyle normalization (within-dataset normalization)
        boyle_signal = array(self.boyle_norm(bias_corrected_signal))

        # Hon normalization (between-dataset normalization)
        perc = scoreatpercentile(boyle_signal, per_norm)
        std = boyle_signal.std()
        hon_signal = self.hon_norm(boyle_signal, perc, std)

        # Slope signal
        slope_signal = self.slope(hon_signal, self.sg_coefs)

        # Hon normalization on slope signal (between-dataset slope smoothing)
        abs_seq = array([abs(e) for e in slope_signal])
        perc = scoreatpercentile(abs_seq, per_slope)
        std = abs_seq.std()
        slopehon_signal = self.hon_norm(slope_signal, perc, std)

        # Writing signal
        if (print_raw_signal):
            signal_file = open(print_raw_signal, "a")
            signal_file.write(
                "fixedStep chrom=" + ref + " start=" + str(start + 1) +
                " step=1\n" +
                "\n".join([str(e) for e in nan_to_num(raw_signal)]) + "\n")
            signal_file.close()
        if (print_bc_signal):
            signal_file = open(print_bc_signal, "a")
            signal_file.write(
                "fixedStep chrom=" + ref + " start=" + str(start + 1) +
                " step=1\n" +
                "\n".join([str(e)
                           for e in nan_to_num(bias_corrected_signal)]) + "\n")
            signal_file.close()
        if (print_norm_signal):
            signal_file = open(print_norm_signal, "a")
            signal_file.write(
                "fixedStep chrom=" + ref + " start=" + str(start + 1) +
                " step=1\n" +
                "\n".join([str(e) for e in nan_to_num(hon_signal)]) + "\n")
            signal_file.close()
        if (print_slope_signal):
            signal_file = open(print_slope_signal, "a")
            signal_file.write(
                "fixedStep chrom=" + ref + " start=" + str(start + 1) +
                " step=1\n" +
                "\n".join([str(e) for e in nan_to_num(slope_signal)]) + "\n")
            signal_file.close()

        # Returning normalized and slope sequences
        return hon_signal, slopehon_signal

    def bias_correction(self, signal, bias_table, genome_file_name, chrName,
                        start, end, forward_shift, reverse_shift,
                        strands_specific):
        """
        Performs bias correction.

        Keyword arguments:
        signal -- Input signal.
        bias_table -- Bias table.

        Return:
        bias_corrected_signal -- Bias-corrected sequence.
        """

        if (not bias_table): return signal

        # Parameters
        window = 50
        defaultKmerValue = 1.0

        # Initialization
        fastaFile = Fastafile(genome_file_name)
        fBiasDict = bias_table[0]
        rBiasDict = bias_table[1]
        k_nb = len(fBiasDict.keys()[0])
        p1 = start
        p2 = end
        p1_w = p1 - (window / 2)
        p2_w = p2 + (window / 2)
        p1_wk = p1_w - int(floor(k_nb / 2.))
        p2_wk = p2_w + int(ceil(k_nb / 2.))
        if (p1 <= 0 or p1_w <= 0 or p1_wk <= 0): return signal

        # Raw counts
        nf = [0.0] * (p2_w - p1_w)
        nr = [0.0] * (p2_w - p1_w)
        for read in self.bam.fetch(chrName, p1_w, p2_w):
            if (not read.is_reverse):
                cut_site = read.pos + forward_shift
                if cut_site >= start and cut_site < end:
                    nf[cut_site - p1_w] += 1.0
                    # for i in range(max(read.pos + forward_shift, start), min(read.pos + forward_shift + 1, end - 1)):
                    #    nf[i - start] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if cut_site >= start and cut_site < end:
                    nr[cut_site - p1_w] += 1.0
                    # for i in range(max(read.aend + reverse_shift - 1, start), min(read.aend + reverse_shift, end - 1)):
                    #    nr[i - start] += 1.0

                    # if ((not read.is_reverse) and (read.pos > p1_w)): nf[read.pos - p1_w] += 1.0
                    # if ((read.is_reverse) and ((read.aend - 1) < p2_w)): nr[read.aend - 1 - p1_w] += 1.0

        # Smoothed counts
        Nf = []
        Nr = []
        fSum = sum(nf[:window])
        rSum = sum(nr[:window])
        fLast = nf[0]
        rLast = nr[0]
        for i in range((window / 2), len(nf) - (window / 2)):
            Nf.append(fSum)
            Nr.append(rSum)
            fSum -= fLast
            fSum += nf[i + (window / 2)]
            fLast = nf[i - (window / 2) + 1]
            rSum -= rLast
            rSum += nr[i + (window / 2)]
            rLast = nr[i - (window / 2) + 1]

        # Fetching sequence
        currStr = str(fastaFile.fetch(chrName, p1_wk - 1, p2_wk - 2)).upper()
        currRevComp = AuxiliaryFunctions.revcomp(
            str(fastaFile.fetch(chrName, p1_wk + 2, p2_wk + 1)).upper())
        #currStr = str(fastaFile.fetch(chrName, p1_wk, p2_wk - 1)).upper()
        #currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrName, p1_wk + 1,
        #                                                            p2_wk)).upper())

        # Iterating on sequence to create signal
        af = []
        ar = []
        for i in range(int(ceil(k_nb / 2.)),
                       len(currStr) - int(floor(k_nb / 2)) + 1):
            fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))]
            rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) -
                               i:len(currStr) + int(floor(k_nb / 2.)) - i]
            try:
                af.append(fBiasDict[fseq])
            except Exception:
                af.append(defaultKmerValue)
            try:
                ar.append(rBiasDict[rseq])
            except Exception:
                ar.append(defaultKmerValue)

        # Calculating bias and writing to wig file
        fSum = sum(af[:window])
        rSum = sum(ar[:window])
        fLast = af[0]
        rLast = ar[0]
        bias_corrected_signal = []
        bias_corrected_signal_forward = []
        bias_corrected_signal_reverse = []
        for i in range((window / 2), len(af) - (window / 2)):
            nhatf = Nf[i - (window / 2)] * (af[i] / fSum)
            nhatr = Nr[i - (window / 2)] * (ar[i] / rSum)
            zf = log(nf[i] + 1) - log(nhatf + 1)
            zr = log(nr[i] + 1) - log(nhatr + 1)
            bias_corrected_signal_forward.append(zf)
            bias_corrected_signal_reverse.append(zr)
            bias_corrected_signal.append(zf + zr)
            fSum -= fLast
            fSum += af[i + (window / 2)]
            fLast = af[i - (window / 2) + 1]
            rSum -= rLast
            rSum += ar[i + (window / 2)]
            rLast = ar[i - (window / 2) + 1]

        # Fixing the negative number in bias corrected signal
        min_value = abs(min(bias_corrected_signal_forward))
        bias_fixed_signal_forward = [
            e + min_value for e in bias_corrected_signal_forward
        ]

        min_value = abs(min(bias_corrected_signal_reverse))
        bias_fixed_signal_reverse = [
            e + min_value for e in bias_corrected_signal_reverse
        ]

        min_value = abs(min(bias_corrected_signal))
        bias_fixed_signal = [e + min_value for e in bias_corrected_signal]

        # Termination
        fastaFile.close()
        if not strands_specific:
            return bias_corrected_signal
        else:
            return bias_fixed_signal_forward, bias_fixed_signal_reverse

    def hon_norm(self, sequence, mean, std):
        """
        Normalizes a sequence according to hon's criterion using mean and std.
        This represents a between-dataset normalization.

        Keyword arguments:
        sequence -- Input sequence.
        mean -- Global mean.
        std -- Global std.

        Return:
        norm_seq -- Normalized sequence.
        """

        #if std != 0:
        #    norm_seq = []
        #    for e in sequence:
        #        norm_seq.append(1.0 / (1.0 + (exp(-(e - mean) / std))))
        #    return norm_seq
        #else:
        #    return sequence
        norm_seq = []
        for e in sequence:
            if (e == 0.0): norm_seq.append(0.0)
            elif (e > 0.0):
                norm_seq.append(1.0 / (1.0 + (exp(-(e - mean) / std))))
            else:
                norm_seq.append(-1.0 / (1.0 + (exp(-(-e - mean) / std))))
        return norm_seq

    def boyle_norm(self, sequence):
        """
        Normalizes a sequence according to Boyle's criterion.
        This represents a within-dataset normalization.

        Keyword arguments:
        sequence -- Input sequence.

        Return:
        norm_seq -- Normalized sequence.
        """
        mean = array([e for e in sequence if e > 0]).mean()
        if isnan(mean):
            return sequence
        else:
            norm_seq = [(float(e) / mean) for e in sequence]
            return norm_seq

    def savitzky_golay_coefficients(self, window_size, order, deriv):
        """
        Evaluate the Savitzky-Golay coefficients in order to evaluate the slope of the signal.
        It uses a window_size (of the interpolation), order (of the polynomial), deriv (derivative needed).

        Keyword arguments:
        window_size -- Size of the window for function interpolation.
        order -- Order of polynomial.
        deriv -- Derivative.

        Return:
        m[::-1] -- The Savitzky-Golay coefficients.
        """

        # Get statistics
        # try: # TODO ERRORS
        window_size = abs(int(window_size))
        order = abs(int(order))
        # except ValueError, msg:
        #    raise ValueError("windowSize and order have to be of type int")
        # if windowSize % 2 != 1 or windowSize < 1:
        #    raise TypeError("windowSize size must be a positive odd number")
        # if windowSize < order + 2:
        #    raise TypeError("windowSize is too small for the polynomials order")
        order_range = range(order + 1)
        half_window = (window_size - 1) // 2

        # Precompute Coefficients
        b = mat([[k**i for i in order_range]
                 for k in range(-half_window, half_window + 1)])
        m = linalg.pinv(b).A[deriv]
        return m[::-1]

    def slope(self, sequence, sg_coefs):
        """
        Evaluates the slope of sequence given the sg_coefs loaded.

        Keyword arguments:
        sequence -- Input sequence.
        sg_coefs -- Savitzky-Golay coefficients.

        Return:
        slope_seq -- Slope sequence.
        """
        slope_seq = convolve(sequence, sg_coefs)
        slope_seq = [
            e for e in slope_seq[(len(sg_coefs) / 2):(len(slope_seq) -
                                                      (len(sg_coefs) / 2))]
        ]

        return slope_seq

    def get_signal_per_strand(self,
                              ref,
                              start,
                              end,
                              downstream_ext,
                              upstream_ext,
                              forward_shift,
                              reverse_shift,
                              initial_clip=1000,
                              per_norm=98,
                              per_slope=98,
                              bias_table=None,
                              genome_file_name=None,
                              print_raw_signal=False,
                              print_bc_signal=False,
                              print_norm_signal=False,
                              print_slope_signal=False,
                              strands_specific=True):
        """

        :param ref: Chromosome name.
        :param start: Initial genomic coordinate of signal.
        :param end: Final genomic coordinate of signal.
        :param downstream_ext: Number of bps to extend towards the downstream region
        :param upstream_ext: Number of bps to extend towards the upstream region
        :param forward_shift: Number of bps to shift the reads aligned to the forward strand.
        :param reverse_shift: Number of bps to shift the reads aligned to the reverse strand.
        :param initial_clip: Signal will be initially clipped at this level to avoid outliers.
        :param per_norm: Percentile value for 'hon_norm' function of the normalized signal.
        :param per_slope: Percentile value for 'hon_norm' function of the slope signal.
        :param bias_table: Bias table to perform bias correction.
        :param genome_file_name: Genome to perform bias correction.
        :param print_raw_signal:
        :param print_bc_signal:
        :param print_norm_signal:
        :param print_slope_signal:
        :return: normalized and slope signal for each strand.
        """

        raw_signal_forward = [0.0] * (end - start)
        raw_signal_reverse = [0.0] * (end - start)

        reads = self.bam.fetch(reference=ref, start=start, end=end)
        for read in reads:
            if (not read.is_reverse):
                cut_site = read.pos + forward_shift
                if cut_site >= start and cut_site < end:
                    raw_signal_forward[cut_site - start] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if cut_site >= start and cut_site < end:
                    raw_signal_reverse[cut_site - start] += 1.0

        raw_signal_forward = array(
            [min(e, initial_clip) for e in raw_signal_forward])
        raw_signal_reverse = array(
            [min(e, initial_clip) for e in raw_signal_reverse])

        # Std-based clipping
        mean = raw_signal_forward.mean()
        std = raw_signal_forward.std()
        clip_signal_forward = [
            min(e, mean + (10 * std)) for e in raw_signal_forward
        ]
        mean = raw_signal_reverse.mean()
        std = raw_signal_reverse.std()
        clip_signal_reverse = [
            min(e, mean + (10 * std)) for e in raw_signal_reverse
        ]

        # Cleavage bias correction
        bc_signal_forward = None
        bc_signal_reverse = None
        if bias_table:
            bc_signal_forward, bc_signal_reverse = self.bias_correction(
                raw_signal_forward, bias_table, genome_file_name, ref, start,
                end, forward_shift, reverse_shift, strands_specific)
        else:
            bc_signal_forward = clip_signal_forward
            bc_signal_reverse = clip_signal_reverse

        # Boyle normalization (within-dataset normalization)
        boyle_signal_forward = array(self.boyle_norm(bc_signal_forward))
        boyle_signal_reverse = array(self.boyle_norm(bc_signal_reverse))

        # Hon normalization (between-dataset normalization)
        perc = scoreatpercentile(boyle_signal_forward, per_norm)
        std = boyle_signal_forward.std()
        hon_signal_forward = self.hon_norm(boyle_signal_forward, perc, std)

        perc = scoreatpercentile(boyle_signal_reverse, per_norm)
        std = boyle_signal_reverse.std()
        hon_signal_reverse = self.hon_norm(boyle_signal_reverse, perc, std)

        # Slope signal
        slope_signal_forward = self.slope(hon_signal_forward, self.sg_coefs)
        slope_signal_reverse = self.slope(hon_signal_reverse, self.sg_coefs)

        # Returning normalized and slope sequences
        return hon_signal_forward, slope_signal_forward, hon_signal_reverse, slope_signal_reverse
示例#29
0
if(not os.path.exists(args.outdir)):
    os.makedirs(args.outdir)
#________________________________________________________________________________________________________________
#get error dict;
#________________________________________________________________________________________________________________
errors = defaultdict(set)
with open(args.errors, 'r') as f:
	for l in f:
		a = l.strip().split("\t");
		if(not args.etype or ",".join(a[:2]) in args.etype):
			errors[tuple(a[:2])].add(a[2]);
			
errors2segments = defaultdict(lambda: defaultdict(list));
samfile = Samfile(args.path)
for segment in samfile.fetch(until_eof=True):
	num = segment.query_name.split("|")[0]
	for etype, eset in errors.iteritems():
		if(num in eset):
			errors2segments[etype][num].append(segment);
			break;
		
		
additional = defaultdict(list);
for fname in args.additional:
	tsamfile = Samfile(fname);
	for segment in tsamfile.fetch(until_eof=True):
		num = segment.query_name.split("|")[0]
		additional[num].append(ArWrapper(segment, tsamfile.getrname(segment.tid)))
	tsamfile.close();
		
示例#30
0
  intFile = open(intFileName,"r")
  spr = 0.0
  counter = 0.0
  for line in intFile:

    # Fetching signal
    ll = line.strip().split("\t")
    mLen = int(ll[2]) - int(ll[1])
    mid = (int(ll[1])+int(ll[2]))/2
    p1 = max(mid - halfWindow,0)
    p2 = mid + halfWindow

    # Fetch raw signal
    pileup_region = PileupRegion(p1,p2,1)
    if(ps_version == "0.7.5"):
      bam.fetch(reference=ll[0], start=p1, end=p2, callback = pileup_region)
    else:
      iter = bam.fetch(reference=ll[0], start=p1, end=p2)
      for alignment in iter: pileup_region.__call__(alignment)
    raw_signal = array([min(e,initial_clip) for e in pileup_region.vector])
    
    # Std-based clipping
    mean = raw_signal.mean()
    std = raw_signal.std()
    clip_signal = [min(e, mean + (10 * std)) for e in raw_signal]

    # Bias Correction
    correctedSignal = bias_correction(bam, clip_signal, biasTableF, biasTableR, genomeFileName, ll[0], p1, p2)

    # Summing min value to signal
    stdzSignal = [e+minValue for e in correctedSignal]
示例#31
0
class GenomicSignal:
    """
    Represents a genomic signal. It should be used to fetch normalized and slope
    signals from a bam file.
    Usage:
    1. Initialize class.
    2. Call load_sg_coefs once.
    3. Call get_signal as many times as needed.

    Authors: Eduardo G. Gusmao.
    """

    def __init__(self, file_name=None):
        """ 
        Initializes GenomicSignal.
        """
        self.file_name = file_name
        self.sg_coefs = None
        if file_name is not None:
            self.bam = Samfile(file_name, "rb")

    def load_sg_coefs(self, slope_window_size):
        """ 
        Loads Savitzky-Golay coefficients into self.sg_coefs based on a slope_window_size.

        Keyword arguments:
        slope_window_size -- Window size of Savitzky-Golay coefficients.

        Return:
        None -- It updates self.sg_coefs.
        """
        self.sg_coefs = self.savitzky_golay_coefficients(slope_window_size, 2, 1)

    def get_tag_count(self, ref, start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift,
                      initial_clip=1000):
        """
        Gets the tag count associated with self.bam based on start, end and ext.

        Keyword arguments:
        ref -- Chromosome name.
        start -- Initial genomic coordinate of signal.
        end -- Final genomic coordinate of signal.
        downstream_ext -- Number of bps to extend towards the downstream region (right for forward strand and left for reverse strand).
        upstream_ext -- Number of bps to extend towards the upstream region (left for forward strand and right for reverse strand).
        forward_shift -- Number of bps to shift the reads aligned to the forward strand. Can be a positive number for a shift towards the downstream region (towards the inside of the aligned read) and a negative number for a shift towards the upstream region.
        reverse_shift -- Number of bps to shift the reads aligned to the reverse strand. Can be a positive number for a shift towards the upstream region and a negative number for a shift towards the downstream region (towards the inside of the aligned read).
        initial_clip -- Signal will be initially clipped at this level to avoid outliers.

        Return:
        tag_count -- Total signal.
        """

        # Fetch raw signal
        pileup_region = PileupRegion(start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift)
        if ps_version == "0.7.5":
            self.bam.fetch(reference=ref, start=start, end=end, callback=pileup_region)
        else:
            iter = self.bam.fetch(reference=ref, start=start, end=end)
            for alignment in iter: pileup_region.__call__(alignment)
        raw_signal = array([min(e, initial_clip) for e in pileup_region.vector])

        # Tag count
        try:
            tag_count = sum(raw_signal)
        except Exception:
            tag_count = 0

        return tag_count

    def get_signal(self, ref, start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift,
                   initial_clip=1000, per_norm=98, per_slope=98,
                   bias_table=None, genome_file_name=None, print_raw_signal=False):
        """
        Gets the signal associated with self.bam based on start, end and ext.
        initial_clip, per_norm and per_slope are used as normalization factors during the normalization
        and slope evaluation procedures.

        Keyword arguments:
        ref -- Chromosome name.
        start -- Initial genomic coordinate of signal.
        end -- Final genomic coordinate of signal.
        initial_clip -- Signal will be initially clipped at this level to avoid outliers.
        per_norm -- Percentile value for 'hon_norm' function of the normalized signal.
        per_slope -- Percentile value for 'hon_norm' function of the slope signal.
        bias_table -- Bias table to perform bias correction.
        genome_file_name -- Genome to perform bias correction.
        downstream_ext -- Number of bps to extend towards the downstream region
        (right for forward strand and left for reverse strand).
        upstream_ext -- Number of bps to extend towards the upstream region
        (left for forward strand and right for reverse strand).
        forward_shift -- Number of bps to shift the reads aligned to the forward strand.
        Can be a positive number for a shift towards the downstream region
        (towards the inside of the aligned read) and a negative number for a shift towards the upstream region.
        reverse_shift -- Number of bps to shift the reads aligned to the reverse strand.
        Can be a positive number for a shift towards the upstream region and a negative number
        for a shift towards the downstream region (towards the inside of the aligned read).

        Return:
        hon_signal -- Normalized signal.
        slopehon_signal -- Slope signal.
        """
        # Fetch raw signal
        pileup_region = PileupRegion(start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift)
        if ps_version == "0.7.5":
            self.bam.fetch(reference=ref, start=start, end=end, callback=pileup_region)
        else:
            iter = self.bam.fetch(reference=ref, start=start, end=end)
            for alignment in iter:
                pileup_region.__call__(alignment)
        raw_signal = array([min(e, initial_clip) for e in pileup_region.vector])

        # Std-based clipping
        mean = raw_signal.mean()
        std = raw_signal.std()
        clip_signal = [min(e, mean + (10 * std)) for e in raw_signal]

        # Cleavage bias correction
        bc_signal = self.bias_correction_dnase(clip_signal, bias_table, genome_file_name, ref, start, end,
                                               forward_shift, reverse_shift)

        # Boyle normalization (within-dataset normalization)
        boyle_signal = array(self.boyle_norm(bc_signal))

        # Hon normalization (between-dataset normalization)
        perc = scoreatpercentile(boyle_signal, per_norm)
        std = boyle_signal.std()
        hon_signal = self.hon_norm_dnase(boyle_signal, perc, std)

        # Slope signal
        slope_signal = self.slope(hon_signal, self.sg_coefs)

        # Returning normalized and slope sequences
        return hon_signal, slope_signal

    def get_signal_atac(self, ref, start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift,
                        initial_clip=50, per_norm=98, per_slope=98,
                        bias_table=None, genome_file_name=None):

        # Cleavage bias correction
        bc_signal_forward, bc_signal_reverse = self.bias_correction_atac(bias_table, genome_file_name,
                                                                         ref, start, end, forward_shift, reverse_shift)

        # Boyle normalization (within-dataset normalization)
        boyle_signal_forward = array(self.boyle_norm(bc_signal_forward))
        boyle_signal_reverse = array(self.boyle_norm(bc_signal_reverse))

        # Hon normalization (between-dataset normalization)
        perc = scoreatpercentile(boyle_signal_forward, per_norm)
        std = boyle_signal_forward.std()
        hon_signal_forward = self.hon_norm_atac(boyle_signal_forward, perc, std)

        perc = scoreatpercentile(boyle_signal_reverse, per_norm)
        std = boyle_signal_reverse.std()
        hon_signal_reverse = self.hon_norm_atac(boyle_signal_reverse, perc, std)

        # Slope signal
        slope_signal_forward = self.slope(hon_signal_forward, self.sg_coefs)
        slope_signal_reverse = self.slope(hon_signal_reverse, self.sg_coefs)

        # Hon normalization (between-dataset normalization)
        perc = scoreatpercentile(slope_signal_forward, per_norm)
        std = np.std(slope_signal_forward)
        slope_signal_forward = self.hon_norm_atac(slope_signal_forward, perc, std)

        perc = scoreatpercentile(slope_signal_reverse, per_norm)
        std = np.std(slope_signal_forward)
        slope_signal_reverse = self.hon_norm_atac(slope_signal_reverse, perc, std)

        # Returning normalized and slope sequences
        return hon_signal_forward, slope_signal_forward, hon_signal_reverse, slope_signal_reverse

    def get_signal_atac2(self, ref, start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift,
                         initial_clip=50, per_norm=98, per_slope=98,
                         bias_table=None, genome_file_name=None):

        # Cleavage bias correction
        bc_signal = self.bias_correction_atac2(bias_table, genome_file_name,
                                               ref, start, end, forward_shift, reverse_shift)

        # Boyle normalization (within-dataset normalization)
        boyle_signal = array(self.boyle_norm(bc_signal))

        # Hon normalization (between-dataset normalization)
        perc = scoreatpercentile(boyle_signal, per_norm)
        std = boyle_signal.std()
        hon_signal = self.hon_norm_atac(boyle_signal, perc, std)

        # Slope signal
        slope_signal = self.slope(hon_signal, self.sg_coefs)

        # Hon normalization (between-dataset normalization)
        slope_signal = self.boyle_norm(slope_signal)

        perc = scoreatpercentile(slope_signal, per_slope)
        std = np.std(slope_signal)
        slope_signal = self.hon_norm_atac(slope_signal, perc, std)

        # Returning normalized and slope sequences
        return hon_signal, slope_signal

    def bias_correction_dnase(self, signal, bias_table, genome_file_name, chrName, start, end,
                              forward_shift, reverse_shift):

        if not bias_table: return signal
        # Parameters
        window = 50
        defaultKmerValue = 1.0

        # Initialization
        fastaFile = Fastafile(genome_file_name)
        fBiasDict = bias_table[0]
        rBiasDict = bias_table[1]
        k_nb = len(fBiasDict.keys()[0])
        p1 = start
        p2 = end
        p1_w = p1 - (window / 2)
        p2_w = p2 + (window / 2)
        p1_wk = p1_w - int(floor(k_nb / 2.))
        p2_wk = p2_w + int(ceil(k_nb / 2.))
        if p1 <= 0 or p1_w <= 0 or p1_wk <= 0: return signal

        # Raw counts
        nf = [0.0] * (p2_w - p1_w)
        nr = [0.0] * (p2_w - p1_w)
        for read in self.bam.fetch(chrName, p1_w, p2_w):
            # check if the read is unmapped, according to issue #112
            if read.is_unmapped:
                continue

            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1_w <= cut_site < p2_w:
                    nf[cut_site - p1_w] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1_w <= cut_site < p2_w:
                    nr[cut_site - p1_w] += 1.0

        # Smoothed counts
        Nf = []
        Nr = []
        fSum = sum(nf[:window])
        rSum = sum(nr[:window])
        fLast = nf[0]
        rLast = nr[0]
        for i in range((window / 2), len(nf) - (window / 2)):
            Nf.append(fSum)
            Nr.append(rSum)
            fSum -= fLast
            fSum += nf[i + (window / 2)]
            fLast = nf[i - (window / 2) + 1]
            rSum -= rLast
            rSum += nr[i + (window / 2)]
            rLast = nr[i - (window / 2) + 1]

        # Fetching sequence
        currStr = str(fastaFile.fetch(chrName, p1_wk, p2_wk - 1)).upper()
        currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrName, p1_wk + 1,
                                                                     p2_wk)).upper())

        # Iterating on sequence to create signal
        af = []
        ar = []
        for i in range(int(ceil(k_nb / 2.)), len(currStr) - int(floor(k_nb / 2)) + 1):
            fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))]
            rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) - i:len(currStr) + int(floor(k_nb / 2.)) - i]
            try:
                af.append(fBiasDict[fseq])
            except Exception:
                af.append(defaultKmerValue)
            try:
                ar.append(rBiasDict[rseq])
            except Exception:
                ar.append(defaultKmerValue)

        # Calculating bias and writing to wig file
        fSum = sum(af[:window])
        rSum = sum(ar[:window])
        fLast = af[0]
        rLast = ar[0]
        bias_corrected_signal = []
        for i in range((window / 2), len(af) - (window / 2)):
            nhatf = Nf[i - (window / 2)] * (af[i] / fSum)
            nhatr = Nr[i - (window / 2)] * (ar[i] / rSum)
            zf = log(nf[i] + 1) - log(nhatf + 1)
            zr = log(nr[i] + 1) - log(nhatr + 1)
            bias_corrected_signal.append(zf + zr)
            fSum -= fLast
            fSum += af[i + (window / 2)]
            fLast = af[i - (window / 2) + 1]
            rSum -= rLast
            rSum += ar[i + (window / 2)]
            rLast = ar[i - (window / 2) + 1]

        # Termination
        fastaFile.close()
        return bias_corrected_signal

    def bias_correction_atac(self, bias_table, genome_file_name, chrName, start, end,
                             forward_shift, reverse_shift):

        # Parameters
        window = 50
        defaultKmerValue = 1.0

        # Initialization
        fastaFile = Fastafile(genome_file_name)
        fBiasDict = bias_table[0]
        rBiasDict = bias_table[1]
        k_nb = len(fBiasDict.keys()[0])
        p1 = start
        p2 = end
        p1_w = p1 - (window / 2)
        p2_w = p2 + (window / 2)
        p1_wk = p1_w - int(floor(k_nb / 2.))
        p2_wk = p2_w + int(ceil(k_nb / 2.))

        if (p1 <= 0 or p1_w <= 0 or p2_wk <= 0):
            # Return raw counts
            nf = [0.0] * (p2 - p1)
            nr = [0.0] * (p2 - p1)
            for read in self.bam.fetch(chrName, p1, p2):
                # check if the read is unmapped, according to issue #112
                if read.is_unmapped:
                    continue

                if not read.is_reverse:
                    cut_site = read.pos + forward_shift
                    if p1 <= cut_site < p2:
                        nf[cut_site - p1] += 1.0
                else:
                    cut_site = read.aend + reverse_shift - 1
                    if p1 <= cut_site < p2:
                        nr[cut_site - p1] += 1.0

            return nf, nr

        # Raw counts
        nf = [0.0] * (p2_w - p1_w)
        nr = [0.0] * (p2_w - p1_w)
        for read in self.bam.fetch(chrName, p1_w, p2_w):
            # check if the read is unmapped, according to issue #112
            if read.is_unmapped:
                continue

            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1_w <= cut_site < p2_w:
                    nf[cut_site - p1_w] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1_w <= cut_site < p2_w:
                    nr[cut_site - p1_w] += 1.0

        # Smoothed counts
        Nf = []
        Nr = []
        fSum = sum(nf[:window])
        rSum = sum(nr[:window])
        fLast = nf[0]
        rLast = nr[0]
        for i in range((window / 2), len(nf) - (window / 2)):
            Nf.append(fSum)
            Nr.append(rSum)
            fSum -= fLast
            fSum += nf[i + (window / 2)]
            fLast = nf[i - (window / 2) + 1]
            rSum -= rLast
            rSum += nr[i + (window / 2)]
            rLast = nr[i - (window / 2) + 1]

        # Fetching sequence
        currStr = str(fastaFile.fetch(chrName, p1_wk, p2_wk - 1)).upper()
        currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrName, p1_wk + 1,
                                                                     p2_wk)).upper())

        # Iterating on sequence to create signal
        af = []
        ar = []
        for i in range(int(ceil(k_nb / 2.)), len(currStr) - int(floor(k_nb / 2)) + 1):
            fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))]
            rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) - i:len(currStr) + int(floor(k_nb / 2.)) - i]
            try:
                af.append(fBiasDict[fseq])
            except Exception:
                af.append(defaultKmerValue)
            try:
                ar.append(rBiasDict[rseq])
            except Exception:
                ar.append(defaultKmerValue)

        # Calculating bias and writing to wig file
        fSum = sum(af[:window])
        rSum = sum(ar[:window])
        fLast = af[0]
        rLast = ar[0]
        bias_corrected_signal_forward = []
        bias_corrected_signal_reverse = []
        for i in range((window / 2), len(af) - (window / 2)):
            nhatf = Nf[i - (window / 2)] * (af[i] / fSum)
            nhatr = Nr[i - (window / 2)] * (ar[i] / rSum)
            bias_corrected_signal_forward.append(nhatf)
            bias_corrected_signal_reverse.append(nhatr)
            fSum -= fLast
            fSum += af[i + (window / 2)]
            fLast = af[i - (window / 2) + 1]
            rSum -= rLast
            rSum += ar[i + (window / 2)]
            rLast = ar[i - (window / 2) + 1]

        # Termination
        fastaFile.close()
        return bias_corrected_signal_forward, bias_corrected_signal_reverse

    def bias_correction_atac2(self, bias_table, genome_file_name, chrName, start, end,
                              forward_shift, reverse_shift):

        # Parameters
        window = 50
        defaultKmerValue = 1.0

        # Initialization
        fastaFile = Fastafile(genome_file_name)
        fBiasDict = bias_table[0]
        rBiasDict = bias_table[1]
        k_nb = len(fBiasDict.keys()[0])
        p1 = start
        p2 = end
        p1_w = p1 - (window / 2)
        p2_w = p2 + (window / 2)
        p1_wk = p1_w - int(floor(k_nb / 2.))
        p2_wk = p2_w + int(ceil(k_nb / 2.))
        if (p1 <= 0 or p1_w <= 0 or p2_wk <= 0):
            # Return raw counts
            signal = [0.0] * (p2 - p1)
            for read in self.bam.fetch(chrName, p1, p2):
                # check if the read is unmapped, according to issue #112
                if read.is_unmapped:
                    continue

                if not read.is_reverse:
                    cut_site = read.pos + forward_shift
                    if p1 <= cut_site < p2:
                        signal[cut_site - p1] += 1.0
                else:
                    cut_site = read.aend + reverse_shift - 1
                    if p1 <= cut_site < p2:
                        signal[cut_site - p1] += 1.0

            return signal

        # Raw counts
        nf = [0.0] * (p2_w - p1_w)
        nr = [0.0] * (p2_w - p1_w)
        for read in self.bam.fetch(chrName, p1_w, p2_w):
            # check if the read is unmapped, according to issue #112
            if read.is_unmapped:
                continue

            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1_w <= cut_site < p2_w:
                    nf[cut_site - p1_w] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1_w <= cut_site < p2_w:
                    nr[cut_site - p1_w] += 1.0

        # Smoothed counts
        Nf = []
        Nr = []
        fSum = sum(nf[:window])
        rSum = sum(nr[:window])
        fLast = nf[0]
        rLast = nr[0]
        for i in range((window / 2), len(nf) - (window / 2)):
            Nf.append(fSum)
            Nr.append(rSum)
            fSum -= fLast
            fSum += nf[i + (window / 2)]
            fLast = nf[i - (window / 2) + 1]
            rSum -= rLast
            rSum += nr[i + (window / 2)]
            rLast = nr[i - (window / 2) + 1]

        # Fetching sequence
        currStr = str(fastaFile.fetch(chrName, p1_wk, p2_wk - 1)).upper()
        currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrName, p1_wk + 1,
                                                                     p2_wk)).upper())

        # Iterating on sequence to create signal
        af = []
        ar = []
        for i in range(int(ceil(k_nb / 2.)), len(currStr) - int(floor(k_nb / 2)) + 1):
            fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))]
            rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) - i:len(currStr) + int(floor(k_nb / 2.)) - i]
            try:
                af.append(fBiasDict[fseq])
            except Exception:
                af.append(defaultKmerValue)
            try:
                ar.append(rBiasDict[rseq])
            except Exception:
                ar.append(defaultKmerValue)

        # Calculating bias and writing to wig file
        fSum = sum(af[:window])
        rSum = sum(ar[:window])
        fLast = af[0]
        rLast = ar[0]
        bc_signal = []
        for i in range((window / 2), len(af) - (window / 2)):
            nhatf = Nf[i - (window / 2)] * (af[i] / fSum)
            nhatr = Nr[i - (window / 2)] * (ar[i] / rSum)
            bc_signal.append(nhatf + nhatr)
            fSum -= fLast
            fSum += af[i + (window / 2)]
            fLast = af[i - (window / 2) + 1]
            rSum -= rLast
            rSum += ar[i + (window / 2)]
            rLast = ar[i - (window / 2) + 1]

        # Termination
        fastaFile.close()
        return bc_signal

    def hon_norm_atac(self, sequence, mean, std):
        """
        Normalizes a sequence according to hon's criterion using mean and std.
        This represents a between-dataset normalization.

        Keyword arguments:
        sequence -- Input sequence.
        mean -- Global mean.
        std -- Global std.

        Return:
        norm_seq -- Normalized sequence.
        """
        if std != 0:
            norm_seq = []
            for e in sequence:
                if e == 0:
                    norm_seq.append(e)
                else:
                    norm_seq.append(1.0 / (1.0 + (exp(-(e - mean) / std))))
            return norm_seq
        else:
            return sequence

    def hon_norm_dnase(self, sequence, mean, std):
        """
        Normalizes a sequence according to hon's criterion using mean and std.
        This represents a between-dataset normalization.
        Keyword arguments:
        sequence -- Input sequence.
        mean -- Global mean.
        std -- Global std.
        Return:
        norm_seq -- Normalized sequence.
        """

        # if std != 0:
        #    norm_seq = []
        #    for e in sequence:
        #        norm_seq.append(1.0 / (1.0 + (exp(-(e - mean) / std))))
        #    return norm_seq
        # else:
        #    return sequence
        norm_seq = []
        for e in sequence:
            if e == 0.0:
                norm_seq.append(0.0)
            elif e > 0.0:
                norm_seq.append(1.0 / (1.0 + (exp(-(e - mean) / std))))
            else:
                norm_seq.append(-1.0 / (1.0 + (exp(-(-e - mean) / std))))
        return norm_seq

    def boyle_norm(self, sequence):
        """
        Normalizes a sequence according to Boyle's criterion.
        This represents a within-dataset normalization.

        Keyword arguments:
        sequence -- Input sequence.

        Return:
        norm_seq -- Normalized sequence.
        """
        mean = array([e for e in sequence if e > 0]).mean()
        if isnan(mean):
            return sequence
        else:
            norm_seq = [(float(e) / mean) for e in sequence]
            return norm_seq

    def savitzky_golay_coefficients(self, window_size, order, deriv):
        """
        Evaluate the Savitzky-Golay coefficients in order to evaluate the slope of the signal.
        It uses a window_size (of the interpolation), order (of the polynomial), deriv (derivative needed).

        Keyword arguments:
        window_size -- Size of the window for function interpolation.
        order -- Order of polynomial.
        deriv -- Derivative.

        Return:
        m[::-1] -- The Savitzky-Golay coefficients.
        """

        # Get statistics
        # try: # TODO ERRORS
        window_size = abs(int(window_size))
        order = abs(int(order))
        # except ValueError, msg:
        #    raise ValueError("windowSize and order have to be of type int")
        # if windowSize % 2 != 1 or windowSize < 1:
        #    raise TypeError("windowSize size must be a positive odd number")
        # if windowSize < order + 2:
        #    raise TypeError("windowSize is too small for the polynomials order")
        order_range = range(order + 1)
        half_window = (window_size - 1) // 2

        # Precompute Coefficients
        b = mat([[k ** i for i in order_range] for k in range(-half_window, half_window + 1)])
        m = linalg.pinv(b).A[deriv]
        return m[::-1]

    def slope(self, sequence, sg_coefs):
        """
        Evaluates the slope of sequence given the sg_coefs loaded.

        Keyword arguments:
        sequence -- Input sequence.
        sg_coefs -- Savitzky-Golay coefficients.

        Return:
        slope_seq -- Slope sequence.
        """
        slope_seq = convolve(sequence, sg_coefs)
        slope_seq = [e for e in slope_seq[(len(sg_coefs) / 2):(len(slope_seq) - (len(sg_coefs) / 2))]]

        return slope_seq

    def print_signal(self, ref, start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift,
                     initial_clip=1000, per_norm=98, per_slope=98, bias_table=None, genome_file_name=None,
                     raw_signal_file=None, bc_signal_file=None, norm_signal_file=None, strand_specific=False):

        if raw_signal_file:
            pileup_region = PileupRegion(start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift)
            if ps_version == "0.7.5":
                self.bam.fetch(reference=ref, start=start, end=end, callback=pileup_region)
            else:
                iter = self.bam.fetch(reference=ref, start=start, end=end)
                for alignment in iter:
                    pileup_region.__call__(alignment)
            raw_signal = array([min(e, initial_clip) for e in pileup_region.vector])

            f = open(raw_signal_file, "a")
            f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                [str(e) for e in nan_to_num(raw_signal)]) + "\n")
            f.close()

        if bc_signal_file or norm_signal_file:
            # Parameters
            window = 50
            defaultKmerValue = 1.0

            # Initialization
            fasta = Fastafile(genome_file_name)
            fBiasDict = bias_table[0]
            rBiasDict = bias_table[1]
            k_nb = len(fBiasDict.keys()[0])
            p1 = start
            p2 = end
            p1_w = p1 - (window / 2)
            p2_w = p2 + (window / 2)
            p1_wk = p1_w - int(k_nb / 2.)
            p2_wk = p2_w + int(k_nb / 2.)

            currStr = str(fasta.fetch(ref, p1_wk, p2_wk - 1)).upper()
            currRevComp = AuxiliaryFunctions.revcomp(str(fasta.fetch(ref, p1_wk + 1, p2_wk)).upper())

            # Iterating on sequence to create the bias signal
            signal_bias_f = []
            signal_bias_r = []
            for i in range(int(k_nb / 2.), len(currStr) - int(k_nb / 2) + 1):
                fseq = currStr[i - int(k_nb / 2.):i + int(k_nb / 2.)]
                rseq = currRevComp[len(currStr) - int(k_nb / 2.) - i:len(currStr) + int(k_nb / 2.) - i]
                try:
                    signal_bias_f.append(fBiasDict[fseq])
                except Exception:
                    signal_bias_f.append(defaultKmerValue)
                try:
                    signal_bias_r.append(rBiasDict[rseq])
                except Exception:
                    signal_bias_r.append(defaultKmerValue)

            # Raw counts
            signal_raw_f = [0.0] * (p2_w - p1_w)
            signal_raw_r = [0.0] * (p2_w - p1_w)
            for read in self.bam.fetch(ref, p1_w, p2_w):
                if not read.is_reverse:
                    cut_site = read.pos + forward_shift
                    if p1_w <= cut_site < p2_w:
                        signal_raw_f[cut_site - p1_w] += 1.0
                else:
                    cut_site = read.aend + reverse_shift - 1
                    if p1_w <= cut_site < p2_w:
                        signal_raw_r[cut_site - p1_w] += 1.0

            # Smoothed counts
            Nf = []
            Nr = []
            fSum = sum(signal_raw_f[:window])
            rSum = sum(signal_raw_r[:window])
            fLast = signal_raw_f[0]
            rLast = signal_raw_r[0]
            for i in range((window / 2), len(signal_raw_f) - (window / 2)):
                Nf.append(fSum)
                Nr.append(rSum)
                fSum -= fLast
                fSum += signal_raw_f[i + (window / 2)]
                fLast = signal_raw_f[i - (window / 2) + 1]
                rSum -= rLast
                rSum += signal_raw_r[i + (window / 2)]
                rLast = signal_raw_r[i - (window / 2) + 1]

            # Calculating bias and writing to wig file
            fSum = sum(signal_bias_f[:window])
            rSum = sum(signal_bias_r[:window])
            fLast = signal_bias_f[0]
            rLast = signal_bias_r[0]
            signal_bc = []
            signal_bc_f = []
            signal_bc_r = []
            for i in range((window / 2), len(signal_bias_f) - (window / 2)):
                nhatf = Nf[i - (window / 2)] * (signal_bias_f[i] / fSum)
                nhatr = Nr[i - (window / 2)] * (signal_bias_r[i] / rSum)
                signal_bc.append(nhatf + nhatr)
                signal_bc_f.append(nhatf)
                signal_bc_r.append(nhatr)
                fSum -= fLast
                fSum += signal_bias_f[i + (window / 2)]
                fLast = signal_bias_f[i - (window / 2) + 1]
                rSum -= rLast
                rSum += signal_bias_r[i + (window / 2)]
                rLast = signal_bias_r[i - (window / 2) + 1]

            if bc_signal_file:
                f = open(bc_signal_file, "a")
                f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                    [str(e) for e in nan_to_num(signal_bc)]) + "\n")
                f.close()

                if strand_specific:
                    prefix = bc_signal_file.split(".")[0]
                    bc_signal_file_f = prefix + "_Forward" + ".bc.wig"
                    bc_signal_file_r = prefix + "_Reverse" + ".bc.wig"
                    f = open(bc_signal_file_f, "a")
                    f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                        [str(e) for e in nan_to_num(signal_bc_f)]) + "\n")
                    f.close()
                    f = open(bc_signal_file_r, "a")
                    f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                        [str(e) for e in nan_to_num(signal_bc_r)]) + "\n")
                    f.close()

            if norm_signal_file:
                norm_signal_bc = self.boyle_norm(signal_bc)
                perc = scoreatpercentile(norm_signal_bc, 98)
                std = np.std(norm_signal_bc)
                norm_signal_bc = self.hon_norm_atac(norm_signal_bc, perc, std)
                f = open(norm_signal_file, "a")
                f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                    [str(e) for e in nan_to_num(norm_signal_bc)]) + "\n")
                f.close()

                if strand_specific:
                    prefix = bc_signal_file.split(".")[0]
                    norm_signal_file_f = prefix + "_Forward" + ".norm.wig"
                    norm_signal_file_r = prefix + "_Reverse" + ".norm.wig"

                    signal_norm_f = self.boyle_norm(signal_bc_f)
                    perc = scoreatpercentile(signal_norm_f, 98)
                    std = np.std(signal_norm_f)
                    signal_norm_f = self.hon_norm_atac(signal_norm_f, perc, std)

                    signal_norm_r = self.boyle_norm(signal_bc_r)
                    perc = scoreatpercentile(signal_norm_r, 98)
                    std = np.std(signal_norm_r)
                    signal_norm_r = self.hon_norm_atac(signal_norm_r, perc, std)

                    f = open(norm_signal_file_f, "a")
                    f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                        [str(e) for e in nan_to_num(signal_norm_f)]) + "\n")
                    f.close()
                    f = open(norm_signal_file_r, "a")
                    f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                        [str(e) for e in nan_to_num(signal_norm_r)]) + "\n")
                    f.close()

    def get_raw_signal_by_fragment_length(self, ref, start, end, bam,
                                          forward_shift, reverse_shift, min_length=None, max_length=None,
                                          strand=True):

        p1 = start
        p2 = end
        raw_f = [0.0] * (p2 - p1)
        raw_r = [0.0] * (p2 - p1)

        if min_length is None and max_length is None:
            for read in bam.fetch(ref, p1, p2):
                # check if the read is unmapped, according to issue #112
                if read.is_unmapped:
                    continue

                if not read.is_reverse:
                    cut_site = read.pos + forward_shift
                    if p1 <= cut_site < p2:
                        raw_f[cut_site - p1] += 1.0
                else:
                    cut_site = read.aend + reverse_shift - 1
                    if p1 <= cut_site < p2:
                        raw_r[cut_site - p1] += 1.0
        elif min_length is None and max_length is not None:
            for read in bam.fetch(ref, p1, p2):
                # check if the read is unmapped, according to issue #112
                if read.is_unmapped:
                    continue

                if abs(read.template_length) <= max_length:
                    if not read.is_reverse:
                        cut_site = read.pos + forward_shift
                        if p1 <= cut_site < p2:
                            raw_f[cut_site - p1] += 1.0
                    else:
                        cut_site = read.aend + reverse_shift - 1
                        if p1 <= cut_site < p2:
                            raw_r[cut_site - p1] += 1.0
        elif min_length is not None and max_length is None:
            for read in bam.fetch(ref, p1, p2):
                # check if the read is unmapped, according to issue #112
                if read.is_unmapped:
                    continue

                if abs(read.template_length) > min_length:
                    if not read.is_reverse:
                        cut_site = read.pos + forward_shift
                        if p1 <= cut_site < p2:
                            raw_f[cut_site - p1] += 1.0
                    else:
                        cut_site = read.aend + reverse_shift - 1
                        if p1 <= cut_site < p2:
                            raw_r[cut_site - p1] += 1.0
        elif min_length is not None and max_length is not None:
            for read in bam.fetch(ref, p1, p2):
                # check if the read is unmapped, according to issue #112
                if read.is_unmapped:
                    continue

                if min_length <= abs(read.template_length) <= max_length:
                    if not read.is_reverse:
                        cut_site = read.pos + forward_shift
                        if p1 <= cut_site < p2:
                            raw_f[cut_site - p1] += 1.0
                    else:
                        cut_site = read.aend + reverse_shift - 1
                        if p1 <= cut_site < p2:
                            raw_r[cut_site - p1] += 1.0
        if strand:
            return np.array(raw_f), np.array(raw_r)
        else:
            return np.add(np.array(raw_f), np.array(raw_r))

    def get_bc_signal_by_fragment_length(self, ref, start, end, bam, fasta, bias_table,
                                         forward_shift, reverse_shift, min_length=None, max_length=None,
                                         strand=True):
        # Parameters
        window = 50
        defaultKmerValue = 1.0

        # Initialization
        fBiasDict = bias_table[0]
        rBiasDict = bias_table[1]
        k_nb = len(fBiasDict.keys()[0])
        p1 = start
        p2 = end
        p1_w = p1 - (window / 2)
        p2_w = p2 + (window / 2)
        p1_wk = p1_w - int(k_nb / 2.)
        p2_wk = p2_w + int(k_nb / 2.)

        if (p1 <= 0 or p1_w <= 0 or p2_wk <= 0):
            # Return raw counts
            signal = [0.0] * (p2 - p1)
            for read in self.bam.fetch(ref, p1, p2):
                # check if the read is unmapped, according to issue #112
                if read.is_unmapped:
                    continue

                if not read.is_reverse:
                    cut_site = read.pos + forward_shift
                    if p1 <= cut_site < p2:
                        signal[cut_site - p1] += 1.0
                else:
                    cut_site = read.aend + reverse_shift - 1
                    if p1 <= cut_site < p2:
                        signal[cut_site - p1] += 1.0

            return signal

        currStr = str(fasta.fetch(ref, p1_wk, p2_wk - 1)).upper()
        currRevComp = AuxiliaryFunctions.revcomp(str(fasta.fetch(ref, p1_wk + 1, p2_wk)).upper())

        # Iterating on sequence to create the bias signal
        signal_bias_f = []
        signal_bias_r = []
        for i in range(int(k_nb / 2.), len(currStr) - int(k_nb / 2) + 1):
            fseq = currStr[i - int(k_nb / 2.):i + int(k_nb / 2.)]
            rseq = currRevComp[len(currStr) - int(k_nb / 2.) - i:len(currStr) + int(k_nb / 2.) - i]
            try:
                signal_bias_f.append(fBiasDict[fseq])
            except Exception:
                signal_bias_f.append(defaultKmerValue)
            try:
                signal_bias_r.append(rBiasDict[rseq])
            except Exception:
                signal_bias_r.append(defaultKmerValue)

        # Raw counts
        raw_f = [0.0] * (p2_w - p1_w)
        raw_r = [0.0] * (p2_w - p1_w)

        if min_length is None and max_length is None:
            for read in bam.fetch(ref, p1_w, p2_w):
                # check if the read is unmapped, according to issue #112
                if read.is_unmapped:
                    continue

                if not read.is_reverse:
                    cut_site = read.pos + forward_shift
                    if p1_w <= cut_site < p2_w:
                        raw_f[cut_site - p1_w] += 1.0
                else:
                    cut_site = read.aend + reverse_shift - 1
                    if p1_w <= cut_site < p2_w:
                        raw_r[cut_site - p1_w] += 1.0
        elif min_length is None and max_length is not None:
            for read in bam.fetch(ref, p1_w, p2_w):
                # check if the read is unmapped, according to issue #112
                if read.is_unmapped:
                    continue

                if abs(read.template_length) <= max_length:
                    if not read.is_reverse:
                        cut_site = read.pos + forward_shift
                        if p1_w <= cut_site < p2_w:
                            raw_f[cut_site - p1_w] += 1.0
                    else:
                        cut_site = read.aend + reverse_shift - 1
                        if p1_w <= cut_site < p2_w:
                            raw_r[cut_site - p1_w] += 1.0
        elif min_length is not None and max_length is None:
            for read in bam.fetch(ref, p1_w, p2_w):
                if abs(read.template_length) > min_length:
                    if not read.is_reverse:
                        cut_site = read.pos + forward_shift
                        if p1_w <= cut_site < p2_w:
                            raw_f[cut_site - p1_w] += 1.0
                    else:
                        cut_site = read.aend + reverse_shift - 1
                        if p1_w <= cut_site < p2_w:
                            raw_r[cut_site - p1_w] += 1.0
        elif min_length is not None and max_length is not None:
            for read in bam.fetch(ref, p1_w, p2_w):
                # check if the read is unmapped, according to issue #112
                if read.is_unmapped:
                    continue

                if min_length < abs(read.template_length) <= max_length:
                    if not read.is_reverse:
                        cut_site = read.pos + forward_shift
                        if p1_w <= cut_site < p2_w:
                            raw_f[cut_site - p1_w] += 1.0
                    else:
                        cut_site = read.aend + reverse_shift - 1
                        if p1_w <= cut_site < p2_w:
                            raw_r[cut_site - p1_w] += 1.0

        # Smoothed counts
        Nf = []
        Nr = []
        fSum = sum(raw_f[:window])
        rSum = sum(raw_r[:window])
        fLast = raw_f[0]
        rLast = raw_r[0]
        for i in range((window / 2), len(raw_f) - (window / 2)):
            Nf.append(fSum)
            Nr.append(rSum)
            fSum -= fLast
            fSum += raw_f[i + (window / 2)]
            fLast = raw_f[i - (window / 2) + 1]
            rSum -= rLast
            rSum += raw_r[i + (window / 2)]
            rLast = raw_r[i - (window / 2) + 1]

        # Calculating bias and writing to wig file
        fSum = sum(signal_bias_f[:window])
        rSum = sum(signal_bias_r[:window])
        fLast = signal_bias_f[0]
        rLast = signal_bias_r[0]
        bc_f = []
        bc_r = []
        for i in range((window / 2), len(signal_bias_f) - (window / 2)):
            nhatf = Nf[i - (window / 2)] * (signal_bias_f[i] / fSum)
            nhatr = Nr[i - (window / 2)] * (signal_bias_r[i] / rSum)
            bc_f.append(nhatf)
            bc_r.append(nhatr)
            fSum -= fLast
            fSum += signal_bias_f[i + (window / 2)]
            fLast = signal_bias_f[i - (window / 2) + 1]
            rSum -= rLast
            rSum += signal_bias_r[i + (window / 2)]
            rLast = signal_bias_r[i - (window / 2) + 1]

        if strand:
            return np.array(bc_f), np.array(bc_r)
        else:
            return np.add(np.array(bc_f), np.array(bc_r))

    def get_bias_raw_bc_signal(self, ref, start, end, bam, fasta, bias_table, forward_shift, reverse_shift,
                               strand=False):
        # Parameters
        window = 50
        defaultKmerValue = 1.0

        # Initialization
        fBiasDict = bias_table[0]
        rBiasDict = bias_table[1]
        k_nb = len(fBiasDict.keys()[0])
        p1 = start
        p2 = end
        p1_w = p1 - (window / 2)
        p2_w = p2 + (window / 2)
        p1_wk = p1_w - int(k_nb / 2.)
        p2_wk = p2_w + int(k_nb / 2.)

        if p1 <= 0 or p1_w <= 0 or p2_wk <= 0:
            # Return raw counts
            signal = [0.0] * (p2 - p1)
            for read in self.bam.fetch(ref, p1, p2):
                # check if the read is unmapped, according to issue #112
                if read.is_unmapped:
                    continue

                if not read.is_reverse:
                    cut_site = read.pos + forward_shift
                    if p1 <= cut_site < p2:
                        signal[cut_site - p1] += 1.0
                else:
                    cut_site = read.aend + reverse_shift - 1
                    if p1 <= cut_site < p2:
                        signal[cut_site - p1] += 1.0

            return signal

        currStr = str(fasta.fetch(ref, p1_wk - 1 + forward_shift, p2_wk - 2 + forward_shift)).upper()
        currRevComp = AuxiliaryFunctions.revcomp(str(fasta.fetch(ref, p1_wk + reverse_shift + 2,
                                                                 p2_wk + reverse_shift + 1)).upper())

        # Iterating on sequence to create the bias signal
        signal_bias_f = []
        signal_bias_r = []
        for i in range(int(k_nb / 2.), len(currStr) - int(k_nb / 2) + 1):
            fseq = currStr[i - int(k_nb / 2.):i + int(k_nb / 2.)]
            rseq = currRevComp[len(currStr) - int(k_nb / 2.) - i:len(currStr) + int(k_nb / 2.) - i]
            try:
                signal_bias_f.append(fBiasDict[fseq])
            except Exception:
                signal_bias_f.append(defaultKmerValue)
            try:
                signal_bias_r.append(rBiasDict[rseq])
            except Exception:
                signal_bias_r.append(defaultKmerValue)

        # Raw counts
        signal_raw_f = [0.0] * (p2_w - p1_w)
        signal_raw_r = [0.0] * (p2_w - p1_w)
        for read in bam.fetch(ref, p1_w, p2_w):
            # check if the read is unmapped, according to issue #112
            if read.is_unmapped:
                continue

            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1_w <= cut_site < p2_w:
                    signal_raw_f[cut_site - p1_w] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1_w <= cut_site < p2_w:
                    signal_raw_r[cut_site - p1_w] += 1.0

        # Smoothed counts
        Nf = []
        Nr = []
        fSum = sum(signal_raw_f[:window])
        rSum = sum(signal_raw_r[:window])
        fLast = signal_raw_f[0]
        rLast = signal_raw_r[0]
        for i in range((window / 2), len(signal_raw_f) - (window / 2)):
            Nf.append(fSum)
            Nr.append(rSum)
            fSum -= fLast
            fSum += signal_raw_f[i + (window / 2)]
            fLast = signal_raw_f[i - (window / 2) + 1]
            rSum -= rLast
            rSum += signal_raw_r[i + (window / 2)]
            rLast = signal_raw_r[i - (window / 2) + 1]

        # Calculating bias and writing to wig file
        fSum = sum(signal_bias_f[:window])
        rSum = sum(signal_bias_r[:window])
        fLast = signal_bias_f[0]
        rLast = signal_bias_r[0]
        bias_f = []
        bias_r = []
        raw = []
        raw_f = []
        raw_r = []
        bc = []
        bc_f = []
        bc_r = []
        for i in range((window / 2), len(signal_bias_f) - (window / 2)):
            nhatf = Nf[i - (window / 2)] * (signal_bias_f[i] / fSum)
            nhatr = Nr[i - (window / 2)] * (signal_bias_r[i] / rSum)
            bias_f.append(signal_bias_f[i])
            bias_r.append(signal_bias_r[i])
            raw.append(signal_raw_f[i] + signal_raw_r[i])
            raw_f.append(signal_raw_f[i])
            raw_r.append(signal_raw_r[i])
            # zf = (signal_raw_f[i]) / (signal_bias_f[i])
            # zr = (signal_raw_r[i]) / (signal_bias_r[i])
            bc.append(nhatf + nhatr)
            bc_f.append(nhatf)
            bc_r.append(nhatr)
            fSum -= fLast
            fSum += signal_bias_f[i + (window / 2)]
            fLast = signal_bias_f[i - (window / 2) + 1]
            rSum -= rLast
            rSum += signal_bias_r[i + (window / 2)]
            rLast = signal_bias_r[i - (window / 2) + 1]

        currStr = str(fasta.fetch(ref, p1_wk, p2_wk - 1)).upper()
        currRevComp = AuxiliaryFunctions.revcomp(str(fasta.fetch(ref, p1_wk + 1, p2_wk)).upper())

        # Iterating on sequence to create the bias signal
        signal_bias_f = []
        signal_bias_r = []
        for i in range(int(k_nb / 2.), len(currStr) - int(k_nb / 2) + 1):
            fseq = currStr[i - int(k_nb / 2.):i + int(k_nb / 2.)]
            rseq = currRevComp[len(currStr) - int(k_nb / 2.) - i:len(currStr) + int(k_nb / 2.) - i]
            try:
                signal_bias_f.append(fBiasDict[fseq])
            except Exception:
                signal_bias_f.append(defaultKmerValue)
            try:
                signal_bias_r.append(rBiasDict[rseq])
            except Exception:
                signal_bias_r.append(defaultKmerValue)

        bias_f = []
        bias_r = []
        for i in range((window / 2), len(signal_bias_f) - (window / 2)):
            bias_f.append(signal_bias_f[i])
            bias_r.append(signal_bias_r[i])

        if strand:
            return bias_f, bias_r, raw, raw_f, raw_r, bc, bc_f, bc_r
        else:
            return bias_f, bias_r, raw, bc
示例#32
0
    # Input
    inputStag1FileName = tempLocation + "inputStag1FileName.txt"
    inputStag2FileName = tempLocation + "inputStag2FileName.txt"
    outputRegionFileName = ol + "region_fold_change.pdf"
    outputSignalFileName = ol + "signal_fold_change.pdf"

    # Iterating on STAG1
    stag1File = open(stag1FileName, "rU")
    inputStag1File = open(inputStag1FileName, "w")
    stag1File.readline()
    for line in stag1File:
        ll = line.strip().split("\t")
        sp = ll[7].split(":")
        if (sp[0] == "ENHANCER"):
            regionsFetch = regionsFile.fetch(ll[0], int(ll[5]), int(ll[6]))
            seFlag = False
            for read in regionsFetch:
                if (read.qname.split(":")[0] == "SUPERENHANCER"):
                    seFlag = True
                    break
            if (seFlag): continue
        if (sp[2] == "."): name = sp[0]
        else: name = sp[2] + "_" + sp[0]
        score = ll[3]
        ctcfScore = ll[4]
        inputStag1File.write("\t".join([name, score]) + "\n")
    stag1File.close()
    inputStag1File.close()

    # Iterating on STAG2
示例#33
0
        start = ll[4]
        end = ll[5]
        tss1 = str(max(int(ll[5]) - 0, 0))
        tss2 = str(int(ll[5]) + ext)
        tes1 = str(max(int(ll[4]) - ext, 0))
        tes2 = str(int(ll[4]) + 0)
        p1 = start
        p2 = tss2
    try:
        geneSymbol = aliasDict[ensg]
    except Exception:
        geneSymbol = ensg

    # Active status
    activeStatus = "INACTIVE"
    h3k4me3Fetch = h3k4me3File.fetch(chrom, int(p1), int(p2))
    mSum = sum(1 for _ in h3k4me3Fetch)
    h3k27acFetch = h3k27acFile.fetch(chrom, int(p1), int(p2))
    aSum = sum(1 for _ in h3k27acFetch)
    if (mSum > 0 or aSum > 0): activeStatus = "ACTIVE"

    geneDict[ensg] = [
        chrom, start, end, "GENE:" + geneSymbol + ":" + activeStatus,
        str(int(float(score))), strand
    ]
    tssDict[ensg] = [
        chrom, tss1, tss2, "PROMOTER:" + geneSymbol + ":" + activeStatus,
        str(int(float(score))), strand
    ]
    ttsDict[ensg] = [
        chrom, tes1, tes2, "TTS:" + geneSymbol + ":" + activeStatus,
示例#34
0
def parse_gem_3c(f_name, out_file, genome_lengths, frags, verbose=False,
                 tmp_format=False, **kwargs):
    """
    Parse gem 3c sam file using pysam tools.

    :param f_name: path to sam file corresponding to the mapping of reads
    :param out_file: path to outfile tab separated format containing paired read information
    :param genome_lengths: a dictionary generated containing the length of the genomic sequence
                           per chromosome
    :param False tmp_format: If True leave the file prepared to be merged with other map files.
    """

    frag_chunk = kwargs.get('frag_chunk', 100000)
    try:
        fhandler = Samfile(f_name)
    except IOError:
        raise Exception('ERROR: file "%s" not found' % f_name)

    # max number of reads in buffer
    max_size = 1000000

    # getrname chromosome names
    i = 0
    crm_dict = {}
    while True:
        try:
            crm_dict[i] = fhandler.getrname(i)
            i += 1
        except ValueError:
            break
    # iteration over reads
    sub_count = 0
    nfile = 0
    tmp_files = []
    reads = []
    cur_name = ''
    write_pairs = False
    read1 = None
    read2 = []
    samiter = fhandler.fetch(until_eof=True)
    r = None
    try:
        r = next(samiter)
    except StopIteration:
        # empty SAM file
        return None
        pass
    while r:
        if not r.is_paired or r.is_unmapped or r.mapq < 4:
            try:
                r = next(samiter)
            except StopIteration:
                break
            continue

        if r.is_read1 and cur_name != r.qname:
            if read1 is None:
                read1 = r
                cur_name = r.qname
                try:
                    r = next(samiter)
                except StopIteration:
                    break
                continue
            else:
                write_pairs = True

        if not write_pairs:
            if r.is_read2 or r.is_supplementary:
                read2.append(r)
                try:
                    r = next(samiter)
                except StopIteration:
                    break
                continue
        else:
            if not read2:
                write_pairs = False
                read1 = None
                try:
                    r = next(samiter)
                except StopIteration:
                    break
                continue
            reads_grp = []
            read_id = read1.query_name
            for read in [read1]+read2:
                if read.query_name != read_id:
                    continue
                positive = not read.is_reverse
                crm      = crm_dict[read.tid]
                len_seq  = read.reference_end-read.pos
                if positive:
                    pos = read.pos + 1
                else:
                    pos = read.pos + len_seq
                try:
                    frag_piece = frags[crm][pos // frag_chunk]
                except KeyError:
                    # Chromosome not in hash
                    read_multi = []
                    break
                idx = bisect(frag_piece, pos)
                try:
                    next_re = frag_piece[idx]
                except IndexError:
                    # case where part of the read is mapped outside chromosome
                    count = 0
                    while idx >= len(frag_piece) and count < len_seq:
                        pos -= 1
                        count += 1
                        frag_piece = frags[crm][pos // frag_chunk]
                        idx = bisect(frag_piece, pos)
                    if count >= len_seq:
                        raise Exception('Read mapped mostly outside ' +
                                        'chromosome\n')
                    next_re    = frag_piece[idx]
                prev_re    = frag_piece[idx - 1 if idx else 0]
                reads_grp.append([read.tid, crm, pos, positive,
                                  len_seq, prev_re, next_re])
            if len(reads_grp) > 2:
                _merge_multis(reads_grp)
            elif len(reads_grp) < 2:
                reads_grp = []
            reads_multi = []
            for paired_reads in combinations(reads_grp, 2):
                read_multi = [item for sublist in sorted(paired_reads,key = lambda x: (x[0], x[2]))
                              for item in sublist]
                if read_multi:
                    reads_multi.append(read_multi)
                sub_count += 1

            paired_total = len(reads_multi)
            paired_nbr = 0
            for pair_read in reads_multi:
                read_name_id = read_id
                paired_nbr += 1
                if paired_total > 1:
                    read_name_id += '#%d/%d' % (paired_nbr,paired_total)
                reads.append([read_name_id]+pair_read)

            if sub_count >= max_size:
                sub_count = 0
                nfile += 1
                reads = sorted(reads, key = lambda x: (x[1], x[3], x[8], x[10]))
                read_lines = ['%s\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n'
                              % tuple(read) for read in reads]
                write_paired_reads_to_file(read_lines, out_file, tmp_files, nfile)
                #map_out.write('\n'.join(reads)+'\n')
                del reads[:]
            write_pairs = False
            read1 = None
            del read2[:]
    if reads:
        nfile += 1
        reads = sorted(reads, key = lambda x: (x[1], x[3], x[8], x[10]))
        read_lines = ['%s\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n'
                      % tuple(read) for read in reads]
        write_paired_reads_to_file(read_lines, out_file, tmp_files, nfile)
        #map_out.write('\n'.join(reads))

    #map_out.close()
    # we have now sorted temporary files
    # we do merge sort for eah pair
    if verbose:
        stdout.write('Merge sort')
        stdout.flush()
    while len(tmp_files) > 1:
        file1 = tmp_files.pop(0)
        try:
            file2 = tmp_files.pop(0)
        except IndexError:
            break
        if verbose:
            stdout.write('.')
        stdout.flush()
        nfile += 1
        tmp_files.append(merge_sort(file1, file2, out_file, nfile, paired=True))
    if verbose:
        stdout.write('\n')

    if tmp_format:
        os.rename(tmp_files[0], out_file)
    else:
        map_out   = open(out_file, 'w')
        tmp_reads_fh = open(tmp_files[0],'rb')
        for crm in genome_lengths:
            map_out.write('# CRM %s\t%d\n' % (crm, genome_lengths[crm]))
        for read_line in tmp_reads_fh:
            read = read_line.split('\t')
            map_out.write('\t'.join([read[0]]+read[2:8]+read[9:]))
        map_out.close()
        os.system('rm -rf ' + tmp_files[0])

    return out_file
示例#35
0
def estimate_bias_pwm(args):
    # Parameters
    max_duplicates = 100

    # Initializing bam and fasta
    bamFile = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fastaFile = Fastafile(genome_data.get_genome())
    regions = GenomicRegionSet("regions")
    regions.read(args.regions_file)

    obs_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    exp_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    obs_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    exp_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])

    # Iterating on HS regions
    for region in regions:
        # Initialization
        prev_pos = -1
        true_counter = 0

        # Evaluating observed frequencies
        # Fetching reads
        for r in bamFile.fetch(region.chrom, region.initial, region.final):
            # Calculating positions
            if not r.is_reverse:
                cut_site = r.pos + args.forward_shift - 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            else:
                cut_site = r.aend + args.reverse_shift + 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            p2 = p1 + args.k_nb

            # Verifying PCR artifacts
            if p1 == prev_pos:
                true_counter += 1
            else:
                prev_pos = p1
                true_counter = 0
            if true_counter > max_duplicates: continue

            # Fetching k-mer
            try:
                currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)

            # Counting k-mer in dictionary
            if not r.is_reverse:
                for i in range(0, len(currStr)):
                    obs_f_pwm_dict[currStr[i]][i] += 1
            else:
                for i in range(0, len(currStr)):
                    obs_r_pwm_dict[currStr[i]][i] += 1

        # Evaluating expected frequencies
        # Fetching whole sequence
        try:
            currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue

        # Iterating on each sequence position
        s = None
        for i in range(0, len(currStr) - args.k_nb):
            # Counting k-mer in dictionary
            s = currStr[i:i + args.k_nb]
            for i in range(0, len(s)):
                exp_f_pwm_dict[s[i]][i] += 1

            # Counting k-mer in dictionary for reverse complement
            s = AuxiliaryFunctions.revcomp(s)
            for i in range(0, len(s)):
                exp_r_pwm_dict[s[i]][i] += 1

    # Closing files
    bamFile.close()
    fastaFile.close()

    # Output pwms
    os.system("mkdir -p " + os.path.join(args.output_location, "pfm"))
    pwm_dict_list = [obs_f_pwm_dict, obs_r_pwm_dict, exp_f_pwm_dict, exp_r_pwm_dict]
    pwm_file_list = []
    pwm_obs_f = os.path.join(args.output_location, "pfm", "obs_{}_f.pfm".format(str(args.k_nb)))
    pwm_obs_r = os.path.join(args.output_location, "pfm", "obs_{}_r.pfm".format(str(args.k_nb)))
    pwm_exp_f = os.path.join(args.output_location, "pfm", "exp_{}_f.pfm".format(str(args.k_nb)))
    pwm_exp_r = os.path.join(args.output_location, "pfm", "exp_{}_r.pfm".format(str(args.k_nb)))

    pwm_file_list.append(pwm_obs_f)
    pwm_file_list.append(pwm_obs_r)
    pwm_file_list.append(pwm_exp_f)
    pwm_file_list.append(pwm_exp_r)

    for i in range(len(pwm_dict_list)):
        with open(pwm_file_list[i], "w") as pwm_file:
            for e in ["A", "C", "G", "T"]:
                pwm_file.write(" ".join([str(int(f)) for f in pwm_dict_list[i][e]]) + "\n")

    motif_obs_f = motifs.read(open(pwm_obs_f), "pfm")
    motif_obs_r = motifs.read(open(pwm_obs_r), "pfm")
    motif_exp_f = motifs.read(open(pwm_exp_f), "pfm")
    motif_exp_r = motifs.read(open(pwm_exp_r), "pfm")

    # Output logos
    os.system("mkdir -p " + os.path.join(args.output_location, "logo"))
    logo_obs_f = os.path.join(args.output_location, "logo", "obs_{}_f.pdf".format(str(args.k_nb)))
    logo_obs_r = os.path.join(args.output_location, "logo", "obs_{}_r.pdf".format(str(args.k_nb)))
    logo_exp_f = os.path.join(args.output_location, "logo", "exp_{}_f.pdf".format(str(args.k_nb)))
    logo_exp_r = os.path.join(args.output_location, "logo", "exp_{}_r.pdf".format(str(args.k_nb)))

    motif_obs_f.weblogo(logo_obs_f, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.2, yaxis_tic_interval=0.1)
    motif_obs_r.weblogo(logo_obs_r, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.2, yaxis_tic_interval=0.1)
    motif_exp_f.weblogo(logo_exp_f, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.02, yaxis_tic_interval=0.01)
    motif_exp_r.weblogo(logo_exp_r, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.02, yaxis_tic_interval=0.01)

    # Creating bias dictionary
    alphabet = ["A", "C", "G", "T"]
    k_mer_comb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)]
    bias_table_F = dict([(e, 0.0) for e in k_mer_comb])
    bias_table_R = dict([(e, 0.0) for e in k_mer_comb])
    for k_mer in k_mer_comb:
        obs_f = get_ppm_score(k_mer, motif_obs_f.pwm, args.k_nb)
        exp_f = get_ppm_score(k_mer, motif_exp_f.pwm, args.k_nb)
        bias_table_F[k_mer] = round(obs_f / exp_f, 6)
        obs_r = get_ppm_score(k_mer, motif_obs_r.pwm, args.k_nb)
        exp_r = get_ppm_score(k_mer, motif_exp_r.pwm, args.k_nb)
        bias_table_R[k_mer] = round(obs_r / exp_r, 6)

    write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
示例#36
0
    # Iterating on AB locations
    for tline in abTreatFile:

        # Initialization
        cline = abControlFile.readline()
        tt = tline.strip().split("\t")
        cc = cline.strip().split("\t")
        chrom = tt[0]
        p1 = int(tt[1])
        p2 = int(tt[2])
        tcomp = tt[3]
        ccomp = cc[3]

        # Fetching TADs
        tfetch = treatTadFile.fetch(chrom, p1, p2)
        ttadList = []
        for read in tfetch:
            ttadList.append([read.reference_start, read.reference_end])
        cfetch = treatTadFile.fetch(chrom, p1, p2)
        ctadList = []
        for read in cfetch:
            ctadList.append([read.reference_start, read.reference_end])

        # Iterating on TADs treatment
        treatIntraCount = 0.0
        treatInterCount = 0.0
        flagTtad = True
        try:
            prevTad = ttadList[0]
            prevSum, prevAvg = summ_avg_tad_interaction(
示例#37
0
class GenomicSignal:
    """
    Represents a genomic signal. It should be used to fetch normalized and slope
    signals from a bam file.
    Usage:
    1. Initialize class.
    2. Call load_sg_coefs once.
    3. Call get_signal as many times as needed.

    Authors: Eduardo G. Gusmao.
    """

    def __init__(self, file_name):
        """ 
        Initializes GenomicSignal.
        """
        self.file_name = file_name
        self.sg_coefs = None
        self.bam = Samfile(file_name, "rb")

    def load_sg_coefs(self, slope_window_size):
        """ 
        Loads Savitzky-Golay coefficients into self.sg_coefs based on a slope_window_size.

        Keyword arguments:
        slope_window_size -- Window size of Savitzky-Golay coefficients.

        Return:
        None -- It updates self.sg_coefs.
        """
        self.sg_coefs = self.savitzky_golay_coefficients(slope_window_size, 2, 1)

    def get_tag_count(self, ref, start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift,
                      initial_clip=1000):
        """
        Gets the tag count associated with self.bam based on start, end and ext.

        Keyword arguments:
        ref -- Chromosome name.
        start -- Initial genomic coordinate of signal.
        end -- Final genomic coordinate of signal.
        downstream_ext -- Number of bps to extend towards the downstream region (right for forward strand and left for reverse strand).
        upstream_ext -- Number of bps to extend towards the upstream region (left for forward strand and right for reverse strand).
        forward_shift -- Number of bps to shift the reads aligned to the forward strand. Can be a positive number for a shift towards the downstream region (towards the inside of the aligned read) and a negative number for a shift towards the upstream region.
        reverse_shift -- Number of bps to shift the reads aligned to the reverse strand. Can be a positive number for a shift towards the upstream region and a negative number for a shift towards the downstream region (towards the inside of the aligned read).
        initial_clip -- Signal will be initially clipped at this level to avoid outliers.

        Return:
        tag_count -- Total signal.
        """

        # Fetch raw signal
        pileup_region = PileupRegion(start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift)
        if (ps_version == "0.7.5"):
            self.bam.fetch(reference=ref, start=start, end=end, callback=pileup_region)
        else:
            iter = self.bam.fetch(reference=ref, start=start, end=end)
            for alignment in iter: pileup_region.__call__(alignment)
        raw_signal = array([min(e, initial_clip) for e in pileup_region.vector])

        # Std-based clipping
        mean = raw_signal.mean()
        std = raw_signal.std()
        clip_signal = [min(e, mean + (10 * std)) for e in raw_signal]

        # Tag count
        try:
            tag_count = sum(clip_signal)
        except Exception:
            tag_count = 0

        return tag_count

    def get_signal(self, ref, start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift,
                   initial_clip=1000, per_norm=98, per_slope=98,
                   bias_table=None, genome_file_name=None, print_raw_signal=False,
                   print_bc_signal=False, print_norm_signal=False, print_slope_signal=False,
                   strands_specific=False):
        """
        Gets the signal associated with self.bam based on start, end and ext.
        initial_clip, per_norm and per_slope are used as normalization factors during the normalization
        and slope evaluation procedures.

        Keyword arguments:
        ref -- Chromosome name.
        start -- Initial genomic coordinate of signal.
        end -- Final genomic coordinate of signal.
        initial_clip -- Signal will be initially clipped at this level to avoid outliers.
        per_norm -- Percentile value for 'hon_norm' function of the normalized signal.
        per_slope -- Percentile value for 'hon_norm' function of the slope signal.
        bias_table -- Bias table to perform bias correction.
        genome_file_name -- Genome to perform bias correction.
        downstream_ext -- Number of bps to extend towards the downstream region
        (right for forward strand and left for reverse strand).
        upstream_ext -- Number of bps to extend towards the upstream region
        (left for forward strand and right for reverse strand).
        forward_shift -- Number of bps to shift the reads aligned to the forward strand.
        Can be a positive number for a shift towards the downstream region
        (towards the inside of the aligned read) and a negative number for a shift towards the upstream region.
        reverse_shift -- Number of bps to shift the reads aligned to the reverse strand.
        Can be a positive number for a shift towards the upstream region and a negative number
        for a shift towards the downstream region (towards the inside of the aligned read).

        Return:
        hon_signal -- Normalized signal.
        slopehon_signal -- Slope signal.
        """

        # Fetch raw signal
        pileup_region = PileupRegion(start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift)
        if (ps_version == "0.7.5"):
            self.bam.fetch(reference=ref, start=start, end=end, callback=pileup_region)
        else:
            iter = self.bam.fetch(reference=ref, start=start, end=end)
            for alignment in iter:
                pileup_region.__call__(alignment)
        raw_signal = array([min(e, initial_clip) for e in pileup_region.vector])

        # Std-based clipping
        mean = raw_signal.mean()
        std = raw_signal.std()
        clip_signal = [min(e, mean + (10 * std)) for e in raw_signal]

        # Cleavage bias correction
        bias_corrected_signal = self.bias_correction(clip_signal, bias_table, genome_file_name,
                                                     ref, start, end, forward_shift, reverse_shift, strands_specific)

        # Boyle normalization (within-dataset normalization)
        boyle_signal = array(self.boyle_norm(bias_corrected_signal))

        # Hon normalization (between-dataset normalization)
        perc = scoreatpercentile(boyle_signal, per_norm)
        std = boyle_signal.std()
        hon_signal = self.hon_norm(boyle_signal, perc, std)

        # Slope signal
        slope_signal = self.slope(hon_signal, self.sg_coefs)

        # Hon normalization on slope signal (between-dataset slope smoothing)
        abs_seq = array([abs(e) for e in slope_signal])
        perc = scoreatpercentile(abs_seq, per_slope)
        std = abs_seq.std()
        slopehon_signal = self.hon_norm(slope_signal, perc, std)

        # Writing signal
        if (print_raw_signal):
            signal_file = open(print_raw_signal, "a")
            signal_file.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                [str(e) for e in nan_to_num(raw_signal)]) + "\n")
            signal_file.close()
        if (print_bc_signal):
            signal_file = open(print_bc_signal, "a")
            signal_file.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                [str(e) for e in nan_to_num(bias_corrected_signal)]) + "\n")
            signal_file.close()
        if (print_norm_signal):
            signal_file = open(print_norm_signal, "a")
            signal_file.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                [str(e) for e in nan_to_num(hon_signal)]) + "\n")
            signal_file.close()
        if (print_slope_signal):
            signal_file = open(print_slope_signal, "a")
            signal_file.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                [str(e) for e in nan_to_num(slope_signal)]) + "\n")
            signal_file.close()

        # Returning normalized and slope sequences
        return hon_signal, slopehon_signal

    def bias_correction(self, signal, bias_table, genome_file_name, chrName, start, end,
                        forward_shift, reverse_shift, strands_specific):
        """
        Performs bias correction.

        Keyword arguments:
        signal -- Input signal.
        bias_table -- Bias table.

        Return:
        bias_corrected_signal -- Bias-corrected sequence.
        """

        if (not bias_table): return signal

        # Parameters
        window = 50
        defaultKmerValue = 1.0

        # Initialization
        fastaFile = Fastafile(genome_file_name)
        fBiasDict = bias_table[0]
        rBiasDict = bias_table[1]
        k_nb = len(fBiasDict.keys()[0])
        p1 = start
        p2 = end
        p1_w = p1 - (window / 2)
        p2_w = p2 + (window / 2)
        p1_wk = p1_w - int(floor(k_nb / 2.))
        p2_wk = p2_w + int(ceil(k_nb / 2.))
        if (p1 <= 0 or p1_w <= 0 or p1_wk <= 0): return signal

        # Raw counts
        nf = [0.0] * (p2_w - p1_w)
        nr = [0.0] * (p2_w - p1_w)
        for read in self.bam.fetch(chrName, p1_w, p2_w):
            if (not read.is_reverse):
                cut_site = read.pos + forward_shift
                if cut_site >= start and cut_site < end:
                    nf[cut_site - p1_w] += 1.0
                    # for i in range(max(read.pos + forward_shift, start), min(read.pos + forward_shift + 1, end - 1)):
                    #    nf[i - start] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if cut_site >= start and cut_site < end:
                    nr[cut_site - p1_w] += 1.0
                    # for i in range(max(read.aend + reverse_shift - 1, start), min(read.aend + reverse_shift, end - 1)):
                    #    nr[i - start] += 1.0

                    # if ((not read.is_reverse) and (read.pos > p1_w)): nf[read.pos - p1_w] += 1.0
                    # if ((read.is_reverse) and ((read.aend - 1) < p2_w)): nr[read.aend - 1 - p1_w] += 1.0

        # Smoothed counts
        Nf = []
        Nr = []
        fSum = sum(nf[:window])
        rSum = sum(nr[:window])
        fLast = nf[0]
        rLast = nr[0]
        for i in range((window / 2), len(nf) - (window / 2)):
            Nf.append(fSum)
            Nr.append(rSum)
            fSum -= fLast
            fSum += nf[i + (window / 2)]
            fLast = nf[i - (window / 2) + 1]
            rSum -= rLast
            rSum += nr[i + (window / 2)]
            rLast = nr[i - (window / 2) + 1]

        # Fetching sequence
        currStr = str(fastaFile.fetch(chrName, p1_wk-1, p2_wk-2)).upper()
        currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrName,p1_wk+2, p2_wk+1)).upper())
        #currStr = str(fastaFile.fetch(chrName, p1_wk, p2_wk - 1)).upper()
        #currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrName, p1_wk + 1,
         #                                                            p2_wk)).upper())

        # Iterating on sequence to create signal
        af = []
        ar = []
        for i in range(int(ceil(k_nb / 2.)), len(currStr) - int(floor(k_nb / 2)) + 1):
            fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))]
            rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) - i:len(currStr) + int(floor(k_nb / 2.)) - i]
            try:
                af.append(fBiasDict[fseq])
            except Exception:
                af.append(defaultKmerValue)
            try:
                ar.append(rBiasDict[rseq])
            except Exception:
                ar.append(defaultKmerValue)

        # Calculating bias and writing to wig file
        fSum = sum(af[:window])
        rSum = sum(ar[:window])
        fLast = af[0]
        rLast = ar[0]
        bias_corrected_signal = []
        bias_corrected_signal_forward = []
        bias_corrected_signal_reverse = []
        for i in range((window / 2), len(af) - (window / 2)):
            nhatf = Nf[i - (window / 2)] * (af[i] / fSum)
            nhatr = Nr[i - (window / 2)] * (ar[i] / rSum)
            zf = log(nf[i] + 1) - log(nhatf + 1)
            zr = log(nr[i] + 1) - log(nhatr + 1)
            bias_corrected_signal_forward.append(zf)
            bias_corrected_signal_reverse.append(zr)
            bias_corrected_signal.append(zf + zr)
            fSum -= fLast
            fSum += af[i + (window / 2)]
            fLast = af[i - (window / 2) + 1]
            rSum -= rLast
            rSum += ar[i + (window / 2)]
            rLast = ar[i - (window / 2) + 1]

        # Fixing the negative number in bias corrected signal
        min_value = abs(min(bias_corrected_signal_forward))
        bias_fixed_signal_forward = [e + min_value for e in bias_corrected_signal_forward]

        min_value = abs(min(bias_corrected_signal_reverse))
        bias_fixed_signal_reverse = [e + min_value for e in bias_corrected_signal_reverse]

        min_value = abs(min(bias_corrected_signal))
        bias_fixed_signal = [e + min_value for e in bias_corrected_signal]

        # Termination
        fastaFile.close()
        if not strands_specific:
            return bias_corrected_signal
        else:
            return bias_fixed_signal_forward, bias_fixed_signal_reverse

    def hon_norm(self, sequence, mean, std):
        """
        Normalizes a sequence according to hon's criterion using mean and std.
        This represents a between-dataset normalization.

        Keyword arguments:
        sequence -- Input sequence.
        mean -- Global mean.
        std -- Global std.

        Return:
        norm_seq -- Normalized sequence.
        """

        #if std != 0:
        #    norm_seq = []
        #    for e in sequence:
        #        norm_seq.append(1.0 / (1.0 + (exp(-(e - mean) / std))))
        #    return norm_seq
        #else:
        #    return sequence
        norm_seq = []
        for e in sequence:
            if(e == 0.0): norm_seq.append(0.0)
            elif(e > 0.0): norm_seq.append(1.0/(1.0+(exp(-(e-mean)/std))))
            else: norm_seq.append(-1.0/(1.0+(exp(-(-e-mean)/std))))
        return norm_seq

    def boyle_norm(self, sequence):
        """
        Normalizes a sequence according to Boyle's criterion.
        This represents a within-dataset normalization.

        Keyword arguments:
        sequence -- Input sequence.

        Return:
        norm_seq -- Normalized sequence.
        """
        mean = array([e for e in sequence if e > 0]).mean()
        if isnan(mean):
            return sequence
        else:
            norm_seq = [(float(e) / mean) for e in sequence]
            return norm_seq

    def savitzky_golay_coefficients(self, window_size, order, deriv):
        """
        Evaluate the Savitzky-Golay coefficients in order to evaluate the slope of the signal.
        It uses a window_size (of the interpolation), order (of the polynomial), deriv (derivative needed).

        Keyword arguments:
        window_size -- Size of the window for function interpolation.
        order -- Order of polynomial.
        deriv -- Derivative.

        Return:
        m[::-1] -- The Savitzky-Golay coefficients.
        """

        # Get statistics
        # try: # TODO ERRORS
        window_size = abs(int(window_size))
        order = abs(int(order))
        # except ValueError, msg:
        #    raise ValueError("windowSize and order have to be of type int")
        # if windowSize % 2 != 1 or windowSize < 1:
        #    raise TypeError("windowSize size must be a positive odd number")
        # if windowSize < order + 2:
        #    raise TypeError("windowSize is too small for the polynomials order")
        order_range = range(order + 1)
        half_window = (window_size - 1) // 2

        # Precompute Coefficients
        b = mat([[k ** i for i in order_range] for k in range(-half_window, half_window + 1)])
        m = linalg.pinv(b).A[deriv]
        return m[::-1]

    def slope(self, sequence, sg_coefs):
        """
        Evaluates the slope of sequence given the sg_coefs loaded.

        Keyword arguments:
        sequence -- Input sequence.
        sg_coefs -- Savitzky-Golay coefficients.

        Return:
        slope_seq -- Slope sequence.
        """
        slope_seq = convolve(sequence, sg_coefs)
        slope_seq = [e for e in slope_seq[(len(sg_coefs) / 2):(len(slope_seq) - (len(sg_coefs) / 2))]]

        return slope_seq

    def get_signal_per_strand(self, ref, start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift,
                              initial_clip=1000, per_norm=98, per_slope=98,
                              bias_table=None, genome_file_name=None, print_raw_signal=False,
                              print_bc_signal=False, print_norm_signal=False, print_slope_signal=False,
                              strands_specific=True):
        """

        :param ref: Chromosome name.
        :param start: Initial genomic coordinate of signal.
        :param end: Final genomic coordinate of signal.
        :param downstream_ext: Number of bps to extend towards the downstream region
        :param upstream_ext: Number of bps to extend towards the upstream region
        :param forward_shift: Number of bps to shift the reads aligned to the forward strand.
        :param reverse_shift: Number of bps to shift the reads aligned to the reverse strand.
        :param initial_clip: Signal will be initially clipped at this level to avoid outliers.
        :param per_norm: Percentile value for 'hon_norm' function of the normalized signal.
        :param per_slope: Percentile value for 'hon_norm' function of the slope signal.
        :param bias_table: Bias table to perform bias correction.
        :param genome_file_name: Genome to perform bias correction.
        :param print_raw_signal:
        :param print_bc_signal:
        :param print_norm_signal:
        :param print_slope_signal:
        :return: normalized and slope signal for each strand.
        """

        raw_signal_forward = [0.0] * (end - start)
        raw_signal_reverse = [0.0] * (end - start)

        reads = self.bam.fetch(reference=ref, start=start, end=end)
        for read in reads:
            if (not read.is_reverse):
                cut_site = read.pos + forward_shift
                if cut_site >= start and cut_site < end:
                    raw_signal_forward[cut_site - start] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if cut_site >= start and cut_site < end:
                    raw_signal_reverse[cut_site - start] += 1.0

        raw_signal_forward = array([min(e, initial_clip) for e in raw_signal_forward])
        raw_signal_reverse = array([min(e, initial_clip) for e in raw_signal_reverse])

        # Std-based clipping
        mean = raw_signal_forward.mean()
        std = raw_signal_forward.std()
        clip_signal_forward = [min(e, mean + (10 * std)) for e in raw_signal_forward]
        mean = raw_signal_reverse.mean()
        std = raw_signal_reverse.std()
        clip_signal_reverse = [min(e, mean + (10 * std)) for e in raw_signal_reverse]

        # Cleavage bias correction
        bc_signal_forward = None
        bc_signal_reverse = None
        if bias_table:
            bc_signal_forward, bc_signal_reverse = self.bias_correction(raw_signal_forward, bias_table,
                                                                        genome_file_name,
                                                                        ref, start, end, forward_shift, reverse_shift,
                                                                        strands_specific)
        else:
            bc_signal_forward = clip_signal_forward
            bc_signal_reverse = clip_signal_reverse

        # Boyle normalization (within-dataset normalization)
        boyle_signal_forward = array(self.boyle_norm(bc_signal_forward))
        boyle_signal_reverse = array(self.boyle_norm(bc_signal_reverse))

        # Hon normalization (between-dataset normalization)
        perc = scoreatpercentile(boyle_signal_forward, per_norm)
        std = boyle_signal_forward.std()
        hon_signal_forward = self.hon_norm(boyle_signal_forward, perc, std)

        perc = scoreatpercentile(boyle_signal_reverse, per_norm)
        std = boyle_signal_reverse.std()
        hon_signal_reverse = self.hon_norm(boyle_signal_reverse, perc, std)

        # Slope signal
        slope_signal_forward = self.slope(hon_signal_forward, self.sg_coefs)
        slope_signal_reverse = self.slope(hon_signal_reverse, self.sg_coefs)

        # Returning normalized and slope sequences
        return hon_signal_forward, slope_signal_forward, hon_signal_reverse, slope_signal_reverse
示例#38
0
文件: sam.py 项目: arvin580/jcvi
def ace(args):
    """
    %prog ace bamfile fastafile

    convert bam format to ace format. This often allows the remapping to be
    assessed as a denovo assembly format. bam file needs to be indexed. also
    creates a .mates file to be used in amos/bambus, and .astat file to mark
    whether the contig is unique or repetitive based on A-statistics in Celera
    assembler.
    """
    p = OptionParser(ace.__doc__)
    p.add_option("--splitdir", dest="splitdir", default="outRoot",
            help="split the ace per contig to dir [default: %default]")
    p.add_option("--unpaired", dest="unpaired", default=False,
            help="remove read pairs on the same contig [default: %default]")
    p.add_option("--minreadno", dest="minreadno", default=3, type="int",
            help="minimum read numbers per contig [default: %default]")
    p.add_option("--minctgsize", dest="minctgsize", default=100, type="int",
            help="minimum contig size per contig [default: %default]")
    p.add_option("--astat", default=False, action="store_true",
            help="create .astat to list repetitiveness [default: %default]")
    p.add_option("--readids", default=False, action="store_true",
            help="create file of mapped and unmapped ids [default: %default]")

    from pysam import Samfile

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bamfile, fastafile = args
    astat = opts.astat
    readids = opts.readids

    f = Fasta(fastafile)
    prefix = bamfile.split(".")[0]
    acefile = prefix + ".ace"
    readsfile = prefix + ".reads"
    astatfile = prefix + ".astat"

    logging.debug("Load {0}".format(bamfile))
    s = Samfile(bamfile, "rb")

    ncontigs = s.nreferences
    genomesize = sum(x for a, x in f.itersizes())
    logging.debug("Total {0} contigs with size {1} base".format(ncontigs,
        genomesize))
    qual = "20"  # default qual

    totalreads = sum(s.count(x) for x in s.references)
    logging.debug("Total {0} reads mapped".format(totalreads))

    fw = open(acefile, "w")
    if astat:
        astatfw = open(astatfile, "w")
    if readids:
        readsfw = open(readsfile, "w")

    print >> fw, "AS {0} {1}".format(ncontigs, totalreads)
    print >> fw

    for i, contig in enumerate(s.references):
        cseq = f[contig]
        nbases = len(cseq)

        mapped_reads = [x for x in s.fetch(contig) if not x.is_unmapped]
        nreads = len(mapped_reads)

        nsegments = 0
        print >> fw, "CO {0} {1} {2} {3} U".format(contig, nbases, nreads,
                nsegments)
        print >> fw, fill(str(cseq.seq))
        print >> fw

        if astat:
            astat = Astat(nbases, nreads, genomesize, totalreads)
            print >> astatfw, "{0}\t{1:.1f}".format(contig, astat)

        text = fill([qual] * nbases, delimiter=" ", width=30)
        print >> fw, "BQ\n{0}".format(text)
        print >> fw

        rnames = []
        for a in mapped_reads:
            readname = a.qname
            rname = readname

            if readids:
                print >> readsfw, readname
            rnames.append(rname)

            strand = "C" if a.is_reverse else "U"
            paddedstart = a.pos + 1  # 0-based to 1-based
            af = "AF {0} {1} {2}".format(rname, strand, paddedstart)
            print >> fw, af

        print >> fw

        for a, rname in zip(mapped_reads, rnames):
            aseq, npadded = cigar_to_seq(a)
            if aseq is None:
                continue

            ninfos = 0
            ntags = 0
            alen = len(aseq)
            rd = "RD {0} {1} {2} {3}\n{4}".format(rname, alen, ninfos, ntags,
                    fill(aseq))
            qs = "QA 1 {0} 1 {0}".format(alen)

            print >> fw, rd
            print >> fw
            print >> fw, qs
            print >> fw
示例#39
0
def estimate_bias_kmer(args):
    # Parameters
    maxDuplicates = 100
    pseudocount = 1.0

    # Initializing bam and fasta
    bamFile = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fastaFile = Fastafile(genome_data.get_genome())
    regions = GenomicRegionSet("regions")
    regions.read(args.regions_file)

    # Initializing dictionaries
    obsDictF = dict()
    obsDictR = dict()
    expDictF = dict()
    expDictR = dict()

    ct_reads_r = 0
    ct_reads_f = 0
    ct_kmers = 0

    # Iterating on HS regions
    for region in regions:

        # Initialization
        prevPos = -1
        trueCounter = 0

        # Evaluating observed frequencies ####################################
        # Fetching reads
        for r in bamFile.fetch(region.chrom, region.initial, region.final):

            # Calculating positions
            if not r.is_reverse:
                cut_site = r.pos + args.forward_shift - 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            else:
                cut_site = r.aend + args.reverse_shift + 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            p2 = p1 + args.k_nb

            # Verifying PCR artifacts
            if p1 == prevPos:
                trueCounter += 1
            else:
                prevPos = p1
                trueCounter = 0
            if trueCounter > maxDuplicates: continue

            # Fetching k-mer
            try:
                currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)

            # Counting k-mer in dictionary
            if not r.is_reverse:
                ct_reads_f += 1
                try:
                    obsDictF[currStr] += 1
                except Exception:
                    obsDictF[currStr] = 1
            else:
                ct_reads_r += 1
                try:
                    obsDictR[currStr] += 1
                except Exception:
                    obsDictR[currStr] = 1

        # Evaluating expected frequencies ####################################
        # Fetching whole sequence
        try:
            currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue
        currRevComp = AuxiliaryFunctions.revcomp(currStr)

        # Iterating on each sequence position
        for i in range(0, len(currStr) - args.k_nb):
            ct_kmers += 1
            # Counting k-mer in dictionary
            s = currStr[i:i + args.k_nb]
            try:
                expDictF[s] += 1
            except Exception:
                expDictF[s] = 1

            # Counting k-mer in dictionary for reverse complement
            s = currRevComp[i:i + args.k_nb]
            try:
                expDictR[s] += 1
            except Exception:
                expDictR[s] = 1

    # Closing files
    bamFile.close()
    fastaFile.close()

    # Creating bias dictionary
    alphabet = ["A", "C", "G", "T"]
    kmerComb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)]
    bias_table_F = dict([(e, 0.0) for e in kmerComb])
    bias_table_R = dict([(e, 0.0) for e in kmerComb])
    for kmer in kmerComb:
        try:
            obsF = obsDictF[kmer] + pseudocount
        except Exception:
            obsF = pseudocount
        try:
            expF = expDictF[kmer] + pseudocount
        except Exception:
            expF = pseudocount
        if ct_reads_f == 0:
            bias_table_F[kmer] = 1
        else:
            bias_table_F[kmer] = round(float(obsF / ct_reads_f) / float(expF / ct_kmers), 6)
        try:
            obsR = obsDictR[kmer] + pseudocount
        except Exception:
            obsR = pseudocount
        try:
            expR = expDictR[kmer] + pseudocount
        except Exception:
            expR = pseudocount
        if ct_reads_r == 0:
            bias_table_R[kmer] = 1
        else:
            bias_table_R[kmer] = round(float(obsR / ct_reads_r) / float(expR / ct_kmers), 6)

    write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
示例#40
0
def ace(args):
    """
    %prog ace bamfile fastafile

    convert bam format to ace format. This often allows the remapping to be
    assessed as a denovo assembly format. bam file needs to be indexed. also
    creates a .mates file to be used in amos/bambus, and .astat file to mark
    whether the contig is unique or repetitive based on A-statistics in Celera
    assembler.
    """
    p = OptionParser(ace.__doc__)
    p.add_option(
        "--splitdir",
        dest="splitdir",
        default="outRoot",
        help="split the ace per contig to dir",
    )
    p.add_option(
        "--unpaired",
        dest="unpaired",
        default=False,
        help="remove read pairs on the same contig",
    )
    p.add_option(
        "--minreadno",
        dest="minreadno",
        default=3,
        type="int",
        help="minimum read numbers per contig",
    )
    p.add_option(
        "--minctgsize",
        dest="minctgsize",
        default=100,
        type="int",
        help="minimum contig size per contig",
    )
    p.add_option(
        "--astat",
        default=False,
        action="store_true",
        help="create .astat to list repetitiveness",
    )
    p.add_option(
        "--readids",
        default=False,
        action="store_true",
        help="create file of mapped and unmapped ids",
    )

    from pysam import Samfile

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bamfile, fastafile = args
    astat = opts.astat
    readids = opts.readids

    f = Fasta(fastafile)
    prefix = bamfile.split(".")[0]
    acefile = prefix + ".ace"
    readsfile = prefix + ".reads"
    astatfile = prefix + ".astat"

    logging.debug("Load {0}".format(bamfile))
    s = Samfile(bamfile, "rb")

    ncontigs = s.nreferences
    genomesize = sum(x for a, x in f.itersizes())
    logging.debug("Total {0} contigs with size {1} base".format(ncontigs, genomesize))
    qual = "20"  # default qual

    totalreads = sum(s.count(x) for x in s.references)
    logging.debug("Total {0} reads mapped".format(totalreads))

    fw = open(acefile, "w")
    if astat:
        astatfw = open(astatfile, "w")
    if readids:
        readsfw = open(readsfile, "w")

    print("AS {0} {1}".format(ncontigs, totalreads), file=fw)
    print(file=fw)

    for i, contig in enumerate(s.references):
        cseq = f[contig]
        nbases = len(cseq)

        mapped_reads = [x for x in s.fetch(contig) if not x.is_unmapped]
        nreads = len(mapped_reads)

        nsegments = 0
        print("CO {0} {1} {2} {3} U".format(contig, nbases, nreads, nsegments), file=fw)
        print(fill(str(cseq.seq)), file=fw)
        print(file=fw)

        if astat:
            astat = Astat(nbases, nreads, genomesize, totalreads)
            print("{0}\t{1:.1f}".format(contig, astat), file=astatfw)

        text = fill([qual] * nbases, delimiter=" ", width=30)
        print("BQ\n{0}".format(text), file=fw)
        print(file=fw)

        rnames = []
        for a in mapped_reads:
            readname = a.qname
            rname = readname

            if readids:
                print(readname, file=readsfw)
            rnames.append(rname)

            strand = "C" if a.is_reverse else "U"
            paddedstart = a.pos + 1  # 0-based to 1-based
            af = "AF {0} {1} {2}".format(rname, strand, paddedstart)
            print(af, file=fw)

        print(file=fw)

        for a, rname in zip(mapped_reads, rnames):
            aseq, npadded = cigar_to_seq(a)
            if aseq is None:
                continue

            ninfos = 0
            ntags = 0
            alen = len(aseq)
            rd = "RD {0} {1} {2} {3}\n{4}".format(
                rname, alen, ninfos, ntags, fill(aseq)
            )
            qs = "QA 1 {0} 1 {0}".format(alen)

            print(rd, file=fw)
            print(file=fw)
            print(qs, file=fw)
            print(file=fw)
示例#41
0
class GenomicSignal:
    """
    Represents a genomic signal. It should be used to fetch normalized and slope
    signals from a bam or bw file.
    Usage:
    1. Initialize class.
    2. Call load_sg_coefs once.
    3. Call get_signal as many times as needed.

    Authors: Eduardo G. Gusmao.

    Methods:

    load_sg_coefs(self, slope_window_size):
    Loads Savitzky-Golay coefficients into self.sg_coefs based on a slope_window_size.

    get_signal(self, ref, start, end, ext, initial_clip = 1000, per_norm = 98, per_slope = 98)
    Gets the signal associated with self.bam or self.bw based on start, end and ext.
    initial_clip, per_norm and per_slope are used as normalization factors during the normalization
    and slope evaluation procedures.

    hon_norm(self, sequence, mean, std):
    Normalizes a sequence according to hon's criterion using mean and std.
    This represents a between-dataset normalization.

    boyle_norm(self, sequence):
    Normalizes a sequence according to Boyle's criterion.
    This represents a within-dataset normalization.

    savitzky_golay_coefficients(self, window_size, order, deriv):
    Evaluate the Savitzky-Golay coefficients in order to evaluate the slope of the signal.
    It uses a window_size (of the interpolation), order (of the polynomial), deriv (derivative needed).

    slope(self, sequence, sg_coefs):
    Evaluates the slope of sequence given the sg_coefs loaded.
    """

    def __init__(self, file_name):
        """ 
        Initializes GenomicSignal.
        """
        self.file_name = file_name
        self.bam = None
        self.bw = None
        self.sg_coefs = None
        self.is_bam = False
        self.is_bw = False
        if(self.file_name.split(".")[-1].upper() == "BAM"):
            self.is_bam = True
            self.bam = Samfile(file_name,"rb")
        elif(self.file_name.split(".")[-1].upper() == "BW" or self.file_name.split(".")[-1].upper() == "BIGWIG"):
            self.is_bw = True
            self.bw = BigWigFile(file_name)
        else: pass # TODO ERROR

    def load_sg_coefs(self, slope_window_size):
        """ 
        Loads Savitzky-Golay coefficients into self.sg_coefs based on a slope_window_size.

        Keyword arguments:
        slope_window_size -- Window size of Savitzky-Golay coefficients.
        
        Return:
        None -- It updates self.sg_coefs.
        """
        self.sg_coefs = self.savitzky_golay_coefficients(slope_window_size, 2, 1)

    def get_tag_count(self, ref, start, end, ext, initial_clip = 1000, ext_both_directions=False):
        """ 
        Gets the tag count associated with self.bam based on start, end and ext.

        Keyword arguments:
        ref -- Chromosome name.
        start -- Initial genomic coordinate of signal.
        end -- Final genomic coordinate of signal.
        ext -- Fragment extention. Eg. 1 for DNase and 200 for histone modifications.
        initial_clip -- Signal will be initially clipped at this level to avoid outliers.
        
        Return:
        tag_count -- Total signal.
        """

        # Fetch raw signal
        pileup_region = PileupRegion(start,end,ext)
        if(self.is_bam):
            if(ps_version == "0.7.5"):
                self.bam.fetch(reference=ref, start=start, end=end, callback = pileup_region)
            else:
                iter = self.bam.fetch(reference=ref, start=start, end=end)
                if(not ext_both_directions):
                    for alignment in iter: pileup_region.__call__(alignment)
                else:
                    for alignment in iter: pileup_region.__call2__(alignment)
            raw_signal = array([min(e,initial_clip) for e in pileup_region.vector])
        elif(self.is_bw):
            signal = self.bw.pileup(ref, start, end)
            raw_signal = array([min(e,initial_clip) for e in signal])

        # Std-based clipping
        mean = raw_signal.mean()
        std = raw_signal.std()
        clip_signal = [min(e, mean + (10 * std)) for e in raw_signal]

        # Tag count
        try: tag_count = sum(clip_signal)
        except Exception: tag_count = 0

        return tag_count

    def get_signal(self, ref, start, end, ext, initial_clip = 1000, per_norm = 99.5, per_slope = 98, 
                   bias_table = None, genome_file_name = None, ext_both_directions=False, print_wig = None):
        """ 
        Gets the signal associated with self.bam based on start, end and ext.
        initial_clip, per_norm and per_slope are used as normalization factors during the normalization
        and slope evaluation procedures.

        Keyword arguments:
        ref -- Chromosome name.
        start -- Initial genomic coordinate of signal.
        end -- Final genomic coordinate of signal.
        ext -- Fragment extention. Eg. 1 for DNase and 200 for histone modifications.
        initial_clip -- Signal will be initially clipped at this level to avoid outliers.
        per_norm -- Percentile value for 'hon_norm' function of the normalized signal.
        per_slope -- Percentile value for 'hon_norm' function of the slope signal.
        bias_table -- Bias table to perform bias correction.
        
        Return:
        hon_signal -- Normalized signal.
        slopehon_signal -- Slope signal.
        """

        # Fetch raw signal
        pileup_region = PileupRegion(start,end,ext)
        if(self.is_bam):
            if(ps_version == "0.7.5"):
                self.bam.fetch(reference=ref, start=start, end=end, callback = pileup_region)
            else:
                iter = self.bam.fetch(reference=ref, start=start, end=end)
                if(not ext_both_directions):
                    for alignment in iter: pileup_region.__call__(alignment)
                else:
                    for alignment in iter: pileup_region.__call2__(alignment)
            raw_signal = array([min(e,initial_clip) for e in pileup_region.vector])
        elif(self.is_bw):
            signal = self.bw.pileup(ref, start, end)
            raw_signal = array([min(e,initial_clip) for e in signal])

        # Std-based clipping
        mean = raw_signal.mean()
        std = raw_signal.std()
        clip_signal = [min(e, mean + (10 * std)) for e in raw_signal]

        # Bias correction
        bias_corrected_signal = self.bias_correction(clip_signal, bias_table, genome_file_name, ref, start, end)

        # Boyle normalization (within-dataset normalization)
        boyle_signal = array(self.boyle_norm(bias_corrected_signal))

        # Hon normalization (between-dataset normalization)
        perc = scoreatpercentile(boyle_signal, per_norm)
        std = boyle_signal.std()
        hon_signal = self.hon_norm(boyle_signal, perc, std)
        
        # Slope signal
        slope_signal = self.slope(hon_signal, self.sg_coefs)

        # Hon normalization on slope signal (between-dataset slope smoothing)
        abs_seq = array([abs(e) for e in slope_signal])
        perc = scoreatpercentile(abs_seq, per_slope)
        std = abs_seq.std()
        slopehon_signal = self.hon_norm(slope_signal, perc, std)

        # Writing signal
        if(print_wig):
            signal_file = open(print_wig+"signal.wig","a")
            norm_file = open(print_wig+"norm.wig","a")
            slope_file = open(print_wig+"slope.wig","a")
            signal_file.write("fixedStep chrom="+ref+" start="+str(start+1)+" step=1\n"+"\n".join([str(e) for e in clip_signal])+"\n")
            norm_file.write("fixedStep chrom="+ref+" start="+str(start+1)+" step=1\n"+"\n".join([str(e) for e in hon_signal])+"\n")
            slope_file.write("fixedStep chrom="+ref+" start="+str(start+1)+" step=1\n"+"\n".join([str(e) for e in slopehon_signal])+"\n")
            signal_file.close()
            norm_file.close()
            slope_file.close()

        # Returning normalized and slope sequences
        return hon_signal, slopehon_signal

    def bias_correction(self, signal, bias_table, genome_file_name, chrName, start, end):
        """ 
        Performs bias correction.

        Keyword arguments:
        signal -- Input signal.
        bias_table -- Bias table.
        
        Return:
        bias_corrected_signal -- Bias-corrected sequence.
        """

        if(not bias_table): return signal

        # Parameters
        window = 50
        defaultKmerValue = 1.0

        # Initialization
        fastaFile = Fastafile(genome_file_name)
        fBiasDict = bias_table.table[0]; rBiasDict = bias_table.table[1]
        k_nb = len(fBiasDict.keys()[0])
        p1 = start; p2 = end
        p1_w = p1 - (window/2); p2_w = p2 + (window/2)
        p1_wk = p1_w - (k_nb/2); p2_wk = p2_w + (k_nb/2)

        # Raw counts
        nf = [0.0] * (p2_w-p1_w); nr = [0.0] * (p2_w-p1_w)
        for r in self.bam.fetch(chrName, p1_w, p2_w):
            if((not r.is_reverse) and (r.pos > p1_w)): nf[r.pos-p1_w] += 1.0
            if((r.is_reverse) and ((r.aend-1) < p2_w)): nr[r.aend-1-p1_w] += 1.0

        # Smoothed counts
        Nf = []; Nr = [];
        fSum = sum(nf[:window]); rSum = sum(nr[:window]);
        fLast = nf[0]; rLast = nr[0]
        for i in range((window/2),len(nf)-(window/2)):
            Nf.append(fSum)
            Nr.append(rSum)
            fSum -= fLast; fSum += nf[i+(window/2)]; fLast = nf[i-(window/2)+1]
            rSum -= rLast; rSum += nr[i+(window/2)]; rLast = nr[i-(window/2)+1]

        # Fetching sequence
        currStr = str(fastaFile.fetch(chrName, p1_wk-1, p2_wk-2)).upper()
        currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrName,p1_wk+2, p2_wk+1)).upper())

        # Iterating on sequence to create signal
        af = []; ar = []
        for i in range((k_nb/2),len(currStr)-(k_nb/2)+1):
            fseq = currStr[i-(k_nb/2):i+(k_nb/2)]
            rseq = currRevComp[len(currStr)-(k_nb/2)-i:len(currStr)+(k_nb/2)-i]
            try: af.append(fBiasDict[fseq])
            except Exception: af.append(defaultKmerValue)
            try: ar.append(rBiasDict[rseq])
            except Exception: ar.append(defaultKmerValue)

        # Calculating bias and writing to wig file
        fSum = sum(af[:window]); rSum = sum(ar[:window]);
        fLast = af[0]; rLast = ar[0]
        bias_corrected_signal = []
        for i in range((window/2),len(af)-(window/2)):
            nhatf = Nf[i-(window/2)]*(af[i]/fSum)
            nhatr = Nr[i-(window/2)]*(ar[i]/rSum)
            zf = log(nf[i]+1)-log(nhatf+1)
            zr = log(nr[i]+1)-log(nhatr+1)
            bias_corrected_signal.append(zf+zr)
            fSum -= fLast; fSum += af[i+(window/2)]; fLast = af[i-(window/2)+1]
            rSum -= rLast; rSum += ar[i+(window/2)]; rLast = ar[i-(window/2)+1]

        # Termination
        fastaFile.close()
        return bias_corrected_signal

    def hon_norm(self, sequence, mean, std):
        """ 
        Normalizes a sequence according to hon's criterion using mean and std.
        This represents a between-dataset normalization.

        Keyword arguments:
        sequence -- Input sequence.
        mean -- Global mean.
        std -- Global std.
        
        Return:
        norm_seq -- Normalized sequence.
        """

        norm_seq = []
        for e in sequence:
            if(e == 0.0): norm_seq.append(0.0)
            elif(e > 0.0): norm_seq.append(1.0/(1.0+(exp(-(e-mean)/std))))
            else: norm_seq.append(-1.0/(1.0+(exp(-(-e-mean)/std))))
        return norm_seq

    def boyle_norm(self, sequence):
        """ 
        Normalizes a sequence according to Boyle's criterion.
        This represents a within-dataset normalization.

        Keyword arguments:
        sequence -- Input sequence.
        
        Return:
        norm_seq -- Normalized sequence.
        """

        mean = array([e for e in sequence if e>0]).mean()
        norm_seq = [(float(e)/mean) for e in sequence]
        return norm_seq

    def savitzky_golay_coefficients(self, window_size, order, deriv):
        """ 
        Evaluate the Savitzky-Golay coefficients in order to evaluate the slope of the signal.
        It uses a window_size (of the interpolation), order (of the polynomial), deriv (derivative needed).

        Keyword arguments:
        window_size -- Size of the window for function interpolation.
        order -- Order of polynomial.
        deriv -- Derivative.
        
        Return:
        m[::-1] -- The Savitzky-Golay coefficients.
        """

        # Get statistics
        #try: # TODO Errors
        window_size = abs(int(window_size))
        order = abs(int(order))
        #except ValueError, msg:
        #    raise ValueError("windowSize and order have to be of type int")
        #if windowSize % 2 != 1 or windowSize < 1:
        #    raise TypeError("windowSize size must be a positive odd number")
        #if windowSize < order + 2:
        #    raise TypeError("windowSize is too small for the polynomials order")
        order_range = range(order+1)
        half_window = (window_size -1) // 2

        # Precompute Coefficients
        b = mat([[k**i for i in order_range] for k in range(-half_window, half_window+1)])
        m = linalg.pinv(b).A[deriv]
        return m[::-1]

    def slope(self, sequence, sg_coefs):
        """ 
        Evaluates the slope of sequence given the sg_coefs loaded.

        Keyword arguments:
        sequence -- Input sequence.
        sg_coefs -- Savitzky-Golay coefficients.
        
        Return:
        slope_seq -- Slope sequence.
        """
        slope_seq = convolve(sequence, sg_coefs)
        slope_seq = [e for e in slope_seq[(len(sg_coefs)/2):(len(slope_seq)-(len(sg_coefs)/2))]]
        return slope_seq
示例#42
0
# Iterating on coordinates
coordFile = open(coordFileName,"r")
for line in coordFile:

  try:

    # Initialization
    ll = line.strip().split("\t")
    chrName = ll[0]
    p1 = int(ll[1]); p2 = int(ll[2])
    p1_w = p1 - (window/2); p2_w = p2 + (window/2)
    p1_wk = p1_w - (k_nb/2); p2_wk = p2_w + (k_nb/2)

    # Raw counts
    pileup_region = PileupRegion(p1_w,p2_w)
    iter = bamFile.fetch(reference=chrName, start=p1_w, end=p2_w)
    for alignment in iter: pileup_region.__call__(alignment)
    nf = pileup_region.vectorF
    nr = pileup_region.vectorR

    outputFileRaw.write("fixedStep chrom="+chrName+" start="+str(p1+1)+" step=1\n")
    for i in range(0,len(nf)): outputFileRaw.write(str(nf[i]+nr[i])+"\n")

    #print "RAW reads"
    #for i in range(p1_w, p2_w):
    #  print i+1, nf[i-p1_w], nr[i-p1_w]

    # Smoothed counts
    Nf = []; Nr = [];
    fSum = sum(nf[:window]); rSum = sum(nr[:window]);
    fLast = nf[0]; rLast = nr[0]
示例#43
0
def bam_surject_msa(args):
    """
    Caveats:
    - flags are remained as original statuses
    - remaining original values for MD, NM, and AS tags
    - mate are given as unmapped
    - same records are emited
    """
    skip_flag = args.skip_flag
    sam = Samfile(args.bam)
    fasta = Fasta(open(args.msa_fasta))
    mapped_ref_set = set(sam.references)

    # setup output
    if args.refnames is None:
        refnames = [
            'consensus{0}'.format(i) for i in xrange(len(args.msa_fastas))
        ]
    else:
        refnames = args.refnames
    assert len(refnames) == len(
        args.msa_fastas
    ), 'The number of refnames should be the same as that of msa_fastas.'

    logging.info('Loading MSA fastas')
    logging.info('Skip flag: %s', args.skip_flag)
    fastas = []
    ref_lens = []
    target_ref_set = set()
    for fn in args.msa_fastas:
        with open(fn) as fp:
            fasta = Fasta(fp)
            fastas.append(fasta)
            if len(fasta.contigs) == 0:
                logging.error('Fasta file %s has no contigs', fn)
                raise Exception('No contigs')
            ref_lens.append(len(fasta.contigs[0]))
            target_ref_set.update(fasta.names)

    rest_refs = [r for r in sam.references if r not in target_ref_set]
    logging.info('%s are included in surjection targets.', len(target_ref_set))
    logging.info('%s are not included in surjection targets.', len(rest_refs))
    if args.keep_rest:
        logging.info('Rest of reference will be kept in surjected BAM file')
        org_ref_len_map = dict(zip(sam.references, sam.lengths))
        refnames.extend([r for r in rest_refs])
        ref_lens.extend([org_ref_len_map[r] for r in rest_refs])
        fastas.extend([None for r in rest_refs])

    logging.info('Setting up output BAMs')
    if args.output.endswith('.bam'):
        mode = 'wb'
    else:
        mode = 'wh'
    out = pysam.Samfile(args.output,
                        mode=mode,
                        reference_names=[refname],
                        reference_lengths=[ref_length])

    # iteration
    for refname, fasta in zip(refnames, fastas):
        out_tid = out.gettid(refname)
        if fasta is None:
            logging.info('Transfering %s', refname)
            src_tid = sam.gettid(refname)
            for rec in sam.fetch(reference=refname):
                if rec.flag & skip_flag:
                    continue
                a = rec.__copy__()
                a.reference_id = out_tid
                if a.next_reference_id != src_tid:  # pair on the same refs
                    a.next_reference_id = out_tid
                else:
                    a.next_reference_id = -1  # unpair
                    a.next_reference_start = -1
                out.write(a)
            continue
        logging.info('Surjecing to %s', refname)
        query_refs = fasta.names
        cc = _CigarChecker() if args.check else None
        for qref in query_refs:
            if qref not in mapped_ref_set:
                logging.warning('%s is not found in original BAM file', qref)
                continue
            #a = pysam.AlignedSegment()
            a = rec.__copy__()
            #print (rec)
            if not rec.is_unmapped:
                org_cigar = Cigar(rec.cigartuples)
                pos, cigar = mc.convert(rec.pos, org_cigar)
                if org_cigar.query_length != cigar.query_length:
                    logging.error('Invalid cigar conversion for %s', rec.qname)
                    logging.error('org %s %s %s', rec.pos, org_cigar,
                                  org_cigar.query_length)
                    logging.error('new %s %s %s', pos, cigar,
                                  cigar.query_length)
                    s1 = pos
                    e1 = mc.get_pos(rec.pos + cigar.ref_length)
                    logging.error('ref %s-%s %s', s1, e1,
                                  mc.get_ref_cigar(s1, e1))
                    logging.error('read %s', rec.seq)
                    logging.error('qref %s', q_aln.seq[s1:e1])
                    raise Exception('Incompatible Cigar')
                cc and cc.check(rec, pos, cigar, org_cigar, mc, q_aln)
                a.cigar = cigar.values
                a.reference_start = pos
            a.reference_id = out_tid
            a.next_reference_id = -1  # this is required
            a.next_reference_start = -1  # this is required
            #a.flag = rec.flag
            #orec.seq = '*'
            #print (orec)
            out.write(a)
示例#44
0
def create_signal(args, regions):
    def revcomp(s):
        rev_dict = dict([("A", "T"), ("T", "A"), ("C", "G"), ("G", "C"), ("N", "N")])
        return "".join([rev_dict[e] for e in s[::-1]])

    alphabet = ["A", "C", "G", "T"]
    kmer_comb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)]
    f_obs_dict = dict([(e, 0.0) for e in kmer_comb])
    r_obs_dict = dict([(e, 0.0) for e in kmer_comb])
    f_exp_dict = dict([(e, 0.0) for e in kmer_comb])
    r_exp_dict = dict([(e, 0.0) for e in kmer_comb])

    bam_file = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fasta_file = Fastafile(genome_data.get_genome())

    for region in regions:
        # Fetching observed reads
        reads = bam_file.fetch(reference=region.chrom, start=region.initial, end=region.final)
        for read in reads:
            if not read.is_reverse:
                p1 = read.pos - int(floor(args.k_nb / 2)) + args.forward_shift - 1
            else:
                p1 = read.aend - int(floor(args.k_nb / 2)) + args.reverse_shift + 1
            p2 = p1 + args.k_nb
            try:
                dna_sequence_obs = str(fasta_file.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if 'N' not in dna_sequence_obs:
                if read.is_reverse:
                    dna_sequence_obs = revcomp(dna_sequence_obs)
                    r_obs_dict[dna_sequence_obs] += 1
                else:
                    f_obs_dict[dna_sequence_obs] += 1

        # Fetching whole sequence
        try:
            dna_sequence_exp = str(fasta_file.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue
        dna_sequence_exp_rev = revcomp(dna_sequence_exp)
        for i in range(0, len(dna_sequence_exp) - args.k_nb):
            s = dna_sequence_exp[i:i + args.k_nb]
            if "N" not in s:
                f_exp_dict[s] += 1
            s = dna_sequence_exp_rev[i:i + args.k_nb]
            if "N" not in s:
                r_exp_dict[s] += 1

    output_fname_f_obs = os.path.join(args.output_location, "{}_f_obs.fa".format(str(args.k_nb)))
    output_fname_f_exp = os.path.join(args.output_location, "{}_f_exp.fa".format(str(args.k_nb)))
    output_fname_r_obs = os.path.join(args.output_location, "{}_r_obs.fa".format(str(args.k_nb)))
    output_fname_r_exp = os.path.join(args.output_location, "{}_r_exp.fa".format(str(args.k_nb)))

    output_file_f_obs = open(output_fname_f_obs, "w")
    output_file_f_exp = open(output_fname_f_exp, "w")
    output_file_r_obs = open(output_fname_r_obs, "w")
    output_file_r_exp = open(output_fname_r_exp, "w")

    for kmer in list(r_obs_dict.keys()):
        if f_obs_dict[kmer] > 0:
            output_file_f_obs.write(kmer + "\t" + str(f_obs_dict[kmer]) + "\n")
    for kmer in list(r_obs_dict.keys()):
        if f_exp_dict[kmer] > 0:
            output_file_f_exp.write(kmer + "\t" + str(f_exp_dict[kmer]) + "\n")
    for kmer in list(r_obs_dict.keys()):
        if r_obs_dict[kmer] > 0:
            output_file_r_obs.write(kmer + "\t" + str(r_obs_dict[kmer]) + "\n")
    for kmer in list(r_obs_dict.keys()):
        if r_exp_dict[kmer] > 0:
            output_file_r_exp.write(kmer + "\t" + str(r_exp_dict[kmer]) + "\n")

    output_file_f_obs.close()
    output_file_f_exp.close()
    output_file_r_obs.close()
    output_file_r_exp.close()
示例#45
0
class BamFile:
    """
    Represents a bam file. It should be used to fetch normalized and slope
    signals from a bam file.
    Usage:
    1. Initialize class.
    2. Call load_sg_coefs once.
    3. Call get_signal as many times as needed.

    Authors: Eduardo G. Gusmao.

    Methods:

    load_sg_coefs(self, slope_window_size):
    Loads Savitzky-Golay coefficients into self.sg_coefs based on a slope_window_size.

    get_signal(self, ref, start, end, ext, initial_clip = 1000, per_norm = 98, per_slope = 98)
    Gets the signal associated with self.bam based on start, end and ext.
    initial_clip, per_norm and per_slope are used as normalization factors during the normalization
    and slope evaluation procedures.

    hon_norm(self, sequence, mean, std):
    Normalizes a sequence according to hon's criterion using mean and std.
    This represents a between-dataset normalization.

    boyle_norm(self, sequence):
    Normalizes a sequence according to Boyle's criterion.
    This represents a within-dataset normalization.

    savitzky_golay_coefficients(self, window_size, order, deriv):
    Evaluate the Savitzky-Golay coefficients in order to evaluate the slope of the signal.
    It uses a window_size (of the interpolation), order (of the polynomial), deriv (derivative needed).

    slope(self, sequence, sg_coefs):
    Evaluates the slope of sequence given the sg_coefs loaded.
    """
    def __init__(self, file_name):
        """ 
        Initializes BamFile.

        Variables:
        bam -- Pysam's bam representation.
        sg_coefs -- Savitzky-Golay coefficients (list). Should be loaded after class initialization.
        """
        self.file_name = file_name
        self.bam = Samfile(file_name, "rb")
        self.sg_coefs = None

    def load_sg_coefs(self, slope_window_size):
        """ 
        Loads Savitzky-Golay coefficients into self.sg_coefs based on a slope_window_size.

        Keyword arguments:
        slope_window_size -- Window size of Savitzky-Golay coefficients.
        
        Return:
        None -- It updates self.sg_coefs.
        """
        self.sg_coefs = self.savitzky_golay_coefficients(
            slope_window_size, 2, 1)

    def get_signal(self,
                   ref,
                   start,
                   end,
                   ext,
                   initial_clip=1000,
                   per_norm=98,
                   per_slope=98):
        """ 
        Gets the signal associated with self.bam based on start, end and ext.
        initial_clip, per_norm and per_slope are used as normalization factors during the normalization
        and slope evaluation procedures.

        Keyword arguments:
        ref -- Chromosome name.
        start -- Initial genomic coordinate of signal.
        end -- Final genomic coordinate of signal.
        ext -- Fragment extention. Eg. 1 for DNase and 200 for histone modifications.
        initial_clip -- Signal will be initially clipped at this level to avoid outliers.
        per_norm -- Percentile value for 'hon_norm' function of the normalized signal.
        per_slope -- Percentile value for 'hon_norm' function of the slope signal.
        
        Return:
        hon_signal -- Normalized signal.
        slopehon_signal -- Slope signal.
        """

        # Fetch raw signal
        pileup_region = PileupRegion(start, end, ext)
        if (ps_version == "0.7.5"):
            self.bam.fetch(reference=ref,
                           start=start,
                           end=end,
                           callback=pileup_region)
        else:
            iter = self.bam.fetch(reference=ref, start=start, end=end)
            for alignment in iter:
                pileup_region.__call__(alignment)
        raw_signal = array(
            [min(e, initial_clip) for e in pileup_region.vector])

        # Std-based clipping
        mean = raw_signal.mean()
        std = raw_signal.std()
        clip_signal = [min(e, mean + (10 * std)) for e in raw_signal]

        # Boyle normalization (within-dataset normalization)
        boyle_signal = array(self.boyle_norm(clip_signal))

        # Hon normalization (between-dataset normalization)
        perc = scoreatpercentile(boyle_signal, per_norm)
        std = boyle_signal.std()
        hon_signal = self.hon_norm(boyle_signal, perc, std)

        # Slope signal
        slope_signal = self.slope(hon_signal, self.sg_coefs)

        # Hon normalization on slope signal (between-dataset slope smoothing)
        abs_seq = array([abs(e) for e in slope_signal])
        perc = scoreatpercentile(abs_seq, per_slope)
        std = abs_seq.std()
        slopehon_signal = self.hon_norm(slope_signal, perc, std)

        # Returning normalized and slope sequences
        return hon_signal, slopehon_signal

    def hon_norm(self, sequence, mean, std):
        """ 
        Normalizes a sequence according to hon's criterion using mean and std.
        This represents a between-dataset normalization.

        Keyword arguments:
        sequence -- Input sequence.
        mean -- Global mean.
        std -- Global std.
        
        Return:
        norm_seq -- Normalized sequence.
        """

        norm_seq = []
        for e in sequence:
            if (e == 0.0): norm_seq.append(0.0)
            elif (e > 0.0):
                norm_seq.append(1.0 / (1.0 + (exp(-(e - mean) / std))))
            else:
                norm_seq.append(-1.0 / (1.0 + (exp(-(-e - mean) / std))))
        return norm_seq

    def boyle_norm(self, sequence):
        """ 
        Normalizes a sequence according to Boyle's criterion.
        This represents a within-dataset normalization.

        Keyword arguments:
        sequence -- Input sequence.
        
        Return:
        norm_seq -- Normalized sequence.
        """

        mean = array([e for e in sequence if e > 0]).mean()
        norm_seq = [(float(e) / mean) for e in sequence]
        return norm_seq

    def savitzky_golay_coefficients(self, window_size, order, deriv):
        """ 
        Evaluate the Savitzky-Golay coefficients in order to evaluate the slope of the signal.
        It uses a window_size (of the interpolation), order (of the polynomial), deriv (derivative needed).

        Keyword arguments:
        window_size -- Size of the window for function interpolation.
        order -- Order of polynomial.
        deriv -- Derivative.
        
        Return:
        m[::-1] -- The Savitzky-Golay coefficients.
        """

        # Get statistics
        #try: # TODO Errors
        window_size = abs(int(window_size))
        order = abs(int(order))
        #except ValueError, msg:
        #    raise ValueError("windowSize and order have to be of type int")
        #if windowSize % 2 != 1 or windowSize < 1:
        #    raise TypeError("windowSize size must be a positive odd number")
        #if windowSize < order + 2:
        #    raise TypeError("windowSize is too small for the polynomials order")
        order_range = range(order + 1)
        half_window = (window_size - 1) // 2

        # Precompute Coefficients
        b = mat([[k**i for i in order_range]
                 for k in range(-half_window, half_window + 1)])
        m = linalg.pinv(b).A[deriv]
        return m[::-1]

    def slope(self, sequence, sg_coefs):
        """ 
        Evaluates the slope of sequence given the sg_coefs loaded.

        Keyword arguments:
        sequence -- Input sequence.
        sg_coefs -- Savitzky-Golay coefficients.
        
        Return:
        slope_seq -- Slope sequence.
        """
        slope_seq = convolve(sequence, sg_coefs)
        slope_seq = [
            e for e in slope_seq[(len(sg_coefs) / 2):(len(slope_seq) -
                                                      (len(sg_coefs) / 2))]
        ]
        return slope_seq