示例#1
0
def bias_correction(bam, signal, fBiasDict, rBiasDict, genome_file_name, chrName, start, end):

  # Parameters
  window = 50
  defaultKmerValue = 1.0

  # Initialization
  fastaFile = Fastafile(genome_file_name)
  k_nb = len(fBiasDict.keys()[0])
  p1 = start; p2 = end
  p1_w = p1 - (window/2); p2_w = p2 + (window/2)
  p1_wk = p1_w - (k_nb/2); p2_wk = p2_w + (k_nb/2)

  # Raw counts
  nf = [0.0] * (p2_w-p1_w); nr = [0.0] * (p2_w-p1_w)
  for r in bam.fetch(chrName, p1_w, p2_w):
    if((not r.is_reverse) and (r.pos > p1_w)): nf[r.pos-p1_w] += 1.0
    if((r.is_reverse) and ((r.aend-1) < p2_w)): nr[r.aend-1-p1_w] += 1.0

  # Smoothed counts
  Nf = []; Nr = [];
  fSum = sum(nf[:window]); rSum = sum(nr[:window]);
  fLast = nf[0]; rLast = nr[0]
  for i in range((window/2),len(nf)-(window/2)):
    Nf.append(fSum)
    Nr.append(rSum)
    fSum -= fLast; fSum += nf[i+(window/2)]; fLast = nf[i-(window/2)+1]
    rSum -= rLast; rSum += nr[i+(window/2)]; rLast = nr[i-(window/2)+1]

  # Fetching sequence
  currStr = str(fastaFile.fetch(chrName, p1_wk-1, p2_wk-2)).upper()
  currRevComp = revcomp(str(fastaFile.fetch(chrName,p1_wk+2, p2_wk+1)).upper())

  # Iterating on sequence to create signal
  af = []; ar = []
  for i in range((k_nb/2),len(currStr)-(k_nb/2)+1):
    fseq = currStr[i-(k_nb/2):i+(k_nb/2)]
    rseq = currRevComp[len(currStr)-(k_nb/2)-i:len(currStr)+(k_nb/2)-i]
    try: af.append(fBiasDict[fseq])
    except Exception: af.append(defaultKmerValue)
    try: ar.append(rBiasDict[rseq])
    except Exception: ar.append(defaultKmerValue)

  # Calculating bias and writing to wig file
  fSum = sum(af[:window]); rSum = sum(ar[:window]);
  fLast = af[0]; rLast = ar[0]
  bias_corrected_signal = []
  for i in range((window/2),len(af)-(window/2)):
    nhatf = Nf[i-(window/2)]*(af[i]/fSum)
    nhatr = Nr[i-(window/2)]*(ar[i]/rSum)
    zf = log(nf[i]+1)-log(nhatf+1)
    zr = log(nr[i]+1)-log(nhatr+1)
    bias_corrected_signal.append(zf+zr)
    fSum -= fLast; fSum += af[i+(window/2)]; fLast = af[i-(window/2)+1]
    rSum -= rLast; rSum += ar[i+(window/2)]; rLast = ar[i-(window/2)+1]

  # Termination
  fastaFile.close()
  return bias_corrected_signal
示例#2
0
    def test_should_return_capitalised_sequence_from_ref_file(self):
        fasta_file = self.__build_fasta_file({
            'chr20':
            "tagcattattattattattattatta",
        })

        fasta_file = Fastafile(fasta_file.filename)
        self.assertEqual(
            fasta_file.fetch('chr20', 10, 20).upper(), "ATTATTATTA")
示例#3
0
    def test_should_be_able_to_fetch_section_of_genome(self):
        fasta_file = self.__build_fasta_file({
            'chr20':
            "TAGCATTATTATTATTATTATTATTA",
        })

        fasta_file = Fastafile(fasta_file.filename)
        self.assertEqual(
            fasta_file.fetch('chr20', 10, 20).upper(), "ATTATTATTA")
示例#4
0
class SeqDataset(Dataset):
    """
    Args:
        intervals_file: bed3 file containing intervals
        fasta_file: file path; Genome sequence
        target_file: file path; path to the targets in the csv format
    """
    def __init__(self, intervals_file, fasta_file, use_linecache=True):

        # intervals
        if use_linecache:
            self.bt = BedToolLinecache(intervals_file)
        else:
            self.bt = BedTool(intervals_file)
        self.fasta_file = fasta_file
        self.fasta = None

    def __len__(self):
        return len(self.bt)

    def __getitem__(self, idx):
        if self.fasta is None:
            self.fasta = Fastafile(self.fasta_file)

        interval = self.bt[idx]

        # Intervals can't be bigger than 1000bp
        if (interval.stop - interval.start) > 1000:
            raise Exception("Input sequences should be at maximum 1000bp.")

        # Fetch the fasta line
        seq = self.fasta.fetch(str(interval.chrom), interval.start,
                               interval.stop).upper()

        # Reverse complement input string is requested
        if interval.strand == "-":
            seq = rc_str(seq)
        """
        # generate an id
        id = str(interval.chrom) + ":" + str(interval.start) + "-" + str(interval.stop)
        if interval.name not in ["", ".", "*"]:
            id = interval.name
        """

        return {
            "inputs": seq,
            "metadata": {
                "ranges": GenomicRanges.from_interval(interval)
            }
        }
def main():
    # setup a reverse_complement translation
    rev_table=string.maketrans('ACGTacgt', 'TGCAtgca')
    def revcomp(seq, rev_table):
        return seq.translate(rev_table)
        
    # open your fasta file
    fasta  = Fastafile("bedtools/tests/data/chr21.fa")
    # open your bed file
    bed    = IntervalFile("bedtools/tests/data/rmsk.hg18.chr21.bed")
    
    # for each bed, grab the the DNA in that interval 
    for b in bed:
        # grab the seq, rev. comp if necessary
        seq = fasta.fetch(b.chrom, b.start, b.end)  
        if b.strand == "-":
            seq = revcomp(seq, rev_table)
        # print the interval and the seq
        print b.chrom, b.start, b.end, b.strand, seq
示例#6
0
    def removeHomopolymers(self, variants, outFile, distance):
        startTime = Helper.getTime()
        Helper.info(
            " [%s] remove Missmatches from homopolymers " %
            (startTime.strftime("%c")), self.rnaEdit.logFile,
            self.rnaEdit.textField)

        tempBedFile = open(outFile + "_tmp.bed", "w+")
        tempSeqFile = outFile + "_tmp.tsv"

        refGenome = "/media/Storage/databases/rnaEditor_annotations/human/human_g1k_v37.fasta"
        fastaFile = Fastafile(self.rnaEdit.params.refGenome)
        mmNumberTotal = len(variants.variantDict)
        # print temporary BedFile
        numberPassed = 0
        for key in variants.variantDict.keys():
            chr, position, ref, alt = key
            startPos = position - distance if position >= distance else 0
            endpos = position + distance
            sequence = fastaFile.fetch(chr, startPos, endpos)
            pattern = ref * distance
            """ !!!Test if this gives better results
                !!!ONLY DELETE IF MM IS AT THE END OF A HOMOPOLYMER NUKLEOTIDES
            if sequence.startswith(pattern):
                del mmDict[site] 
            elif sequence.endswith(pattern):
                del mmDict[site]
            """
            if pattern in sequence:
                try:
                    del variants.variantDict[key]
                except KeyError:
                    pass
            else:
                numberPassed += 1

        # output statistics
        Helper.info(
            "\t\t %d out of %d passed the Homopolymer-Filter" %
            (numberPassed, mmNumberTotal), self.rnaEdit.logFile,
            self.rnaEdit.textField)
        Helper.printTimeDiff(startTime, self.rnaEdit.logFile,
                             self.rnaEdit.textField)
示例#7
0
def main():
    # setup a reverse_complement translation
    rev_table = string.maketrans('ACGTacgt', 'TGCAtgca')

    def revcomp(seq, rev_table):
        return seq.translate(rev_table)

    # open your fasta file
    fasta = Fastafile("bedtools/tests/data/chr21.fa")
    # open your bed file
    bed = IntervalFile("bedtools/tests/data/rmsk.hg18.chr21.bed")

    # for each bed, grab the the DNA in that interval
    for b in bed:
        # grab the seq, rev. comp if necessary
        seq = fasta.fetch(b.chrom, b.start, b.end)
        if b.strand == "-":
            seq = revcomp(seq, rev_table)
        # print the interval and the seq
        print b.chrom, b.start, b.end, b.strand, seq
示例#8
0
    # Evaluating Overall TC
    regionTagCountVec = ["0", "0", "0"]
    try:
        for i in range(0, len(tcHalfWindowVec)):
            tcHalfWindow = tcHalfWindowVec[i]
            regionTagCountVec[i] = tag_count(chrName, p1, p2, dnaseBam,
                                             tcHalfWindow)
    except Exception:
        print "Exception TC raised in " + line
        writeOutput(ll, regionTagCountVec, resVec, outFile)
        continue

    # Fetching sequence
    try:
        sequence = str(genomeFile.fetch(chrName, p1, p2))
    except Exception:
        print "Exception SEQUENCE raised in " + line
        writeOutput(ll, regionTagCountVec, resVec, outFile)
        continue

    # Performing motif matching
    for i in range(0, len(motifList)):

        m = motifList[i]
        for res in search(sequence, [m.pssm_list], [m.min],
                          absolute_threshold=True,
                          both_strands=True):
            for (position, score) in res:
                if (score > resVec[i][0]):
                    resVec[i][0] = score
示例#9
0
def bias_correction(bam, signal, fBiasDict, rBiasDict, genome_file_name,
                    chrName, start, end):

    # Parameters
    window = 50
    defaultKmerValue = 1.0

    # Initialization
    fastaFile = Fastafile(genome_file_name)
    k_nb = len(fBiasDict.keys()[0])
    p1 = start
    p2 = end
    p1_w = p1 - (window / 2)
    p2_w = p2 + (window / 2)
    p1_wk = p1_w - (k_nb / 2)
    p2_wk = p2_w + (k_nb / 2)

    # Raw counts
    nf = [0.0] * (p2_w - p1_w)
    nr = [0.0] * (p2_w - p1_w)
    for r in bam.fetch(chrName, p1_w, p2_w):
        if ((not r.is_reverse) and (r.pos > p1_w)): nf[r.pos - p1_w] += 1.0
        if ((r.is_reverse) and ((r.aend - 1) < p2_w)):
            nr[r.aend - 1 - p1_w] += 1.0

    # Smoothed counts
    Nf = []
    Nr = []
    fSum = sum(nf[:window])
    rSum = sum(nr[:window])
    fLast = nf[0]
    rLast = nr[0]
    for i in range((window / 2), len(nf) - (window / 2)):
        Nf.append(fSum)
        Nr.append(rSum)
        fSum -= fLast
        fSum += nf[i + (window / 2)]
        fLast = nf[i - (window / 2) + 1]
        rSum -= rLast
        rSum += nr[i + (window / 2)]
        rLast = nr[i - (window / 2) + 1]

    # Fetching sequence
    currStr = str(fastaFile.fetch(chrName, p1_wk - 1, p2_wk - 2)).upper()
    currRevComp = revcomp(
        str(fastaFile.fetch(chrName, p1_wk + 2, p2_wk + 1)).upper())

    # Iterating on sequence to create signal
    af = []
    ar = []
    for i in range((k_nb / 2), len(currStr) - (k_nb / 2) + 1):
        fseq = currStr[i - (k_nb / 2):i + (k_nb / 2)]
        rseq = currRevComp[len(currStr) - (k_nb / 2) - i:len(currStr) +
                           (k_nb / 2) - i]
        try:
            af.append(fBiasDict[fseq])
        except Exception:
            af.append(defaultKmerValue)
        try:
            ar.append(rBiasDict[rseq])
        except Exception:
            ar.append(defaultKmerValue)

    # Calculating bias and writing to wig file
    fSum = sum(af[:window])
    rSum = sum(ar[:window])
    fLast = af[0]
    rLast = ar[0]
    bias_corrected_signal = []
    for i in range((window / 2), len(af) - (window / 2)):
        nhatf = Nf[i - (window / 2)] * (af[i] / fSum)
        nhatr = Nr[i - (window / 2)] * (ar[i] / rSum)
        zf = log(nf[i] + 1) - log(nhatf + 1)
        zr = log(nr[i] + 1) - log(nhatr + 1)
        bias_corrected_signal.append(zf + zr)
        fSum -= fLast
        fSum += af[i + (window / 2)]
        fLast = af[i - (window / 2) + 1]
        rSum -= rLast
        rSum += ar[i + (window / 2)]
        rLast = ar[i - (window / 2) + 1]

    # Termination
    fastaFile.close()
    return bias_corrected_signal
示例#10
0
def bias_correction(chrom, start, end, bam, bias_table, genome_file_name, forward_shift, reverse_shift):
    # Parameters
    window = 50
    defaultKmerValue = 1.0

    # Initialization
    fastaFile = Fastafile(genome_file_name)
    fBiasDict = bias_table[0]
    rBiasDict = bias_table[1]
    k_nb = len(fBiasDict.keys()[0])
    p1 = start
    p2 = end
    p1_w = p1 - (window / 2)
    p2_w = p2 + (window / 2)
    p1_wk = p1_w - int(floor(k_nb / 2.))
    p2_wk = p2_w + int(ceil(k_nb / 2.))
    if p1 <= 0 or p1_w <= 0 or p2_wk <= 0:
        # Return raw counts
        bc_signal = [0.0] * (p2 - p1)
        for read in bam.fetch(chrom, p1, p2):
            # check if the read is unmapped, according to issue #112
            if read.is_unmapped:
                continue

            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1 <= cut_site < p2:
                    bc_signal[cut_site - p1] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1 <= cut_site < p2:
                    bc_signal[cut_site - p1] += 1.0

        return bc_signal

    # Raw counts
    nf = [0.0] * (p2_w - p1_w)
    nr = [0.0] * (p2_w - p1_w)
    for read in bam.fetch(chrom, p1_w, p2_w):
        # check if the read is unmapped, according to issue #112
        if read.is_unmapped:
            continue

        if not read.is_reverse:
            cut_site = read.pos + forward_shift
            if p1_w <= cut_site < p2_w:
                nf[cut_site - p1_w] += 1.0
        else:
            cut_site = read.aend + reverse_shift - 1
            if p1_w <= cut_site < p2_w:
                nr[cut_site - p1_w] += 1.0

    # Smoothed counts
    Nf = []
    Nr = []
    f_sum = sum(nf[:window])
    r_sum = sum(nr[:window])
    f_last = nf[0]
    r_last = nr[0]
    for i in range((window / 2), len(nf) - (window / 2)):
        Nf.append(f_sum)
        Nr.append(r_sum)
        f_sum -= f_last
        f_sum += nf[i + (window / 2)]
        f_last = nf[i - (window / 2) + 1]
        r_sum -= r_last
        r_sum += nr[i + (window / 2)]
        r_last = nr[i - (window / 2) + 1]

    # Fetching sequence
    currStr = str(fastaFile.fetch(chrom, p1_wk, p2_wk - 1)).upper()
    currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrom, p1_wk + 1, p2_wk)).upper())

    # Iterating on sequence to create signal
    af = []
    ar = []
    for i in range(int(ceil(k_nb / 2.)), len(currStr) - int(floor(k_nb / 2)) + 1):
        fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))]
        rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) - i:len(currStr) + int(floor(k_nb / 2.)) - i]
        try:
            af.append(fBiasDict[fseq])
        except Exception:
            af.append(defaultKmerValue)
        try:
            ar.append(rBiasDict[rseq])
        except Exception:
            ar.append(defaultKmerValue)

    # Calculating bias and writing to wig file
    f_sum = sum(af[:window])
    r_sum = sum(ar[:window])
    f_last = af[0]
    r_last = ar[0]
    bc_signal = []
    for i in range((window / 2), len(af) - (window / 2)):
        nhatf = Nf[i - (window / 2)] * (af[i] / f_sum)
        nhatr = Nr[i - (window / 2)] * (ar[i] / r_sum)
        bc_signal.append(nhatf + nhatr)
        f_sum -= f_last
        f_sum += af[i + (window / 2)]
        f_last = af[i - (window / 2) + 1]
        r_sum -= r_last
        r_sum += ar[i + (window / 2)]
        r_last = ar[i - (window / 2) + 1]

    # Termination
    fastaFile.close()
    return bc_signal
示例#11
0
                if strand == '+':
                    pos = max(int(c[1]), int(c[2]))

                iname   = 'c' + chrom + 'p' + str(pos) + 'IN'
                oname   = 'c' + chrom + 'p' + str(pos) + 'OUT'

                outerstart = pos - 600
                outerend   = pos + 600
                innerstart = pos - 450
                innerend   = pos + 450

                if outerstart < 0: outerstart = 0
                if innerstart < 0: innerstart = 0

                outerseq = ref.fetch(chrom, outerstart, outerend)
                innerseq = ref.fetch(chrom, innerstart, innerend)

                if strand == 'plus':
                    outerseq = rc(outerseq)
                    innerseq = rc(innerseq)

                outerprimers = makeprimers(oname, outerseq, 1000, 1200)
                innerprimers = makeprimers(iname, innerseq, 700, 900)

                leftouterprimer  = ''
                rightouterprimer = ''
                leftinnerprimer  = ''
                rightinnerprimer = ''

                primertext = ''
示例#12
0
class ReferenceGenome(object):
    """
    Class to read sequence data from a reference genome

    Attributes:
        genome_fasta_file (string): Path to reference genome file
        fasta_file (pysam.Fastafile): Fastafile object for reference genome
    """

    def __init__(self, genome_fasta_file, logger):
        """
        Create new ReferenceGenome

        Args:
            genome_fasta_file (string): Path to whole genome FASTA file
            logger (logging.Logger): Logger for reporting warnings/errors

        Returns:
            ReferenceGenome
        """
        self._logger = logger
        self.genome_fasta_file = genome_fasta_file
        self._validate_reference_file()

        try:
            self._fasta_file = Fastafile(self.genome_fasta_file)
        except:
            raise IOError("Could not read genome file: " +
                          self.genome_fasta_file)

    def _validate_reference_file(self):
        """
        Check whether reference file is valid

        Args:
            genome_fasta_file (string): Path to genome FASTA file

        Returns
            bool: True if valid
        """
        if not os.path.isfile(self.genome_fasta_file + ".fai"):
            raise ValueError(
                "Supplied genome FASTA file does not have FASTA index")

    def get_contig_lengths(self):
        """
        Get the names and lengths of all contigs in a references

        Args:
            None

        Returns:
            list(tuple(string,int)): Returns a list representing the name and lengths of all contigs in reference
        """
        return zip(self._fasta_file.references, self._fasta_file.lengths)

    def get_reference_bases(self, chrom, start, end):
        """
        Get the reference bases from start to end

        Args:
            chrom (string): Chromsome to query
            start (int): Start position to query
            end (int): End position (not inclusive)

        Returns:
            string: The genome sequence

        Raises:
            ValueError - Invalid arguments
        """
        if start >= end:
            raise ValueError("Start/stop coordinates incorrect for: " +
                             str(chrom) + ":" + str(start) + "-" + str(end))

        if chrom not in self._fasta_file:
            raise ValueError(
                "FASTA reference is missing entry for chromosome " + str(chrom))

        return self._fasta_file.fetch(str(chrom), start, end)
示例#13
0
def estimate_bias_pwm(args):
    # Parameters
    max_duplicates = 100

    # Initializing bam and fasta
    bamFile = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fastaFile = Fastafile(genome_data.get_genome())
    regions = GenomicRegionSet("regions")
    regions.read(args.regions_file)

    obs_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    exp_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    obs_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    exp_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])

    # Iterating on HS regions
    for region in regions:
        # Initialization
        prev_pos = -1
        true_counter = 0

        # Evaluating observed frequencies
        # Fetching reads
        for r in bamFile.fetch(region.chrom, region.initial, region.final):
            # Calculating positions
            if not r.is_reverse:
                cut_site = r.pos + args.forward_shift - 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            else:
                cut_site = r.aend + args.reverse_shift + 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            p2 = p1 + args.k_nb

            # Verifying PCR artifacts
            if p1 == prev_pos:
                true_counter += 1
            else:
                prev_pos = p1
                true_counter = 0
            if true_counter > max_duplicates: continue

            # Fetching k-mer
            try:
                currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)

            # Counting k-mer in dictionary
            if not r.is_reverse:
                for i in range(0, len(currStr)):
                    obs_f_pwm_dict[currStr[i]][i] += 1
            else:
                for i in range(0, len(currStr)):
                    obs_r_pwm_dict[currStr[i]][i] += 1

        # Evaluating expected frequencies
        # Fetching whole sequence
        try:
            currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue

        # Iterating on each sequence position
        s = None
        for i in range(0, len(currStr) - args.k_nb):
            # Counting k-mer in dictionary
            s = currStr[i:i + args.k_nb]
            for i in range(0, len(s)):
                exp_f_pwm_dict[s[i]][i] += 1

            # Counting k-mer in dictionary for reverse complement
            s = AuxiliaryFunctions.revcomp(s)
            for i in range(0, len(s)):
                exp_r_pwm_dict[s[i]][i] += 1

    # Closing files
    bamFile.close()
    fastaFile.close()

    # Output pwms
    os.system("mkdir -p " + os.path.join(args.output_location, "pfm"))
    pwm_dict_list = [obs_f_pwm_dict, obs_r_pwm_dict, exp_f_pwm_dict, exp_r_pwm_dict]
    pwm_file_list = []
    pwm_obs_f = os.path.join(args.output_location, "pfm", "obs_{}_f.pfm".format(str(args.k_nb)))
    pwm_obs_r = os.path.join(args.output_location, "pfm", "obs_{}_r.pfm".format(str(args.k_nb)))
    pwm_exp_f = os.path.join(args.output_location, "pfm", "exp_{}_f.pfm".format(str(args.k_nb)))
    pwm_exp_r = os.path.join(args.output_location, "pfm", "exp_{}_r.pfm".format(str(args.k_nb)))

    pwm_file_list.append(pwm_obs_f)
    pwm_file_list.append(pwm_obs_r)
    pwm_file_list.append(pwm_exp_f)
    pwm_file_list.append(pwm_exp_r)

    for i in range(len(pwm_dict_list)):
        with open(pwm_file_list[i], "w") as pwm_file:
            for e in ["A", "C", "G", "T"]:
                pwm_file.write(" ".join([str(int(f)) for f in pwm_dict_list[i][e]]) + "\n")

    motif_obs_f = motifs.read(open(pwm_obs_f), "pfm")
    motif_obs_r = motifs.read(open(pwm_obs_r), "pfm")
    motif_exp_f = motifs.read(open(pwm_exp_f), "pfm")
    motif_exp_r = motifs.read(open(pwm_exp_r), "pfm")

    # Output logos
    os.system("mkdir -p " + os.path.join(args.output_location, "logo"))
    logo_obs_f = os.path.join(args.output_location, "logo", "obs_{}_f.pdf".format(str(args.k_nb)))
    logo_obs_r = os.path.join(args.output_location, "logo", "obs_{}_r.pdf".format(str(args.k_nb)))
    logo_exp_f = os.path.join(args.output_location, "logo", "exp_{}_f.pdf".format(str(args.k_nb)))
    logo_exp_r = os.path.join(args.output_location, "logo", "exp_{}_r.pdf".format(str(args.k_nb)))

    motif_obs_f.weblogo(logo_obs_f, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.2, yaxis_tic_interval=0.1)
    motif_obs_r.weblogo(logo_obs_r, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.2, yaxis_tic_interval=0.1)
    motif_exp_f.weblogo(logo_exp_f, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.02, yaxis_tic_interval=0.01)
    motif_exp_r.weblogo(logo_exp_r, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.02, yaxis_tic_interval=0.01)

    # Creating bias dictionary
    alphabet = ["A", "C", "G", "T"]
    k_mer_comb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)]
    bias_table_F = dict([(e, 0.0) for e in k_mer_comb])
    bias_table_R = dict([(e, 0.0) for e in k_mer_comb])
    for k_mer in k_mer_comb:
        obs_f = get_ppm_score(k_mer, motif_obs_f.pwm, args.k_nb)
        exp_f = get_ppm_score(k_mer, motif_exp_f.pwm, args.k_nb)
        bias_table_F[k_mer] = round(obs_f / exp_f, 6)
        obs_r = get_ppm_score(k_mer, motif_obs_r.pwm, args.k_nb)
        exp_r = get_ppm_score(k_mer, motif_exp_r.pwm, args.k_nb)
        bias_table_R[k_mer] = round(obs_r / exp_r, 6)

    write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
示例#14
0
文件: Match.py 项目: CostaLab/reg-gen
def main(args):
    """
    Performs motif matching.
    """

    ###################################################################################################
    # Processing Input Arguments
    ###################################################################################################

    # Initializing Error Handler
    err = ErrorHandler()

    # Additional Parameters
    matching_folder_name = "match"
    random_region_name = "random_regions"

    filter_values = parse_filter(args.filter)

    ###################################################################################################
    # Initializations
    ###################################################################################################

    # Output folder
    if args.output_location:
        output_location = args.output_location
    else:
        output_location = npath(matching_folder_name)
    print(">> output location:", output_location)

    # Default genomic data
    genome_data = GenomeData(args.organism)

    print(">> genome:", genome_data.organism)
    print(">> pseudocounts:", args.pseudocounts)
    print(">> fpr threshold:", args.fpr)

    ###################################################################################################
    # Reading Input Regions
    ###################################################################################################

    genomic_regions_dict = {}

    # get experimental matrix, if available
    if args.input_matrix:
        try:
            exp_matrix = ExperimentalMatrix()
            exp_matrix.read(args.input_matrix)

            # if the matrix is present, the (empty) dictionary is overwritten
            genomic_regions_dict = exp_matrix.objectsDict

            print(">>> experimental matrix loaded")

        except Exception:
            err.throw_error("MM_WRONG_EXPMAT")
    elif args.input_files:
        # get input files, if available
        for input_filename in args.input_files:
            name, _ = os.path.splitext(os.path.basename(input_filename))

            regions = GenomicRegionSet(name)
            regions.read(npath(input_filename))

            genomic_regions_dict[name] = regions

            print(">>> input file", name, "loaded:", len(regions), "regions")

    # we put this here because we don't want to create the output directory unless we
    # are sure the initialisation (including loading input files) worked
    try:
        if not os.path.isdir(output_location):
            os.makedirs(output_location)
    except Exception:
        err.throw_error("MM_OUT_FOLDER_CREATION")

    annotation = None
    target_genes = None
    # get promoter regions from list of genes (both target and background)
    # TODO: should be more clever, allow precomputed regions etc
    if args.target_genes_filename:
        annotation = AnnotationSet(args.organism, alias_source=args.organism,
                                   protein_coding=True, known_only=True)

        target_genes = GeneSet("target_genes")
        target_genes.read(args.target_genes_filename)

        # TODO: what do we do with unmapped genes? maybe just print them out
        target_regions = annotation.get_promoters(gene_set=target_genes, promoter_length=args.promoter_length)
        target_regions.name = "target_regions"
        target_regions.sort()
        output_file_name = npath(os.path.join(output_location, target_regions.name + ".bed"))
        target_regions.write(output_file_name)

        genomic_regions_dict[target_regions.name] = target_regions

        print(">>> target promoter file created:", len(target_regions), "regions")

    # we make a background in case it's requested, but also in case a list of target genes has not been
    # provided
    if args.promoter_make_background or (args.promoters_only and not args.target_genes_filename):
        if not annotation:
            annotation = AnnotationSet(args.organism, alias_source=args.organism,
                                       protein_coding=True, known_only=True)

        # background is made of all known genes minus the target genes (if any)
        background_genes = GeneSet("background_genes")
        background_genes.get_all_genes(organism=args.organism)

        if target_genes:
            background_genes.subtract(target_genes)

        background_regions = annotation.get_promoters(gene_set=background_genes,
                                                      promoter_length=args.promoter_length)
        background_regions.name = "background_regions"
        background_regions.sort()
        output_file_name = npath(os.path.join(output_location, background_regions.name + ".bed"))
        background_regions.write(output_file_name)

        genomic_regions_dict[background_regions.name] = background_regions

        print(">>> background promoter file created:", len(background_regions), "regions")

    if not genomic_regions_dict:
        err.throw_error("DEFAULT_ERROR", add_msg="You must either specify an experimental matrix, or at least a "
                                                 "valid input file, or one of the 'promoter test' options.")

    max_region_len = 0
    max_region = None
    regions_to_match = []

    # Iterating on experimental matrix objects
    for k in genomic_regions_dict.keys():

        curr_genomic_region = genomic_regions_dict[k]

        # If the object is a GenomicRegionSet
        if isinstance(curr_genomic_region, GenomicRegionSet):

            if args.rmdup:
                # remove duplicates and sort regions
                curr_genomic_region.remove_duplicates(sort=True)
            else:
                # sort regions
                curr_genomic_region.sort()

            # Append label and GenomicRegionSet
            regions_to_match.append(curr_genomic_region)

            # Verifying max_region_len for random region generation
            curr_len = len(curr_genomic_region)
            if curr_len > max_region_len:
                max_region_len = curr_len
                max_region = curr_genomic_region

    print(">> all files loaded")

    ###################################################################################################
    # Creating random regions
    ###################################################################################################

    # if a random proportion is set, create random regions
    if args.rand_proportion:

        # Create random coordinates and name it random_regions
        rand_region = max_region.random_regions(args.organism, multiply_factor=args.rand_proportion, chrom_X=True)
        rand_region.sort()
        rand_region.name = random_region_name

        # Add random regions to the list of regions to perform matching on
        regions_to_match.append(rand_region)

        # Writing random regions
        output_file_name = npath(os.path.join(output_location, random_region_name))
        rand_bed_file_name = output_file_name + ".bed"
        rand_region.write(rand_bed_file_name)

        # Verifying condition to write bb
        if args.bigbed:

            # Fetching file with chromosome sizes
            chrom_sizes_file = genome_data.get_chromosome_sizes()

            try:
                # Converting to big bed
                bed_to_bb(rand_bed_file_name, chrom_sizes_file)

                # removing previously-created BED file
                os.remove(rand_bed_file_name)
            except Exception:
                err.throw_warning("DEFAULT_WARNING")  # FIXME: maybe error instead?

        print(">> random regions file created:", len(rand_region), "regions")

    ###################################################################################################
    # Creating PWMs
    ###################################################################################################

    if args.motif_dbs:
        ms = MotifSet(preload_motifs=args.motif_dbs, motif_dbs=True)
        # filter for dbs only if --motif_dbs is not set
        if 'database' in filter_values:
            del filter_values['database']
    else:
        if 'database' in filter_values:
            ms = MotifSet(preload_motifs=filter_values['database'])
        else:
            ms = MotifSet(preload_motifs="default")

    print(">> used database(s):", ",".join([str(db) for db in ms.motif_data.repositories_list]))

    # applying filtering pattern, taking a subset of the motif set
    if args.filter:
        ms = ms.filter(filter_values, search=args.filter_type)

    motif_list = ms.get_motif_list(args.pseudocounts, args.fpr)

    print(">> motifs loaded:", len(motif_list))

    # Performing normalized threshold strategy if requested
    if args.norm_threshold:
        threshold_list = [motif.threshold / motif.len for motif in motif_list]
        unique_threshold = sum(threshold_list) / len(threshold_list)
    else:
        unique_threshold = None

    scanner = scan.Scanner(7)
    pssm_list = []
    thresholds = []
    for motif in motif_list:
        if unique_threshold:
            thresholds.append(0.0)
            thresholds.append(0.0)
        else:
            thresholds.append(motif.threshold)
            thresholds.append(motif.threshold)

        pssm_list.append(motif.pssm)
        pssm_list.append(motif.pssm_rc)

    # Performing motif matching
    # TODO: we can expand this to use bg from sequence, for example,
    # or from organism.
    bg = tools.flat_bg(4)
    scanner.set_motifs(pssm_list, bg, thresholds)

    ###################################################################################################
    # Motif Matching
    ###################################################################################################

    # Creating genome file
    genome_file = Fastafile(genome_data.get_genome())

    print()

    # Iterating on list of genomic region sets
    for grs in regions_to_match:

        start = time.time()
        print(">> matching [", grs.name, "], ", len(grs), " regions... ", sep="", end='')
        sys.stdout.flush()

        # Initializing output bed file
        output_bed_file = os.path.join(output_location, grs.name + "_mpbs.bed")

        # must remove it because we append the MPBS
        if os.path.isfile(output_bed_file):
            os.remove(output_bed_file)

        # Iterating on genomic region set
        for genomic_region in grs:

            # Reading sequence associated to genomic_region
            sequence = str(genome_file.fetch(genomic_region.chrom, genomic_region.initial, genomic_region.final))

            grs_tmp = match_multiple(scanner, motif_list, sequence, genomic_region)

            # post-processing: if required, remove duplicate regions on opposing strands (keep highest score)
            if len(grs_tmp) > 1 and args.remove_strand_duplicates:
                grs_tmp.sort()
                seqs = grs_tmp.sequences
                seqs_new = []
                cur_pos = 0
                end_pos = len(seqs) - 1
                while cur_pos < end_pos:
                    gr = seqs[cur_pos]

                    new_pos = cur_pos + 1
                    while new_pos < end_pos:
                        gr2 = seqs[new_pos]

                        # if this sequence is unrelated, we move on
                        if gr.name != gr2.name or gr.chrom != gr2.chrom or gr.initial != gr2.initial or gr.final != gr2.final or gr.orientation == gr2.orientation:
                            break

                        if float(gr.data) < float(gr2.data):
                            gr = gr2

                        new_pos = new_pos + 1

                    # adding the currently-selected genomic region
                    seqs_new.append(gr)

                    # at the next loop, we start from the next right-handed sequences
                    cur_pos = new_pos

                # edge case: the last element was not considered
                # (when it is, cur_pos == end_pos+1)
                if cur_pos == end_pos:
                    seqs_new.append(seqs[cur_pos])

                grs_tmp.sequences = seqs_new

            grs_tmp.write(output_bed_file, mode="a")

        del grs.sequences[:]

        # Verifying condition to write bb
        if args.bigbed and args.normalize_bitscore:
            # Fetching file with chromosome sizes
            chrom_sizes_file = genome_data.get_chromosome_sizes()

            # Converting to big bed
            bed_to_bb(output_bed_file, chrom_sizes_file)

            # removing BED file
            os.remove(output_bed_file)

        secs = time.time() - start
        print("[", "%02.3f" % secs, " seconds]", sep="")
示例#15
0
    def estimate_table_pwm(self, regions, dnase_file_name, genome_file_name,
                           k_nb, forward_shift, reverse_shift):
        """
        Estimates bias based on HS regions, DNase-seq signal and genomic sequences.

        Keyword arguments:
        regions -- DNase-seq HS regions.
        atac_file_name -- DNase-seq file name.
        genome_file_name -- Genome to fetch genomic sequences from.

        Return:
        bias_table_F, bias_table_R -- Bias tables.
        """

        # Initializing bam and fasta
        if (dnase_file_name.split(".")[-1].upper() != "BAM"):
            return None  # TODO ERROR
        bamFile = Samfile(dnase_file_name, "rb")
        fastaFile = Fastafile(genome_file_name)

        obsSeqsF = []
        obsSeqsR = []
        expSeqsF = []
        expSeqsR = []

        # Iterating on HS regions
        for region in regions:
            # Evaluating observed frequencies
            # Fetching reads
            for r in bamFile.fetch(region.chrom, region.initial, region.final):
                # Calculating positions
                # if(not r.is_reverse): p1 = r.pos - (k_nb/2) - 1 + shift
                # else: p1 = r.aend - (k_nb/2) + 1 - shift
                if (not r.is_reverse):
                    cut_site = r.pos + forward_shift - 1
                    p1 = cut_site - int(floor(k_nb / 2))
                else:
                    cut_site = r.aend + reverse_shift + 1
                    p1 = cut_site - int(floor(k_nb / 2))
                p2 = p1 + k_nb

                # Fetching k-mer
                try:
                    currStr = str(fastaFile.fetch(region.chrom, p1,
                                                  p2)).upper()
                except Exception:
                    continue
                if (r.is_reverse):
                    currStr = AuxiliaryFunctions.revcomp(currStr)

                # Counting k-mer in dictionary
                if 'N' not in currStr:
                    if (not r.is_reverse):
                        obsSeqsF.append(Seq(currStr))
                    else:
                        obsSeqsR.append(Seq(currStr))

            # Evaluating expected frequencies
            # Fetching whole sequence
            try:
                currStr = str(
                    fastaFile.fetch(region.chrom, region.initial,
                                    region.final)).upper()
            except Exception:
                continue
            currRevComp = AuxiliaryFunctions.revcomp(currStr)

            # Iterating on each sequence position
            for i in range(0, len(currStr) - k_nb):
                s = currStr[i:i + k_nb]
                if 'N' not in currStr:
                    # Counting k-mer in dictionary
                    expSeqsF.append(Seq(s))

                    # Counting k-mer in dictionary for reverse complement
                    s = currRevComp[i:i + k_nb]
                    expSeqsR.append(Seq(s))

        # Closing files
        bamFile.close()
        fastaFile.close()

        obsMotifsF = motifs.create(obsSeqsF)
        obsMotifsR = motifs.create(obsSeqsR)
        expMotifsF = motifs.create(expSeqsF)
        expMotifsR = motifs.create(expSeqsR)

        obsPwmF = obsMotifsF.pwm
        obsPwmR = obsMotifsR.pwm
        expPwmF = expMotifsF.pwm
        expPwmR = expMotifsR.pwm

        # Output logos
        logo_obs_f = os.path.join(
            self.output_loc, "Bias", "logo",
            "obs_{}_{}_f.pdf".format(str(k_nb), str(forward_shift)))
        logo_obs_r = os.path.join(
            self.output_loc, "Bias", "logo",
            "obs_{}_{}_r.pdf".format(str(k_nb), str(forward_shift)))
        logo_exp_f = os.path.join(
            self.output_loc, "Bias", "logo",
            "exp_{}_{}_f.pdf".format(str(k_nb), str(forward_shift)))
        logo_exp_r = os.path.join(
            self.output_loc, "Bias", "logo",
            "exp_{}_{}_r.pdf".format(str(k_nb), str(forward_shift)))
        obsMotifsF.weblogo(logo_obs_f,
                           format="pdf",
                           stack_width="large",
                           color_scheme="color_classic",
                           yaxis_scale=0.2,
                           yaxis_tic_interval=0.1)
        obsMotifsR.weblogo(logo_obs_r,
                           format="pdf",
                           stack_width="large",
                           color_scheme="color_classic",
                           yaxis_scale=0.2,
                           yaxis_tic_interval=0.1)
        expMotifsF.weblogo(logo_exp_f,
                           format="pdf",
                           stack_width="large",
                           color_scheme="color_classic",
                           yaxis_scale=0.02,
                           yaxis_tic_interval=0.01)
        expMotifsR.weblogo(logo_exp_r,
                           format="pdf",
                           stack_width="large",
                           color_scheme="color_classic",
                           yaxis_scale=0.02,
                           yaxis_tic_interval=0.01)

        # Output pwms
        pwm_data_list = [obsPwmF, obsPwmR, expPwmF, expPwmR]
        pwm_file_list = []
        pwm_obs_f = os.path.join(
            self.output_loc, "Bias", "pwm",
            "obs_{}_{}_f.pwm".format(str(k_nb), str(forward_shift)))
        pwm_obs_r = os.path.join(
            self.output_loc, "Bias", "pwm",
            "obs_{}_{}_r.pwm".format(str(k_nb), str(forward_shift)))
        pwm_exp_f = os.path.join(
            self.output_loc, "Bias", "pwm",
            "exp_{}_{}_f.pwm".format(str(k_nb), str(forward_shift)))
        pwm_exp_r = os.path.join(
            self.output_loc, "Bias", "pwm",
            "exp_{}_{}_r.pwm".format(str(k_nb), str(forward_shift)))

        pwm_file_list.append(pwm_obs_f)
        pwm_file_list.append(pwm_obs_r)
        pwm_file_list.append(pwm_exp_f)
        pwm_file_list.append(pwm_exp_r)

        for i in range(len(pwm_data_list)):
            with open(pwm_file_list[i], "w") as f:
                f.write(str(pwm_data_list[i]))

        # Creating bias dictionary
        alphabet = ["A", "C", "G", "T"]
        k_mer_comb = ["".join(e) for e in product(alphabet, repeat=k_nb)]
        bias_table_F = dict([(e, 0.0) for e in k_mer_comb])
        bias_table_R = dict([(e, 0.0) for e in k_mer_comb])
        for k_mer in k_mer_comb:
            obsF = self.get_pwm_score(k_mer, obsPwmF, k_nb)
            expF = self.get_pwm_score(k_mer, expPwmF, k_nb)
            bias_table_F[k_mer] = round(obsF / expF, 6)
            obsR = self.get_pwm_score(k_mer, obsPwmR, k_nb)
            expR = self.get_pwm_score(k_mer, expPwmR, k_nb)
            bias_table_R[k_mer] = round(obsR / expR, 6)

        # Return
        return [bias_table_F, bias_table_R]
示例#16
0
    def estimate_table(self, regions, dnase_file_name, genome_file_name, k_nb,
                       forward_shift, reverse_shift):
        """ 
        Estimates bias based on HS regions, DNase-seq signal and genomic sequences.

        Keyword arguments:
        regions -- DNase-seq HS regions.
        dnase_file_name -- DNase-seq file name.
        genome_file_name -- Genome to fetch genomic sequences from.
        
        Return:
        bias_table_F, bias_table_R -- Bias tables.
        """

        # Parameters
        maxDuplicates = 100
        pseudocount = 1.0

        # Initializing bam and fasta
        if (dnase_file_name.split(".")[-1].upper() != "BAM"):
            return None  # TODO ERROR
        bamFile = Samfile(dnase_file_name, "rb")
        fastaFile = Fastafile(genome_file_name)

        # Initializing dictionaries
        obsDictF = dict()
        obsDictR = dict()
        expDictF = dict()
        expDictR = dict()

        ct_reads_r = 0
        ct_reads_f = 0
        ct_kmers = 0

        # Iterating on HS regions
        for region in regions:

            # Initialization
            prevPos = -1
            trueCounter = 0

            # Evaluating observed frequencies ####################################
            # Fetching reads
            for r in bamFile.fetch(region.chrom, region.initial, region.final):

                # Calculating positions
                if (not r.is_reverse):
                    cut_site = r.pos + forward_shift - 1
                    p1 = cut_site - int(floor(k_nb / 2))
                else:
                    cut_site = r.aend + reverse_shift + 1
                    p1 = cut_site - int(floor(k_nb / 2))
                p2 = p1 + k_nb

                # Verifying PCR artifacts
                if (p1 == prevPos):
                    trueCounter += 1
                else:
                    prevPos = p1
                    trueCounter = 0
                if (trueCounter > maxDuplicates): continue

                # Fetching k-mer
                try:
                    currStr = str(fastaFile.fetch(region.chrom, p1,
                                                  p2)).upper()
                except Exception:
                    continue
                if (r.is_reverse):
                    currStr = AuxiliaryFunctions.revcomp(currStr)

                # Counting k-mer in dictionary
                if (not r.is_reverse):
                    ct_reads_f += 1
                    try:
                        obsDictF[currStr] += 1
                    except Exception:
                        obsDictF[currStr] = 1
                else:
                    ct_reads_r += 1
                    try:
                        obsDictR[currStr] += 1
                    except Exception:
                        obsDictR[currStr] = 1

            # Evaluating expected frequencies ####################################
            # Fetching whole sequence
            try:
                currStr = str(
                    fastaFile.fetch(region.chrom, region.initial,
                                    region.final)).upper()
            except Exception:
                continue
            currRevComp = AuxiliaryFunctions.revcomp(currStr)

            # Iterating on each sequence position
            for i in range(0, len(currStr) - k_nb):
                ct_kmers += 1
                # Counting k-mer in dictionary
                s = currStr[i:i + k_nb]
                try:
                    expDictF[s] += 1
                except Exception:
                    expDictF[s] = 1

                # Counting k-mer in dictionary for reverse complement
                s = currRevComp[i:i + k_nb]
                try:
                    expDictR[s] += 1
                except Exception:
                    expDictR[s] = 1

        # Closing files
        bamFile.close()
        fastaFile.close()

        # Creating bias dictionary
        alphabet = ["A", "C", "G", "T"]
        kmerComb = ["".join(e) for e in product(alphabet, repeat=k_nb)]
        bias_table_F = dict([(e, 0.0) for e in kmerComb])
        bias_table_R = dict([(e, 0.0) for e in kmerComb])
        for kmer in kmerComb:
            try:
                obsF = obsDictF[kmer] + pseudocount
            except Exception:
                obsF = pseudocount
            try:
                expF = expDictF[kmer] + pseudocount
            except Exception:
                expF = pseudocount
            if ct_reads_f == 0:
                bias_table_F[kmer] = 1
            else:
                bias_table_F[kmer] = round(
                    float(obsF / ct_reads_f) / float(expF / ct_kmers), 6)
            try:
                obsR = obsDictR[kmer] + pseudocount
            except Exception:
                obsR = pseudocount
            try:
                expR = expDictR[kmer] + pseudocount
            except Exception:
                expR = pseudocount
            if ct_reads_r == 0:
                bias_table_R[kmer] = 1
            else:
                bias_table_R[kmer] = round(
                    float(obsR / ct_reads_r) / float(expR / ct_kmers), 6)

        # Return
        return [bias_table_F, bias_table_R]
示例#17
0
    def estimate_table(self, regions, dnase_file_name, genome_file_name, k_nb, shift):
        """ 
        Estimates bias based on HS regions, DNase-seq signal and genomic sequences.

        Keyword arguments:
        regions -- DNase-seq HS regions.
        dnase_file_name -- DNase-seq file name.
        genome_file_name -- Genome to fetch genomic sequences from.
        
        Return:
        bias_table_F, bias_table_R -- Bias tables.
        """

        # Parameters
        maxDuplicates = 100
        pseudocount = 1.0

        # Initializing bam and fasta
        if(dnase_file_name.split(".")[-1].upper() != "BAM"): return None # TODO ERROR
        bamFile = Samfile(dnase_file_name, "rb")
        fastaFile = Fastafile(genome_file_name)

        # Initializing dictionaries
        obsDictF = dict(); obsDictR = dict()
        expDictF = dict(); expDictR = dict()

        ct_reads_r=0
        ct_reads_f=0
        ct_kmers=0

        # Iterating on HS regions
        for region in regions:

            # Initialization
            prevPos = -1
            trueCounter = 0

            # Evaluating observed frequencies ####################################

            # Fetching reads
            for r in bamFile.fetch(region.chrom, region.initial, region.final):

                # Calculating positions
                if(not r.is_reverse): p1 = r.pos - (k_nb/2) - 1 + shift
                else: p1 = r.aend - (k_nb/2) + 1 - shift
                p2 = p1 + k_nb

                # Verifying PCR artifacts
                if(p1 == prevPos): trueCounter += 1
                else:
                    prevPos = p1
                    trueCounter = 0
                if(trueCounter > maxDuplicates): continue

                # Fetching k-mer
                try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
                except Exception: continue
                if(r.is_reverse): currStr = AuxiliaryFunctions.revcomp(currStr)

                # Counting k-mer in dictionary
                if(not r.is_reverse):
                    ct_reads_r+=1
                    try: obsDictF[currStr] += 1
                    except Exception: obsDictF[currStr] = 1
                else:
                    ct_reads_f+=1
                    try: obsDictR[currStr] += 1
                    except Exception: obsDictR[currStr] = 1 


            # Evaluating expected frequencies ####################################

            # Fetching whole sequence
            try: currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
            except Exception: continue
            currRevComp = AuxiliaryFunctions.revcomp(currStr)

            # Iterating on each sequence position
            for i in range(0,len(currStr)-k_nb):
                ct_kmers+=1
                # Counting k-mer in dictionary
                s = currStr[i:i+k_nb]
                try: expDictF[s] += 1
                except Exception: expDictF[s] = 1

                # Counting k-mer in dictionary for reverse complement
                s = currRevComp[i:i+k_nb]
                try: expDictR[s] += 1
                except Exception: expDictR[s] = 1

        # Closing files
        bamFile.close()
        fastaFile.close()

        # Creating bias dictionary
        alphabet = ["A","C","G","T"]
        kmerComb = ["".join(e) for e in product(alphabet, repeat=k_nb)]
        bias_table_F = dict([(e,0.0) for e in kmerComb]) 
        bias_table_R = dict([(e,0.0) for e in kmerComb]) 
        for kmer in kmerComb:
            try: obsF = obsDictF[kmer] + pseudocount
            except Exception: obsF = pseudocount
            try: expF = expDictF[kmer] + pseudocount
            except Exception: expF = pseudocount
            bias_table_F[kmer] = round(float(obsF/ct_reads_f)/float(expF/ct_kmers),6)
            try: obsR = obsDictR[kmer] + pseudocount
            except Exception: obsR = pseudocount
            try: expR = expDictR[kmer] + pseudocount
            except Exception: expR = pseudocount
            bias_table_R[kmer] = round(float(obsR/ct_reads_r)/float(expR/ct_kmers),6)

        # Return
        return [bias_table_F, bias_table_R]
示例#18
0
        for i in range((window / 2), len(nf) - (window / 2)):
            Nf.append(fSum)
            Nr.append(rSum)
            fSum -= fLast
            fSum += nf[i + (window / 2)]
            fLast = nf[i - (window / 2) + 1]
            rSum -= rLast
            rSum += nr[i + (window / 2)]
            rLast = nr[i - (window / 2) + 1]

        #for i in range(p1, p2):
        #  print i+1, Nf[i-p1], Nr[i-p1]

        # Fetching sequence
        # -1 and +2 corrections because He is wrong
        currStr = str(fastaFile.fetch(chrName, p1_wk - 1, p2_wk - 2)).upper()
        currRevComp = str(
            Seq(str(fastaFile.fetch(chrName, p1_wk + 2,
                                    p2_wk + 1)).upper()).reverse_complement())

        #print currStr
        #print currRevComp

        # Iterating on sequence to create signal
        af = []
        ar = []
        for i in range((k_nb / 2), len(currStr) - (k_nb / 2) + 1):
            fseq = currStr[i - (k_nb / 2):i + (k_nb / 2)]
            rseq = currRevComp[len(currStr) - (k_nb / 2) - i:len(currStr) +
                               (k_nb / 2) - i]
            #print fseq, rseq
示例#19
0
    with open(sys.argv[2], 'r') as ref:
        for line in ref:

            (bin, name, chrom, strand, txStart, txEnd, cdsStart, cdsEnd,
             exonCount, exonStarts, exonEnds, score, name2, cdsStartStat,
             cdsEndStat, exonFrames) = line.strip().split()

            exonStarts = map(int, exonStarts.split(',')[:-1])
            exonEnds = map(int, exonEnds.split(',')[:-1])

            assert len(exonEnds) == len(exonStarts) == int(exonCount)

            seq = ''

            for start, end in zip(exonStarts, exonEnds):
                if chrom in fa.references:
                    seq += fa.fetch(chrom, start, end)

            if strand == '-':
                seq = rc(seq)

            if seq:
                genes[name2].append(seq)

    for name in genes:
        for i, tx in enumerate(genes[name]):
            print ">%s.%d\n%s" % (name, i, tx)

else:
    sys.exit(usage())
示例#20
0
def estimate_bias_kmer(args):
    # Parameters
    maxDuplicates = 100
    pseudocount = 1.0

    # Initializing bam and fasta
    bamFile = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fastaFile = Fastafile(genome_data.get_genome())
    regions = GenomicRegionSet("regions")
    regions.read(args.regions_file)

    # Initializing dictionaries
    obsDictF = dict()
    obsDictR = dict()
    expDictF = dict()
    expDictR = dict()

    ct_reads_r = 0
    ct_reads_f = 0
    ct_kmers = 0

    # Iterating on HS regions
    for region in regions:

        # Initialization
        prevPos = -1
        trueCounter = 0

        # Evaluating observed frequencies ####################################
        # Fetching reads
        for r in bamFile.fetch(region.chrom, region.initial, region.final):

            # Calculating positions
            if not r.is_reverse:
                cut_site = r.pos + args.forward_shift - 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            else:
                cut_site = r.aend + args.reverse_shift + 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            p2 = p1 + args.k_nb

            # Verifying PCR artifacts
            if p1 == prevPos:
                trueCounter += 1
            else:
                prevPos = p1
                trueCounter = 0
            if trueCounter > maxDuplicates: continue

            # Fetching k-mer
            try:
                currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)

            # Counting k-mer in dictionary
            if not r.is_reverse:
                ct_reads_f += 1
                try:
                    obsDictF[currStr] += 1
                except Exception:
                    obsDictF[currStr] = 1
            else:
                ct_reads_r += 1
                try:
                    obsDictR[currStr] += 1
                except Exception:
                    obsDictR[currStr] = 1

        # Evaluating expected frequencies ####################################
        # Fetching whole sequence
        try:
            currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue
        currRevComp = AuxiliaryFunctions.revcomp(currStr)

        # Iterating on each sequence position
        for i in range(0, len(currStr) - args.k_nb):
            ct_kmers += 1
            # Counting k-mer in dictionary
            s = currStr[i:i + args.k_nb]
            try:
                expDictF[s] += 1
            except Exception:
                expDictF[s] = 1

            # Counting k-mer in dictionary for reverse complement
            s = currRevComp[i:i + args.k_nb]
            try:
                expDictR[s] += 1
            except Exception:
                expDictR[s] = 1

    # Closing files
    bamFile.close()
    fastaFile.close()

    # Creating bias dictionary
    alphabet = ["A", "C", "G", "T"]
    kmerComb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)]
    bias_table_F = dict([(e, 0.0) for e in kmerComb])
    bias_table_R = dict([(e, 0.0) for e in kmerComb])
    for kmer in kmerComb:
        try:
            obsF = obsDictF[kmer] + pseudocount
        except Exception:
            obsF = pseudocount
        try:
            expF = expDictF[kmer] + pseudocount
        except Exception:
            expF = pseudocount
        if ct_reads_f == 0:
            bias_table_F[kmer] = 1
        else:
            bias_table_F[kmer] = round(float(obsF / ct_reads_f) / float(expF / ct_kmers), 6)
        try:
            obsR = obsDictR[kmer] + pseudocount
        except Exception:
            obsR = pseudocount
        try:
            expR = expDictR[kmer] + pseudocount
        except Exception:
            expR = pseudocount
        if ct_reads_r == 0:
            bias_table_R[kmer] = 1
        else:
            bias_table_R[kmer] = round(float(obsR / ct_reads_r) / float(expR / ct_kmers), 6)

    write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
示例#21
0
def create_signal(args, regions):
    def revcomp(s):
        rev_dict = dict([("A", "T"), ("T", "A"), ("C", "G"), ("G", "C"), ("N", "N")])
        return "".join([rev_dict[e] for e in s[::-1]])

    alphabet = ["A", "C", "G", "T"]
    kmer_comb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)]
    f_obs_dict = dict([(e, 0.0) for e in kmer_comb])
    r_obs_dict = dict([(e, 0.0) for e in kmer_comb])
    f_exp_dict = dict([(e, 0.0) for e in kmer_comb])
    r_exp_dict = dict([(e, 0.0) for e in kmer_comb])

    bam_file = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fasta_file = Fastafile(genome_data.get_genome())

    for region in regions:
        # Fetching observed reads
        reads = bam_file.fetch(reference=region.chrom, start=region.initial, end=region.final)
        for read in reads:
            if not read.is_reverse:
                p1 = read.pos - int(floor(args.k_nb / 2)) + args.forward_shift - 1
            else:
                p1 = read.aend - int(floor(args.k_nb / 2)) + args.reverse_shift + 1
            p2 = p1 + args.k_nb
            try:
                dna_sequence_obs = str(fasta_file.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if 'N' not in dna_sequence_obs:
                if read.is_reverse:
                    dna_sequence_obs = revcomp(dna_sequence_obs)
                    r_obs_dict[dna_sequence_obs] += 1
                else:
                    f_obs_dict[dna_sequence_obs] += 1

        # Fetching whole sequence
        try:
            dna_sequence_exp = str(fasta_file.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue
        dna_sequence_exp_rev = revcomp(dna_sequence_exp)
        for i in range(0, len(dna_sequence_exp) - args.k_nb):
            s = dna_sequence_exp[i:i + args.k_nb]
            if "N" not in s:
                f_exp_dict[s] += 1
            s = dna_sequence_exp_rev[i:i + args.k_nb]
            if "N" not in s:
                r_exp_dict[s] += 1

    output_fname_f_obs = os.path.join(args.output_location, "{}_f_obs.fa".format(str(args.k_nb)))
    output_fname_f_exp = os.path.join(args.output_location, "{}_f_exp.fa".format(str(args.k_nb)))
    output_fname_r_obs = os.path.join(args.output_location, "{}_r_obs.fa".format(str(args.k_nb)))
    output_fname_r_exp = os.path.join(args.output_location, "{}_r_exp.fa".format(str(args.k_nb)))

    output_file_f_obs = open(output_fname_f_obs, "w")
    output_file_f_exp = open(output_fname_f_exp, "w")
    output_file_r_obs = open(output_fname_r_obs, "w")
    output_file_r_exp = open(output_fname_r_exp, "w")

    for kmer in r_obs_dict.keys():
        if f_obs_dict[kmer] > 0:
            output_file_f_obs.write(kmer + "\t" + str(f_obs_dict[kmer]) + "\n")
    for kmer in r_obs_dict.keys():
        if f_exp_dict[kmer] > 0:
            output_file_f_exp.write(kmer + "\t" + str(f_exp_dict[kmer]) + "\n")
    for kmer in r_obs_dict.keys():
        if r_obs_dict[kmer] > 0:
            output_file_r_obs.write(kmer + "\t" + str(r_obs_dict[kmer]) + "\n")
    for kmer in r_obs_dict.keys():
        if r_exp_dict[kmer] > 0:
            output_file_r_exp.write(kmer + "\t" + str(r_exp_dict[kmer]) + "\n")

    output_file_f_obs.close()
    output_file_f_exp.close()
    output_file_r_obs.close()
    output_file_r_exp.close()
示例#22
0
    # Smoothed counts
    Nf = []; Nr = [];
    fSum = sum(nf[:window]); rSum = sum(nr[:window]);
    fLast = nf[0]; rLast = nr[0]
    for i in range((window/2),len(nf)-(window/2)):
      Nf.append(fSum)
      Nr.append(rSum)
      fSum -= fLast; fSum += nf[i+(window/2)]; fLast = nf[i-(window/2)+1]
      rSum -= rLast; rSum += nr[i+(window/2)]; rLast = nr[i-(window/2)+1]

    #for i in range(p1, p2):
    #  print i+1, Nf[i-p1], Nr[i-p1]

    # Fetching sequence
    currStr = str(fastaFile.fetch(chrName, p1_wk+4, p2_wk+3)).upper()
    currRevComp = "".join([revDict[e] for e in currStr])

    #print currStr
    #print currRevComp

    # Iterating on sequence to create signal
    af = []; ar = []
    for i in range((k_nb/2),len(currStr)-(k_nb/2)+1):
      fseq = currStr[i-(k_nb/2):i+(k_nb/2)]
      rseq = currRevComp[i-(k_nb/2):i+(k_nb/2)][::-1]
      #rseq = currRevComp[len(currStr)-(k_nb/2)-i:len(currStr)+(k_nb/2)-i]
      #print fseq, rseq
      try: af.append(fBiasDict[fseq])
      except Exception: af.append(defaultKmerValue)
      try: ar.append(rBiasDict[rseq])
示例#23
0
def snp_workflow(ex, job, assembly, minsnp=40., mincov=5, path_to_ref=None, via='local',
                 logfile=sys.stdout, debugfile=sys.stderr):
    """Main function of the workflow"""
    ref_genome = assembly.fasta_by_chrom
    sample_names = [job.groups[gid]['name'] for gid in sorted(job.files.keys())]

    logfile.write("\n* Generate vcfs for each chrom/group\n"); logfile.flush()
    vcfs = dict((chrom,{}) for chrom in ref_genome.keys()) # {chr: {}}
    bams = {}
    # Launch the jobs
    for gid in sorted(job.files.keys()):
        # Merge all bams belonging to the same group
        runs = [r['bam'] for r in job.files[gid].itervalues()]
        bam = Samfile(runs[0])
        header = bam.header
        headerfile = unique_filename_in()
        for h in header["SQ"]:
            if h["SN"] in assembly.chrmeta:
                h["SN"] = assembly.chrmeta[h["SN"]]["ac"]
        head = Samfile( headerfile, "wh", header=header )
        head.close()
        if len(runs) > 1:
            _b = merge_bam(ex,runs)
            index_bam(ex,_b)
            bams[gid] = _b
        else:
            bams[gid] = runs[0]
        # Samtools mpileup + bcftools + vcfutils.pl
        for chrom,ref in ref_genome.iteritems():
            vcf = unique_filename_in()
            vcfs[chrom][gid] = (vcf,
                                pileup.nonblocking(ex, bams[gid], ref, header=headerfile,
                                                   via=via, stdout=vcf))
        logfile.write("  ...Group %s running.\n" %job.groups[gid]['name']); logfile.flush()
    # Wait for vcfs to finish and store them in *vcfs[chrom][gid]*
    for gid in sorted(job.files.keys()):
        for chrom,ref in ref_genome.iteritems():
            vcfs[chrom][gid][1].wait()
            vcfs[chrom][gid] = vcfs[chrom][gid][0]
        logfile.write("  ...Group %s done.\n" %job.groups[gid]['name']); logfile.flush()
    # Targz the pileup files (vcf)
    tarname = unique_filename_in()
    tarfh = tarfile.open(tarname, "w:gz")
    for chrom,v in vcfs.iteritems():
        for gid,vcf in v.iteritems():
            tarfh.add(vcf, arcname="%s_%s.vcf" % (job.groups[gid]['name'],chrom))
    tarfh.close()
    ex.add( tarname, description=set_file_descr("vcfs_files.tar.gz",step="pileup",type="tar",view='admin') )

    logfile.write("\n* Merge info from vcf files\n"); logfile.flush()
    outall = unique_filename_in()
    outexons = unique_filename_in()
    with open(outall,"w") as fout:
        fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+ \
                                 ['gene','location_type','distance'])+'\n')
    with open(outexons,"w") as fout:
        fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+['exon','strand','ref_aa'] \
                                  + ['new_aa_'+s for s in sample_names])+'\n')
    msa_table = dict((s,'') for s in [assembly.name]+sample_names)
    for chrom,v in vcfs.iteritems():
        logfile.write("  > Chromosome '%s'\n" % chrom); logfile.flush()
    # Put together info from all vcf files
        logfile.write("  - All SNPs\n"); logfile.flush()
        allsnps = all_snps(ex,chrom,vcfs[chrom],bams,outall,assembly,
                           sample_names,mincov,float(minsnp),logfile,debugfile)
    # Annotate SNPs and check synonymy
        logfile.write("  - Exonic SNPs\n"); logfile.flush()
        exon_snps(chrom,outexons,allsnps,assembly,sample_names,ref_genome,logfile,debugfile)
        for snprow in allsnps:
            for n,k in enumerate([assembly.name]+sample_names):
                msa_table[k] += snprow[3+n][0]
    description = set_file_descr("allSNP.txt",step="SNPs",type="txt")
    ex.add(outall,description=description)
    description = set_file_descr("exonsSNP.txt",step="SNPs",type="txt")
    ex.add(outexons,description=description)
    msafile = unique_filename_in()
    with open(msafile,"w") as msa:
        msa.write(" %i %i\n"%(len(msa_table),len(msa_table.values()[0])))
        for name,seq in msa_table.iteritems():
            msa.write("%s\t%s\n" %(name,seq))
    msa_table = {}
    description = set_file_descr("SNPalignment.txt",step="SNPs",type="txt")
    ex.add(msafile,description=description)
    # Create UCSC bed tracks
    logfile.write("\n* Create tracks\n"); logfile.flush()
    create_tracks(ex,outall,sample_names,assembly)
    # Create quantitative tracks
    logfile.write("\n* Create heteroz. and quality tracks\n"); logfile.flush()

    def _process_pileup(pileups, seq, startpos, endpos):
        atoi = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
        vectors = ([],[],[])
        for pileupcolumn in pileups:
            position = pileupcolumn.pos
            if position < startpos: continue
            if position >= endpos: break
            coverage = pileupcolumn.n
            ref_symbol = seq[position-startpos]
            ref = atoi.get(ref_symbol, 4)
            symbols = [0,0,0,0,0]
            quality = 0
            for pileupread in pileupcolumn.pileups:
                symbols[atoi.get(pileupread.alignment.seq[pileupread.qpos], 4)] += 1
                quality += ord(pileupread.alignment.qual[pileupread.qpos])-33
            quality = float(quality)/coverage
            info = heterozygosity(ref, symbols[0:4])
            if coverage > 0: vectors[0].append((position, position+1, coverage))
            if info > 0: vectors[1].append((position, position+1, info))
            if quality > 0: vectors[2].append((position, position+1, quality))
#            yield (position, position+1, coverage, info, quality)
        return vectors

    if job.options.get('make_bigwigs',False):
        _descr = {'groupId':0,'step':"tracks",'type':"bigWig",'ucsc':'1'}
        for gid,bamfile in bams.iteritems():
            _descr['groupId'] = gid
            bamtr = track(bamfile,format="bam")
            covname = unique_filename_in()+".bw"
            out_cov = track(covname, chrmeta=assembly.chrmeta)
            hetname = unique_filename_in()+".bw"
            out_het = track(hetname, chrmeta=assembly.chrmeta)
            qualname = unique_filename_in()+".bw"
            out_qual = track(qualname, chrmeta=assembly.chrmeta)
            for chrom, cinfo in assembly.chrmeta.iteritems():
                fasta = Fastafile(ref_genome[chrom])
                #process fasta and bam by 10Mb chunks
                for chunk in range(0,cinfo["length"],10**7):
                    fastaseq = fasta.fetch(cinfo['ac'], chunk, chunk+10**7)
                    vecs = _process_pileup(bamtr.pileup(chrom, chunk, chunk+10**7), fastaseq, chunk, chunk+10**7)
                    out_cov.write(vecs[0], fields=['start','end','score'], chrom=chrom)
                    out_het.write(vecs[1], fields=['start','end','score'], chrom=chrom)
                    out_qual.write(vecs[2], fields=['start','end','score'], chrom=chrom)
            out_cov.close()
            out_het.close()
            out_qual.close()
            description = set_file_descr(job.groups[gid]['name']+"_coverage.bw",**_descr)
            ex.add(covname,description=description)
            description = set_file_descr(job.groups[gid]['name']+"_heterozygosity.bw",**_descr)
            ex.add(hetname,description=description)
            description = set_file_descr(job.groups[gid]['name']+"_quality.bw",**_descr)
            ex.add(qualname,description=description)

    return 0
示例#24
0
def bias_correction(chrom, start, end, bam, bias_table, genome_file_name,
                    forward_shift, reverse_shift):
    # Parameters
    window = 50
    defaultKmerValue = 1.0

    # Initialization
    fastaFile = Fastafile(genome_file_name)
    fBiasDict = bias_table[0]
    rBiasDict = bias_table[1]
    k_nb = len(list(fBiasDict.keys())[0])
    p1 = start
    p2 = end
    p1_w = p1 - (window // 2)
    p2_w = p2 + (window // 2)
    p1_wk = p1_w - int(floor(k_nb / 2.))
    p2_wk = p2_w + int(ceil(k_nb / 2.))
    if p1 <= 0 or p1_w <= 0 or p1_wk <= 0 or p2_wk <= 0:
        # Return raw counts
        bc_signal = [0.0] * (p2 - p1)
        for read in bam.fetch(chrom, p1, p2):
            # check if the read is unmapped, according to issue #112
            if read.is_unmapped:
                continue

            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1 <= cut_site < p2:
                    bc_signal[cut_site - p1] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1 <= cut_site < p2:
                    bc_signal[cut_site - p1] += 1.0

        return bc_signal

    # Raw counts
    nf = [0.0] * (p2_w - p1_w)
    nr = [0.0] * (p2_w - p1_w)
    for read in bam.fetch(chrom, p1_w, p2_w):
        # check if the read is unmapped, according to issue #112
        if read.is_unmapped:
            continue

        if not read.is_reverse:
            cut_site = read.pos + forward_shift
            if p1_w <= cut_site < p2_w:
                nf[cut_site - p1_w] += 1.0
        else:
            cut_site = read.aend + reverse_shift - 1
            if p1_w <= cut_site < p2_w:
                nr[cut_site - p1_w] += 1.0

    # Smoothed counts
    Nf = []
    Nr = []
    f_sum = sum(nf[:window])
    r_sum = sum(nr[:window])
    f_last = nf[0]
    r_last = nr[0]
    for i in range(int(window / 2), len(nf) - int(window / 2)):
        Nf.append(f_sum)
        Nr.append(r_sum)
        f_sum -= f_last
        f_sum += nf[i + int(window / 2)]
        f_last = nf[i - int(window / 2) + 1]
        r_sum -= r_last
        r_sum += nr[i + int(window / 2)]
        r_last = nr[i - int(window / 2) + 1]

    # Fetching sequence
    currStr = str(fastaFile.fetch(chrom, p1_wk, p2_wk - 1)).upper()
    currRevComp = AuxiliaryFunctions.revcomp(
        str(fastaFile.fetch(chrom, p1_wk + 1, p2_wk)).upper())

    # Iterating on sequence to create signal
    af = []
    ar = []
    for i in range(int(ceil(k_nb / 2.)),
                   len(currStr) - int(floor(k_nb / 2)) + 1):
        fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))]
        rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) -
                           i:len(currStr) + int(floor(k_nb / 2.)) - i]
        try:
            af.append(fBiasDict[fseq])
        except Exception:
            af.append(defaultKmerValue)
        try:
            ar.append(rBiasDict[rseq])
        except Exception:
            ar.append(defaultKmerValue)

    # Calculating bias and writing to wig file
    f_sum = sum(af[:window])
    r_sum = sum(ar[:window])
    f_last = af[0]
    r_last = ar[0]
    bc_signal = []
    for i in range(int(window / 2), len(af) - int(window / 2)):
        nhatf = Nf[i - int(window / 2)] * (af[i] / f_sum)
        nhatr = Nr[i - int(window / 2)] * (ar[i] / r_sum)
        bc_signal.append(nhatf + nhatr)
        f_sum -= f_last
        f_sum += af[i + int(window / 2)]
        f_last = af[i - int(window / 2) + 1]
        r_sum -= r_last
        r_sum += ar[i + int(window / 2)]
        r_last = ar[i - int(window / 2) + 1]

    # Termination
    fastaFile.close()
    return bc_signal
示例#25
0
    def print_signal(self, ref, start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift,
                     initial_clip=1000, per_norm=98, per_slope=98, bias_table=None, genome_file_name=None,
                     raw_signal_file=None, bc_signal_file=None, norm_signal_file=None, strand_specific=False):

        if raw_signal_file:
            pileup_region = PileupRegion(start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift)
            if ps_version == "0.7.5":
                self.bam.fetch(reference=ref, start=start, end=end, callback=pileup_region)
            else:
                iter = self.bam.fetch(reference=ref, start=start, end=end)
                for alignment in iter:
                    pileup_region.__call__(alignment)
            raw_signal = array([min(e, initial_clip) for e in pileup_region.vector])

            f = open(raw_signal_file, "a")
            f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                [str(e) for e in nan_to_num(raw_signal)]) + "\n")
            f.close()

        if bc_signal_file or norm_signal_file:
            # Parameters
            window = 50
            defaultKmerValue = 1.0

            # Initialization
            fasta = Fastafile(genome_file_name)
            fBiasDict = bias_table[0]
            rBiasDict = bias_table[1]
            k_nb = len(fBiasDict.keys()[0])
            p1 = start
            p2 = end
            p1_w = p1 - (window / 2)
            p2_w = p2 + (window / 2)
            p1_wk = p1_w - int(k_nb / 2.)
            p2_wk = p2_w + int(k_nb / 2.)

            currStr = str(fasta.fetch(ref, p1_wk, p2_wk - 1)).upper()
            currRevComp = AuxiliaryFunctions.revcomp(str(fasta.fetch(ref, p1_wk + 1, p2_wk)).upper())

            # Iterating on sequence to create the bias signal
            signal_bias_f = []
            signal_bias_r = []
            for i in range(int(k_nb / 2.), len(currStr) - int(k_nb / 2) + 1):
                fseq = currStr[i - int(k_nb / 2.):i + int(k_nb / 2.)]
                rseq = currRevComp[len(currStr) - int(k_nb / 2.) - i:len(currStr) + int(k_nb / 2.) - i]
                try:
                    signal_bias_f.append(fBiasDict[fseq])
                except Exception:
                    signal_bias_f.append(defaultKmerValue)
                try:
                    signal_bias_r.append(rBiasDict[rseq])
                except Exception:
                    signal_bias_r.append(defaultKmerValue)

            # Raw counts
            signal_raw_f = [0.0] * (p2_w - p1_w)
            signal_raw_r = [0.0] * (p2_w - p1_w)
            for read in self.bam.fetch(ref, p1_w, p2_w):
                if not read.is_reverse:
                    cut_site = read.pos + forward_shift
                    if p1_w <= cut_site < p2_w:
                        signal_raw_f[cut_site - p1_w] += 1.0
                else:
                    cut_site = read.aend + reverse_shift - 1
                    if p1_w <= cut_site < p2_w:
                        signal_raw_r[cut_site - p1_w] += 1.0

            # Smoothed counts
            Nf = []
            Nr = []
            fSum = sum(signal_raw_f[:window])
            rSum = sum(signal_raw_r[:window])
            fLast = signal_raw_f[0]
            rLast = signal_raw_r[0]
            for i in range((window / 2), len(signal_raw_f) - (window / 2)):
                Nf.append(fSum)
                Nr.append(rSum)
                fSum -= fLast
                fSum += signal_raw_f[i + (window / 2)]
                fLast = signal_raw_f[i - (window / 2) + 1]
                rSum -= rLast
                rSum += signal_raw_r[i + (window / 2)]
                rLast = signal_raw_r[i - (window / 2) + 1]

            # Calculating bias and writing to wig file
            fSum = sum(signal_bias_f[:window])
            rSum = sum(signal_bias_r[:window])
            fLast = signal_bias_f[0]
            rLast = signal_bias_r[0]
            signal_bc = []
            signal_bc_f = []
            signal_bc_r = []
            for i in range((window / 2), len(signal_bias_f) - (window / 2)):
                nhatf = Nf[i - (window / 2)] * (signal_bias_f[i] / fSum)
                nhatr = Nr[i - (window / 2)] * (signal_bias_r[i] / rSum)
                signal_bc.append(nhatf + nhatr)
                signal_bc_f.append(nhatf)
                signal_bc_r.append(nhatr)
                fSum -= fLast
                fSum += signal_bias_f[i + (window / 2)]
                fLast = signal_bias_f[i - (window / 2) + 1]
                rSum -= rLast
                rSum += signal_bias_r[i + (window / 2)]
                rLast = signal_bias_r[i - (window / 2) + 1]

            if bc_signal_file:
                f = open(bc_signal_file, "a")
                f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                    [str(e) for e in nan_to_num(signal_bc)]) + "\n")
                f.close()

                if strand_specific:
                    prefix = bc_signal_file.split(".")[0]
                    bc_signal_file_f = prefix + "_Forward" + ".bc.wig"
                    bc_signal_file_r = prefix + "_Reverse" + ".bc.wig"
                    f = open(bc_signal_file_f, "a")
                    f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                        [str(e) for e in nan_to_num(signal_bc_f)]) + "\n")
                    f.close()
                    f = open(bc_signal_file_r, "a")
                    f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                        [str(e) for e in nan_to_num(signal_bc_r)]) + "\n")
                    f.close()

            if norm_signal_file:
                norm_signal_bc = self.boyle_norm(signal_bc)
                perc = scoreatpercentile(norm_signal_bc, 98)
                std = np.std(norm_signal_bc)
                norm_signal_bc = self.hon_norm_atac(norm_signal_bc, perc, std)
                f = open(norm_signal_file, "a")
                f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                    [str(e) for e in nan_to_num(norm_signal_bc)]) + "\n")
                f.close()

                if strand_specific:
                    prefix = bc_signal_file.split(".")[0]
                    norm_signal_file_f = prefix + "_Forward" + ".norm.wig"
                    norm_signal_file_r = prefix + "_Reverse" + ".norm.wig"

                    signal_norm_f = self.boyle_norm(signal_bc_f)
                    perc = scoreatpercentile(signal_norm_f, 98)
                    std = np.std(signal_norm_f)
                    signal_norm_f = self.hon_norm_atac(signal_norm_f, perc, std)

                    signal_norm_r = self.boyle_norm(signal_bc_r)
                    perc = scoreatpercentile(signal_norm_r, 98)
                    std = np.std(signal_norm_r)
                    signal_norm_r = self.hon_norm_atac(signal_norm_r, perc, std)

                    f = open(norm_signal_file_f, "a")
                    f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                        [str(e) for e in nan_to_num(signal_norm_f)]) + "\n")
                    f.close()
                    f = open(norm_signal_file_r, "a")
                    f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                        [str(e) for e in nan_to_num(signal_norm_r)]) + "\n")
                    f.close()
示例#26
0
    def bias_correction(self, signal, bias_table, genome_file_name, chrName,
                        start, end, forward_shift, reverse_shift,
                        strands_specific):
        """
        Performs bias correction.

        Keyword arguments:
        signal -- Input signal.
        bias_table -- Bias table.

        Return:
        bias_corrected_signal -- Bias-corrected sequence.
        """

        if (not bias_table): return signal

        # Parameters
        window = 50
        defaultKmerValue = 1.0

        # Initialization
        fastaFile = Fastafile(genome_file_name)
        fBiasDict = bias_table[0]
        rBiasDict = bias_table[1]
        k_nb = len(fBiasDict.keys()[0])
        p1 = start
        p2 = end
        p1_w = p1 - (window / 2)
        p2_w = p2 + (window / 2)
        p1_wk = p1_w - int(floor(k_nb / 2.))
        p2_wk = p2_w + int(ceil(k_nb / 2.))
        if (p1 <= 0 or p1_w <= 0 or p1_wk <= 0): return signal

        # Raw counts
        nf = [0.0] * (p2_w - p1_w)
        nr = [0.0] * (p2_w - p1_w)
        for read in self.bam.fetch(chrName, p1_w, p2_w):
            if (not read.is_reverse):
                cut_site = read.pos + forward_shift
                if cut_site >= start and cut_site < end:
                    nf[cut_site - p1_w] += 1.0
                    # for i in range(max(read.pos + forward_shift, start), min(read.pos + forward_shift + 1, end - 1)):
                    #    nf[i - start] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if cut_site >= start and cut_site < end:
                    nr[cut_site - p1_w] += 1.0
                    # for i in range(max(read.aend + reverse_shift - 1, start), min(read.aend + reverse_shift, end - 1)):
                    #    nr[i - start] += 1.0

                    # if ((not read.is_reverse) and (read.pos > p1_w)): nf[read.pos - p1_w] += 1.0
                    # if ((read.is_reverse) and ((read.aend - 1) < p2_w)): nr[read.aend - 1 - p1_w] += 1.0

        # Smoothed counts
        Nf = []
        Nr = []
        fSum = sum(nf[:window])
        rSum = sum(nr[:window])
        fLast = nf[0]
        rLast = nr[0]
        for i in range((window / 2), len(nf) - (window / 2)):
            Nf.append(fSum)
            Nr.append(rSum)
            fSum -= fLast
            fSum += nf[i + (window / 2)]
            fLast = nf[i - (window / 2) + 1]
            rSum -= rLast
            rSum += nr[i + (window / 2)]
            rLast = nr[i - (window / 2) + 1]

        # Fetching sequence
        currStr = str(fastaFile.fetch(chrName, p1_wk - 1, p2_wk - 2)).upper()
        currRevComp = AuxiliaryFunctions.revcomp(
            str(fastaFile.fetch(chrName, p1_wk + 2, p2_wk + 1)).upper())
        #currStr = str(fastaFile.fetch(chrName, p1_wk, p2_wk - 1)).upper()
        #currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrName, p1_wk + 1,
        #                                                            p2_wk)).upper())

        # Iterating on sequence to create signal
        af = []
        ar = []
        for i in range(int(ceil(k_nb / 2.)),
                       len(currStr) - int(floor(k_nb / 2)) + 1):
            fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))]
            rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) -
                               i:len(currStr) + int(floor(k_nb / 2.)) - i]
            try:
                af.append(fBiasDict[fseq])
            except Exception:
                af.append(defaultKmerValue)
            try:
                ar.append(rBiasDict[rseq])
            except Exception:
                ar.append(defaultKmerValue)

        # Calculating bias and writing to wig file
        fSum = sum(af[:window])
        rSum = sum(ar[:window])
        fLast = af[0]
        rLast = ar[0]
        bias_corrected_signal = []
        bias_corrected_signal_forward = []
        bias_corrected_signal_reverse = []
        for i in range((window / 2), len(af) - (window / 2)):
            nhatf = Nf[i - (window / 2)] * (af[i] / fSum)
            nhatr = Nr[i - (window / 2)] * (ar[i] / rSum)
            zf = log(nf[i] + 1) - log(nhatf + 1)
            zr = log(nr[i] + 1) - log(nhatr + 1)
            bias_corrected_signal_forward.append(zf)
            bias_corrected_signal_reverse.append(zr)
            bias_corrected_signal.append(zf + zr)
            fSum -= fLast
            fSum += af[i + (window / 2)]
            fLast = af[i - (window / 2) + 1]
            rSum -= rLast
            rSum += ar[i + (window / 2)]
            rLast = ar[i - (window / 2) + 1]

        # Fixing the negative number in bias corrected signal
        min_value = abs(min(bias_corrected_signal_forward))
        bias_fixed_signal_forward = [
            e + min_value for e in bias_corrected_signal_forward
        ]

        min_value = abs(min(bias_corrected_signal_reverse))
        bias_fixed_signal_reverse = [
            e + min_value for e in bias_corrected_signal_reverse
        ]

        min_value = abs(min(bias_corrected_signal))
        bias_fixed_signal = [e + min_value for e in bias_corrected_signal]

        # Termination
        fastaFile.close()
        if not strands_specific:
            return bias_corrected_signal
        else:
            return bias_fixed_signal_forward, bias_fixed_signal_reverse
示例#27
0
文件: plot.py 项目: eggduzao/reg-gen
    def line(self):
        signal = GenomicSignal(self.bam_file)
        signal.load_sg_coefs(slope_window_size=9)
        bias_table = BiasTable()
        bias_table_list = self.bias_table.split(",")
        table = bias_table.load_table(table_file_name_F=bias_table_list[0],
                                      table_file_name_R=bias_table_list[1])
        genome_data = GenomeData(self.organism)
        fasta = Fastafile(genome_data.get_genome())
        pwm_dict = dict([("A", [0.0] * self.window_size), ("C", [0.0] * self.window_size),
                        ("G", [0.0] * self.window_size), ("T", [0.0] * self.window_size),
                        ("N", [0.0] * self.window_size)])


        mean_raw_signal = np.zeros(self.window_size)
        mean_bc_signal = np.zeros(self.window_size)
        mean_raw_signal_f = np.zeros(self.window_size)
        mean_bc_signal_f = np.zeros(self.window_size)
        mean_raw_signal_r = np.zeros(self.window_size)
        mean_bc_signal_r = np.zeros(self.window_size)

        mean_bias_signal_f = np.zeros(self.window_size)
        mean_bias_signal_r = np.zeros(self.window_size)
        num_sites = 0

        mpbs_regions = GenomicRegionSet("Motif Predicted Binding Sites")
        mpbs_regions.read_bed(self.motif_file)

        total_nc_signal = 0
        total_nl_signal = 0
        total_nr_signal = 0

        for region in mpbs_regions:
            if str(region.name).split(":")[-1] == "Y":
                num_sites += 1
                # Extend by 50 bp
                mid = (region.initial + region.final) / 2
                p1 = mid - (self.window_size / 2)
                p2 = mid + (self.window_size / 2)

                if not self.strands_specific:
                    # Fetch raw signal
                    raw_signal, _ = signal.get_signal(ref=region.chrom, start=p1, end=p2,
                                                  downstream_ext=self.atac_downstream_ext,
                                                  upstream_ext=self.atac_upstream_ext,
                                                  forward_shift=self.atac_forward_shift,
                                                  reverse_shift=self.atac_reverse_shift,
                                                  genome_file_name=genome_data.get_genome())

                    mean_raw_signal = np.add(mean_raw_signal, raw_signal)

                    # Fetch bias correction signal
                    bc_signal, _ = signal.get_signal(ref=region.chrom, start=p1, end=p2,
                                                     bias_table=table,
                                                     downstream_ext=self.atac_downstream_ext,
                                                     upstream_ext=self.atac_upstream_ext,
                                                     forward_shift=self.atac_forward_shift,
                                                     reverse_shift=self.atac_reverse_shift,
                                                     genome_file_name=genome_data.get_genome())

                    mean_bc_signal = np.add(mean_bc_signal, bc_signal)
                else:
                    raw_signal_f, _, raw_signal_r, _ =  signal.get_signal_per_strand(ref=region.chrom, start=p1, end=p2,
                                                                        downstream_ext=self.atac_downstream_ext,
                                                                        upstream_ext=self.atac_upstream_ext,
                                                                        forward_shift=self.atac_forward_shift,
                                                                        reverse_shift=self.atac_reverse_shift,
                                                                        genome_file_name=genome_data.get_genome())
                    mean_raw_signal_f = np.add(mean_raw_signal_f, raw_signal_f)
                    mean_raw_signal_r = np.add(mean_raw_signal_r, raw_signal_r)

                    bc_signal_f, _, bc_signal_r, _ = signal.get_signal_per_strand(ref=region.chrom, start=p1, end=p2,
                                                                                  bias_table=table,
                                                                                  downstream_ext=self.atac_downstream_ext,
                                                                                  upstream_ext=self.atac_upstream_ext,
                                                                                  forward_shift=self.atac_forward_shift,
                                                                                  reverse_shift=self.atac_reverse_shift,
                                                                                  genome_file_name=genome_data.get_genome())
                    mean_bc_signal_f = np.add(mean_bc_signal_f, bc_signal_f)
                    mean_bc_signal_r = np.add(mean_bc_signal_r, bc_signal_r)

                # Update pwm
                aux_plus = 1
                dna_seq = str(fasta.fetch(region.chrom, p1, p2)).upper()
                if (region.final - region.initial) % 2 == 0:
                    aux_plus = 0
                dna_seq_rev = AuxiliaryFunctions.revcomp(str(fasta.fetch(region.chrom,
                                                                         p1 + aux_plus, p2 + aux_plus)).upper())
                if region.orientation == "+":
                    for i in range(0, len(dna_seq)):
                        pwm_dict[dna_seq[i]][i] += 1
                elif region.orientation == "-":
                    for i in range(0, len(dna_seq_rev)):
                        pwm_dict[dna_seq_rev[i]][i] += 1

                # Create bias signal
                bias_table_f = table[0]
                bias_table_r = table[1]
                self.k_nb = len(bias_table_f.keys()[0])
                bias_signal_f = []
                bias_signal_r = []
                p1_wk = p1 - int(self.k_nb / 2)
                p2_wk = p2 + int(self.k_nb / 2)
                dna_seq = str(fasta.fetch(region.chrom, p1_wk, p2_wk - 1)).upper()
                dna_seq_rev = AuxiliaryFunctions.revcomp(str(fasta.fetch(region.chrom, p1_wk, p2_wk + 1)).upper())
                for i in range(int(self.k_nb / 2), len(dna_seq) - int(self.k_nb / 2) + 1):
                    fseq = dna_seq[i - int(self.k_nb / 2):i + int(self.k_nb / 2)]
                    rseq = dna_seq_rev[len(dna_seq) - int(self.k_nb / 2) - i:len(dna_seq) + int(self.k_nb / 2) - i]
                    try:
                        bias_signal_f.append(bias_table_f[fseq])
                    except Exception:
                        bias_signal_f.append(1)
                    try:
                        bias_signal_r.append(bias_table_r[rseq])
                    except Exception:
                        bias_signal_r.append(1)

                mean_bias_signal_f = np.add(mean_bias_signal_f, np.array(bias_signal_f))
                mean_bias_signal_r = np.add(mean_bias_signal_r, np.array(bias_signal_r))

                if self.protection_score:
                    # signal in the center of the MPBS
                    p1 = region.initial
                    p2 = region.final
                    nc_signal, _ = signal.get_signal(ref=region.chrom, start=p1, end=p2,
                                                     bias_table=table,
                                                     downstream_ext=self.atac_downstream_ext,
                                                     upstream_ext=self.atac_upstream_ext,
                                                     forward_shift=self.atac_forward_shift,
                                                     reverse_shift=self.atac_reverse_shift,
                                                     genome_file_name=genome_data.get_genome())
                    total_nc_signal += sum(nc_signal)
                    p1 = region.final
                    p2 = 2 * region.final - region.initial
                    nr_signal, _ = signal.get_signal(ref=region.chrom, start=p1, end=p2,
                                                     bias_table=table,
                                                     downstream_ext=self.atac_downstream_ext,
                                                     upstream_ext=self.atac_upstream_ext,
                                                     forward_shift=self.atac_forward_shift,
                                                     reverse_shift=self.atac_reverse_shift,
                                                     genome_file_name=genome_data.get_genome())
                    total_nr_signal += sum(nr_signal)
                    p1 = 2 * region.initial - region.final
                    p2 = region.final
                    nl_signal, _ = signal.get_signal(ref=region.chrom, start=p1, end=p2,
                                                     bias_table=table,
                                                     downstream_ext=self.atac_downstream_ext,
                                                     upstream_ext=self.atac_upstream_ext,
                                                     forward_shift=self.atac_forward_shift,
                                                     reverse_shift=self.atac_reverse_shift,
                                                     genome_file_name=genome_data.get_genome())
                    total_nl_signal += sum(nl_signal)


        mean_raw_signal = mean_raw_signal / num_sites
        mean_bc_signal = mean_bc_signal / num_sites

        mean_raw_signal_f = mean_raw_signal_f / num_sites
        mean_raw_signal_r = mean_raw_signal_r / num_sites
        mean_bc_signal_f = mean_bc_signal_f / num_sites
        mean_bc_signal_r = mean_bc_signal_r / num_sites

        mean_bias_signal_f = mean_bias_signal_f / num_sites
        mean_bias_signal_r = mean_bias_signal_r / num_sites

        protection_score = (total_nl_signal + total_nr_signal - 2 * total_nc_signal) / (2 * num_sites)

        # Output PWM and create logo
        pwm_fname = os.path.join(self.output_loc, "{}.pwm".format(self.motif_name))
        pwm_file = open(pwm_fname,"w")
        for e in ["A","C","G","T"]:
            pwm_file.write(" ".join([str(int(f)) for f in pwm_dict[e]])+"\n")
        pwm_file.close()

        logo_fname = os.path.join(self.output_loc, "{}.logo.eps".format(self.motif_name))
        pwm = motifs.read(open(pwm_fname), "pfm")
        pwm.weblogo(logo_fname, format="eps", stack_width="large", stacks_per_line="100",
                    color_scheme="color_classic", unit_name="", show_errorbars=False, logo_title="",
                    show_xaxis=False, xaxis_label="", show_yaxis=False, yaxis_label="",
                    show_fineprint=False, show_ends=False)

        # Output the raw, bias corrected signal and protection score
        output_fname = os.path.join(self.output_loc, "{}.txt".format(self.motif_name))
        output_file = open(output_fname, "w")
        if not self.strands_specific:
            output_file.write("raw signal: \n" + np.array_str(mean_raw_signal) + "\n")
            output_file.write("bias corrected signal: \n" + np.array_str(mean_bc_signal) + "\n")
        else:
            output_file.write("raw forward signal: \n" + np.array_str(mean_raw_signal_f) + "\n")
            output_file.write("bias corrected forward signal: \n" + np.array_str(mean_bc_signal_f) + "\n")
            output_file.write("raw reverse signal: \n" + np.array_str(mean_raw_signal_r) + "\n")
            output_file.write("bias reverse corrected signal: \n" + np.array_str(mean_bc_signal_r) + "\n")
        output_file.write("forward bias signal: \n" + np.array_str(mean_bias_signal_f) + "\n")
        output_file.write("reverse bias signal: \n" + np.array_str(mean_bias_signal_r) + "\n")
        if self.protection_score:
            output_file.write("protection score: \n" + str(protection_score) + "\n")
        output_file.close()

        if self.strands_specific:
            fig, (ax1, ax2, ax3) = plt.subplots(3, figsize=(12.0, 10.0))
        else:
            fig, (ax1, ax2) = plt.subplots(2)
        x = np.linspace(-50, 49, num=self.window_size)

        ax1.plot(x, mean_bias_signal_f, color='red', label='Forward')
        ax1.plot(x, mean_bias_signal_r, color='blue', label='Reverse')

        ax1.xaxis.set_ticks_position('bottom')
        ax1.yaxis.set_ticks_position('left')
        ax1.spines['top'].set_visible(False)
        ax1.spines['right'].set_visible(False)
        ax1.spines['left'].set_position(('outward', 15))
        ax1.spines['bottom'].set_position(('outward', 5))
        ax1.tick_params(direction='out')

        ax1.set_xticks([-50, -40, -30, -20, -10, 0, 10, 20, 30, 40, 49])
        ax1.set_xticklabels(['-50', '-40', '-30', '-20', '-10', '0', '10', '20', '30', '40', '49'])
        min_bias_signal = min(min(mean_bias_signal_f), min(mean_bias_signal_r))
        max_bias_signal = max(max(mean_bias_signal_f), max(mean_bias_signal_r))
        ax1.set_yticks([min_bias_signal, max_bias_signal])
        ax1.set_yticklabels([str(round(min_bias_signal,2)), str(round(max_bias_signal,2))], rotation=90)

        ax1.text(-48, max_bias_signal, '# Sites = {}'.format(str(num_sites)), fontweight='bold')
        ax1.set_title(self.motif_name, fontweight='bold')
        ax1.set_xlim(-50, 49)
        ax1.set_ylim([min_bias_signal, max_bias_signal])
        ax1.legend(loc="upper right", frameon=False)
        ax1.set_ylabel("Average Bias \nSignal", rotation=90, fontweight='bold')

        if not self.strands_specific:
            mean_raw_signal = self.standardize(mean_raw_signal)
            mean_bc_signal = self.standardize(mean_bc_signal)
            ax2.plot(x, mean_raw_signal, color='red', label='Uncorrected')
            ax2.plot(x, mean_bc_signal, color='green', label='Corrected')
        else:
            mean_raw_signal_f = self.standardize(mean_raw_signal_f)
            mean_raw_signal_r = self.standardize(mean_raw_signal_r)
            mean_bc_signal_f = self.standardize(mean_bc_signal_f)
            mean_bc_signal_r = self.standardize(mean_bc_signal_r)
            ax2.plot(x, mean_raw_signal_f, color='red', label='Forward')
            ax2.plot(x, mean_raw_signal_r, color='green', label='Reverse')
            ax3.plot(x, mean_bc_signal_f, color='red', label='Forward')
            ax3.plot(x, mean_bc_signal_r, color='green', label='Reverse')

        ax2.xaxis.set_ticks_position('bottom')
        ax2.yaxis.set_ticks_position('left')
        ax2.spines['top'].set_visible(False)
        ax2.spines['right'].set_visible(False)
        ax2.spines['left'].set_position(('outward', 15))
        ax2.tick_params(direction='out')
        ax2.set_xticks([-50, -40, -30, -20, -10, 0, 10, 20, 30, 40, 49])
        ax2.set_xticklabels(['-50', '-40', '-30', '-20', '-10', '0', '10', '20', '30', '40', '49'])
        ax2.set_yticks([0, 1])
        ax2.set_yticklabels([str(0), str(1)], rotation=90)
        ax2.set_xlim(-50, 49)
        ax2.set_ylim([0, 1])

        if not self.strands_specific:
            ax2.spines['bottom'].set_position(('outward', 40))
            ax2.set_xlabel("Coordinates from Motif Center", fontweight='bold')
            ax2.set_ylabel("Average ATAC-seq \nSignal", rotation=90, fontweight='bold')
            ax2.legend(loc="center", frameon=False, bbox_to_anchor=(0.85, 0.06))
        else:
            ax2.spines['bottom'].set_position(('outward', 5))
            ax2.set_ylabel("Average ATAC-seq \n Uncorrected Signal", rotation=90, fontweight='bold')
            ax2.legend(loc="lower right", frameon=False)

            ax3.xaxis.set_ticks_position('bottom')
            ax3.yaxis.set_ticks_position('left')
            ax3.spines['top'].set_visible(False)
            ax3.spines['right'].set_visible(False)
            ax3.spines['left'].set_position(('outward', 15))
            ax3.tick_params(direction='out')
            ax3.set_xticks([-50, -40, -30, -20, -10, 0, 10, 20, 30, 40, 49])
            ax3.set_xticklabels(['-50', '-40', '-30', '-20', '-10', '0', '10', '20', '30', '40', '49'])
            ax3.set_yticks([0, 1])
            ax3.set_yticklabels([str(0), str(1)], rotation=90)
            ax3.set_xlim(-50, 49)
            ax3.set_ylim([0, 1])
            ax3.legend(loc="lower right", frameon=False)
            ax3.spines['bottom'].set_position(('outward', 40))
            ax3.set_xlabel("Coordinates from Motif Center", fontweight='bold')
            ax3.set_ylabel("Average ATAC-seq \n Corrected Signal", rotation=90, fontweight='bold')
            ax3.text(-48, 0.05, '# K-mer = {}\n# Forward Shift = {}'.format(str(self.k_nb), str(self.atac_forward_shift)),
                     fontweight='bold')

        figure_name = os.path.join(self.output_loc, "{}.line.eps".format(self.motif_name))
        fig.subplots_adjust(bottom=.2, hspace=.5)
        fig.tight_layout()
        fig.savefig(figure_name, format="eps", dpi=300)

        # Creating canvas and printing eps / pdf with merged results
        output_fname = os.path.join(self.output_loc, "{}.eps".format(self.motif_name))
        c = pyx.canvas.canvas()
        c.insert(pyx.epsfile.epsfile(0, 0, figure_name, scale=1.0))
        if self.strands_specific:
            c.insert(pyx.epsfile.epsfile(2.76, 1.58, logo_fname, width=27.2, height=2.45))
        else:
            c.insert(pyx.epsfile.epsfile(2.5, 1.54, logo_fname, width=16, height=1.75))
        c.writeEPSfile(output_fname)
        os.system("epstopdf " + figure_name)
        os.system("epstopdf " + logo_fname)
        os.system("epstopdf " + output_fname)
示例#28
0
def main(args):
    """
    Performs motif matching.
    """

    ###################################################################################################
    # Processing Input Arguments
    ###################################################################################################

    # Initializing Error Handler
    err = ErrorHandler()

    # Additional Parameters
    matching_folder_name = "match"
    random_region_name = "random_regions"

    filter_values = parse_filter(args.filter)

    ###################################################################################################
    # Initializations
    ###################################################################################################

    # Output folder
    if args.output_location:
        output_location = args.output_location
    else:
        output_location = npath(matching_folder_name)
    print(">> output location:", output_location)

    # Default genomic data
    genome_data = GenomeData(args.organism)

    print(">> genome:", genome_data.organism)
    print(">> pseudocounts:", args.pseudocounts)
    print(">> fpr threshold:", args.fpr)

    ###################################################################################################
    # Reading Input Regions
    ###################################################################################################

    genomic_regions_dict = {}

    # get experimental matrix, if available
    if args.input_matrix:
        try:
            exp_matrix = ExperimentalMatrix()
            exp_matrix.read(args.input_matrix)

            # if the matrix is present, the (empty) dictionary is overwritten
            genomic_regions_dict = exp_matrix.objectsDict

            print(">>> experimental matrix loaded")

        except Exception:
            err.throw_error("MM_WRONG_EXPMAT")
    elif args.input_files:
        # get input files, if available
        for input_filename in args.input_files:
            name, _ = os.path.splitext(os.path.basename(input_filename))

            regions = GenomicRegionSet(name)
            regions.read(npath(input_filename))

            genomic_regions_dict[name] = regions

            print(">>> input file", name, "loaded:", len(regions), "regions")

    # we put this here because we don't want to create the output directory unless we
    # are sure the initialisation (including loading input files) worked
    try:
        if not os.path.isdir(output_location):
            os.makedirs(output_location)
    except Exception:
        err.throw_error("MM_OUT_FOLDER_CREATION")

    annotation = None
    target_genes = None
    # get promoter regions from list of genes (both target and background)
    # TODO: should be more clever, allow precomputed regions etc
    if args.target_genes_filename:
        annotation = AnnotationSet(args.organism,
                                   alias_source=args.organism,
                                   protein_coding=True,
                                   known_only=True)

        target_genes = GeneSet("target_genes")
        target_genes.read(args.target_genes_filename)

        # TODO: what do we do with unmapped genes? maybe just print them out
        target_regions = annotation.get_promoters(
            gene_set=target_genes, promoter_length=args.promoter_length)
        target_regions.name = "target_regions"
        target_regions.sort()
        output_file_name = npath(
            os.path.join(output_location, target_regions.name + ".bed"))
        target_regions.write(output_file_name)

        genomic_regions_dict[target_regions.name] = target_regions

        print(">>> target promoter file created:", len(target_regions),
              "regions")

    # we make a background in case it's requested, but also in case a list of target genes has not been
    # provided
    if args.promoter_make_background or (args.promoters_only
                                         and not args.target_genes_filename):
        if not annotation:
            annotation = AnnotationSet(args.organism,
                                       alias_source=args.organism,
                                       protein_coding=True,
                                       known_only=True)

        # background is made of all known genes minus the target genes (if any)
        background_genes = GeneSet("background_genes")
        background_genes.get_all_genes(organism=args.organism)

        if target_genes:
            background_genes.subtract(target_genes)

        background_regions = annotation.get_promoters(
            gene_set=background_genes, promoter_length=args.promoter_length)
        background_regions.name = "background_regions"
        background_regions.sort()
        output_file_name = npath(
            os.path.join(output_location, background_regions.name + ".bed"))
        background_regions.write(output_file_name)

        genomic_regions_dict[background_regions.name] = background_regions

        print(">>> background promoter file created:", len(background_regions),
              "regions")

    if not genomic_regions_dict:
        err.throw_error(
            "DEFAULT_ERROR",
            add_msg=
            "You must either specify an experimental matrix, or at least a "
            "valid input file, or one of the 'promoter test' options.")

    max_region_len = 0
    max_region = None
    regions_to_match = []

    # Iterating on experimental matrix objects
    for k in genomic_regions_dict.keys():

        curr_genomic_region = genomic_regions_dict[k]

        # If the object is a GenomicRegionSet
        if isinstance(curr_genomic_region, GenomicRegionSet):

            if args.rmdup:
                # remove duplicates and sort regions
                curr_genomic_region.remove_duplicates(sort=True)
            else:
                # sort regions
                curr_genomic_region.sort()

            # Append label and GenomicRegionSet
            regions_to_match.append(curr_genomic_region)

            # Verifying max_region_len for random region generation
            curr_len = len(curr_genomic_region)
            if curr_len > max_region_len:
                max_region_len = curr_len
                max_region = curr_genomic_region

    print(">> all files loaded")

    ###################################################################################################
    # Creating random regions
    ###################################################################################################

    # if a random proportion is set, create random regions
    if args.rand_proportion:

        # Create random coordinates and name it random_regions
        rand_region = max_region.random_regions(
            args.organism, multiply_factor=args.rand_proportion, chrom_X=True)
        rand_region.sort()
        rand_region.name = random_region_name

        # Add random regions to the list of regions to perform matching on
        regions_to_match.append(rand_region)

        # Writing random regions
        output_file_name = npath(
            os.path.join(output_location, random_region_name))
        rand_bed_file_name = output_file_name + ".bed"
        rand_region.write(rand_bed_file_name)

        # Verifying condition to write bb
        if args.bigbed:

            # Fetching file with chromosome sizes
            chrom_sizes_file = genome_data.get_chromosome_sizes()

            try:
                # Converting to big bed
                bed_to_bb(rand_bed_file_name, chrom_sizes_file)

                # removing previously-created BED file
                os.remove(rand_bed_file_name)
            except Exception:
                err.throw_warning(
                    "DEFAULT_WARNING")  # FIXME: maybe error instead?

        print(">> random regions file created:", len(rand_region), "regions")

    ###################################################################################################
    # Creating PWMs
    ###################################################################################################

    if args.motif_dbs:
        ms = MotifSet(preload_motifs=args.motif_dbs, motif_dbs=True)
        # filter for dbs only if --motif_dbs is not set
        if 'database' in filter_values:
            del filter_values['database']
    else:
        if 'database' in filter_values:
            ms = MotifSet(preload_motifs=filter_values['database'])
        else:
            ms = MotifSet(preload_motifs="default")

    print(">> used database(s):",
          ",".join([str(db) for db in ms.motif_data.repositories_list]))

    # applying filtering pattern, taking a subset of the motif set
    if args.filter:
        ms = ms.filter(filter_values, search=args.filter_type)

    motif_list = ms.get_motif_list(args.pseudocounts, args.fpr)

    print(">> motifs loaded:", len(motif_list))

    # Performing normalized threshold strategy if requested
    if args.norm_threshold:
        threshold_list = [motif.threshold / motif.len for motif in motif_list]
        unique_threshold = sum(threshold_list) / len(threshold_list)
    else:
        unique_threshold = None

    scanner = scan.Scanner(7)
    pssm_list = []
    thresholds = []
    for motif in motif_list:
        if unique_threshold:
            thresholds.append(0.0)
            thresholds.append(0.0)
        else:
            thresholds.append(motif.threshold)
            thresholds.append(motif.threshold)

        pssm_list.append(motif.pssm)
        pssm_list.append(motif.pssm_rc)

    # Performing motif matching
    # TODO: we can expand this to use bg from sequence, for example,
    # or from organism.
    bg = tools.flat_bg(4)
    scanner.set_motifs(pssm_list, bg, thresholds)

    ###################################################################################################
    # Motif Matching
    ###################################################################################################

    # Creating genome file
    genome_file = Fastafile(genome_data.get_genome())

    print()

    # Iterating on list of genomic region sets
    for grs in regions_to_match:

        start = time.time()
        print(">> matching [",
              grs.name,
              "], ",
              len(grs),
              " regions... ",
              sep="",
              end='')
        sys.stdout.flush()

        # Initializing output bed file
        output_bed_file = os.path.join(output_location, grs.name + "_mpbs.bed")

        # must remove it because we append the MPBS
        if os.path.isfile(output_bed_file):
            os.remove(output_bed_file)

        # Iterating on genomic region set
        for genomic_region in grs:

            # Reading sequence associated to genomic_region
            sequence = str(
                genome_file.fetch(genomic_region.chrom, genomic_region.initial,
                                  genomic_region.final))

            grs_tmp = match_multiple(scanner, motif_list, sequence,
                                     genomic_region)

            # post-processing: if required, remove duplicate regions on opposing strands (keep highest score)
            if len(grs_tmp) > 1 and args.remove_strand_duplicates:
                grs_tmp.sort()
                seqs = grs_tmp.sequences
                seqs_new = []
                cur_pos = 0
                end_pos = len(seqs) - 1
                while cur_pos < end_pos:
                    gr = seqs[cur_pos]

                    new_pos = cur_pos + 1
                    while new_pos < end_pos:
                        gr2 = seqs[new_pos]

                        # if this sequence is unrelated, we move on
                        if gr.name != gr2.name or gr.chrom != gr2.chrom or gr.initial != gr2.initial or gr.final != gr2.final or gr.orientation == gr2.orientation:
                            break

                        if float(gr.data) < float(gr2.data):
                            gr = gr2

                        new_pos = new_pos + 1

                    # adding the currently-selected genomic region
                    seqs_new.append(gr)

                    # at the next loop, we start from the next right-handed sequences
                    cur_pos = new_pos

                # edge case: the last element was not considered
                # (when it is, cur_pos == end_pos+1)
                if cur_pos == end_pos:
                    seqs_new.append(seqs[cur_pos])

                grs_tmp.sequences = seqs_new

            grs_tmp.write(output_bed_file, mode="a")

        del grs.sequences[:]

        # Verifying condition to write bb
        if args.bigbed and args.normalize_bitscore:
            # Fetching file with chromosome sizes
            chrom_sizes_file = genome_data.get_chromosome_sizes()

            # Converting to big bed
            bed_to_bb(output_bed_file, chrom_sizes_file)

            # removing BED file
            os.remove(output_bed_file)

        secs = time.time() - start
        print("[", "%02.3f" % secs, " seconds]", sep="")
示例#29
0
    def bias_correction_atac(self, bias_table, genome_file_name, chrName, start, end,
                             forward_shift, reverse_shift):

        # Parameters
        window = 50
        defaultKmerValue = 1.0

        # Initialization
        fastaFile = Fastafile(genome_file_name)
        fBiasDict = bias_table[0]
        rBiasDict = bias_table[1]
        k_nb = len(fBiasDict.keys()[0])
        p1 = start
        p2 = end
        p1_w = p1 - (window / 2)
        p2_w = p2 + (window / 2)
        p1_wk = p1_w - int(floor(k_nb / 2.))
        p2_wk = p2_w + int(ceil(k_nb / 2.))

        if (p1 <= 0 or p1_w <= 0 or p2_wk <= 0):
            # Return raw counts
            nf = [0.0] * (p2 - p1)
            nr = [0.0] * (p2 - p1)
            for read in self.bam.fetch(chrName, p1, p2):
                if not read.is_reverse:
                    cut_site = read.pos + forward_shift
                    if p1 <= cut_site < p2:
                        nf[cut_site - p1] += 1.0
                else:
                    cut_site = read.aend + reverse_shift - 1
                    if p1 <= cut_site < p2:
                        nr[cut_site - p1] += 1.0

            return nf, nr

        # Raw counts
        nf = [0.0] * (p2_w - p1_w)
        nr = [0.0] * (p2_w - p1_w)
        for read in self.bam.fetch(chrName, p1_w, p2_w):
            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1_w <= cut_site < p2_w:
                    nf[cut_site - p1_w] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1_w <= cut_site < p2_w:
                    nr[cut_site - p1_w] += 1.0

        # Smoothed counts
        Nf = []
        Nr = []
        fSum = sum(nf[:window])
        rSum = sum(nr[:window])
        fLast = nf[0]
        rLast = nr[0]
        for i in range((window / 2), len(nf) - (window / 2)):
            Nf.append(fSum)
            Nr.append(rSum)
            fSum -= fLast
            fSum += nf[i + (window / 2)]
            fLast = nf[i - (window / 2) + 1]
            rSum -= rLast
            rSum += nr[i + (window / 2)]
            rLast = nr[i - (window / 2) + 1]

        # Fetching sequence
        currStr = str(fastaFile.fetch(chrName, p1_wk, p2_wk - 1)).upper()
        currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrName, p1_wk + 1,
                                                                     p2_wk)).upper())

        # Iterating on sequence to create signal
        af = []
        ar = []
        for i in range(int(ceil(k_nb / 2.)), len(currStr) - int(floor(k_nb / 2)) + 1):
            fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))]
            rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) - i:len(currStr) + int(floor(k_nb / 2.)) - i]
            try:
                af.append(fBiasDict[fseq])
            except Exception:
                af.append(defaultKmerValue)
            try:
                ar.append(rBiasDict[rseq])
            except Exception:
                ar.append(defaultKmerValue)

        # Calculating bias and writing to wig file
        fSum = sum(af[:window])
        rSum = sum(ar[:window])
        fLast = af[0]
        rLast = ar[0]
        bias_corrected_signal_forward = []
        bias_corrected_signal_reverse = []
        for i in range((window / 2), len(af) - (window / 2)):
            nhatf = Nf[i - (window / 2)] * (af[i] / fSum)
            nhatr = Nr[i - (window / 2)] * (ar[i] / rSum)
            bias_corrected_signal_forward.append(nhatf)
            bias_corrected_signal_reverse.append(nhatr)
            fSum -= fLast
            fSum += af[i + (window / 2)]
            fLast = af[i - (window / 2) + 1]
            rSum -= rLast
            rSum += ar[i + (window / 2)]
            rLast = ar[i - (window / 2) + 1]

        # Termination
        fastaFile.close()
        return bias_corrected_signal_forward, bias_corrected_signal_reverse
示例#30
0
    def bias_correction(self, signal, bias_table, genome_file_name, chrName, start, end,
                        forward_shift, reverse_shift, strands_specific):
        """
        Performs bias correction.

        Keyword arguments:
        signal -- Input signal.
        bias_table -- Bias table.

        Return:
        bias_corrected_signal -- Bias-corrected sequence.
        """

        if (not bias_table): return signal

        # Parameters
        window = 50
        defaultKmerValue = 1.0

        # Initialization
        fastaFile = Fastafile(genome_file_name)
        fBiasDict = bias_table[0]
        rBiasDict = bias_table[1]
        k_nb = len(fBiasDict.keys()[0])
        p1 = start
        p2 = end
        p1_w = p1 - (window / 2)
        p2_w = p2 + (window / 2)
        p1_wk = p1_w - int(floor(k_nb / 2.))
        p2_wk = p2_w + int(ceil(k_nb / 2.))
        if (p1 <= 0 or p1_w <= 0 or p1_wk <= 0): return signal

        # Raw counts
        nf = [0.0] * (p2_w - p1_w)
        nr = [0.0] * (p2_w - p1_w)
        for read in self.bam.fetch(chrName, p1_w, p2_w):
            if (not read.is_reverse):
                cut_site = read.pos + forward_shift
                if cut_site >= start and cut_site < end:
                    nf[cut_site - p1_w] += 1.0
                    # for i in range(max(read.pos + forward_shift, start), min(read.pos + forward_shift + 1, end - 1)):
                    #    nf[i - start] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if cut_site >= start and cut_site < end:
                    nr[cut_site - p1_w] += 1.0
                    # for i in range(max(read.aend + reverse_shift - 1, start), min(read.aend + reverse_shift, end - 1)):
                    #    nr[i - start] += 1.0

                    # if ((not read.is_reverse) and (read.pos > p1_w)): nf[read.pos - p1_w] += 1.0
                    # if ((read.is_reverse) and ((read.aend - 1) < p2_w)): nr[read.aend - 1 - p1_w] += 1.0

        # Smoothed counts
        Nf = []
        Nr = []
        fSum = sum(nf[:window])
        rSum = sum(nr[:window])
        fLast = nf[0]
        rLast = nr[0]
        for i in range((window / 2), len(nf) - (window / 2)):
            Nf.append(fSum)
            Nr.append(rSum)
            fSum -= fLast
            fSum += nf[i + (window / 2)]
            fLast = nf[i - (window / 2) + 1]
            rSum -= rLast
            rSum += nr[i + (window / 2)]
            rLast = nr[i - (window / 2) + 1]

        # Fetching sequence
        currStr = str(fastaFile.fetch(chrName, p1_wk-1, p2_wk-2)).upper()
        currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrName,p1_wk+2, p2_wk+1)).upper())
        #currStr = str(fastaFile.fetch(chrName, p1_wk, p2_wk - 1)).upper()
        #currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrName, p1_wk + 1,
         #                                                            p2_wk)).upper())

        # Iterating on sequence to create signal
        af = []
        ar = []
        for i in range(int(ceil(k_nb / 2.)), len(currStr) - int(floor(k_nb / 2)) + 1):
            fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))]
            rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) - i:len(currStr) + int(floor(k_nb / 2.)) - i]
            try:
                af.append(fBiasDict[fseq])
            except Exception:
                af.append(defaultKmerValue)
            try:
                ar.append(rBiasDict[rseq])
            except Exception:
                ar.append(defaultKmerValue)

        # Calculating bias and writing to wig file
        fSum = sum(af[:window])
        rSum = sum(ar[:window])
        fLast = af[0]
        rLast = ar[0]
        bias_corrected_signal = []
        bias_corrected_signal_forward = []
        bias_corrected_signal_reverse = []
        for i in range((window / 2), len(af) - (window / 2)):
            nhatf = Nf[i - (window / 2)] * (af[i] / fSum)
            nhatr = Nr[i - (window / 2)] * (ar[i] / rSum)
            zf = log(nf[i] + 1) - log(nhatf + 1)
            zr = log(nr[i] + 1) - log(nhatr + 1)
            bias_corrected_signal_forward.append(zf)
            bias_corrected_signal_reverse.append(zr)
            bias_corrected_signal.append(zf + zr)
            fSum -= fLast
            fSum += af[i + (window / 2)]
            fLast = af[i - (window / 2) + 1]
            rSum -= rLast
            rSum += ar[i + (window / 2)]
            rLast = ar[i - (window / 2) + 1]

        # Fixing the negative number in bias corrected signal
        min_value = abs(min(bias_corrected_signal_forward))
        bias_fixed_signal_forward = [e + min_value for e in bias_corrected_signal_forward]

        min_value = abs(min(bias_corrected_signal_reverse))
        bias_fixed_signal_reverse = [e + min_value for e in bias_corrected_signal_reverse]

        min_value = abs(min(bias_corrected_signal))
        bias_fixed_signal = [e + min_value for e in bias_corrected_signal]

        # Termination
        fastaFile.close()
        if not strands_specific:
            return bias_corrected_signal
        else:
            return bias_fixed_signal_forward, bias_fixed_signal_reverse
示例#31
0
    def print_signal(self, ref, start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift,
                     initial_clip=1000, per_norm=98, per_slope=98, bias_table=None, genome_file_name=None,
                     raw_signal_file=None, bc_signal_file=None, norm_signal_file=None, strand_specific=False):

        if raw_signal_file:
            pileup_region = PileupRegion(start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift)
            if ps_version == "0.7.5":
                self.bam.fetch(reference=ref, start=start, end=end, callback=pileup_region)
            else:
                iter = self.bam.fetch(reference=ref, start=start, end=end)
                for alignment in iter:
                    pileup_region.__call__(alignment)
            raw_signal = array([min(e, initial_clip) for e in pileup_region.vector])

            f = open(raw_signal_file, "a")
            f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                [str(e) for e in nan_to_num(raw_signal)]) + "\n")
            f.close()

        if bc_signal_file or norm_signal_file:
            # Parameters
            window = 50
            defaultKmerValue = 1.0

            # Initialization
            fasta = Fastafile(genome_file_name)
            fBiasDict = bias_table[0]
            rBiasDict = bias_table[1]
            k_nb = len(fBiasDict.keys()[0])
            p1 = start
            p2 = end
            p1_w = p1 - (window / 2)
            p2_w = p2 + (window / 2)
            p1_wk = p1_w - int(k_nb / 2.)
            p2_wk = p2_w + int(k_nb / 2.)

            currStr = str(fasta.fetch(ref, p1_wk, p2_wk - 1)).upper()
            currRevComp = AuxiliaryFunctions.revcomp(str(fasta.fetch(ref, p1_wk + 1, p2_wk)).upper())

            # Iterating on sequence to create the bias signal
            signal_bias_f = []
            signal_bias_r = []
            for i in range(int(k_nb / 2.), len(currStr) - int(k_nb / 2) + 1):
                fseq = currStr[i - int(k_nb / 2.):i + int(k_nb / 2.)]
                rseq = currRevComp[len(currStr) - int(k_nb / 2.) - i:len(currStr) + int(k_nb / 2.) - i]
                try:
                    signal_bias_f.append(fBiasDict[fseq])
                except Exception:
                    signal_bias_f.append(defaultKmerValue)
                try:
                    signal_bias_r.append(rBiasDict[rseq])
                except Exception:
                    signal_bias_r.append(defaultKmerValue)

            # Raw counts
            signal_raw_f = [0.0] * (p2_w - p1_w)
            signal_raw_r = [0.0] * (p2_w - p1_w)
            for read in self.bam.fetch(ref, p1_w, p2_w):
                if not read.is_reverse:
                    cut_site = read.pos + forward_shift
                    if p1_w <= cut_site < p2_w:
                        signal_raw_f[cut_site - p1_w] += 1.0
                else:
                    cut_site = read.aend + reverse_shift - 1
                    if p1_w <= cut_site < p2_w:
                        signal_raw_r[cut_site - p1_w] += 1.0

            # Smoothed counts
            Nf = []
            Nr = []
            fSum = sum(signal_raw_f[:window])
            rSum = sum(signal_raw_r[:window])
            fLast = signal_raw_f[0]
            rLast = signal_raw_r[0]
            for i in range((window / 2), len(signal_raw_f) - (window / 2)):
                Nf.append(fSum)
                Nr.append(rSum)
                fSum -= fLast
                fSum += signal_raw_f[i + (window / 2)]
                fLast = signal_raw_f[i - (window / 2) + 1]
                rSum -= rLast
                rSum += signal_raw_r[i + (window / 2)]
                rLast = signal_raw_r[i - (window / 2) + 1]

            # Calculating bias and writing to wig file
            fSum = sum(signal_bias_f[:window])
            rSum = sum(signal_bias_r[:window])
            fLast = signal_bias_f[0]
            rLast = signal_bias_r[0]
            signal_bc = []
            signal_bc_f = []
            signal_bc_r = []
            for i in range((window / 2), len(signal_bias_f) - (window / 2)):
                nhatf = Nf[i - (window / 2)] * (signal_bias_f[i] / fSum)
                nhatr = Nr[i - (window / 2)] * (signal_bias_r[i] / rSum)
                signal_bc.append(nhatf + nhatr)
                signal_bc_f.append(nhatf)
                signal_bc_r.append(nhatr)
                fSum -= fLast
                fSum += signal_bias_f[i + (window / 2)]
                fLast = signal_bias_f[i - (window / 2) + 1]
                rSum -= rLast
                rSum += signal_bias_r[i + (window / 2)]
                rLast = signal_bias_r[i - (window / 2) + 1]

            if bc_signal_file:
                f = open(bc_signal_file, "a")
                f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                    [str(e) for e in nan_to_num(signal_bc)]) + "\n")
                f.close()

                if strand_specific:
                    prefix = bc_signal_file.split(".")[0]
                    bc_signal_file_f = prefix + "_Forward" + ".bc.wig"
                    bc_signal_file_r = prefix + "_Reverse" + ".bc.wig"
                    f = open(bc_signal_file_f, "a")
                    f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                        [str(e) for e in nan_to_num(signal_bc_f)]) + "\n")
                    f.close()
                    f = open(bc_signal_file_r, "a")
                    f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                        [str(e) for e in nan_to_num(signal_bc_r)]) + "\n")
                    f.close()

            if norm_signal_file:
                norm_signal_bc = self.boyle_norm(signal_bc)
                perc = scoreatpercentile(norm_signal_bc, 98)
                std = np.std(norm_signal_bc)
                norm_signal_bc = self.hon_norm_atac(norm_signal_bc, perc, std)
                f = open(norm_signal_file, "a")
                f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                    [str(e) for e in nan_to_num(norm_signal_bc)]) + "\n")
                f.close()

                if strand_specific:
                    prefix = bc_signal_file.split(".")[0]
                    norm_signal_file_f = prefix + "_Forward" + ".norm.wig"
                    norm_signal_file_r = prefix + "_Reverse" + ".norm.wig"

                    signal_norm_f = self.boyle_norm(signal_bc_f)
                    perc = scoreatpercentile(signal_norm_f, 98)
                    std = np.std(signal_norm_f)
                    signal_norm_f = self.hon_norm_atac(signal_norm_f, perc, std)

                    signal_norm_r = self.boyle_norm(signal_bc_r)
                    perc = scoreatpercentile(signal_norm_r, 98)
                    std = np.std(signal_norm_r)
                    signal_norm_r = self.hon_norm_atac(signal_norm_r, perc, std)

                    f = open(norm_signal_file_f, "a")
                    f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                        [str(e) for e in nan_to_num(signal_norm_f)]) + "\n")
                    f.close()
                    f = open(norm_signal_file_r, "a")
                    f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                        [str(e) for e in nan_to_num(signal_norm_r)]) + "\n")
                    f.close()
示例#32
0
def bias_correction_dnase(signal_class, signal, chrName, start, end,
                          forward_shift, reverse_shift):
  table_file_name_F = os.path.join(os.path.dirname(__file__), '../data/single_hit_bias_table_F.txt')
  table_file_name_R = os.path.join(os.path.dirname(__file__), '../data/single_hit_bias_table_R.txt')
  bias_table = load_table(table_file_name_F, table_file_name_R)
  if not bias_table: return signal
  # Parameters
  window = 50
  defaultKmerValue = 1.0
  genome_file_name = signal_class.fastaFile
  # Initialization
  fastaFile = Fastafile(genome_file_name)
  fBiasDict = bias_table[0]
  rBiasDict = bias_table[1]
  k_nb = len(list(fBiasDict.keys())[0])
  p1 = start
  p2 = end
  p1_w = int(p1 - (window / 2))
  p2_w = int(p2 + (window / 2))
  p1_wk = p1_w - int(floor(k_nb / 2.))
  p2_wk = p2_w + int(ceil(k_nb / 2.))
  if p1 <= 0 or p1_w <= 0 or p1_wk <= 0: return signal

  # Raw counts
  nf = [0.0] * int(p2_w - p1_w)
  nr = [0.0] * int(p2_w - p1_w)
  for read in signal_class.bam.fetch(chrName, p1_w, p2_w):
    # check if the read is unmapped, according to issue #112
    if read.is_unmapped:
      continue

    if not read.is_reverse:
      cut_site = read.pos + forward_shift
      if p1_w <= cut_site < p2_w:
        nf[cut_site - p1_w] += 1.0
    else:
      cut_site = read.aend + reverse_shift - 1
      if p1_w <= cut_site < p2_w:
        nr[cut_site - p1_w] += 1.0

  # Smoothed counts
  Nf = []
  Nr = []
  fSum = sum(nf[:window])
  rSum = sum(nr[:window])
  fLast = nf[0]
  rLast = nr[0]
  for i in range(int(window / 2), int(len(nf) - (window / 2))):
    Nf.append(fSum)
    Nr.append(rSum)
    fSum -= fLast
    fSum += nf[i + int(window / 2)]
    fLast = nf[i - int(window / 2) + 1]
    rSum -= rLast
    rSum += nr[i + int(window / 2)]
    rLast = nr[i - int(window / 2) + 1]

  # Fetching sequence
  currStr = str(fastaFile.fetch(chrName, p1_wk, p2_wk - 1)).upper()
  currRevComp = revcomp(str(fastaFile.fetch(chrName, p1_wk + 1, p2_wk)).upper())

  # Iterating on sequence to create signal
  af = []
  ar = []
  for i in range(int(ceil(k_nb / 2.)), len(currStr) - int(floor(k_nb / 2)) + 1):
    fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))]
    rseq = currRevComp[
           len(currStr) - int(ceil(k_nb / 2.)) - i:len(currStr) + int(
             floor(k_nb / 2.)) - i]
    try:
      af.append(fBiasDict[fseq])
    except Exception:
      af.append(defaultKmerValue)
    try:
      ar.append(rBiasDict[rseq])
    except Exception:
      ar.append(defaultKmerValue)

  # Calculating bias and writing to wig file
  fSum = sum(af[:window])
  rSum = sum(ar[:window])
  fLast = af[0]
  rLast = ar[0]
  bias_corrected_signal = []
  for i in range(int(window / 2), int(len(af) - (window / 2))):
    nhatf = Nf[i - int(window / 2)] * (af[i] / fSum)
    nhatr = Nr[i - int(window / 2)] * (ar[i] / rSum)
    zf = log(nf[i] + 1) - log(nhatf + 1)
    zr = log(nr[i] + 1) - log(nhatr + 1)
    bias_corrected_signal.append(zf + zr)
    fSum -= fLast
    fSum += af[i + int(window / 2)]
    fLast = af[i - int(window / 2) + 1]
    rSum -= rLast
    rSum += ar[i + int(window / 2)]
    rLast = ar[i - int(window / 2) + 1]

  # Termination
  fastaFile.close()
  return bias_corrected_signal
示例#33
0
文件: Main.py 项目: Marvin84/reg-gen
def main_matching():
    """
    Performs motif matching.

    Authors: Eduardo G. Gusmao.
    """

    ###################################################################################################
    # Processing Input Arguments
    ###################################################################################################

    # Initializing Error Handler
    main_error_handler = ErrorHandler()

    # Parameters
    usage_message = "%prog --matching [options] <experiment_matrix>"

    # Initializing Option Parser
    parser = PassThroughOptionParser(usage = usage_message)

    # Parameters Options
    parser.add_option("--organism", dest = "organism", type = "string", metavar="STRING", default = "hg19",
                      help = ("Organism considered on the analysis. Check our full documentation for all available "
                              "options. All default files such as genomes will be based on the chosen organism "
                              "and the data.config file."))
    parser.add_option("--fpr", dest = "fpr", type = "float", metavar="FLOAT", default = 0.0001,
                      help = ("False positive rate cutoff for motif matching."))
    parser.add_option("--precision", dest = "precision", type = "int", metavar="INT", default = 10000,
                      help = ("Score distribution precision for determining false positive rate cutoff."))
    parser.add_option("--pseudocounts", dest = "pseudocounts", type = "float", metavar="FLOAT", default = 0.1,
                      help = ("Pseudocounts to be added to raw counts of each PFM."))
    parser.add_option("--rand-proportion", dest = "rand_proportion", type = "float", metavar="FLOAT", default = 10.0,
                      help = ("If random coordinates need to be created (for further motif enrichment),"
                              "then it will be created a number of coordinates that equals this"
                              "parameter x the number of input regions (in case of multiple regions, the"
                              "larger is considered). If zero (0) is passed, then no random coordinates are created."))
    parser.add_option("--norm-threshold", dest = "norm_threshold", action = "store_true", default = False,
                      help = ("If this option is used, the thresholds for all PWMs will be normalized by their length."
                              "In this scheme, the threshold cutoff is evaluated in the regular way by the given fpr."
                              "Then, all thresholds are divided by the lenght of the motif. The final threshold consists"
                              "of the average between all normalized motif thresholds. This single threshold will be"
                              "applied to all motifs."))

    # Output Options
    parser.add_option("--output-location", dest = "output_location", type = "string", metavar="PATH", default = os.getcwd(),
                      help = ("Path where the output files will be written."))
    parser.add_option("--bigbed", dest = "bigbed", action = "store_true", default = False,
                      help = ("If this option is used, all bed files will be written as bigbed."))
    parser.add_option("--normalize-bitscore", dest = "normalize_bitscore", action = "store_false", default = True,
                      help = ("In order to print bigbed files the scores need to be normalized between 0 and 1000."
                              "This option should be used if real bitscores should be printed in the resulting bed file."
                              "In this case, a bigbed file will not be created."))

    # Processing Options
    options, arguments = parser.parse_args()

    # Additional Parameters
    matching_folder_name = "Match"
    random_region_name = "random_regions"

    ###################################################################################################
    # Initializations
    ###################################################################################################

    # Output folder
    matching_output_location = os.path.join(options.output_location,matching_folder_name)
    try:
        if(not os.path.isdir(matching_output_location)): os.makedirs(matching_output_location)
    except Exception: main_error_handler.throw_error("MM_OUT_FOLDER_CREATION")

    # Default genomic data
    genome_data = GenomeData(options.organism)

    # Default motif data
    motif_data = MotifData()

    ###################################################################################################
    # Reading Input Matrix
    ###################################################################################################

    # Reading arguments
    try:
        input_matrix = arguments[0]
        if(len(arguments) > 1): main_error_handler.throw_warning("MM_MANY_ARG")
    except Exception: main_error_handler.throw_error("MM_NO_ARGUMENT")

    # Create experimental matrix
    try:
        exp_matrix = ExperimentalMatrix()
        exp_matrix.read(input_matrix)
    except Exception: main_error_handler.throw_error("MM_WRONG_EXPMAT")

    ###################################################################################################
    # Reading Regions
    ###################################################################################################

    # Initialization
    max_region_len = 0
    max_region = None
    input_regions = []

    try:
        exp_matrix_objects_dict = exp_matrix.objectsDict
    except Exception: main_error_handler.throw_error("MM_WRONG_EXPMAT")

    # Iterating on experimental matrix objects
    for k in exp_matrix_objects_dict.keys():

        curr_genomic_region = exp_matrix_objects_dict[k]

        # If the object is a GenomicRegionSet
        if(isinstance(curr_genomic_region,GenomicRegionSet)):

            # Sorting input region
            curr_genomic_region.sort()

            # Append label and GenomicRegionSet
            input_regions.append(curr_genomic_region)

            # Verifying max_region_len for random region generation
            curr_len = len(curr_genomic_region)
            if(curr_len > max_region_len):
                max_region_len = curr_len
                max_region = exp_matrix_objects_dict[k]

    ###################################################################################################
    # Creating random region
    ###################################################################################################

    # Create random coordinates
    rand_region = None
    if(options.rand_proportion > 0):

        # Create random coordinates and name it random_regions
        rand_region = max_region.random_regions(options.organism, multiply_factor=options.rand_proportion, chrom_X=True)
        rand_region.sort()
        rand_region.name = random_region_name

        # Put random regions in the end of the input regions
        input_regions.append(rand_region)

        # Writing random regions
        output_file_name = os.path.join(matching_output_location, random_region_name)
        rand_bed_file_name = output_file_name+".bed"
        rand_region.write_bed(rand_bed_file_name)

        # Verifying condition to write bb
        if(options.bigbed):

            # Fetching file with chromosome sizes
            chrom_sizes_file = genome_data.get_chromosome_sizes()

            # Converting to big bed
            rand_bb_file_name = output_file_name+".bb"
            try:
                os.system(" ".join(["bedToBigBed", rand_bed_file_name, chrom_sizes_file, rand_bb_file_name, "-verbose=0"]))
                os.remove(rand_bed_file_name)
            except Exception: pass # WARNING

    else: main_error_handler.throw_error("MM_WRONG_RANDPROP")

    ###################################################################################################
    # Creating PWMs
    ###################################################################################################

    # Initialization
    motif_list = []

    # Creating thresholds object
    thresholds = Thresholds(motif_data)

    # Fetching list with all motif file names
    motif_file_names = []
    for motif_repository in motif_data.get_pwm_list():
        for motif_file_name in glob(os.path.join(motif_repository,"*.pwm")):
            motif_file_names.append(motif_file_name)

    # Iterating on grouped file name list
    for motif_file_name in motif_file_names:

        # Append motif motif_list
        motif_list.append(Motif(motif_file_name, options.pseudocounts, options.precision, options.fpr, thresholds))

    # Performing normalized threshold strategy if requested
    if(options.norm_threshold):
        threshold_list = [motif.threshold/motif.len for motif in motif_list]
        unique_threshold = sum(threshold_list)/len(threshold_list)
    else: unique_threshold = None
        

    ###################################################################################################
    # Motif Matching
    ###################################################################################################

    # Creating genome file
    genome_file = Fastafile(genome_data.get_genome())

    # Iterating on list of genomic regions
    for genomic_region_set in input_regions:

        # Initializing output bed file
        output_file_name = os.path.join(matching_output_location, genomic_region_set.name+"_mpbs")
        bed_file_name = output_file_name+".bed"
        output_file = open(bed_file_name,"w")
        
        # Iterating on genomic regions
        for genomic_region in genomic_region_set.sequences:

            # Reading sequence associated to genomic_region
            sequence = str(genome_file.fetch(genomic_region.chrom, genomic_region.initial, genomic_region.final))

            # Splitting the sequence in smaller sequences to remove the "N" regions
            sequence_list = filter(None,sequence.split("N"))

            # Perform motif matching for each motif in each sequence
            for seq in sequence_list:
                for motif in motif_list: match_single(motif, seq, genomic_region, output_file, unique_threshold, options.normalize_bitscore)

        # Closing file
        output_file.close()

        # Verifying condition to write bb
        if(options.bigbed and options.normalize_bitscore):

            # Fetching file with chromosome sizes
            chrom_sizes_file = genome_data.get_chromosome_sizes()

            # Converting to big bed
            sort_file_name = output_file_name+"_sort.bed"
            bb_file_name = output_file_name+".bb"
            os.system("sort -k1,1 -k2,2n "+bed_file_name+" > "+sort_file_name)
            os.system(" ".join(["bedToBigBed", sort_file_name, chrom_sizes_file, bb_file_name, "-verbose=0"]))
            os.remove(bed_file_name); os.remove(sort_file_name)
示例#34
0
    def line(self):
        signal = GenomicSignal(self.bam_file)
        signal.load_sg_coefs(slope_window_size=9)
        bias_table = BiasTable()
        bias_table_list = self.bias_table.split(",")
        table = bias_table.load_table(table_file_name_F=bias_table_list[0],
                                      table_file_name_R=bias_table_list[1])
        genome_data = GenomeData(self.organism)
        fasta = Fastafile(genome_data.get_genome())
        pwm_dict = dict([("A", [0.0] * self.window_size),
                         ("C", [0.0] * self.window_size),
                         ("G", [0.0] * self.window_size),
                         ("T", [0.0] * self.window_size),
                         ("N", [0.0] * self.window_size)])

        mean_raw_signal = np.zeros(self.window_size)
        mean_bc_signal = np.zeros(self.window_size)
        mean_raw_signal_f = np.zeros(self.window_size)
        mean_bc_signal_f = np.zeros(self.window_size)
        mean_raw_signal_r = np.zeros(self.window_size)
        mean_bc_signal_r = np.zeros(self.window_size)

        mean_bias_signal_f = np.zeros(self.window_size)
        mean_bias_signal_r = np.zeros(self.window_size)
        num_sites = 0

        mpbs_regions = GenomicRegionSet("Motif Predicted Binding Sites")
        mpbs_regions.read_bed(self.motif_file)

        total_nc_signal = 0
        total_nl_signal = 0
        total_nr_signal = 0

        for region in mpbs_regions:
            if str(region.name).split(":")[-1] == "Y":
                num_sites += 1
                # Extend by 50 bp
                mid = (region.initial + region.final) / 2
                p1 = mid - (self.window_size / 2)
                p2 = mid + (self.window_size / 2)

                if not self.strands_specific:
                    # Fetch raw signal
                    raw_signal, _ = signal.get_signal(
                        ref=region.chrom,
                        start=p1,
                        end=p2,
                        downstream_ext=self.atac_downstream_ext,
                        upstream_ext=self.atac_upstream_ext,
                        forward_shift=self.atac_forward_shift,
                        reverse_shift=self.atac_reverse_shift,
                        genome_file_name=genome_data.get_genome())

                    mean_raw_signal = np.add(mean_raw_signal, raw_signal)

                    # Fetch bias correction signal
                    bc_signal, _ = signal.get_signal(
                        ref=region.chrom,
                        start=p1,
                        end=p2,
                        bias_table=table,
                        downstream_ext=self.atac_downstream_ext,
                        upstream_ext=self.atac_upstream_ext,
                        forward_shift=self.atac_forward_shift,
                        reverse_shift=self.atac_reverse_shift,
                        genome_file_name=genome_data.get_genome())

                    mean_bc_signal = np.add(mean_bc_signal, bc_signal)
                else:
                    raw_signal_f, _, raw_signal_r, _ = signal.get_signal_per_strand(
                        ref=region.chrom,
                        start=p1,
                        end=p2,
                        downstream_ext=self.atac_downstream_ext,
                        upstream_ext=self.atac_upstream_ext,
                        forward_shift=self.atac_forward_shift,
                        reverse_shift=self.atac_reverse_shift,
                        genome_file_name=genome_data.get_genome())
                    mean_raw_signal_f = np.add(mean_raw_signal_f, raw_signal_f)
                    mean_raw_signal_r = np.add(mean_raw_signal_r, raw_signal_r)

                    bc_signal_f, _, bc_signal_r, _ = signal.get_signal_per_strand(
                        ref=region.chrom,
                        start=p1,
                        end=p2,
                        bias_table=table,
                        downstream_ext=self.atac_downstream_ext,
                        upstream_ext=self.atac_upstream_ext,
                        forward_shift=self.atac_forward_shift,
                        reverse_shift=self.atac_reverse_shift,
                        genome_file_name=genome_data.get_genome())
                    mean_bc_signal_f = np.add(mean_bc_signal_f, bc_signal_f)
                    mean_bc_signal_r = np.add(mean_bc_signal_r, bc_signal_r)

                # Update pwm
                aux_plus = 1
                dna_seq = str(fasta.fetch(region.chrom, p1, p2)).upper()
                if (region.final - region.initial) % 2 == 0:
                    aux_plus = 0
                dna_seq_rev = AuxiliaryFunctions.revcomp(
                    str(fasta.fetch(region.chrom, p1 + aux_plus,
                                    p2 + aux_plus)).upper())
                if region.orientation == "+":
                    for i in range(0, len(dna_seq)):
                        pwm_dict[dna_seq[i]][i] += 1
                elif region.orientation == "-":
                    for i in range(0, len(dna_seq_rev)):
                        pwm_dict[dna_seq_rev[i]][i] += 1

                # Create bias signal
                bias_table_f = table[0]
                bias_table_r = table[1]
                self.k_nb = len(bias_table_f.keys()[0])
                bias_signal_f = []
                bias_signal_r = []
                p1_wk = p1 - int(self.k_nb / 2)
                p2_wk = p2 + int(self.k_nb / 2)
                dna_seq = str(fasta.fetch(region.chrom, p1_wk,
                                          p2_wk - 1)).upper()
                dna_seq_rev = AuxiliaryFunctions.revcomp(
                    str(fasta.fetch(region.chrom, p1_wk, p2_wk + 1)).upper())
                for i in range(int(self.k_nb / 2),
                               len(dna_seq) - int(self.k_nb / 2) + 1):
                    fseq = dna_seq[i - int(self.k_nb / 2):i +
                                   int(self.k_nb / 2)]
                    rseq = dna_seq_rev[len(dna_seq) - int(self.k_nb / 2) -
                                       i:len(dna_seq) + int(self.k_nb / 2) - i]
                    try:
                        bias_signal_f.append(bias_table_f[fseq])
                    except Exception:
                        bias_signal_f.append(1)
                    try:
                        bias_signal_r.append(bias_table_r[rseq])
                    except Exception:
                        bias_signal_r.append(1)

                mean_bias_signal_f = np.add(mean_bias_signal_f,
                                            np.array(bias_signal_f))
                mean_bias_signal_r = np.add(mean_bias_signal_r,
                                            np.array(bias_signal_r))

                if self.protection_score:
                    # signal in the center of the MPBS
                    p1 = region.initial
                    p2 = region.final
                    nc_signal, _ = signal.get_signal(
                        ref=region.chrom,
                        start=p1,
                        end=p2,
                        bias_table=table,
                        downstream_ext=self.atac_downstream_ext,
                        upstream_ext=self.atac_upstream_ext,
                        forward_shift=self.atac_forward_shift,
                        reverse_shift=self.atac_reverse_shift,
                        genome_file_name=genome_data.get_genome())
                    total_nc_signal += sum(nc_signal)
                    p1 = region.final
                    p2 = 2 * region.final - region.initial
                    nr_signal, _ = signal.get_signal(
                        ref=region.chrom,
                        start=p1,
                        end=p2,
                        bias_table=table,
                        downstream_ext=self.atac_downstream_ext,
                        upstream_ext=self.atac_upstream_ext,
                        forward_shift=self.atac_forward_shift,
                        reverse_shift=self.atac_reverse_shift,
                        genome_file_name=genome_data.get_genome())
                    total_nr_signal += sum(nr_signal)
                    p1 = 2 * region.initial - region.final
                    p2 = region.final
                    nl_signal, _ = signal.get_signal(
                        ref=region.chrom,
                        start=p1,
                        end=p2,
                        bias_table=table,
                        downstream_ext=self.atac_downstream_ext,
                        upstream_ext=self.atac_upstream_ext,
                        forward_shift=self.atac_forward_shift,
                        reverse_shift=self.atac_reverse_shift,
                        genome_file_name=genome_data.get_genome())
                    total_nl_signal += sum(nl_signal)

        mean_raw_signal = mean_raw_signal / num_sites
        mean_bc_signal = mean_bc_signal / num_sites

        mean_raw_signal_f = mean_raw_signal_f / num_sites
        mean_raw_signal_r = mean_raw_signal_r / num_sites
        mean_bc_signal_f = mean_bc_signal_f / num_sites
        mean_bc_signal_r = mean_bc_signal_r / num_sites

        mean_bias_signal_f = mean_bias_signal_f / num_sites
        mean_bias_signal_r = mean_bias_signal_r / num_sites

        protection_score = (total_nl_signal + total_nr_signal -
                            2 * total_nc_signal) / (2 * num_sites)

        # Output PWM and create logo
        pwm_fname = os.path.join(self.output_loc,
                                 "{}.pwm".format(self.motif_name))
        pwm_file = open(pwm_fname, "w")
        for e in ["A", "C", "G", "T"]:
            pwm_file.write(" ".join([str(int(f)) for f in pwm_dict[e]]) + "\n")
        pwm_file.close()

        logo_fname = os.path.join(self.output_loc,
                                  "{}.logo.eps".format(self.motif_name))
        pwm = motifs.read(open(pwm_fname), "pfm")
        pwm.weblogo(logo_fname,
                    format="eps",
                    stack_width="large",
                    stacks_per_line="100",
                    color_scheme="color_classic",
                    unit_name="",
                    show_errorbars=False,
                    logo_title="",
                    show_xaxis=False,
                    xaxis_label="",
                    show_yaxis=False,
                    yaxis_label="",
                    show_fineprint=False,
                    show_ends=False)

        # Output the raw, bias corrected signal and protection score
        output_fname = os.path.join(self.output_loc,
                                    "{}.txt".format(self.motif_name))
        output_file = open(output_fname, "w")
        if not self.strands_specific:
            output_file.write("raw signal: \n" +
                              np.array_str(mean_raw_signal) + "\n")
            output_file.write("bias corrected signal: \n" +
                              np.array_str(mean_bc_signal) + "\n")
        else:
            output_file.write("raw forward signal: \n" +
                              np.array_str(mean_raw_signal_f) + "\n")
            output_file.write("bias corrected forward signal: \n" +
                              np.array_str(mean_bc_signal_f) + "\n")
            output_file.write("raw reverse signal: \n" +
                              np.array_str(mean_raw_signal_r) + "\n")
            output_file.write("bias reverse corrected signal: \n" +
                              np.array_str(mean_bc_signal_r) + "\n")
        output_file.write("forward bias signal: \n" +
                          np.array_str(mean_bias_signal_f) + "\n")
        output_file.write("reverse bias signal: \n" +
                          np.array_str(mean_bias_signal_r) + "\n")
        if self.protection_score:
            output_file.write("protection score: \n" + str(protection_score) +
                              "\n")
        output_file.close()

        if self.strands_specific:
            fig, (ax1, ax2, ax3) = plt.subplots(3, figsize=(12.0, 10.0))
        else:
            fig, (ax1, ax2) = plt.subplots(2)
        x = np.linspace(-50, 49, num=self.window_size)

        ax1.plot(x, mean_bias_signal_f, color='red', label='Forward')
        ax1.plot(x, mean_bias_signal_r, color='blue', label='Reverse')

        ax1.xaxis.set_ticks_position('bottom')
        ax1.yaxis.set_ticks_position('left')
        ax1.spines['top'].set_visible(False)
        ax1.spines['right'].set_visible(False)
        ax1.spines['left'].set_position(('outward', 15))
        ax1.spines['bottom'].set_position(('outward', 5))
        ax1.tick_params(direction='out')

        ax1.set_xticks([-50, -40, -30, -20, -10, 0, 10, 20, 30, 40, 49])
        ax1.set_xticklabels([
            '-50', '-40', '-30', '-20', '-10', '0', '10', '20', '30', '40',
            '49'
        ])
        min_bias_signal = min(min(mean_bias_signal_f), min(mean_bias_signal_r))
        max_bias_signal = max(max(mean_bias_signal_f), max(mean_bias_signal_r))
        ax1.set_yticks([min_bias_signal, max_bias_signal])
        ax1.set_yticklabels(
            [str(round(min_bias_signal, 2)),
             str(round(max_bias_signal, 2))],
            rotation=90)

        ax1.text(-48,
                 max_bias_signal,
                 '# Sites = {}'.format(str(num_sites)),
                 fontweight='bold')
        ax1.set_title(self.motif_name, fontweight='bold')
        ax1.set_xlim(-50, 49)
        ax1.set_ylim([min_bias_signal, max_bias_signal])
        ax1.legend(loc="upper right", frameon=False)
        ax1.set_ylabel("Average Bias \nSignal", rotation=90, fontweight='bold')

        if not self.strands_specific:
            mean_raw_signal = self.standardize(mean_raw_signal)
            mean_bc_signal = self.standardize(mean_bc_signal)
            ax2.plot(x, mean_raw_signal, color='red', label='Uncorrected')
            ax2.plot(x, mean_bc_signal, color='green', label='Corrected')
        else:
            mean_raw_signal_f = self.standardize(mean_raw_signal_f)
            mean_raw_signal_r = self.standardize(mean_raw_signal_r)
            mean_bc_signal_f = self.standardize(mean_bc_signal_f)
            mean_bc_signal_r = self.standardize(mean_bc_signal_r)
            ax2.plot(x, mean_raw_signal_f, color='red', label='Forward')
            ax2.plot(x, mean_raw_signal_r, color='green', label='Reverse')
            ax3.plot(x, mean_bc_signal_f, color='red', label='Forward')
            ax3.plot(x, mean_bc_signal_r, color='green', label='Reverse')

        ax2.xaxis.set_ticks_position('bottom')
        ax2.yaxis.set_ticks_position('left')
        ax2.spines['top'].set_visible(False)
        ax2.spines['right'].set_visible(False)
        ax2.spines['left'].set_position(('outward', 15))
        ax2.tick_params(direction='out')
        ax2.set_xticks([-50, -40, -30, -20, -10, 0, 10, 20, 30, 40, 49])
        ax2.set_xticklabels([
            '-50', '-40', '-30', '-20', '-10', '0', '10', '20', '30', '40',
            '49'
        ])
        ax2.set_yticks([0, 1])
        ax2.set_yticklabels([str(0), str(1)], rotation=90)
        ax2.set_xlim(-50, 49)
        ax2.set_ylim([0, 1])

        if not self.strands_specific:
            ax2.spines['bottom'].set_position(('outward', 40))
            ax2.set_xlabel("Coordinates from Motif Center", fontweight='bold')
            ax2.set_ylabel("Average ATAC-seq \nSignal",
                           rotation=90,
                           fontweight='bold')
            ax2.legend(loc="center",
                       frameon=False,
                       bbox_to_anchor=(0.85, 0.06))
        else:
            ax2.spines['bottom'].set_position(('outward', 5))
            ax2.set_ylabel("Average ATAC-seq \n Uncorrected Signal",
                           rotation=90,
                           fontweight='bold')
            ax2.legend(loc="lower right", frameon=False)

            ax3.xaxis.set_ticks_position('bottom')
            ax3.yaxis.set_ticks_position('left')
            ax3.spines['top'].set_visible(False)
            ax3.spines['right'].set_visible(False)
            ax3.spines['left'].set_position(('outward', 15))
            ax3.tick_params(direction='out')
            ax3.set_xticks([-50, -40, -30, -20, -10, 0, 10, 20, 30, 40, 49])
            ax3.set_xticklabels([
                '-50', '-40', '-30', '-20', '-10', '0', '10', '20', '30', '40',
                '49'
            ])
            ax3.set_yticks([0, 1])
            ax3.set_yticklabels([str(0), str(1)], rotation=90)
            ax3.set_xlim(-50, 49)
            ax3.set_ylim([0, 1])
            ax3.legend(loc="lower right", frameon=False)
            ax3.spines['bottom'].set_position(('outward', 40))
            ax3.set_xlabel("Coordinates from Motif Center", fontweight='bold')
            ax3.set_ylabel("Average ATAC-seq \n Corrected Signal",
                           rotation=90,
                           fontweight='bold')
            ax3.text(-48,
                     0.05,
                     '# K-mer = {}\n# Forward Shift = {}'.format(
                         str(self.k_nb), str(self.atac_forward_shift)),
                     fontweight='bold')

        figure_name = os.path.join(self.output_loc,
                                   "{}.line.eps".format(self.motif_name))
        fig.subplots_adjust(bottom=.2, hspace=.5)
        fig.tight_layout()
        fig.savefig(figure_name, format="eps", dpi=300)

        # Creating canvas and printing eps / pdf with merged results
        output_fname = os.path.join(self.output_loc,
                                    "{}.eps".format(self.motif_name))
        c = pyx.canvas.canvas()
        c.insert(pyx.epsfile.epsfile(0, 0, figure_name, scale=1.0))
        if self.strands_specific:
            c.insert(
                pyx.epsfile.epsfile(2.76,
                                    1.58,
                                    logo_fname,
                                    width=27.2,
                                    height=2.45))
        else:
            c.insert(
                pyx.epsfile.epsfile(2.5,
                                    1.54,
                                    logo_fname,
                                    width=16,
                                    height=1.75))
        c.writeEPSfile(output_fname)
        os.system("epstopdf " + figure_name)
        os.system("epstopdf " + logo_fname)
        os.system("epstopdf " + output_fname)
示例#35
0
def create_signal(args, regions):
    def revcomp(s):
        rev_dict = dict([("A", "T"), ("T", "A"), ("C", "G"), ("G", "C"), ("N", "N")])
        return "".join([rev_dict[e] for e in s[::-1]])

    alphabet = ["A", "C", "G", "T"]
    kmer_comb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)]
    f_obs_dict = dict([(e, 0.0) for e in kmer_comb])
    r_obs_dict = dict([(e, 0.0) for e in kmer_comb])
    f_exp_dict = dict([(e, 0.0) for e in kmer_comb])
    r_exp_dict = dict([(e, 0.0) for e in kmer_comb])

    bam_file = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fasta_file = Fastafile(genome_data.get_genome())

    for region in regions:
        # Fetching observed reads
        reads = bam_file.fetch(reference=region.chrom, start=region.initial, end=region.final)
        for read in reads:
            if not read.is_reverse:
                p1 = read.pos - int(floor(args.k_nb / 2)) + args.forward_shift - 1
            else:
                p1 = read.aend - int(floor(args.k_nb / 2)) + args.reverse_shift + 1
            p2 = p1 + args.k_nb
            try:
                dna_sequence_obs = str(fasta_file.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if 'N' not in dna_sequence_obs:
                if read.is_reverse:
                    dna_sequence_obs = revcomp(dna_sequence_obs)
                    r_obs_dict[dna_sequence_obs] += 1
                else:
                    f_obs_dict[dna_sequence_obs] += 1

        # Fetching whole sequence
        try:
            dna_sequence_exp = str(fasta_file.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue
        dna_sequence_exp_rev = revcomp(dna_sequence_exp)
        for i in range(0, len(dna_sequence_exp) - args.k_nb):
            s = dna_sequence_exp[i:i + args.k_nb]
            if "N" not in s:
                f_exp_dict[s] += 1
            s = dna_sequence_exp_rev[i:i + args.k_nb]
            if "N" not in s:
                r_exp_dict[s] += 1

    output_fname_f_obs = os.path.join(args.output_location, "{}_f_obs.fa".format(str(args.k_nb)))
    output_fname_f_exp = os.path.join(args.output_location, "{}_f_exp.fa".format(str(args.k_nb)))
    output_fname_r_obs = os.path.join(args.output_location, "{}_r_obs.fa".format(str(args.k_nb)))
    output_fname_r_exp = os.path.join(args.output_location, "{}_r_exp.fa".format(str(args.k_nb)))

    output_file_f_obs = open(output_fname_f_obs, "w")
    output_file_f_exp = open(output_fname_f_exp, "w")
    output_file_r_obs = open(output_fname_r_obs, "w")
    output_file_r_exp = open(output_fname_r_exp, "w")

    for kmer in list(r_obs_dict.keys()):
        if f_obs_dict[kmer] > 0:
            output_file_f_obs.write(kmer + "\t" + str(f_obs_dict[kmer]) + "\n")
    for kmer in list(r_obs_dict.keys()):
        if f_exp_dict[kmer] > 0:
            output_file_f_exp.write(kmer + "\t" + str(f_exp_dict[kmer]) + "\n")
    for kmer in list(r_obs_dict.keys()):
        if r_obs_dict[kmer] > 0:
            output_file_r_obs.write(kmer + "\t" + str(r_obs_dict[kmer]) + "\n")
    for kmer in list(r_obs_dict.keys()):
        if r_exp_dict[kmer] > 0:
            output_file_r_exp.write(kmer + "\t" + str(r_exp_dict[kmer]) + "\n")

    output_file_f_obs.close()
    output_file_f_exp.close()
    output_file_r_obs.close()
    output_file_r_exp.close()
示例#36
0
            score,
            name2,
            cdsStartStat,
            cdsEndStat,
            exonFrames) = line.strip().split()

            exonStarts = map(int, exonStarts.split(',')[:-1])
            exonEnds   = map(int, exonEnds.split(',')[:-1])

            assert len(exonEnds) == len(exonStarts) == int(exonCount)

            seq = ''

            for start, end in zip(exonStarts, exonEnds):
                if chrom in fa.references:
                    seq += fa.fetch(chrom, start, end)

            if strand == '-':
                seq = rc(seq)

            #seq = seq + 'A'*100 # polyadenylate

            if seq:
                genes[name2].append(seq)


    for name in genes:
        for i, tx in enumerate(genes[name]):
            print ">%s.%d\n%s" % (name, i, tx)

else:
示例#37
0
def estimate_bias_kmer(args):
    # Parameters
    maxDuplicates = 100
    pseudocount = 1.0

    # Initializing bam and fasta
    bamFile = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fastaFile = Fastafile(genome_data.get_genome())
    regions = GenomicRegionSet("regions")
    regions.read(args.regions_file)

    # Initializing dictionaries
    obsDictF = dict()
    obsDictR = dict()
    expDictF = dict()
    expDictR = dict()

    ct_reads_r = 0
    ct_reads_f = 0
    ct_kmers = 0

    # Iterating on HS regions
    for region in regions:

        # Initialization
        prevPos = -1
        trueCounter = 0

        # Evaluating observed frequencies ####################################
        # Fetching reads
        for r in bamFile.fetch(region.chrom, region.initial, region.final):

            # Calculating positions
            if not r.is_reverse:
                cut_site = r.pos + args.forward_shift - 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            else:
                cut_site = r.aend + args.reverse_shift + 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            p2 = p1 + args.k_nb

            # Verifying PCR artifacts
            if p1 == prevPos:
                trueCounter += 1
            else:
                prevPos = p1
                trueCounter = 0
            if trueCounter > maxDuplicates: continue

            # Fetching k-mer
            try:
                currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)

            # Counting k-mer in dictionary
            if not r.is_reverse:
                ct_reads_f += 1
                try:
                    obsDictF[currStr] += 1
                except Exception:
                    obsDictF[currStr] = 1
            else:
                ct_reads_r += 1
                try:
                    obsDictR[currStr] += 1
                except Exception:
                    obsDictR[currStr] = 1

        # Evaluating expected frequencies ####################################
        # Fetching whole sequence
        try:
            currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue
        currRevComp = AuxiliaryFunctions.revcomp(currStr)

        # Iterating on each sequence position
        for i in range(0, len(currStr) - args.k_nb):
            ct_kmers += 1
            # Counting k-mer in dictionary
            s = currStr[i:i + args.k_nb]
            try:
                expDictF[s] += 1
            except Exception:
                expDictF[s] = 1

            # Counting k-mer in dictionary for reverse complement
            s = currRevComp[i:i + args.k_nb]
            try:
                expDictR[s] += 1
            except Exception:
                expDictR[s] = 1

    # Closing files
    bamFile.close()
    fastaFile.close()

    # Creating bias dictionary
    alphabet = ["A", "C", "G", "T"]
    kmerComb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)]
    bias_table_F = dict([(e, 0.0) for e in kmerComb])
    bias_table_R = dict([(e, 0.0) for e in kmerComb])
    for kmer in kmerComb:
        try:
            obsF = obsDictF[kmer] + pseudocount
        except Exception:
            obsF = pseudocount
        try:
            expF = expDictF[kmer] + pseudocount
        except Exception:
            expF = pseudocount
        if ct_reads_f == 0:
            bias_table_F[kmer] = 1
        else:
            bias_table_F[kmer] = round(float(obsF / ct_reads_f) / float(expF / ct_kmers), 6)
        try:
            obsR = obsDictR[kmer] + pseudocount
        except Exception:
            obsR = pseudocount
        try:
            expR = expDictR[kmer] + pseudocount
        except Exception:
            expR = pseudocount
        if ct_reads_r == 0:
            bias_table_R[kmer] = 1
        else:
            bias_table_R[kmer] = round(float(obsR / ct_reads_r) / float(expR / ct_kmers), 6)

    write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
示例#38
0
  chrName = ll[0]; p1 = int(ll[1]); p2 = int(ll[2])

  # Starting result structures
  regionTagCount = 0
  resVec = [globalMin,0,0,0,0,0] # BIT-SCORE, MOTIF_P1, MOTIF_P2, FP_OVERLAP, FP_P1, FP_P2
  counter = 0

  # Evaluating Overall TC
  try: regionTagCount = tag_count(chrName, p1, p2, dnaseBam, tcHalfWindow)
  except Exception: 
    print "Exception TC raised in "+line
    writeOutput(ll,regionTagCount,resVec,outFile)
    continue

  # Fetching sequence
  try: sequence = str(genomeFile.fetch(chrName, p1, p2))
  except Exception:
    print "Exception SEQUENCE raised in "+line
    writeOutput(ll,regionTagCount,resVec,outFile)
    continue

  # Fetching footprints
  try: footprints = fpBam.fetch(reference=chrName, start=p1, end=p2)
  except Exception:
    print "Exception FOOTPRINTS raised in "+line
    writeOutput(ll,regionTagCount,resVec,outFile)
    continue

  # Best mpbs
  maxPos = -99999
  maxValue = globalMin
示例#39
0
            if (not r.is_reverse):
                p1 = r.pos - (k_nb / 2) - 1  # The -1 is because He is wrong
            else:
                p1 = r.aend - (k_nb / 2) + 1  # The +1 is because He is wrong
            p2 = p1 + k_nb

            # Verifying PCR artifacts
            if (p1 == prevPos): trueCounter += 1
            else:
                prevPos = p1
                trueCounter = 0
            if (trueCounter > maxDuplicates): continue

            # Fetching k-mer
            try:
                currStr = str(fastaFile.fetch(chrom, p1, p2)).upper()
            except Exception:
                continue
            if (r.is_reverse): currStr = str(Seq(currStr).reverse_complement())

            # Counting k-mer in dictionary
            try:
                obsDict[currStr] += 1
            except Exception:
                obsDict[currStr] = 1
            if (not r.is_reverse):
                try:
                    obsDictF[currStr] += 1
                except Exception:
                    obsDictF[currStr] = 1
            else:
示例#40
0
        vectorTable3.append(chrom)
        start = int(coordVec[1])
        vectorTable1.append(str(start))
        vectorTable2.append(str(start))
        vectorTable3.append(str(start))
        end = int(coordVec[2])
        vectorTable1.append(str(end))
        vectorTable2.append(str(end))
        vectorTable3.append(str(end))
        regionLength = end - start
        vectorTable1.append(str(regionLength))
        vectorTable2.append(str(regionLength))
        vectorTable3.append(str(regionLength))

        # %CG content
        sequence = str(genomeFile.fetch(chrom, start, end)).upper()
        cgFreq = 0
        for character in sequence:
            if (character == "C" or character == "G"): cgFreq += 1
        vectorTable1.append(str(round(float(cgFreq) / regionLength, 2)))

        # CTCF status
        higherScore = -999.
        higherStrand = "NA"
        for k in ctcfIndexList:
            motifFile = motifFileList[k]
            motifFetch = motifFile.fetch(chrom, start, end)
            for read in motifFetch:
                rr = read.qname.split(":")
                motifScore = float(rr[1])
                if (motifScore > higherScore):
示例#41
0
class VariantAnnotator(object):
  def __init__(self, gene_db, reference_fasta):
    self.reference  = Fastafile(reference_fasta)
    self.con        = open_genedb(gene_db)
    self.gene_cache = OrderedDict()

    self.band_map = band_map = defaultdict(IntervalTree)
    for band in get_cytobands(self.con):
      band_map[band.chrom].insert(band.start,band.end,band)
      if band.chrom.startswith('chr') and band.chrom[3:] not in band_map:
        band_map[band.chrom[3:]] = band_map[band.chrom]

    trans = get_transcripts(self.con)
    trans = progress_loop(trans, label='Loading transcripts: ', units='transcripts')

    self.feature_map = feature_map = defaultdict(IntervalTree)
    for gene in trans:
      feature_map[gene.chrom].insert(gene.txStart,gene.txEnd,gene)

      if 0: # DEBUG
        parts = self.decode_gene(gene)
        for part in parts:
          if part.type not in ('intron','UTR5','UTR3','UTR') and '_' not in part.chrom:
            print '\t'.join(map(str,[part.chrom,part.start,part.end,gene.symbol]))

    sys.stderr.write('Loading complete.\n')


  def decode_gene(self,gene):
    gene_cache = self.gene_cache
    key = gene.id

    try:
      parts = gene_cache.pop(key)
    except KeyError:
      partlist = list(decode_gene(gene))

      parts = IntervalTree()
      for part in partlist:
        parts.insert(part.start,part.end,part)

      if len(gene_cache)>=300:
        gene_cache.popitem(0)

    # Add result to end of LRU
    gene_cache[key] = parts

    return parts


  def annotate(self, chrom, ref_start, ref_end, variant, nsonly=False):
    variant = variant.replace('-','')

    ref_nuc = self.reference.fetch(chrom,ref_start,ref_end).upper()
    var_nuc = variant.upper()

    evidence = []
    for feature in self.feature_map[chrom].find(ref_start, ref_end):
      evidence.extend( self.classify_feature(feature.value, ref_start, ref_end, ref_nuc, var_nuc) )

    #ns = any('NON-SYNONYMOUS' in e[3] for e in evidence)
    #if nsonly and not ns:
    #  return []

    # If not in a gene, check to see if there are any genes nearby
    if not evidence:
      five_prime  = set()
      three_prime = set()

      for feature in self.feature_map[chrom].find(ref_start-2000, ref_end+2000):
        gene = feature.value
        if (0<ref_end-gene.txStart<=2000) ^ (gene.strand=='-'):
          five_prime.add(gene)
        else:
          three_prime.add(gene)

      for gene in five_prime:
        evidence.append( ['UPSTREAM_GENE',gene,'',False,'','',ref_nuc,var_nuc,'',''] )

      for gene in three_prime:
        evidence.append( ['DOWNSTREAM_GENE',gene,'',False,'','',ref_nuc,var_nuc,'',''] )

    if not evidence:
      evidence.append( ['intergenic','','',False,'','',ref_nuc,var_nuc,'',''] )

    evidence = group_evidence(evidence)
    cytoband = cytoband_name(self.band_map[chrom].find_values(ref_start,ref_end))
    context  = [ chrom,cytoband,ref_start,ref_end ]

    if 0: # evidence:
      print
      for e in evidence:
        values = context+e
        for f,v in zip(GeneEvidence.__slots__,values):
          print '%15s = %s' % (f,v)
        print

    evidence = [ GeneEvidence._make(context+e) for e in evidence ]

    return evidence


  def classify_feature(self, gene, ref_start, ref_end, ref_nuc, var_nuc):
    gene_parts = self.decode_gene(gene)

    intersect = defaultdict(list)
    for part in gene_parts.find_values(ref_start, ref_end):
      intersect[part.type].append(part)

    evidence = []

    parts    = set(intersect)
    mut_type = set()

    for splice in gene_parts.find_values(ref_start-5,ref_end+5):
      if splice.type=='CDS' or 'UTR' in splice.type:
        if (0<splice.start-ref_end<=5) or (0<ref_start-splice.end<=5):
          mut_type.add('POSSIBLE_INTRONIC_SPLICE_VARIANT')

    parts    = ','.join(sorted(parts))
    mut_type = ','.join(sorted(mut_type))

    if len(intersect)==1 and len(intersect['CDS'])==1:
      e = self.classify_exonic_variant(gene, gene_parts, intersect['CDS'][0],
                                       ref_start, ref_end, ref_nuc, var_nuc)
      evidence.append(e)
    elif len(intersect['CDS']):
      evidence.append([parts,gene,'',True,'NON-SYNONYMOUS',mut_type,ref_nuc,var_nuc,'',''])
    elif mut_type:
      evidence.append([parts,gene,'',True,'PREDICTED-DISRUPT-TRANSCRIPT',mut_type,ref_nuc,var_nuc,'',''])
    elif len(intersect['UTR5'])+len(intersect['UTR3']):
      evidence.append([parts,gene,'',False,'UNKNOWN-UTR',mut_type,ref_nuc,var_nuc,'',''])
    elif len(intersect['intron']):
      evidence.append([parts,gene,'',False,'UNKNOWN-INTRONIC',mut_type,ref_nuc,var_nuc,'',''])
    else:
      evidence.append([parts,gene,'',False,'UNKNOWN-INTERGENIC',mut_type,ref_nuc,var_nuc,'',''])

    return evidence


  def classify_exonic_variant(self, gene, gene_parts, cds, ref_start, ref_end, ref_nuc, var_nuc):
    result = ['CDS',gene,'mRNA=%s:protein=%s:exon=%d:strand=%s' % \
                         (gene.mRNA,gene.protein,cds.exon_num,gene.strand)]

    exon_start = ref_start - cds.start
    exon_end   = ref_end   - cds.start

    # FIXME: Report ref and var nuc relative to gene strand

    var_nuc = var_nuc.upper()

    #print gene.chrom,ref_start,ref_end,ref_nuc,var_nuc
    #assert len(ref_nuc)==(ref_end-ref_start)

    if ref_nuc==var_nuc:
      result += [False,'SYNONYMOUS','REFERENCE',ref_nuc,var_nuc,'','']
      return result

    ref_frame  = len(ref_nuc)%3
    var_frame  = len(var_nuc)%3
    frameshift = (len(ref_nuc)-len(var_nuc))%3

    if 0:
      print '  REF_FRAME: %d' % ref_frame
      print '  VAR_FRAME: %d' % var_frame

    mut_type = []

    if len(ref_nuc)==len(var_nuc):
      mut_type.append('SUBSTITUTION')
    elif len(ref_nuc)>len(var_nuc):
      mut_type.append('DELETION')
    else:
      mut_type.append('INSERTION')

    if exon_start<5:
      mut_type.append('POSSIBLE-SPLICE5')
    if cds.end-exon_end<5:
      mut_type.append('POSSIBLE-SPLICE3')

    if ref_frame!=var_frame:
      mut_type.append('FRAMESHIFT')
      mut_type = ','.join(sorted(mut_type))
      result += [True,'NON-SYNONYMOUS',mut_type,ref_nuc,var_nuc,'','']
      return result

    # FIXME: Request 100 bases beyond end of transcription
    ref_var_start = 0
    ref_cds_seq   = []
    for part in gene_parts:
      if part.type=='CDS':
        seq = Seq(self.reference.fetch(part.chrom,part.start,part.end))
        #assert len(seq)==(end-start)
        ref_cds_seq.append(seq)
        if part.cds_index<cds.cds_index:
          ref_var_start += len(seq)
        elif part.cds_index==cds.cds_index:
          ref_var_start += exon_start

    #assert ref_nuc==str(ref_cds_seq[cds.cds_index][exon_start:exon_end]).upper()

    if 0:
      print '  CDS  : %d-%d' % (cds.start,cds.end)
      print '  VAR  : %d-%d' % (ref_start,ref_end)
      print '  LOCAL: %d-%d (size=%d)' % (exon_start,exon_end,len(ref_cds_seq[cds.cds_index]))

    var_cds_seq = ref_cds_seq[:]

    v = list(var_cds_seq[cds.cds_index])
    v[exon_start:exon_end] = list(var_nuc)
    var_cds_seq[cds.cds_index] = ''.join(v)

    ref_cds = Seq(''.join(str(s) for s in ref_cds_seq))
    var_cds = Seq(''.join(str(s) for s in var_cds_seq))

    if gene.strand=='-':
      ref_var_start = len(ref_cds)-ref_var_start-1
      ref_cds       = ref_cds.reverse_complement()
      var_cds       = var_cds.reverse_complement()
      ref_cds_nuc   = str(Seq(ref_nuc).reverse_complement())
      var_cds_nuc   = str(Seq(var_nuc).reverse_complement())
    else:
      ref_cds_nuc   = ref_nuc
      var_cds_nuc   = var_nuc

    try:
      ref_cds_aa = ref_cds.translate()
      var_cds_aa = var_cds.translate()
    except TranslationError:
      mut_type.append('INVALID_TRANSLATION')
      mut_type = ','.join(sorted(mut_type))
      result += [True,'PRESUMED_NON-SYNONYMOUS',mut_type,ref_cds_nuc,var_cds_nuc,'','']
      return result

    ref_aa,var_aa,aa_position = reduce_match(str(ref_cds_aa),str(var_cds_aa))

    if not ref_aa and not var_aa:
      mut_type = ','.join(sorted(mut_type))

      codon_start  = ref_var_start-ref_var_start%3
      codon_end    = ref_var_start+len(ref_nuc)
      if codon_end%3:
        codon_end += 3-codon_end%3

      aa_position = codon_start//3
      ref_frame   = ref_cds[codon_start:codon_end]
      ref_aa      = ref_frame.translate()

      #assert len(ref_aa)

      result[-1] += ':aa=%d' % (aa_position+1)
      result += [False,'SYNONYMOUS',mut_type,ref_cds_nuc,var_cds_nuc,str(ref_aa),str(ref_aa)]
      return result

    # Classify non-synonymous change by comparing AA sequences

    # Make sure ref protein doesn't appear to have spurious stops

    r = ref_cds_aa.rstrip('*')
    v = var_cds_aa.rstrip('*')

    ref_stop = r.find('*')
    var_stop = v.find('*')

    if ref_stop==-1:
      if var_stop!=-1 and not v.startswith(r):
        mut_type.append('PREMATURE_STOP')
      elif ref_cds_aa[-1]=='*' and var_cds_aa[-1]!='*':
        mut_type.append('LOSS_OF_STOP')

    if 0:
      print '  REF_NUC:',ref_cds_nuc
      print '  VAR_NUC:',var_cds_nuc
      print '   REF_AA:',ref_aa
      print '   VAR_AA:',var_aa
      #print '  NUC_DIFF:',levenshtein_sequence(str(ref_cds),str(var_cds))
      #print '  AA_DIFF: ',levenshtein_sequence(str(ref_aa), str(var_aa) )

      ref_size = ref_end-ref_start
      cds_size = len(ref_cds)
      print '  CDS_SIZE=%d (%.1f codons)' % (cds_size,cds_size/3.0)
      print '  CDS SEQ=%s' % ref_cds

      assert not ref_cds or str(ref_cds[:3])=='ATG'

    mut_type = ','.join(sorted(mut_type))
    result[-1] += ':aa=%d' % (aa_position+1)
    result     += [True,'NON-SYNONYMOUS',mut_type,ref_cds_nuc,var_cds_nuc,str(ref_aa),str(var_aa)]

    return result
示例#42
0
def estimate_bias_pwm(args):
    # Parameters
    max_duplicates = 100

    # Initializing bam and fasta
    bamFile = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fastaFile = Fastafile(genome_data.get_genome())
    regions = GenomicRegionSet("regions")
    regions.read(args.regions_file)

    obs_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    exp_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    obs_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    exp_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])

    # Iterating on HS regions
    for region in regions:
        # Initialization
        prev_pos = -1
        true_counter = 0

        # Evaluating observed frequencies
        # Fetching reads
        for r in bamFile.fetch(region.chrom, region.initial, region.final):
            # Calculating positions
            if not r.is_reverse:
                cut_site = r.pos + args.forward_shift - 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            else:
                cut_site = r.aend + args.reverse_shift + 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            p2 = p1 + args.k_nb

            # Verifying PCR artifacts
            if p1 == prev_pos:
                true_counter += 1
            else:
                prev_pos = p1
                true_counter = 0
            if true_counter > max_duplicates: continue

            # Fetching k-mer
            try:
                currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)

            # Counting k-mer in dictionary
            if not r.is_reverse:
                for i in range(0, len(currStr)):
                    obs_f_pwm_dict[currStr[i]][i] += 1
            else:
                for i in range(0, len(currStr)):
                    obs_r_pwm_dict[currStr[i]][i] += 1

        # Evaluating expected frequencies
        # Fetching whole sequence
        try:
            currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue

        # Iterating on each sequence position
        s = None
        for i in range(0, len(currStr) - args.k_nb):
            # Counting k-mer in dictionary
            s = currStr[i:i + args.k_nb]
            for i in range(0, len(s)):
                exp_f_pwm_dict[s[i]][i] += 1

            # Counting k-mer in dictionary for reverse complement
            s = AuxiliaryFunctions.revcomp(s)
            for i in range(0, len(s)):
                exp_r_pwm_dict[s[i]][i] += 1

    # Closing files
    bamFile.close()
    fastaFile.close()

    # Output pwms
    os.system("mkdir -p " + os.path.join(args.output_location, "pfm"))
    pwm_dict_list = [obs_f_pwm_dict, obs_r_pwm_dict, exp_f_pwm_dict, exp_r_pwm_dict]
    pwm_file_list = []
    pwm_obs_f = os.path.join(args.output_location, "pfm", "obs_{}_f.pfm".format(str(args.k_nb)))
    pwm_obs_r = os.path.join(args.output_location, "pfm", "obs_{}_r.pfm".format(str(args.k_nb)))
    pwm_exp_f = os.path.join(args.output_location, "pfm", "exp_{}_f.pfm".format(str(args.k_nb)))
    pwm_exp_r = os.path.join(args.output_location, "pfm", "exp_{}_r.pfm".format(str(args.k_nb)))

    pwm_file_list.append(pwm_obs_f)
    pwm_file_list.append(pwm_obs_r)
    pwm_file_list.append(pwm_exp_f)
    pwm_file_list.append(pwm_exp_r)

    for i in range(len(pwm_dict_list)):
        with open(pwm_file_list[i], "w") as pwm_file:
            for e in ["A", "C", "G", "T"]:
                pwm_file.write(" ".join([str(int(f)) for f in pwm_dict_list[i][e]]) + "\n")

    motif_obs_f = motifs.read(open(pwm_obs_f), "pfm")
    motif_obs_r = motifs.read(open(pwm_obs_r), "pfm")
    motif_exp_f = motifs.read(open(pwm_exp_f), "pfm")
    motif_exp_r = motifs.read(open(pwm_exp_r), "pfm")

    # Output logos
    os.system("mkdir -p " + os.path.join(args.output_location, "logo"))
    logo_obs_f = os.path.join(args.output_location, "logo", "obs_{}_f.pdf".format(str(args.k_nb)))
    logo_obs_r = os.path.join(args.output_location, "logo", "obs_{}_r.pdf".format(str(args.k_nb)))
    logo_exp_f = os.path.join(args.output_location, "logo", "exp_{}_f.pdf".format(str(args.k_nb)))
    logo_exp_r = os.path.join(args.output_location, "logo", "exp_{}_r.pdf".format(str(args.k_nb)))

    motif_obs_f.weblogo(logo_obs_f, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.2, yaxis_tic_interval=0.1)
    motif_obs_r.weblogo(logo_obs_r, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.2, yaxis_tic_interval=0.1)
    motif_exp_f.weblogo(logo_exp_f, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.02, yaxis_tic_interval=0.01)
    motif_exp_r.weblogo(logo_exp_r, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.02, yaxis_tic_interval=0.01)

    # Creating bias dictionary
    alphabet = ["A", "C", "G", "T"]
    k_mer_comb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)]
    bias_table_F = dict([(e, 0.0) for e in k_mer_comb])
    bias_table_R = dict([(e, 0.0) for e in k_mer_comb])
    for k_mer in k_mer_comb:
        obs_f = get_ppm_score(k_mer, motif_obs_f.pwm, args.k_nb)
        exp_f = get_ppm_score(k_mer, motif_exp_f.pwm, args.k_nb)
        bias_table_F[k_mer] = round(obs_f / exp_f, 6)
        obs_r = get_ppm_score(k_mer, motif_obs_r.pwm, args.k_nb)
        exp_r = get_ppm_score(k_mer, motif_exp_r.pwm, args.k_nb)
        bias_table_R[k_mer] = round(obs_r / exp_r, 6)

    write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
示例#43
0
def snp_workflow(ex,
                 job,
                 assembly,
                 minsnp=40.,
                 mincov=5,
                 path_to_ref=None,
                 via='local',
                 logfile=sys.stdout,
                 debugfile=sys.stderr):
    """Main function of the workflow"""
    ref_genome = assembly.fasta_by_chrom
    sample_names = [
        job.groups[gid]['name'] for gid in sorted(job.files.keys())
    ]

    logfile.write("\n* Generate vcfs for each chrom/group\n")
    logfile.flush()
    vcfs = dict((chrom, {}) for chrom in ref_genome.keys())  # {chr: {}}
    bams = {}
    # Launch the jobs
    bam = Samfile(job.files.values()[0].values()[0]['bam'])
    header = bam.header
    headerfile = unique_filename_in()
    for h in header["SQ"]:
        if h["SN"] in assembly.chrmeta:
            h["SN"] = assembly.chrmeta[h["SN"]]["ac"]
    head = Samfile(headerfile, "wh", header=header)
    head.close()
    for gid in job.files.keys():
        # Merge all bams belonging to the same group
        runs = [r['bam'] for r in job.files[gid].itervalues()]
        if len(runs) > 1:
            _b = merge_bam(ex, runs)
            index_bam(ex, _b)
            bams[gid] = _b
        else:
            index_bam(ex, runs[0])
            bams[gid] = runs[0]
        # Samtools mpileup + bcftools + vcfutils.pl
        for chrom, ref in ref_genome.iteritems():
            vcf = unique_filename_in()
            vcfs[chrom][gid] = (vcf,
                                pileup.nonblocking(ex,
                                                   bams[gid],
                                                   ref,
                                                   header=headerfile,
                                                   via=via,
                                                   stdout=vcf))
        logfile.write("  ...Group %s running.\n" % job.groups[gid]['name'])
        logfile.flush()
    # Wait for vcfs to finish and store them in *vcfs[chrom][gid]*
    for gid in job.files.keys():
        for chrom, ref in ref_genome.iteritems():
            vcfs[chrom][gid][1].wait()
            vcfs[chrom][gid] = vcfs[chrom][gid][0]
        logfile.write("  ...Group %s done.\n" % job.groups[gid]['name'])
        logfile.flush()
    # Targz the pileup files (vcf)
    tarname = unique_filename_in()
    tarfh = tarfile.open(tarname, "w:gz")
    for chrom, v in vcfs.iteritems():
        for gid, vcf in v.iteritems():
            tarfh.add(vcf,
                      arcname="%s_%s.vcf" % (job.groups[gid]['name'], chrom))
    tarfh.close()
    ex.add(tarname,
           description=set_file_descr("vcf_files.tar.gz",
                                      step="pileup",
                                      type="tar",
                                      view='admin'))

    logfile.write("\n* Merge info from vcf files\n")
    logfile.flush()
    outall = unique_filename_in()
    outexons = unique_filename_in()
    with open(outall, "w") as fout:
        fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+ \
                                 ['gene','location_type','distance'])+'\n')
    with open(outexons, "w") as fout:
        fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+['exon','strand','ref_aa'] \
                                  + ['new_aa_'+s for s in sample_names])+'\n')
    msa_table = dict((s, '') for s in [assembly.name] + sample_names)
    for chrom, v in vcfs.iteritems():
        logfile.write("  > Chromosome '%s'\n" % chrom)
        logfile.flush()
        # Put together info from all vcf files
        logfile.write("  - All SNPs\n")
        logfile.flush()
        allsnps = all_snps(ex, chrom, vcfs[chrom], bams, outall,
                           assembly, headerfile, sample_names, mincov,
                           float(minsnp), logfile, debugfile, via)
        # Annotate SNPs and check synonymy
        logfile.write("  - Exonic SNPs\n")
        logfile.flush()
        exon_snps(chrom, outexons, allsnps, assembly, sample_names, ref_genome,
                  logfile, debugfile)
        for snprow in allsnps:
            for n, k in enumerate([assembly.name] + sample_names):
                base = snprow[3 + n][0]
                if base == "-": base = snprow[3][0]
                if base not in 'ACGTacgt': base = "N"
                msa_table[k] += base
    description = set_file_descr("allSNP.txt", step="SNPs", type="txt")
    ex.add(outall, description=description)
    description = set_file_descr("exonsSNP.txt", step="SNPs", type="txt")
    ex.add(outexons, description=description)
    msafile = unique_filename_in()
    with open(msafile, "w") as msa:
        msa.write(" %i %i\n" % (len(msa_table), len(msa_table.values()[0])))
        for name, seq in msa_table.iteritems():
            msa.write("%s\t%s\n" % (name, seq))
    msa_table = {}
    description = set_file_descr("SNPalignment.txt", step="SNPs", type="txt")
    ex.add(msafile, description=description)
    # Create UCSC bed tracks
    logfile.write("\n* Create tracks\n")
    logfile.flush()
    create_tracks(ex, outall, sample_names, assembly)
    # Create quantitative tracks
    logfile.write("\n* Create heteroz. and quality tracks\n")
    logfile.flush()

    def _process_pileup(pileups, seq, startpos, endpos):
        atoi = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
        vectors = ([], [], [])
        for pileupcolumn in pileups:
            position = pileupcolumn.pos
            if position < startpos: continue
            if position >= endpos: break
            coverage = pileupcolumn.n
            ref_symbol = seq[position - startpos]
            ref = atoi.get(ref_symbol, 4)
            symbols = [0, 0, 0, 0, 0]
            quality = 0
            for pileupread in pileupcolumn.pileups:
                if pileupread.qpos >= len(pileupread.alignment.seq):
                    coverage -= 1
                else:
                    symbols[atoi.get(pileupread.alignment.seq[pileupread.qpos],
                                     4)] += 1
                    quality += ord(
                        pileupread.alignment.qual[pileupread.qpos]) - 33
            quality = float(quality) / coverage
            info = heterozygosity(ref, symbols[0:4])
            if coverage > 0:
                vectors[0].append((position, position + 1, coverage))
            if info > 0: vectors[1].append((position, position + 1, info))
            if quality > 0:
                vectors[2].append((position, position + 1, quality))
#            yield (position, position+1, coverage, info, quality)
        return vectors

    if job.options.get('make_bigwigs', False):
        _descr = {
            'groupId': 0,
            'step': "tracks",
            'type': "bigWig",
            'ucsc': '1'
        }
        for gid, bamfile in bams.iteritems():
            _descr['groupId'] = gid
            bamtr = track(bamfile, format="bam")
            covname = unique_filename_in() + ".bw"
            out_cov = track(covname, chrmeta=assembly.chrmeta)
            hetname = unique_filename_in() + ".bw"
            out_het = track(hetname, chrmeta=assembly.chrmeta)
            qualname = unique_filename_in() + ".bw"
            out_qual = track(qualname, chrmeta=assembly.chrmeta)
            for chrom, cinfo in assembly.chrmeta.iteritems():
                fasta = Fastafile(ref_genome[chrom])
                #process fasta and bam by 10Mb chunks
                for chunk in range(0, cinfo["length"], 10**7):
                    fastaseq = fasta.fetch(cinfo['ac'], chunk, chunk + 10**7)
                    vecs = _process_pileup(
                        bamtr.pileup(chrom, chunk, chunk + 10**7), fastaseq,
                        chunk, chunk + 10**7)
                    out_cov.write(vecs[0],
                                  fields=['start', 'end', 'score'],
                                  chrom=chrom)
                    out_het.write(vecs[1],
                                  fields=['start', 'end', 'score'],
                                  chrom=chrom)
                    out_qual.write(vecs[2],
                                   fields=['start', 'end', 'score'],
                                   chrom=chrom)
            out_cov.close()
            out_het.close()
            out_qual.close()
            description = set_file_descr(
                job.groups[gid]['name'] + "_coverage.bw", **_descr)
            ex.add(covname, description=description)
            description = set_file_descr(
                job.groups[gid]['name'] + "_heterozygosity.bw", **_descr)
            ex.add(hetname, description=description)
            description = set_file_descr(
                job.groups[gid]['name'] + "_quality.bw", **_descr)
            ex.add(qualname, description=description)

    return 0