예제 #1
0
def get_pwm(fasta, regions, window_size):
    pwm = dict([("A", [0.0] * window_size), ("C", [0.0] * window_size),
                ("G", [0.0] * window_size), ("T", [0.0] * window_size),
                ("N", [0.0] * window_size)])

    for region in regions:
        middle = (region.initial + region.final) // 2
        p1 = middle - window_size // 2
        p2 = middle + window_size // 2

        if p1 <= 0:
            continue

        aux_plus = 1
        dna_seq = str(fasta.fetch(region.chrom, p1, p2)).upper()

        if window_size % 2 == 0:
            aux_plus = 0

        dna_seq_rev = AuxiliaryFunctions.revcomp(
            str(fasta.fetch(region.chrom, p1 + aux_plus,
                            p2 + aux_plus)).upper())
        if region.orientation == "+":
            for i in range(len(dna_seq)):
                pwm[dna_seq[i]][i] += 1

        elif region.orientation == "-":
            for i in range(len(dna_seq_rev)):
                pwm[dna_seq_rev[i]][i] += 1

    return pwm
예제 #2
0
def update_pwm(pwm, fasta, region, p1, p2):
    # Update pwm
    aux_plus = 1
    dna_seq = str(fasta.fetch(region.chrom, p1, p2)).upper()
    if (region.final - region.initial) % 2 == 0:
        aux_plus = 0
    dna_seq_rev = AuxiliaryFunctions.revcomp(
        str(fasta.fetch(region.chrom, p1 + aux_plus, p2 + aux_plus)).upper())
    if region.orientation == "+":
        for i in range(0, len(dna_seq)):
            pwm[dna_seq[i]][i] += 1
    elif region.orientation == "-":
        for i in range(0, len(dna_seq_rev)):
            pwm[dna_seq_rev[i]][i] += 1
예제 #3
0
def update_pwm(pwm, fasta, region, p1, p2):
    # Update pwm
    aux_plus = 1
    dna_seq = str(fasta.fetch(region.chrom, p1, p2)).upper()

    if (region.final - region.initial) % 2 == 0:
        aux_plus = 0

    dna_seq_rev = AuxiliaryFunctions.revcomp(str(fasta.fetch(region.chrom,
                                                             p1 + aux_plus, p2 + aux_plus)).upper())
    if region.orientation == "+":
        for i in range(0, len(dna_seq)):
            pwm[dna_seq[i]][i] += 1

    elif region.orientation == "-":
        for i in range(0, len(dna_seq_rev)):
            pwm[dna_seq_rev[i]][i] += 1
예제 #4
0
    def get_bc_signal_by_fragment_length(self, ref, start, end, bam, fasta, bias_table,
                                         forward_shift, reverse_shift, min_length=None, max_length=None,
                                         strand=True):
        # Parameters
        window = 50
        defaultKmerValue = 1.0

        # Initialization
        fBiasDict = bias_table[0]
        rBiasDict = bias_table[1]
        k_nb = len(fBiasDict.keys()[0])
        p1 = start
        p2 = end
        p1_w = p1 - (window / 2)
        p2_w = p2 + (window / 2)
        p1_wk = p1_w - int(k_nb / 2.)
        p2_wk = p2_w + int(k_nb / 2.)

        if (p1 <= 0 or p1_w <= 0 or p2_wk <= 0):
            # Return raw counts
            signal = [0.0] * (p2 - p1)
            for read in self.bam.fetch(ref, p1, p2):
                if not read.is_reverse:
                    cut_site = read.pos + forward_shift
                    if p1 <= cut_site < p2:
                        signal[cut_site - p1] += 1.0
                else:
                    cut_site = read.aend + reverse_shift - 1
                    if p1 <= cut_site < p2:
                        signal[cut_site - p1] += 1.0

            return signal

        currStr = str(fasta.fetch(ref, p1_wk, p2_wk - 1)).upper()
        currRevComp = AuxiliaryFunctions.revcomp(str(fasta.fetch(ref, p1_wk + 1, p2_wk)).upper())

        # Iterating on sequence to create the bias signal
        signal_bias_f = []
        signal_bias_r = []
        for i in range(int(k_nb / 2.), len(currStr) - int(k_nb / 2) + 1):
            fseq = currStr[i - int(k_nb / 2.):i + int(k_nb / 2.)]
            rseq = currRevComp[len(currStr) - int(k_nb / 2.) - i:len(currStr) + int(k_nb / 2.) - i]
            try:
                signal_bias_f.append(fBiasDict[fseq])
            except Exception:
                signal_bias_f.append(defaultKmerValue)
            try:
                signal_bias_r.append(rBiasDict[rseq])
            except Exception:
                signal_bias_r.append(defaultKmerValue)

        # Raw counts
        raw_f = [0.0] * (p2_w - p1_w)
        raw_r = [0.0] * (p2_w - p1_w)

        if min_length is None and max_length is None:
            for read in bam.fetch(ref, p1_w, p2_w):
                if not read.is_reverse:
                    cut_site = read.pos + forward_shift
                    if p1_w <= cut_site < p2_w:
                        raw_f[cut_site - p1_w] += 1.0
                else:
                    cut_site = read.aend + reverse_shift - 1
                    if p1_w <= cut_site < p2_w:
                        raw_r[cut_site - p1_w] += 1.0
        elif min_length is None and max_length is not None:
            for read in bam.fetch(ref, p1_w, p2_w):
                if abs(read.template_length) <= max_length:
                    if not read.is_reverse:
                        cut_site = read.pos + forward_shift
                        if p1_w <= cut_site < p2_w:
                            raw_f[cut_site - p1_w] += 1.0
                    else:
                        cut_site = read.aend + reverse_shift - 1
                        if p1_w <= cut_site < p2_w:
                            raw_r[cut_site - p1_w] += 1.0
        elif min_length is not None and max_length is None:
            for read in bam.fetch(ref, p1_w, p2_w):
                if abs(read.template_length) > min_length:
                    if not read.is_reverse:
                        cut_site = read.pos + forward_shift
                        if p1_w <= cut_site < p2_w:
                            raw_f[cut_site - p1_w] += 1.0
                    else:
                        cut_site = read.aend + reverse_shift - 1
                        if p1_w <= cut_site < p2_w:
                            raw_r[cut_site - p1_w] += 1.0
        elif min_length is not None and max_length is not None:
            for read in bam.fetch(ref, p1_w, p2_w):
                if min_length < abs(read.template_length) <= max_length:
                    if not read.is_reverse:
                        cut_site = read.pos + forward_shift
                        if p1_w <= cut_site < p2_w:
                            raw_f[cut_site - p1_w] += 1.0
                    else:
                        cut_site = read.aend + reverse_shift - 1
                        if p1_w <= cut_site < p2_w:
                            raw_r[cut_site - p1_w] += 1.0

        # Smoothed counts
        Nf = []
        Nr = []
        fSum = sum(raw_f[:window])
        rSum = sum(raw_r[:window])
        fLast = raw_f[0]
        rLast = raw_r[0]
        for i in range((window / 2), len(raw_f) - (window / 2)):
            Nf.append(fSum)
            Nr.append(rSum)
            fSum -= fLast
            fSum += raw_f[i + (window / 2)]
            fLast = raw_f[i - (window / 2) + 1]
            rSum -= rLast
            rSum += raw_r[i + (window / 2)]
            rLast = raw_r[i - (window / 2) + 1]

        # Calculating bias and writing to wig file
        fSum = sum(signal_bias_f[:window])
        rSum = sum(signal_bias_r[:window])
        fLast = signal_bias_f[0]
        rLast = signal_bias_r[0]
        bc_f = []
        bc_r = []
        for i in range((window / 2), len(signal_bias_f) - (window / 2)):
            nhatf = Nf[i - (window / 2)] * (signal_bias_f[i] / fSum)
            nhatr = Nr[i - (window / 2)] * (signal_bias_r[i] / rSum)
            bc_f.append(nhatf)
            bc_r.append(nhatr)
            fSum -= fLast
            fSum += signal_bias_f[i + (window / 2)]
            fLast = signal_bias_f[i - (window / 2) + 1]
            rSum -= rLast
            rSum += signal_bias_r[i + (window / 2)]
            rLast = signal_bias_r[i - (window / 2) + 1]

        if strand:
            return np.array(bc_f), np.array(bc_r)
        else:
            return np.add(np.array(bc_f), np.array(bc_r))
예제 #5
0
    def print_signal(self, ref, start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift,
                     initial_clip=1000, per_norm=98, per_slope=98, bias_table=None, genome_file_name=None,
                     raw_signal_file=None, bc_signal_file=None, norm_signal_file=None, strand_specific=False):

        if raw_signal_file:
            pileup_region = PileupRegion(start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift)
            if ps_version == "0.7.5":
                self.bam.fetch(reference=ref, start=start, end=end, callback=pileup_region)
            else:
                iter = self.bam.fetch(reference=ref, start=start, end=end)
                for alignment in iter:
                    pileup_region.__call__(alignment)
            raw_signal = array([min(e, initial_clip) for e in pileup_region.vector])

            f = open(raw_signal_file, "a")
            f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                [str(e) for e in nan_to_num(raw_signal)]) + "\n")
            f.close()

        if bc_signal_file or norm_signal_file:
            # Parameters
            window = 50
            defaultKmerValue = 1.0

            # Initialization
            fasta = Fastafile(genome_file_name)
            fBiasDict = bias_table[0]
            rBiasDict = bias_table[1]
            k_nb = len(fBiasDict.keys()[0])
            p1 = start
            p2 = end
            p1_w = p1 - (window / 2)
            p2_w = p2 + (window / 2)
            p1_wk = p1_w - int(k_nb / 2.)
            p2_wk = p2_w + int(k_nb / 2.)

            currStr = str(fasta.fetch(ref, p1_wk, p2_wk - 1)).upper()
            currRevComp = AuxiliaryFunctions.revcomp(str(fasta.fetch(ref, p1_wk + 1, p2_wk)).upper())

            # Iterating on sequence to create the bias signal
            signal_bias_f = []
            signal_bias_r = []
            for i in range(int(k_nb / 2.), len(currStr) - int(k_nb / 2) + 1):
                fseq = currStr[i - int(k_nb / 2.):i + int(k_nb / 2.)]
                rseq = currRevComp[len(currStr) - int(k_nb / 2.) - i:len(currStr) + int(k_nb / 2.) - i]
                try:
                    signal_bias_f.append(fBiasDict[fseq])
                except Exception:
                    signal_bias_f.append(defaultKmerValue)
                try:
                    signal_bias_r.append(rBiasDict[rseq])
                except Exception:
                    signal_bias_r.append(defaultKmerValue)

            # Raw counts
            signal_raw_f = [0.0] * (p2_w - p1_w)
            signal_raw_r = [0.0] * (p2_w - p1_w)
            for read in self.bam.fetch(ref, p1_w, p2_w):
                if not read.is_reverse:
                    cut_site = read.pos + forward_shift
                    if p1_w <= cut_site < p2_w:
                        signal_raw_f[cut_site - p1_w] += 1.0
                else:
                    cut_site = read.aend + reverse_shift - 1
                    if p1_w <= cut_site < p2_w:
                        signal_raw_r[cut_site - p1_w] += 1.0

            # Smoothed counts
            Nf = []
            Nr = []
            fSum = sum(signal_raw_f[:window])
            rSum = sum(signal_raw_r[:window])
            fLast = signal_raw_f[0]
            rLast = signal_raw_r[0]
            for i in range((window / 2), len(signal_raw_f) - (window / 2)):
                Nf.append(fSum)
                Nr.append(rSum)
                fSum -= fLast
                fSum += signal_raw_f[i + (window / 2)]
                fLast = signal_raw_f[i - (window / 2) + 1]
                rSum -= rLast
                rSum += signal_raw_r[i + (window / 2)]
                rLast = signal_raw_r[i - (window / 2) + 1]

            # Calculating bias and writing to wig file
            fSum = sum(signal_bias_f[:window])
            rSum = sum(signal_bias_r[:window])
            fLast = signal_bias_f[0]
            rLast = signal_bias_r[0]
            signal_bc = []
            signal_bc_f = []
            signal_bc_r = []
            for i in range((window / 2), len(signal_bias_f) - (window / 2)):
                nhatf = Nf[i - (window / 2)] * (signal_bias_f[i] / fSum)
                nhatr = Nr[i - (window / 2)] * (signal_bias_r[i] / rSum)
                signal_bc.append(nhatf + nhatr)
                signal_bc_f.append(nhatf)
                signal_bc_r.append(nhatr)
                fSum -= fLast
                fSum += signal_bias_f[i + (window / 2)]
                fLast = signal_bias_f[i - (window / 2) + 1]
                rSum -= rLast
                rSum += signal_bias_r[i + (window / 2)]
                rLast = signal_bias_r[i - (window / 2) + 1]

            if bc_signal_file:
                f = open(bc_signal_file, "a")
                f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                    [str(e) for e in nan_to_num(signal_bc)]) + "\n")
                f.close()

                if strand_specific:
                    prefix = bc_signal_file.split(".")[0]
                    bc_signal_file_f = prefix + "_Forward" + ".bc.wig"
                    bc_signal_file_r = prefix + "_Reverse" + ".bc.wig"
                    f = open(bc_signal_file_f, "a")
                    f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                        [str(e) for e in nan_to_num(signal_bc_f)]) + "\n")
                    f.close()
                    f = open(bc_signal_file_r, "a")
                    f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                        [str(e) for e in nan_to_num(signal_bc_r)]) + "\n")
                    f.close()

            if norm_signal_file:
                norm_signal_bc = self.boyle_norm(signal_bc)
                perc = scoreatpercentile(norm_signal_bc, 98)
                std = np.std(norm_signal_bc)
                norm_signal_bc = self.hon_norm_atac(norm_signal_bc, perc, std)
                f = open(norm_signal_file, "a")
                f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                    [str(e) for e in nan_to_num(norm_signal_bc)]) + "\n")
                f.close()

                if strand_specific:
                    prefix = bc_signal_file.split(".")[0]
                    norm_signal_file_f = prefix + "_Forward" + ".norm.wig"
                    norm_signal_file_r = prefix + "_Reverse" + ".norm.wig"

                    signal_norm_f = self.boyle_norm(signal_bc_f)
                    perc = scoreatpercentile(signal_norm_f, 98)
                    std = np.std(signal_norm_f)
                    signal_norm_f = self.hon_norm_atac(signal_norm_f, perc, std)

                    signal_norm_r = self.boyle_norm(signal_bc_r)
                    perc = scoreatpercentile(signal_norm_r, 98)
                    std = np.std(signal_norm_r)
                    signal_norm_r = self.hon_norm_atac(signal_norm_r, perc, std)

                    f = open(norm_signal_file_f, "a")
                    f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                        [str(e) for e in nan_to_num(signal_norm_f)]) + "\n")
                    f.close()
                    f = open(norm_signal_file_r, "a")
                    f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                        [str(e) for e in nan_to_num(signal_norm_r)]) + "\n")
                    f.close()
예제 #6
0
    def bias_correction_atac(self, bias_table, genome_file_name, chrName, start, end,
                             forward_shift, reverse_shift):

        # Parameters
        window = 50
        defaultKmerValue = 1.0

        # Initialization
        fastaFile = Fastafile(genome_file_name)
        fBiasDict = bias_table[0]
        rBiasDict = bias_table[1]
        k_nb = len(fBiasDict.keys()[0])
        p1 = start
        p2 = end
        p1_w = p1 - (window / 2)
        p2_w = p2 + (window / 2)
        p1_wk = p1_w - int(floor(k_nb / 2.))
        p2_wk = p2_w + int(ceil(k_nb / 2.))

        if (p1 <= 0 or p1_w <= 0 or p2_wk <= 0):
            # Return raw counts
            nf = [0.0] * (p2 - p1)
            nr = [0.0] * (p2 - p1)
            for read in self.bam.fetch(chrName, p1, p2):
                if not read.is_reverse:
                    cut_site = read.pos + forward_shift
                    if p1 <= cut_site < p2:
                        nf[cut_site - p1] += 1.0
                else:
                    cut_site = read.aend + reverse_shift - 1
                    if p1 <= cut_site < p2:
                        nr[cut_site - p1] += 1.0

            return nf, nr

        # Raw counts
        nf = [0.0] * (p2_w - p1_w)
        nr = [0.0] * (p2_w - p1_w)
        for read in self.bam.fetch(chrName, p1_w, p2_w):
            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1_w <= cut_site < p2_w:
                    nf[cut_site - p1_w] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1_w <= cut_site < p2_w:
                    nr[cut_site - p1_w] += 1.0

        # Smoothed counts
        Nf = []
        Nr = []
        fSum = sum(nf[:window])
        rSum = sum(nr[:window])
        fLast = nf[0]
        rLast = nr[0]
        for i in range((window / 2), len(nf) - (window / 2)):
            Nf.append(fSum)
            Nr.append(rSum)
            fSum -= fLast
            fSum += nf[i + (window / 2)]
            fLast = nf[i - (window / 2) + 1]
            rSum -= rLast
            rSum += nr[i + (window / 2)]
            rLast = nr[i - (window / 2) + 1]

        # Fetching sequence
        currStr = str(fastaFile.fetch(chrName, p1_wk, p2_wk - 1)).upper()
        currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrName, p1_wk + 1,
                                                                     p2_wk)).upper())

        # Iterating on sequence to create signal
        af = []
        ar = []
        for i in range(int(ceil(k_nb / 2.)), len(currStr) - int(floor(k_nb / 2)) + 1):
            fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))]
            rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) - i:len(currStr) + int(floor(k_nb / 2.)) - i]
            try:
                af.append(fBiasDict[fseq])
            except Exception:
                af.append(defaultKmerValue)
            try:
                ar.append(rBiasDict[rseq])
            except Exception:
                ar.append(defaultKmerValue)

        # Calculating bias and writing to wig file
        fSum = sum(af[:window])
        rSum = sum(ar[:window])
        fLast = af[0]
        rLast = ar[0]
        bias_corrected_signal_forward = []
        bias_corrected_signal_reverse = []
        for i in range((window / 2), len(af) - (window / 2)):
            nhatf = Nf[i - (window / 2)] * (af[i] / fSum)
            nhatr = Nr[i - (window / 2)] * (ar[i] / rSum)
            bias_corrected_signal_forward.append(nhatf)
            bias_corrected_signal_reverse.append(nhatr)
            fSum -= fLast
            fSum += af[i + (window / 2)]
            fLast = af[i - (window / 2) + 1]
            rSum -= rLast
            rSum += ar[i + (window / 2)]
            rLast = ar[i - (window / 2) + 1]

        # Termination
        fastaFile.close()
        return bias_corrected_signal_forward, bias_corrected_signal_reverse
예제 #7
0
def bias_correction(chrom, start, end, bam, bias_table, genome_file_name,
                    forward_shift, reverse_shift):
    # Parameters
    window = 50
    defaultKmerValue = 1.0

    # Initialization
    fastaFile = Fastafile(genome_file_name)
    fBiasDict = bias_table[0]
    rBiasDict = bias_table[1]
    k_nb = len(list(fBiasDict.keys())[0])
    p1 = start
    p2 = end
    p1_w = p1 - (window // 2)
    p2_w = p2 + (window // 2)
    p1_wk = p1_w - int(floor(k_nb / 2.))
    p2_wk = p2_w + int(ceil(k_nb / 2.))
    if p1 <= 0 or p1_w <= 0 or p1_wk <= 0 or p2_wk <= 0:
        # Return raw counts
        bc_signal = [0.0] * (p2 - p1)
        for read in bam.fetch(chrom, p1, p2):
            # check if the read is unmapped, according to issue #112
            if read.is_unmapped:
                continue

            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1 <= cut_site < p2:
                    bc_signal[cut_site - p1] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1 <= cut_site < p2:
                    bc_signal[cut_site - p1] += 1.0

        return bc_signal

    # Raw counts
    nf = [0.0] * (p2_w - p1_w)
    nr = [0.0] * (p2_w - p1_w)
    for read in bam.fetch(chrom, p1_w, p2_w):
        # check if the read is unmapped, according to issue #112
        if read.is_unmapped:
            continue

        if not read.is_reverse:
            cut_site = read.pos + forward_shift
            if p1_w <= cut_site < p2_w:
                nf[cut_site - p1_w] += 1.0
        else:
            cut_site = read.aend + reverse_shift - 1
            if p1_w <= cut_site < p2_w:
                nr[cut_site - p1_w] += 1.0

    # Smoothed counts
    Nf = []
    Nr = []
    f_sum = sum(nf[:window])
    r_sum = sum(nr[:window])
    f_last = nf[0]
    r_last = nr[0]
    for i in range(int(window / 2), len(nf) - int(window / 2)):
        Nf.append(f_sum)
        Nr.append(r_sum)
        f_sum -= f_last
        f_sum += nf[i + int(window / 2)]
        f_last = nf[i - int(window / 2) + 1]
        r_sum -= r_last
        r_sum += nr[i + int(window / 2)]
        r_last = nr[i - int(window / 2) + 1]

    # Fetching sequence
    currStr = str(fastaFile.fetch(chrom, p1_wk, p2_wk - 1)).upper()
    currRevComp = AuxiliaryFunctions.revcomp(
        str(fastaFile.fetch(chrom, p1_wk + 1, p2_wk)).upper())

    # Iterating on sequence to create signal
    af = []
    ar = []
    for i in range(int(ceil(k_nb / 2.)),
                   len(currStr) - int(floor(k_nb / 2)) + 1):
        fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))]
        rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) -
                           i:len(currStr) + int(floor(k_nb / 2.)) - i]
        try:
            af.append(fBiasDict[fseq])
        except Exception:
            af.append(defaultKmerValue)
        try:
            ar.append(rBiasDict[rseq])
        except Exception:
            ar.append(defaultKmerValue)

    # Calculating bias and writing to wig file
    f_sum = sum(af[:window])
    r_sum = sum(ar[:window])
    f_last = af[0]
    r_last = ar[0]
    bc_signal = []
    for i in range(int(window / 2), len(af) - int(window / 2)):
        nhatf = Nf[i - int(window / 2)] * (af[i] / f_sum)
        nhatr = Nr[i - int(window / 2)] * (ar[i] / r_sum)
        bc_signal.append(nhatf + nhatr)
        f_sum -= f_last
        f_sum += af[i + int(window / 2)]
        f_last = af[i - int(window / 2) + 1]
        r_sum -= r_last
        r_sum += ar[i + int(window / 2)]
        r_last = ar[i - int(window / 2) + 1]

    # Termination
    fastaFile.close()
    return bc_signal
예제 #8
0
    def load_gene_list(self, file_name, filter_havana=True, protein_coding=False, known_only=False):
        """Reads gene annotation in gtf (gencode) format. It populates self.gene_list with such entries.
        
        *Keyword arguments:*

            - file_name -- The gencode .gtf file name.
        """
        # Opening GTF file
        try: gtf_file = open(file_name, "r")
        except Exception: 
            print("Error: Cannot find the annotation file: "+file_name)
            print("Please check the path in ~/rgtdata/data.config")
            sys.exit(1)
        
        # Reading GTF file
        for line in gtf_file:

            # Processing line
            line = line.strip()
            if line[0] == "#": continue
            line_list = line.split("\t")
            try:
                if filter_havana and line_list[1] == "HAVANA": continue
            except: pass
            
            addt_list = line_list[8].split(";")
            addt_list = filter(None, addt_list)

            # Processing additional list of options
            addt_dict = dict()
            for addt_element in addt_list:
                addt_element_list = addt_element.split(" ")
                addt_element_list = filter(None, addt_element_list)
                # Removing " symbol from string options
                addt_element_list[1] = addt_element_list[1].replace("\"", "")
                addt_dict[addt_element_list[0]] = addt_element_list[1]

            # filter non-protein-coding sequences, if required
            if protein_coding:
                if "gene_type" not in addt_dict or addt_dict["gene_type"] != "protein_coding":
                    continue
                if "transcript_type" in addt_dict and addt_dict["transcript_type"] != "protein_coding":
                    continue

            # filter unknown sequences, if required
            if known_only:
                if "gene_status" not in addt_dict or addt_dict["gene_status"] != "KNOWN":
                    continue
                if "transcript_status" in addt_dict and addt_dict["transcript_status"] != "KNOWN":
                    continue
        
            # Removing dot from IDs
            addt_dict["gene_id"] = addt_dict["gene_id"].split(".")[0]
            try: addt_dict["transcript_id"] = addt_dict["transcript_id"].split(".")[0]
            except: pass

            # Creating final version of additional arguments
            final_addt_list = []
            for addt_key in ["gene_id", "transcript_id", "gene_type", "gene_status", "gene_name", 
                             "transcript_type", "transcript_status", "transcript_name", "level"]:
                try:
                    final_addt_list.append(addt_dict[addt_key])
                except Exception:
                    final_addt_list.append(None)

            # Handling score
            current_score = 0
            if AuxiliaryFunctions.string_is_int(line_list[5]):
                current_score = AuxiliaryFunctions.correct_standard_bed_score(line_list[5])
            
            # Creating GenomicRegion
            genomic_region = GenomicRegion(chrom=line_list[0],
                                           initial=int(line_list[3])-1,
                                           final=int(line_list[4]),
                                           orientation=line_list[6],
                                           data=current_score)

            # Creating final vector
            extra_index_elements = [[],[]] # One list for each: EXACT_GENE_MATCHES, INEXACT_GENE_MATCHES
            final_vector = [genomic_region,line_list[1],line_list[2],line_list[7]] + final_addt_list + extra_index_elements
            self.gene_list.append(final_vector)

        # Termination
        gtf_file.close()
예제 #9
0
    def load_gene_list(self, file_name, filter_havana=True):
        """
        Reads gene annotation in gtf (gencode) format. It populates self.gene_list with such entries.

        Keyword arguments:
        file_name -- The gencode .gtf file name.
        
        Return: void.
        """
        # Opening GTF file
        try:
            gtf_file = open(file_name, "r")
        except Exception:
            pass  # TODO

        # Reading GTF file
        for line in gtf_file:

            # Processing line
            line = line.strip()
            if (line[0] == "#"): continue
            line_list = line.split("\t")
            if (filter_havana and line_list[1] == "HAVANA"): continue
            addt_list = line_list[8].split(";")
            addt_list = filter(None, addt_list)

            # Processing additional list of options
            addt_dict = dict()
            for addt_element in addt_list:
                addt_element_list = addt_element.split(" ")
                addt_element_list = filter(None, addt_element_list)
                addt_element_list[1] = addt_element_list[1].replace(
                    "\"", "")  # Removing " symbol from string options
                addt_dict[addt_element_list[0]] = addt_element_list[1]

            # Removing dot from IDs
            addt_dict["gene_id"] = addt_dict["gene_id"].split(".")[0]
            addt_dict["transcript_id"] = addt_dict["transcript_id"].split(
                ".")[0]

            # Creating final version of additional arguments
            final_addt_list = []
            for addt_key in [
                    "gene_id", "transcript_id", "gene_type", "gene_status",
                    "gene_name", "transcript_type", "transcript_status",
                    "transcript_name", "level"
            ]:
                try:
                    final_addt_list.append(addt_dict[addt_key])
                except Exception:
                    final_addt_list.append(None)

            # Handling score
            current_score = 0
            if (AuxiliaryFunctions.string_is_int(line_list[5])):
                current_score = AuxiliaryFunctions.correct_standard_bed_score(
                    line_list[5])

            # Creating GenomicRegion
            genomic_region = GenomicRegion(chrom=line_list[0],
                                           initial=int(line_list[3]) - 1,
                                           final=int(line_list[4]),
                                           orientation=line_list[6],
                                           data=current_score)

            # Creating final vector
            extra_index_elements = [
                [], []
            ]  # One list for each: EXACT_GENE_MATCHES, INEXACT_GENE_MATCHES
            final_vector = [
                genomic_region, line_list[1], line_list[2], line_list[7]
            ] + final_addt_list + extra_index_elements
            self.gene_list.append(final_vector)

        # Termination
        gtf_file.close()
예제 #10
0
    def estimate_table(self, regions, dnase_file_name, genome_file_name, k_nb,
                       forward_shift, reverse_shift):
        """ 
        Estimates bias based on HS regions, DNase-seq signal and genomic sequences.

        Keyword arguments:
        regions -- DNase-seq HS regions.
        dnase_file_name -- DNase-seq file name.
        genome_file_name -- Genome to fetch genomic sequences from.
        
        Return:
        bias_table_F, bias_table_R -- Bias tables.
        """

        # Parameters
        maxDuplicates = 100
        pseudocount = 1.0

        # Initializing bam and fasta
        if (dnase_file_name.split(".")[-1].upper() != "BAM"):
            return None  # TODO ERROR
        bamFile = Samfile(dnase_file_name, "rb")
        fastaFile = Fastafile(genome_file_name)

        # Initializing dictionaries
        obsDictF = dict()
        obsDictR = dict()
        expDictF = dict()
        expDictR = dict()

        ct_reads_r = 0
        ct_reads_f = 0
        ct_kmers = 0

        # Iterating on HS regions
        for region in regions:

            # Initialization
            prevPos = -1
            trueCounter = 0

            # Evaluating observed frequencies ####################################
            # Fetching reads
            for r in bamFile.fetch(region.chrom, region.initial, region.final):

                # Calculating positions
                if (not r.is_reverse):
                    cut_site = r.pos + forward_shift - 1
                    p1 = cut_site - int(floor(k_nb / 2))
                else:
                    cut_site = r.aend + reverse_shift + 1
                    p1 = cut_site - int(floor(k_nb / 2))
                p2 = p1 + k_nb

                # Verifying PCR artifacts
                if (p1 == prevPos):
                    trueCounter += 1
                else:
                    prevPos = p1
                    trueCounter = 0
                if (trueCounter > maxDuplicates): continue

                # Fetching k-mer
                try:
                    currStr = str(fastaFile.fetch(region.chrom, p1,
                                                  p2)).upper()
                except Exception:
                    continue
                if (r.is_reverse):
                    currStr = AuxiliaryFunctions.revcomp(currStr)

                # Counting k-mer in dictionary
                if (not r.is_reverse):
                    ct_reads_f += 1
                    try:
                        obsDictF[currStr] += 1
                    except Exception:
                        obsDictF[currStr] = 1
                else:
                    ct_reads_r += 1
                    try:
                        obsDictR[currStr] += 1
                    except Exception:
                        obsDictR[currStr] = 1

            # Evaluating expected frequencies ####################################
            # Fetching whole sequence
            try:
                currStr = str(
                    fastaFile.fetch(region.chrom, region.initial,
                                    region.final)).upper()
            except Exception:
                continue
            currRevComp = AuxiliaryFunctions.revcomp(currStr)

            # Iterating on each sequence position
            for i in range(0, len(currStr) - k_nb):
                ct_kmers += 1
                # Counting k-mer in dictionary
                s = currStr[i:i + k_nb]
                try:
                    expDictF[s] += 1
                except Exception:
                    expDictF[s] = 1

                # Counting k-mer in dictionary for reverse complement
                s = currRevComp[i:i + k_nb]
                try:
                    expDictR[s] += 1
                except Exception:
                    expDictR[s] = 1

        # Closing files
        bamFile.close()
        fastaFile.close()

        # Creating bias dictionary
        alphabet = ["A", "C", "G", "T"]
        kmerComb = ["".join(e) for e in product(alphabet, repeat=k_nb)]
        bias_table_F = dict([(e, 0.0) for e in kmerComb])
        bias_table_R = dict([(e, 0.0) for e in kmerComb])
        for kmer in kmerComb:
            try:
                obsF = obsDictF[kmer] + pseudocount
            except Exception:
                obsF = pseudocount
            try:
                expF = expDictF[kmer] + pseudocount
            except Exception:
                expF = pseudocount
            if ct_reads_f == 0:
                bias_table_F[kmer] = 1
            else:
                bias_table_F[kmer] = round(
                    float(obsF / ct_reads_f) / float(expF / ct_kmers), 6)
            try:
                obsR = obsDictR[kmer] + pseudocount
            except Exception:
                obsR = pseudocount
            try:
                expR = expDictR[kmer] + pseudocount
            except Exception:
                expR = pseudocount
            if ct_reads_r == 0:
                bias_table_R[kmer] = 1
            else:
                bias_table_R[kmer] = round(
                    float(obsR / ct_reads_r) / float(expR / ct_kmers), 6)

        # Return
        return [bias_table_F, bias_table_R]
예제 #11
0
    def estimate_table_pwm(self, regions, dnase_file_name, genome_file_name,
                           k_nb, forward_shift, reverse_shift):
        """
        Estimates bias based on HS regions, DNase-seq signal and genomic sequences.

        Keyword arguments:
        regions -- DNase-seq HS regions.
        atac_file_name -- DNase-seq file name.
        genome_file_name -- Genome to fetch genomic sequences from.

        Return:
        bias_table_F, bias_table_R -- Bias tables.
        """

        # Initializing bam and fasta
        if (dnase_file_name.split(".")[-1].upper() != "BAM"):
            return None  # TODO ERROR
        bamFile = Samfile(dnase_file_name, "rb")
        fastaFile = Fastafile(genome_file_name)

        obsSeqsF = []
        obsSeqsR = []
        expSeqsF = []
        expSeqsR = []

        # Iterating on HS regions
        for region in regions:
            # Evaluating observed frequencies
            # Fetching reads
            for r in bamFile.fetch(region.chrom, region.initial, region.final):
                # Calculating positions
                # if(not r.is_reverse): p1 = r.pos - (k_nb/2) - 1 + shift
                # else: p1 = r.aend - (k_nb/2) + 1 - shift
                if (not r.is_reverse):
                    cut_site = r.pos + forward_shift - 1
                    p1 = cut_site - int(floor(k_nb / 2))
                else:
                    cut_site = r.aend + reverse_shift + 1
                    p1 = cut_site - int(floor(k_nb / 2))
                p2 = p1 + k_nb

                # Fetching k-mer
                try:
                    currStr = str(fastaFile.fetch(region.chrom, p1,
                                                  p2)).upper()
                except Exception:
                    continue
                if (r.is_reverse):
                    currStr = AuxiliaryFunctions.revcomp(currStr)

                # Counting k-mer in dictionary
                if 'N' not in currStr:
                    if (not r.is_reverse):
                        obsSeqsF.append(Seq(currStr))
                    else:
                        obsSeqsR.append(Seq(currStr))

            # Evaluating expected frequencies
            # Fetching whole sequence
            try:
                currStr = str(
                    fastaFile.fetch(region.chrom, region.initial,
                                    region.final)).upper()
            except Exception:
                continue
            currRevComp = AuxiliaryFunctions.revcomp(currStr)

            # Iterating on each sequence position
            for i in range(0, len(currStr) - k_nb):
                s = currStr[i:i + k_nb]
                if 'N' not in currStr:
                    # Counting k-mer in dictionary
                    expSeqsF.append(Seq(s))

                    # Counting k-mer in dictionary for reverse complement
                    s = currRevComp[i:i + k_nb]
                    expSeqsR.append(Seq(s))

        # Closing files
        bamFile.close()
        fastaFile.close()

        obsMotifsF = motifs.create(obsSeqsF)
        obsMotifsR = motifs.create(obsSeqsR)
        expMotifsF = motifs.create(expSeqsF)
        expMotifsR = motifs.create(expSeqsR)

        obsPwmF = obsMotifsF.pwm
        obsPwmR = obsMotifsR.pwm
        expPwmF = expMotifsF.pwm
        expPwmR = expMotifsR.pwm

        # Output logos
        logo_obs_f = os.path.join(
            self.output_loc, "Bias", "logo",
            "obs_{}_{}_f.pdf".format(str(k_nb), str(forward_shift)))
        logo_obs_r = os.path.join(
            self.output_loc, "Bias", "logo",
            "obs_{}_{}_r.pdf".format(str(k_nb), str(forward_shift)))
        logo_exp_f = os.path.join(
            self.output_loc, "Bias", "logo",
            "exp_{}_{}_f.pdf".format(str(k_nb), str(forward_shift)))
        logo_exp_r = os.path.join(
            self.output_loc, "Bias", "logo",
            "exp_{}_{}_r.pdf".format(str(k_nb), str(forward_shift)))
        obsMotifsF.weblogo(logo_obs_f,
                           format="pdf",
                           stack_width="large",
                           color_scheme="color_classic",
                           yaxis_scale=0.2,
                           yaxis_tic_interval=0.1)
        obsMotifsR.weblogo(logo_obs_r,
                           format="pdf",
                           stack_width="large",
                           color_scheme="color_classic",
                           yaxis_scale=0.2,
                           yaxis_tic_interval=0.1)
        expMotifsF.weblogo(logo_exp_f,
                           format="pdf",
                           stack_width="large",
                           color_scheme="color_classic",
                           yaxis_scale=0.02,
                           yaxis_tic_interval=0.01)
        expMotifsR.weblogo(logo_exp_r,
                           format="pdf",
                           stack_width="large",
                           color_scheme="color_classic",
                           yaxis_scale=0.02,
                           yaxis_tic_interval=0.01)

        # Output pwms
        pwm_data_list = [obsPwmF, obsPwmR, expPwmF, expPwmR]
        pwm_file_list = []
        pwm_obs_f = os.path.join(
            self.output_loc, "Bias", "pwm",
            "obs_{}_{}_f.pwm".format(str(k_nb), str(forward_shift)))
        pwm_obs_r = os.path.join(
            self.output_loc, "Bias", "pwm",
            "obs_{}_{}_r.pwm".format(str(k_nb), str(forward_shift)))
        pwm_exp_f = os.path.join(
            self.output_loc, "Bias", "pwm",
            "exp_{}_{}_f.pwm".format(str(k_nb), str(forward_shift)))
        pwm_exp_r = os.path.join(
            self.output_loc, "Bias", "pwm",
            "exp_{}_{}_r.pwm".format(str(k_nb), str(forward_shift)))

        pwm_file_list.append(pwm_obs_f)
        pwm_file_list.append(pwm_obs_r)
        pwm_file_list.append(pwm_exp_f)
        pwm_file_list.append(pwm_exp_r)

        for i in range(len(pwm_data_list)):
            with open(pwm_file_list[i], "w") as f:
                f.write(str(pwm_data_list[i]))

        # Creating bias dictionary
        alphabet = ["A", "C", "G", "T"]
        k_mer_comb = ["".join(e) for e in product(alphabet, repeat=k_nb)]
        bias_table_F = dict([(e, 0.0) for e in k_mer_comb])
        bias_table_R = dict([(e, 0.0) for e in k_mer_comb])
        for k_mer in k_mer_comb:
            obsF = self.get_pwm_score(k_mer, obsPwmF, k_nb)
            expF = self.get_pwm_score(k_mer, expPwmF, k_nb)
            bias_table_F[k_mer] = round(obsF / expF, 6)
            obsR = self.get_pwm_score(k_mer, obsPwmR, k_nb)
            expR = self.get_pwm_score(k_mer, expPwmR, k_nb)
            bias_table_R[k_mer] = round(obsR / expR, 6)

        # Return
        return [bias_table_F, bias_table_R]
예제 #12
0
    def estimate_table(self, regions, dnase_file_name, genome_file_name, k_nb, shift):
        """ 
        Estimates bias based on HS regions, DNase-seq signal and genomic sequences.

        Keyword arguments:
        regions -- DNase-seq HS regions.
        dnase_file_name -- DNase-seq file name.
        genome_file_name -- Genome to fetch genomic sequences from.
        
        Return:
        bias_table_F, bias_table_R -- Bias tables.
        """

        # Parameters
        maxDuplicates = 100
        pseudocount = 1.0

        # Initializing bam and fasta
        if(dnase_file_name.split(".")[-1].upper() != "BAM"): return None # TODO ERROR
        bamFile = Samfile(dnase_file_name, "rb")
        fastaFile = Fastafile(genome_file_name)

        # Initializing dictionaries
        obsDictF = dict(); obsDictR = dict()
        expDictF = dict(); expDictR = dict()

        ct_reads_r=0
        ct_reads_f=0
        ct_kmers=0

        # Iterating on HS regions
        for region in regions:

            # Initialization
            prevPos = -1
            trueCounter = 0

            # Evaluating observed frequencies ####################################

            # Fetching reads
            for r in bamFile.fetch(region.chrom, region.initial, region.final):

                # Calculating positions
                if(not r.is_reverse): p1 = r.pos - (k_nb/2) - 1 + shift
                else: p1 = r.aend - (k_nb/2) + 1 - shift
                p2 = p1 + k_nb

                # Verifying PCR artifacts
                if(p1 == prevPos): trueCounter += 1
                else:
                    prevPos = p1
                    trueCounter = 0
                if(trueCounter > maxDuplicates): continue

                # Fetching k-mer
                try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
                except Exception: continue
                if(r.is_reverse): currStr = AuxiliaryFunctions.revcomp(currStr)

                # Counting k-mer in dictionary
                if(not r.is_reverse):
                    ct_reads_r+=1
                    try: obsDictF[currStr] += 1
                    except Exception: obsDictF[currStr] = 1
                else:
                    ct_reads_f+=1
                    try: obsDictR[currStr] += 1
                    except Exception: obsDictR[currStr] = 1 


            # Evaluating expected frequencies ####################################

            # Fetching whole sequence
            try: currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
            except Exception: continue
            currRevComp = AuxiliaryFunctions.revcomp(currStr)

            # Iterating on each sequence position
            for i in range(0,len(currStr)-k_nb):
                ct_kmers+=1
                # Counting k-mer in dictionary
                s = currStr[i:i+k_nb]
                try: expDictF[s] += 1
                except Exception: expDictF[s] = 1

                # Counting k-mer in dictionary for reverse complement
                s = currRevComp[i:i+k_nb]
                try: expDictR[s] += 1
                except Exception: expDictR[s] = 1

        # Closing files
        bamFile.close()
        fastaFile.close()

        # Creating bias dictionary
        alphabet = ["A","C","G","T"]
        kmerComb = ["".join(e) for e in product(alphabet, repeat=k_nb)]
        bias_table_F = dict([(e,0.0) for e in kmerComb]) 
        bias_table_R = dict([(e,0.0) for e in kmerComb]) 
        for kmer in kmerComb:
            try: obsF = obsDictF[kmer] + pseudocount
            except Exception: obsF = pseudocount
            try: expF = expDictF[kmer] + pseudocount
            except Exception: expF = pseudocount
            bias_table_F[kmer] = round(float(obsF/ct_reads_f)/float(expF/ct_kmers),6)
            try: obsR = obsDictR[kmer] + pseudocount
            except Exception: obsR = pseudocount
            try: expR = expDictR[kmer] + pseudocount
            except Exception: expR = pseudocount
            bias_table_R[kmer] = round(float(obsR/ct_reads_r)/float(expR/ct_kmers),6)

        # Return
        return [bias_table_F, bias_table_R]
예제 #13
0
def estimate_bias_kmer(args):
    # Parameters
    maxDuplicates = 100
    pseudocount = 1.0

    # Initializing bam and fasta
    bamFile = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fastaFile = Fastafile(genome_data.get_genome())
    regions = GenomicRegionSet("regions")
    regions.read(args.regions_file)

    # Initializing dictionaries
    obsDictF = dict()
    obsDictR = dict()
    expDictF = dict()
    expDictR = dict()

    ct_reads_r = 0
    ct_reads_f = 0
    ct_kmers = 0

    # Iterating on HS regions
    for region in regions:

        # Initialization
        prevPos = -1
        trueCounter = 0

        # Evaluating observed frequencies ####################################
        # Fetching reads
        for r in bamFile.fetch(region.chrom, region.initial, region.final):

            # Calculating positions
            if not r.is_reverse:
                cut_site = r.pos + args.forward_shift - 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            else:
                cut_site = r.aend + args.reverse_shift + 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            p2 = p1 + args.k_nb

            # Verifying PCR artifacts
            if p1 == prevPos:
                trueCounter += 1
            else:
                prevPos = p1
                trueCounter = 0
            if trueCounter > maxDuplicates: continue

            # Fetching k-mer
            try:
                currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)

            # Counting k-mer in dictionary
            if not r.is_reverse:
                ct_reads_f += 1
                try:
                    obsDictF[currStr] += 1
                except Exception:
                    obsDictF[currStr] = 1
            else:
                ct_reads_r += 1
                try:
                    obsDictR[currStr] += 1
                except Exception:
                    obsDictR[currStr] = 1

        # Evaluating expected frequencies ####################################
        # Fetching whole sequence
        try:
            currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue
        currRevComp = AuxiliaryFunctions.revcomp(currStr)

        # Iterating on each sequence position
        for i in range(0, len(currStr) - args.k_nb):
            ct_kmers += 1
            # Counting k-mer in dictionary
            s = currStr[i:i + args.k_nb]
            try:
                expDictF[s] += 1
            except Exception:
                expDictF[s] = 1

            # Counting k-mer in dictionary for reverse complement
            s = currRevComp[i:i + args.k_nb]
            try:
                expDictR[s] += 1
            except Exception:
                expDictR[s] = 1

    # Closing files
    bamFile.close()
    fastaFile.close()

    # Creating bias dictionary
    alphabet = ["A", "C", "G", "T"]
    kmerComb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)]
    bias_table_F = dict([(e, 0.0) for e in kmerComb])
    bias_table_R = dict([(e, 0.0) for e in kmerComb])
    for kmer in kmerComb:
        try:
            obsF = obsDictF[kmer] + pseudocount
        except Exception:
            obsF = pseudocount
        try:
            expF = expDictF[kmer] + pseudocount
        except Exception:
            expF = pseudocount
        if ct_reads_f == 0:
            bias_table_F[kmer] = 1
        else:
            bias_table_F[kmer] = round(float(obsF / ct_reads_f) / float(expF / ct_kmers), 6)
        try:
            obsR = obsDictR[kmer] + pseudocount
        except Exception:
            obsR = pseudocount
        try:
            expR = expDictR[kmer] + pseudocount
        except Exception:
            expR = pseudocount
        if ct_reads_r == 0:
            bias_table_R[kmer] = 1
        else:
            bias_table_R[kmer] = round(float(obsR / ct_reads_r) / float(expR / ct_kmers), 6)

    write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
예제 #14
0
def estimate_bias_pwm(args):
    # Parameters
    max_duplicates = 100

    # Initializing bam and fasta
    bamFile = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fastaFile = Fastafile(genome_data.get_genome())
    regions = GenomicRegionSet("regions")
    regions.read(args.regions_file)

    obs_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    exp_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    obs_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    exp_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])

    # Iterating on HS regions
    for region in regions:
        # Initialization
        prev_pos = -1
        true_counter = 0

        # Evaluating observed frequencies
        # Fetching reads
        for r in bamFile.fetch(region.chrom, region.initial, region.final):
            # Calculating positions
            if not r.is_reverse:
                cut_site = r.pos + args.forward_shift - 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            else:
                cut_site = r.aend + args.reverse_shift + 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            p2 = p1 + args.k_nb

            # Verifying PCR artifacts
            if p1 == prev_pos:
                true_counter += 1
            else:
                prev_pos = p1
                true_counter = 0
            if true_counter > max_duplicates: continue

            # Fetching k-mer
            try:
                currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)

            # Counting k-mer in dictionary
            if not r.is_reverse:
                for i in range(0, len(currStr)):
                    obs_f_pwm_dict[currStr[i]][i] += 1
            else:
                for i in range(0, len(currStr)):
                    obs_r_pwm_dict[currStr[i]][i] += 1

        # Evaluating expected frequencies
        # Fetching whole sequence
        try:
            currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue

        # Iterating on each sequence position
        s = None
        for i in range(0, len(currStr) - args.k_nb):
            # Counting k-mer in dictionary
            s = currStr[i:i + args.k_nb]
            for i in range(0, len(s)):
                exp_f_pwm_dict[s[i]][i] += 1

            # Counting k-mer in dictionary for reverse complement
            s = AuxiliaryFunctions.revcomp(s)
            for i in range(0, len(s)):
                exp_r_pwm_dict[s[i]][i] += 1

    # Closing files
    bamFile.close()
    fastaFile.close()

    # Output pwms
    os.system("mkdir -p " + os.path.join(args.output_location, "pfm"))
    pwm_dict_list = [obs_f_pwm_dict, obs_r_pwm_dict, exp_f_pwm_dict, exp_r_pwm_dict]
    pwm_file_list = []
    pwm_obs_f = os.path.join(args.output_location, "pfm", "obs_{}_f.pfm".format(str(args.k_nb)))
    pwm_obs_r = os.path.join(args.output_location, "pfm", "obs_{}_r.pfm".format(str(args.k_nb)))
    pwm_exp_f = os.path.join(args.output_location, "pfm", "exp_{}_f.pfm".format(str(args.k_nb)))
    pwm_exp_r = os.path.join(args.output_location, "pfm", "exp_{}_r.pfm".format(str(args.k_nb)))

    pwm_file_list.append(pwm_obs_f)
    pwm_file_list.append(pwm_obs_r)
    pwm_file_list.append(pwm_exp_f)
    pwm_file_list.append(pwm_exp_r)

    for i in range(len(pwm_dict_list)):
        with open(pwm_file_list[i], "w") as pwm_file:
            for e in ["A", "C", "G", "T"]:
                pwm_file.write(" ".join([str(int(f)) for f in pwm_dict_list[i][e]]) + "\n")

    motif_obs_f = motifs.read(open(pwm_obs_f), "pfm")
    motif_obs_r = motifs.read(open(pwm_obs_r), "pfm")
    motif_exp_f = motifs.read(open(pwm_exp_f), "pfm")
    motif_exp_r = motifs.read(open(pwm_exp_r), "pfm")

    # Output logos
    os.system("mkdir -p " + os.path.join(args.output_location, "logo"))
    logo_obs_f = os.path.join(args.output_location, "logo", "obs_{}_f.pdf".format(str(args.k_nb)))
    logo_obs_r = os.path.join(args.output_location, "logo", "obs_{}_r.pdf".format(str(args.k_nb)))
    logo_exp_f = os.path.join(args.output_location, "logo", "exp_{}_f.pdf".format(str(args.k_nb)))
    logo_exp_r = os.path.join(args.output_location, "logo", "exp_{}_r.pdf".format(str(args.k_nb)))

    motif_obs_f.weblogo(logo_obs_f, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.2, yaxis_tic_interval=0.1)
    motif_obs_r.weblogo(logo_obs_r, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.2, yaxis_tic_interval=0.1)
    motif_exp_f.weblogo(logo_exp_f, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.02, yaxis_tic_interval=0.01)
    motif_exp_r.weblogo(logo_exp_r, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.02, yaxis_tic_interval=0.01)

    # Creating bias dictionary
    alphabet = ["A", "C", "G", "T"]
    k_mer_comb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)]
    bias_table_F = dict([(e, 0.0) for e in k_mer_comb])
    bias_table_R = dict([(e, 0.0) for e in k_mer_comb])
    for k_mer in k_mer_comb:
        obs_f = get_ppm_score(k_mer, motif_obs_f.pwm, args.k_nb)
        exp_f = get_ppm_score(k_mer, motif_exp_f.pwm, args.k_nb)
        bias_table_F[k_mer] = round(obs_f / exp_f, 6)
        obs_r = get_ppm_score(k_mer, motif_obs_r.pwm, args.k_nb)
        exp_r = get_ppm_score(k_mer, motif_exp_r.pwm, args.k_nb)
        bias_table_R[k_mer] = round(obs_r / exp_r, 6)

    write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
예제 #15
0
def estimate_bias_pwm(args):
    # Parameters
    max_duplicates = 100

    # Initializing bam and fasta
    bamFile = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fastaFile = Fastafile(genome_data.get_genome())
    regions = GenomicRegionSet("regions")
    regions.read(args.regions_file)

    obs_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    exp_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    obs_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    exp_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])

    # Iterating on HS regions
    for region in regions:
        # Initialization
        prev_pos = -1
        true_counter = 0

        # Evaluating observed frequencies
        # Fetching reads
        for r in bamFile.fetch(region.chrom, region.initial, region.final):
            # Calculating positions
            if not r.is_reverse:
                cut_site = r.pos + args.forward_shift - 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            else:
                cut_site = r.aend + args.reverse_shift + 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            p2 = p1 + args.k_nb

            # Verifying PCR artifacts
            if p1 == prev_pos:
                true_counter += 1
            else:
                prev_pos = p1
                true_counter = 0
            if true_counter > max_duplicates: continue

            # Fetching k-mer
            try:
                currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)

            # Counting k-mer in dictionary
            if not r.is_reverse:
                for i in range(0, len(currStr)):
                    obs_f_pwm_dict[currStr[i]][i] += 1
            else:
                for i in range(0, len(currStr)):
                    obs_r_pwm_dict[currStr[i]][i] += 1

        # Evaluating expected frequencies
        # Fetching whole sequence
        try:
            currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue

        # Iterating on each sequence position
        s = None
        for i in range(0, len(currStr) - args.k_nb):
            # Counting k-mer in dictionary
            s = currStr[i:i + args.k_nb]
            for i in range(0, len(s)):
                exp_f_pwm_dict[s[i]][i] += 1

            # Counting k-mer in dictionary for reverse complement
            s = AuxiliaryFunctions.revcomp(s)
            for i in range(0, len(s)):
                exp_r_pwm_dict[s[i]][i] += 1

    # Closing files
    bamFile.close()
    fastaFile.close()

    # Output pwms
    os.system("mkdir -p " + os.path.join(args.output_location, "pfm"))
    pwm_dict_list = [obs_f_pwm_dict, obs_r_pwm_dict, exp_f_pwm_dict, exp_r_pwm_dict]
    pwm_file_list = []
    pwm_obs_f = os.path.join(args.output_location, "pfm", "obs_{}_f.pfm".format(str(args.k_nb)))
    pwm_obs_r = os.path.join(args.output_location, "pfm", "obs_{}_r.pfm".format(str(args.k_nb)))
    pwm_exp_f = os.path.join(args.output_location, "pfm", "exp_{}_f.pfm".format(str(args.k_nb)))
    pwm_exp_r = os.path.join(args.output_location, "pfm", "exp_{}_r.pfm".format(str(args.k_nb)))

    pwm_file_list.append(pwm_obs_f)
    pwm_file_list.append(pwm_obs_r)
    pwm_file_list.append(pwm_exp_f)
    pwm_file_list.append(pwm_exp_r)

    for i in range(len(pwm_dict_list)):
        with open(pwm_file_list[i], "w") as pwm_file:
            for e in ["A", "C", "G", "T"]:
                pwm_file.write(" ".join([str(int(f)) for f in pwm_dict_list[i][e]]) + "\n")

    motif_obs_f = motifs.read(open(pwm_obs_f), "pfm")
    motif_obs_r = motifs.read(open(pwm_obs_r), "pfm")
    motif_exp_f = motifs.read(open(pwm_exp_f), "pfm")
    motif_exp_r = motifs.read(open(pwm_exp_r), "pfm")

    # Output logos
    os.system("mkdir -p " + os.path.join(args.output_location, "logo"))
    logo_obs_f = os.path.join(args.output_location, "logo", "obs_{}_f.pdf".format(str(args.k_nb)))
    logo_obs_r = os.path.join(args.output_location, "logo", "obs_{}_r.pdf".format(str(args.k_nb)))
    logo_exp_f = os.path.join(args.output_location, "logo", "exp_{}_f.pdf".format(str(args.k_nb)))
    logo_exp_r = os.path.join(args.output_location, "logo", "exp_{}_r.pdf".format(str(args.k_nb)))

    motif_obs_f.weblogo(logo_obs_f, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.2, yaxis_tic_interval=0.1)
    motif_obs_r.weblogo(logo_obs_r, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.2, yaxis_tic_interval=0.1)
    motif_exp_f.weblogo(logo_exp_f, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.02, yaxis_tic_interval=0.01)
    motif_exp_r.weblogo(logo_exp_r, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.02, yaxis_tic_interval=0.01)

    # Creating bias dictionary
    alphabet = ["A", "C", "G", "T"]
    k_mer_comb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)]
    bias_table_F = dict([(e, 0.0) for e in k_mer_comb])
    bias_table_R = dict([(e, 0.0) for e in k_mer_comb])
    for k_mer in k_mer_comb:
        obs_f = get_ppm_score(k_mer, motif_obs_f.pwm, args.k_nb)
        exp_f = get_ppm_score(k_mer, motif_exp_f.pwm, args.k_nb)
        bias_table_F[k_mer] = round(obs_f / exp_f, 6)
        obs_r = get_ppm_score(k_mer, motif_obs_r.pwm, args.k_nb)
        exp_r = get_ppm_score(k_mer, motif_exp_r.pwm, args.k_nb)
        bias_table_R[k_mer] = round(obs_r / exp_r, 6)

    write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
예제 #16
0
def estimate_bias_kmer(args):
    # Parameters
    maxDuplicates = 100
    pseudocount = 1.0

    # Initializing bam and fasta
    bamFile = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fastaFile = Fastafile(genome_data.get_genome())
    regions = GenomicRegionSet("regions")
    regions.read(args.regions_file)

    # Initializing dictionaries
    obsDictF = dict()
    obsDictR = dict()
    expDictF = dict()
    expDictR = dict()

    ct_reads_r = 0
    ct_reads_f = 0
    ct_kmers = 0

    # Iterating on HS regions
    for region in regions:

        # Initialization
        prevPos = -1
        trueCounter = 0

        # Evaluating observed frequencies ####################################
        # Fetching reads
        for r in bamFile.fetch(region.chrom, region.initial, region.final):

            # Calculating positions
            if not r.is_reverse:
                cut_site = r.pos + args.forward_shift - 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            else:
                cut_site = r.aend + args.reverse_shift + 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            p2 = p1 + args.k_nb

            # Verifying PCR artifacts
            if p1 == prevPos:
                trueCounter += 1
            else:
                prevPos = p1
                trueCounter = 0
            if trueCounter > maxDuplicates: continue

            # Fetching k-mer
            try:
                currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)

            # Counting k-mer in dictionary
            if not r.is_reverse:
                ct_reads_f += 1
                try:
                    obsDictF[currStr] += 1
                except Exception:
                    obsDictF[currStr] = 1
            else:
                ct_reads_r += 1
                try:
                    obsDictR[currStr] += 1
                except Exception:
                    obsDictR[currStr] = 1

        # Evaluating expected frequencies ####################################
        # Fetching whole sequence
        try:
            currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue
        currRevComp = AuxiliaryFunctions.revcomp(currStr)

        # Iterating on each sequence position
        for i in range(0, len(currStr) - args.k_nb):
            ct_kmers += 1
            # Counting k-mer in dictionary
            s = currStr[i:i + args.k_nb]
            try:
                expDictF[s] += 1
            except Exception:
                expDictF[s] = 1

            # Counting k-mer in dictionary for reverse complement
            s = currRevComp[i:i + args.k_nb]
            try:
                expDictR[s] += 1
            except Exception:
                expDictR[s] = 1

    # Closing files
    bamFile.close()
    fastaFile.close()

    # Creating bias dictionary
    alphabet = ["A", "C", "G", "T"]
    kmerComb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)]
    bias_table_F = dict([(e, 0.0) for e in kmerComb])
    bias_table_R = dict([(e, 0.0) for e in kmerComb])
    for kmer in kmerComb:
        try:
            obsF = obsDictF[kmer] + pseudocount
        except Exception:
            obsF = pseudocount
        try:
            expF = expDictF[kmer] + pseudocount
        except Exception:
            expF = pseudocount
        if ct_reads_f == 0:
            bias_table_F[kmer] = 1
        else:
            bias_table_F[kmer] = round(float(obsF / ct_reads_f) / float(expF / ct_kmers), 6)
        try:
            obsR = obsDictR[kmer] + pseudocount
        except Exception:
            obsR = pseudocount
        try:
            expR = expDictR[kmer] + pseudocount
        except Exception:
            expR = pseudocount
        if ct_reads_r == 0:
            bias_table_R[kmer] = 1
        else:
            bias_table_R[kmer] = round(float(obsR / ct_reads_r) / float(expR / ct_kmers), 6)

    write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
예제 #17
0
    def get_bias_raw_bc_signal(self, ref, start, end, bam, fasta, bias_table, forward_shift, reverse_shift,
                               strand=False):
        # Parameters
        window = 50
        defaultKmerValue = 1.0

        # Initialization
        fBiasDict = bias_table[0]
        rBiasDict = bias_table[1]
        k_nb = len(fBiasDict.keys()[0])
        p1 = start
        p2 = end
        p1_w = p1 - (window / 2)
        p2_w = p2 + (window / 2)
        p1_wk = p1_w - int(k_nb / 2.)
        p2_wk = p2_w + int(k_nb / 2.)

        if p1 <= 0 or p1_w <= 0 or p2_wk <= 0:
            # Return raw counts
            signal = [0.0] * (p2 - p1)
            for read in self.bam.fetch(ref, p1, p2):
                # check if the read is unmapped, according to issue #112
                if read.is_unmapped:
                    continue

                if not read.is_reverse:
                    cut_site = read.pos + forward_shift
                    if p1 <= cut_site < p2:
                        signal[cut_site - p1] += 1.0
                else:
                    cut_site = read.aend + reverse_shift - 1
                    if p1 <= cut_site < p2:
                        signal[cut_site - p1] += 1.0

            return signal

        currStr = str(fasta.fetch(ref, p1_wk - 1 + forward_shift, p2_wk - 2 + forward_shift)).upper()
        currRevComp = AuxiliaryFunctions.revcomp(str(fasta.fetch(ref, p1_wk + reverse_shift + 2,
                                                                 p2_wk + reverse_shift + 1)).upper())

        # Iterating on sequence to create the bias signal
        signal_bias_f = []
        signal_bias_r = []
        for i in range(int(k_nb / 2.), len(currStr) - int(k_nb / 2) + 1):
            fseq = currStr[i - int(k_nb / 2.):i + int(k_nb / 2.)]
            rseq = currRevComp[len(currStr) - int(k_nb / 2.) - i:len(currStr) + int(k_nb / 2.) - i]
            try:
                signal_bias_f.append(fBiasDict[fseq])
            except Exception:
                signal_bias_f.append(defaultKmerValue)
            try:
                signal_bias_r.append(rBiasDict[rseq])
            except Exception:
                signal_bias_r.append(defaultKmerValue)

        # Raw counts
        signal_raw_f = [0.0] * (p2_w - p1_w)
        signal_raw_r = [0.0] * (p2_w - p1_w)
        for read in bam.fetch(ref, p1_w, p2_w):
            # check if the read is unmapped, according to issue #112
            if read.is_unmapped:
                continue

            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1_w <= cut_site < p2_w:
                    signal_raw_f[cut_site - p1_w] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1_w <= cut_site < p2_w:
                    signal_raw_r[cut_site - p1_w] += 1.0

        # Smoothed counts
        Nf = []
        Nr = []
        fSum = sum(signal_raw_f[:window])
        rSum = sum(signal_raw_r[:window])
        fLast = signal_raw_f[0]
        rLast = signal_raw_r[0]
        for i in range((window / 2), len(signal_raw_f) - (window / 2)):
            Nf.append(fSum)
            Nr.append(rSum)
            fSum -= fLast
            fSum += signal_raw_f[i + (window / 2)]
            fLast = signal_raw_f[i - (window / 2) + 1]
            rSum -= rLast
            rSum += signal_raw_r[i + (window / 2)]
            rLast = signal_raw_r[i - (window / 2) + 1]

        # Calculating bias and writing to wig file
        fSum = sum(signal_bias_f[:window])
        rSum = sum(signal_bias_r[:window])
        fLast = signal_bias_f[0]
        rLast = signal_bias_r[0]
        bias_f = []
        bias_r = []
        raw = []
        raw_f = []
        raw_r = []
        bc = []
        bc_f = []
        bc_r = []
        for i in range((window / 2), len(signal_bias_f) - (window / 2)):
            nhatf = Nf[i - (window / 2)] * (signal_bias_f[i] / fSum)
            nhatr = Nr[i - (window / 2)] * (signal_bias_r[i] / rSum)
            bias_f.append(signal_bias_f[i])
            bias_r.append(signal_bias_r[i])
            raw.append(signal_raw_f[i] + signal_raw_r[i])
            raw_f.append(signal_raw_f[i])
            raw_r.append(signal_raw_r[i])
            # zf = (signal_raw_f[i]) / (signal_bias_f[i])
            # zr = (signal_raw_r[i]) / (signal_bias_r[i])
            bc.append(nhatf + nhatr)
            bc_f.append(nhatf)
            bc_r.append(nhatr)
            fSum -= fLast
            fSum += signal_bias_f[i + (window / 2)]
            fLast = signal_bias_f[i - (window / 2) + 1]
            rSum -= rLast
            rSum += signal_bias_r[i + (window / 2)]
            rLast = signal_bias_r[i - (window / 2) + 1]

        currStr = str(fasta.fetch(ref, p1_wk, p2_wk - 1)).upper()
        currRevComp = AuxiliaryFunctions.revcomp(str(fasta.fetch(ref, p1_wk + 1, p2_wk)).upper())

        # Iterating on sequence to create the bias signal
        signal_bias_f = []
        signal_bias_r = []
        for i in range(int(k_nb / 2.), len(currStr) - int(k_nb / 2) + 1):
            fseq = currStr[i - int(k_nb / 2.):i + int(k_nb / 2.)]
            rseq = currRevComp[len(currStr) - int(k_nb / 2.) - i:len(currStr) + int(k_nb / 2.) - i]
            try:
                signal_bias_f.append(fBiasDict[fseq])
            except Exception:
                signal_bias_f.append(defaultKmerValue)
            try:
                signal_bias_r.append(rBiasDict[rseq])
            except Exception:
                signal_bias_r.append(defaultKmerValue)

        bias_f = []
        bias_r = []
        for i in range((window / 2), len(signal_bias_f) - (window / 2)):
            bias_f.append(signal_bias_f[i])
            bias_r.append(signal_bias_r[i])

        if strand:
            return bias_f, bias_r, raw, raw_f, raw_r, bc, bc_f, bc_r
        else:
            return bias_f, bias_r, raw, bc
예제 #18
0
def bias_correction(chrom, start, end, bam, bias_table, genome_file_name, forward_shift, reverse_shift):
    # Parameters
    window = 50
    defaultKmerValue = 1.0

    # Initialization
    fastaFile = Fastafile(genome_file_name)
    fBiasDict = bias_table[0]
    rBiasDict = bias_table[1]
    k_nb = len(fBiasDict.keys()[0])
    p1 = start
    p2 = end
    p1_w = p1 - (window / 2)
    p2_w = p2 + (window / 2)
    p1_wk = p1_w - int(floor(k_nb / 2.))
    p2_wk = p2_w + int(ceil(k_nb / 2.))
    if p1 <= 0 or p1_w <= 0 or p2_wk <= 0:
        # Return raw counts
        bc_signal = [0.0] * (p2 - p1)
        for read in bam.fetch(chrom, p1, p2):
            # check if the read is unmapped, according to issue #112
            if read.is_unmapped:
                continue

            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1 <= cut_site < p2:
                    bc_signal[cut_site - p1] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1 <= cut_site < p2:
                    bc_signal[cut_site - p1] += 1.0

        return bc_signal

    # Raw counts
    nf = [0.0] * (p2_w - p1_w)
    nr = [0.0] * (p2_w - p1_w)
    for read in bam.fetch(chrom, p1_w, p2_w):
        # check if the read is unmapped, according to issue #112
        if read.is_unmapped:
            continue

        if not read.is_reverse:
            cut_site = read.pos + forward_shift
            if p1_w <= cut_site < p2_w:
                nf[cut_site - p1_w] += 1.0
        else:
            cut_site = read.aend + reverse_shift - 1
            if p1_w <= cut_site < p2_w:
                nr[cut_site - p1_w] += 1.0

    # Smoothed counts
    Nf = []
    Nr = []
    f_sum = sum(nf[:window])
    r_sum = sum(nr[:window])
    f_last = nf[0]
    r_last = nr[0]
    for i in range((window / 2), len(nf) - (window / 2)):
        Nf.append(f_sum)
        Nr.append(r_sum)
        f_sum -= f_last
        f_sum += nf[i + (window / 2)]
        f_last = nf[i - (window / 2) + 1]
        r_sum -= r_last
        r_sum += nr[i + (window / 2)]
        r_last = nr[i - (window / 2) + 1]

    # Fetching sequence
    currStr = str(fastaFile.fetch(chrom, p1_wk, p2_wk - 1)).upper()
    currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrom, p1_wk + 1, p2_wk)).upper())

    # Iterating on sequence to create signal
    af = []
    ar = []
    for i in range(int(ceil(k_nb / 2.)), len(currStr) - int(floor(k_nb / 2)) + 1):
        fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))]
        rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) - i:len(currStr) + int(floor(k_nb / 2.)) - i]
        try:
            af.append(fBiasDict[fseq])
        except Exception:
            af.append(defaultKmerValue)
        try:
            ar.append(rBiasDict[rseq])
        except Exception:
            ar.append(defaultKmerValue)

    # Calculating bias and writing to wig file
    f_sum = sum(af[:window])
    r_sum = sum(ar[:window])
    f_last = af[0]
    r_last = ar[0]
    bc_signal = []
    for i in range((window / 2), len(af) - (window / 2)):
        nhatf = Nf[i - (window / 2)] * (af[i] / f_sum)
        nhatr = Nr[i - (window / 2)] * (ar[i] / r_sum)
        bc_signal.append(nhatf + nhatr)
        f_sum -= f_last
        f_sum += af[i + (window / 2)]
        f_last = af[i - (window / 2) + 1]
        r_sum -= r_last
        r_sum += ar[i + (window / 2)]
        r_last = ar[i - (window / 2) + 1]

    # Termination
    fastaFile.close()
    return bc_signal
예제 #19
0
    def print_signal(self, ref, start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift,
                     initial_clip=1000, per_norm=98, per_slope=98, bias_table=None, genome_file_name=None,
                     raw_signal_file=None, bc_signal_file=None, norm_signal_file=None, strand_specific=False):

        if raw_signal_file:
            pileup_region = PileupRegion(start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift)
            if ps_version == "0.7.5":
                self.bam.fetch(reference=ref, start=start, end=end, callback=pileup_region)
            else:
                iter = self.bam.fetch(reference=ref, start=start, end=end)
                for alignment in iter:
                    pileup_region.__call__(alignment)
            raw_signal = array([min(e, initial_clip) for e in pileup_region.vector])

            f = open(raw_signal_file, "a")
            f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                [str(e) for e in nan_to_num(raw_signal)]) + "\n")
            f.close()

        if bc_signal_file or norm_signal_file:
            # Parameters
            window = 50
            defaultKmerValue = 1.0

            # Initialization
            fasta = Fastafile(genome_file_name)
            fBiasDict = bias_table[0]
            rBiasDict = bias_table[1]
            k_nb = len(fBiasDict.keys()[0])
            p1 = start
            p2 = end
            p1_w = p1 - (window / 2)
            p2_w = p2 + (window / 2)
            p1_wk = p1_w - int(k_nb / 2.)
            p2_wk = p2_w + int(k_nb / 2.)

            currStr = str(fasta.fetch(ref, p1_wk, p2_wk - 1)).upper()
            currRevComp = AuxiliaryFunctions.revcomp(str(fasta.fetch(ref, p1_wk + 1, p2_wk)).upper())

            # Iterating on sequence to create the bias signal
            signal_bias_f = []
            signal_bias_r = []
            for i in range(int(k_nb / 2.), len(currStr) - int(k_nb / 2) + 1):
                fseq = currStr[i - int(k_nb / 2.):i + int(k_nb / 2.)]
                rseq = currRevComp[len(currStr) - int(k_nb / 2.) - i:len(currStr) + int(k_nb / 2.) - i]
                try:
                    signal_bias_f.append(fBiasDict[fseq])
                except Exception:
                    signal_bias_f.append(defaultKmerValue)
                try:
                    signal_bias_r.append(rBiasDict[rseq])
                except Exception:
                    signal_bias_r.append(defaultKmerValue)

            # Raw counts
            signal_raw_f = [0.0] * (p2_w - p1_w)
            signal_raw_r = [0.0] * (p2_w - p1_w)
            for read in self.bam.fetch(ref, p1_w, p2_w):
                if not read.is_reverse:
                    cut_site = read.pos + forward_shift
                    if p1_w <= cut_site < p2_w:
                        signal_raw_f[cut_site - p1_w] += 1.0
                else:
                    cut_site = read.aend + reverse_shift - 1
                    if p1_w <= cut_site < p2_w:
                        signal_raw_r[cut_site - p1_w] += 1.0

            # Smoothed counts
            Nf = []
            Nr = []
            fSum = sum(signal_raw_f[:window])
            rSum = sum(signal_raw_r[:window])
            fLast = signal_raw_f[0]
            rLast = signal_raw_r[0]
            for i in range((window / 2), len(signal_raw_f) - (window / 2)):
                Nf.append(fSum)
                Nr.append(rSum)
                fSum -= fLast
                fSum += signal_raw_f[i + (window / 2)]
                fLast = signal_raw_f[i - (window / 2) + 1]
                rSum -= rLast
                rSum += signal_raw_r[i + (window / 2)]
                rLast = signal_raw_r[i - (window / 2) + 1]

            # Calculating bias and writing to wig file
            fSum = sum(signal_bias_f[:window])
            rSum = sum(signal_bias_r[:window])
            fLast = signal_bias_f[0]
            rLast = signal_bias_r[0]
            signal_bc = []
            signal_bc_f = []
            signal_bc_r = []
            for i in range((window / 2), len(signal_bias_f) - (window / 2)):
                nhatf = Nf[i - (window / 2)] * (signal_bias_f[i] / fSum)
                nhatr = Nr[i - (window / 2)] * (signal_bias_r[i] / rSum)
                signal_bc.append(nhatf + nhatr)
                signal_bc_f.append(nhatf)
                signal_bc_r.append(nhatr)
                fSum -= fLast
                fSum += signal_bias_f[i + (window / 2)]
                fLast = signal_bias_f[i - (window / 2) + 1]
                rSum -= rLast
                rSum += signal_bias_r[i + (window / 2)]
                rLast = signal_bias_r[i - (window / 2) + 1]

            if bc_signal_file:
                f = open(bc_signal_file, "a")
                f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                    [str(e) for e in nan_to_num(signal_bc)]) + "\n")
                f.close()

                if strand_specific:
                    prefix = bc_signal_file.split(".")[0]
                    bc_signal_file_f = prefix + "_Forward" + ".bc.wig"
                    bc_signal_file_r = prefix + "_Reverse" + ".bc.wig"
                    f = open(bc_signal_file_f, "a")
                    f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                        [str(e) for e in nan_to_num(signal_bc_f)]) + "\n")
                    f.close()
                    f = open(bc_signal_file_r, "a")
                    f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                        [str(e) for e in nan_to_num(signal_bc_r)]) + "\n")
                    f.close()

            if norm_signal_file:
                norm_signal_bc = self.boyle_norm(signal_bc)
                perc = scoreatpercentile(norm_signal_bc, 98)
                std = np.std(norm_signal_bc)
                norm_signal_bc = self.hon_norm_atac(norm_signal_bc, perc, std)
                f = open(norm_signal_file, "a")
                f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                    [str(e) for e in nan_to_num(norm_signal_bc)]) + "\n")
                f.close()

                if strand_specific:
                    prefix = bc_signal_file.split(".")[0]
                    norm_signal_file_f = prefix + "_Forward" + ".norm.wig"
                    norm_signal_file_r = prefix + "_Reverse" + ".norm.wig"

                    signal_norm_f = self.boyle_norm(signal_bc_f)
                    perc = scoreatpercentile(signal_norm_f, 98)
                    std = np.std(signal_norm_f)
                    signal_norm_f = self.hon_norm_atac(signal_norm_f, perc, std)

                    signal_norm_r = self.boyle_norm(signal_bc_r)
                    perc = scoreatpercentile(signal_norm_r, 98)
                    std = np.std(signal_norm_r)
                    signal_norm_r = self.hon_norm_atac(signal_norm_r, perc, std)

                    f = open(norm_signal_file_f, "a")
                    f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                        [str(e) for e in nan_to_num(signal_norm_f)]) + "\n")
                    f.close()
                    f = open(norm_signal_file_r, "a")
                    f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join(
                        [str(e) for e in nan_to_num(signal_norm_r)]) + "\n")
                    f.close()
예제 #20
0
    def load_gene_list(self,
                       file_name,
                       filter_havana=True,
                       protein_coding=False,
                       known_only=False):
        """Reads gene annotation in gtf (gencode) format. It populates self.gene_list with such entries.
        
        *Keyword arguments:*

            - file_name -- The gencode .gtf file name.
        """
        # Opening GTF file
        try:
            gtf_file = open(file_name, "r")
        except Exception:
            print("Error: Cannot find the annotation file: " + file_name)
            print("Please check the path in ~/rgtdata/data.config")
            sys.exit(1)

        # Reading GTF file
        for line in gtf_file:

            # Processing line
            line = line.strip()
            if line[0] == "#": continue
            line_list = line.split("\t")
            try:
                if filter_havana and line_list[1] == "HAVANA": continue
            except:
                pass

            addt_list = line_list[8].split(";")
            addt_list = [_f for _f in addt_list if _f]

            # Processing additional list of options
            addt_dict = dict()
            for addt_element in addt_list:
                addt_element_list = addt_element.split(" ")
                addt_element_list = [_f for _f in addt_element_list if _f]
                # Removing " symbol from string options
                addt_element_list[1] = addt_element_list[1].replace("\"", "")
                addt_dict[addt_element_list[0]] = addt_element_list[1]

            # filter non-protein-coding sequences, if required
            if protein_coding:
                if "gene_type" not in addt_dict or addt_dict[
                        "gene_type"] != "protein_coding":
                    continue
                if "transcript_type" in addt_dict and addt_dict[
                        "transcript_type"] != "protein_coding":
                    continue

            # filter unknown sequences, if required
            if known_only:
                if "gene_status" not in addt_dict or addt_dict[
                        "gene_status"] != "KNOWN":
                    continue
                if "transcript_status" in addt_dict and addt_dict[
                        "transcript_status"] != "KNOWN":
                    continue

            # Removing dot from IDs
            addt_dict["gene_id"] = addt_dict["gene_id"].split(".")[0]
            try:
                addt_dict["transcript_id"] = addt_dict["transcript_id"].split(
                    ".")[0]
            except:
                pass

            # Creating final version of additional arguments
            final_addt_list = []
            for addt_key in [
                    "gene_id", "transcript_id", "gene_type", "gene_status",
                    "gene_name", "transcript_type", "transcript_status",
                    "transcript_name", "level"
            ]:
                try:
                    final_addt_list.append(addt_dict[addt_key])
                except Exception:
                    final_addt_list.append(None)

            # Handling score
            current_score = 0
            if AuxiliaryFunctions.string_is_int(line_list[5]):
                current_score = AuxiliaryFunctions.correct_standard_bed_score(
                    line_list[5])

            # Creating GenomicRegion
            genomic_region = GenomicRegion(chrom=line_list[0],
                                           initial=int(line_list[3]) - 1,
                                           final=int(line_list[4]),
                                           orientation=line_list[6],
                                           data=current_score)

            # Creating final vector
            extra_index_elements = [
                [], []
            ]  # One list for each: EXACT_GENE_MATCHES, INEXACT_GENE_MATCHES
            final_vector = [
                genomic_region, line_list[1], line_list[2], line_list[7]
            ] + final_addt_list + extra_index_elements
            self.gene_list.append(final_vector)

        # Termination
        gtf_file.close()
예제 #21
0
    def get_bc_signal_by_fragment_length(self, ref, start, end, bam, fasta, bias_table,
                                         forward_shift, reverse_shift, min_length=None, max_length=None,
                                         strand=True):
        # Parameters
        window = 50
        defaultKmerValue = 1.0

        # Initialization
        fBiasDict = bias_table[0]
        rBiasDict = bias_table[1]
        k_nb = len(fBiasDict.keys()[0])
        p1 = start
        p2 = end
        p1_w = p1 - (window / 2)
        p2_w = p2 + (window / 2)
        p1_wk = p1_w - int(k_nb / 2.)
        p2_wk = p2_w + int(k_nb / 2.)

        if (p1 <= 0 or p1_w <= 0 or p2_wk <= 0):
            # Return raw counts
            signal = [0.0] * (p2 - p1)
            for read in self.bam.fetch(ref, p1, p2):
                # check if the read is unmapped, according to issue #112
                if read.is_unmapped:
                    continue

                if not read.is_reverse:
                    cut_site = read.pos + forward_shift
                    if p1 <= cut_site < p2:
                        signal[cut_site - p1] += 1.0
                else:
                    cut_site = read.aend + reverse_shift - 1
                    if p1 <= cut_site < p2:
                        signal[cut_site - p1] += 1.0

            return signal

        currStr = str(fasta.fetch(ref, p1_wk, p2_wk - 1)).upper()
        currRevComp = AuxiliaryFunctions.revcomp(str(fasta.fetch(ref, p1_wk + 1, p2_wk)).upper())

        # Iterating on sequence to create the bias signal
        signal_bias_f = []
        signal_bias_r = []
        for i in range(int(k_nb / 2.), len(currStr) - int(k_nb / 2) + 1):
            fseq = currStr[i - int(k_nb / 2.):i + int(k_nb / 2.)]
            rseq = currRevComp[len(currStr) - int(k_nb / 2.) - i:len(currStr) + int(k_nb / 2.) - i]
            try:
                signal_bias_f.append(fBiasDict[fseq])
            except Exception:
                signal_bias_f.append(defaultKmerValue)
            try:
                signal_bias_r.append(rBiasDict[rseq])
            except Exception:
                signal_bias_r.append(defaultKmerValue)

        # Raw counts
        raw_f = [0.0] * (p2_w - p1_w)
        raw_r = [0.0] * (p2_w - p1_w)

        if min_length is None and max_length is None:
            for read in bam.fetch(ref, p1_w, p2_w):
                # check if the read is unmapped, according to issue #112
                if read.is_unmapped:
                    continue

                if not read.is_reverse:
                    cut_site = read.pos + forward_shift
                    if p1_w <= cut_site < p2_w:
                        raw_f[cut_site - p1_w] += 1.0
                else:
                    cut_site = read.aend + reverse_shift - 1
                    if p1_w <= cut_site < p2_w:
                        raw_r[cut_site - p1_w] += 1.0
        elif min_length is None and max_length is not None:
            for read in bam.fetch(ref, p1_w, p2_w):
                # check if the read is unmapped, according to issue #112
                if read.is_unmapped:
                    continue

                if abs(read.template_length) <= max_length:
                    if not read.is_reverse:
                        cut_site = read.pos + forward_shift
                        if p1_w <= cut_site < p2_w:
                            raw_f[cut_site - p1_w] += 1.0
                    else:
                        cut_site = read.aend + reverse_shift - 1
                        if p1_w <= cut_site < p2_w:
                            raw_r[cut_site - p1_w] += 1.0
        elif min_length is not None and max_length is None:
            for read in bam.fetch(ref, p1_w, p2_w):
                if abs(read.template_length) > min_length:
                    if not read.is_reverse:
                        cut_site = read.pos + forward_shift
                        if p1_w <= cut_site < p2_w:
                            raw_f[cut_site - p1_w] += 1.0
                    else:
                        cut_site = read.aend + reverse_shift - 1
                        if p1_w <= cut_site < p2_w:
                            raw_r[cut_site - p1_w] += 1.0
        elif min_length is not None and max_length is not None:
            for read in bam.fetch(ref, p1_w, p2_w):
                # check if the read is unmapped, according to issue #112
                if read.is_unmapped:
                    continue

                if min_length < abs(read.template_length) <= max_length:
                    if not read.is_reverse:
                        cut_site = read.pos + forward_shift
                        if p1_w <= cut_site < p2_w:
                            raw_f[cut_site - p1_w] += 1.0
                    else:
                        cut_site = read.aend + reverse_shift - 1
                        if p1_w <= cut_site < p2_w:
                            raw_r[cut_site - p1_w] += 1.0

        # Smoothed counts
        Nf = []
        Nr = []
        fSum = sum(raw_f[:window])
        rSum = sum(raw_r[:window])
        fLast = raw_f[0]
        rLast = raw_r[0]
        for i in range((window / 2), len(raw_f) - (window / 2)):
            Nf.append(fSum)
            Nr.append(rSum)
            fSum -= fLast
            fSum += raw_f[i + (window / 2)]
            fLast = raw_f[i - (window / 2) + 1]
            rSum -= rLast
            rSum += raw_r[i + (window / 2)]
            rLast = raw_r[i - (window / 2) + 1]

        # Calculating bias and writing to wig file
        fSum = sum(signal_bias_f[:window])
        rSum = sum(signal_bias_r[:window])
        fLast = signal_bias_f[0]
        rLast = signal_bias_r[0]
        bc_f = []
        bc_r = []
        for i in range((window / 2), len(signal_bias_f) - (window / 2)):
            nhatf = Nf[i - (window / 2)] * (signal_bias_f[i] / fSum)
            nhatr = Nr[i - (window / 2)] * (signal_bias_r[i] / rSum)
            bc_f.append(nhatf)
            bc_r.append(nhatr)
            fSum -= fLast
            fSum += signal_bias_f[i + (window / 2)]
            fLast = signal_bias_f[i - (window / 2) + 1]
            rSum -= rLast
            rSum += signal_bias_r[i + (window / 2)]
            rLast = signal_bias_r[i - (window / 2) + 1]

        if strand:
            return np.array(bc_f), np.array(bc_r)
        else:
            return np.add(np.array(bc_f), np.array(bc_r))
예제 #22
0
    def get_bias_raw_bc_signal(self, ref, start, end, bam, fasta, bias_table, forward_shift, reverse_shift,
                               strand=False):
        # Parameters
        window = 50
        defaultKmerValue = 1.0

        # Initialization
        fBiasDict = bias_table[0]
        rBiasDict = bias_table[1]
        k_nb = len(fBiasDict.keys()[0])
        p1 = start
        p2 = end
        p1_w = p1 - (window / 2)
        p2_w = p2 + (window / 2)
        p1_wk = p1_w - int(k_nb / 2.)
        p2_wk = p2_w + int(k_nb / 2.)

        if p1 <= 0 or p1_w <= 0 or p2_wk <= 0:
            # Return raw counts
            signal = [0.0] * (p2 - p1)
            for read in self.bam.fetch(ref, p1, p2):
                if not read.is_reverse:
                    cut_site = read.pos + forward_shift
                    if p1 <= cut_site < p2:
                        signal[cut_site - p1] += 1.0
                else:
                    cut_site = read.aend + reverse_shift - 1
                    if p1 <= cut_site < p2:
                        signal[cut_site - p1] += 1.0

            return signal

        currStr = str(fasta.fetch(ref, p1_wk - 1 + forward_shift, p2_wk - 2 + forward_shift)).upper()
        currRevComp = AuxiliaryFunctions.revcomp(str(fasta.fetch(ref, p1_wk + reverse_shift + 2,
                                                                 p2_wk + reverse_shift + 1)).upper())

        # Iterating on sequence to create the bias signal
        signal_bias_f = []
        signal_bias_r = []
        for i in range(int(k_nb / 2.), len(currStr) - int(k_nb / 2) + 1):
            fseq = currStr[i - int(k_nb / 2.):i + int(k_nb / 2.)]
            rseq = currRevComp[len(currStr) - int(k_nb / 2.) - i:len(currStr) + int(k_nb / 2.) - i]
            try:
                signal_bias_f.append(fBiasDict[fseq])
            except Exception:
                signal_bias_f.append(defaultKmerValue)
            try:
                signal_bias_r.append(rBiasDict[rseq])
            except Exception:
                signal_bias_r.append(defaultKmerValue)

        # Raw counts
        signal_raw_f = [0.0] * (p2_w - p1_w)
        signal_raw_r = [0.0] * (p2_w - p1_w)
        for read in bam.fetch(ref, p1_w, p2_w):
            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1_w <= cut_site < p2_w:
                    signal_raw_f[cut_site - p1_w] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1_w <= cut_site < p2_w:
                    signal_raw_r[cut_site - p1_w] += 1.0

        # Smoothed counts
        Nf = []
        Nr = []
        fSum = sum(signal_raw_f[:window])
        rSum = sum(signal_raw_r[:window])
        fLast = signal_raw_f[0]
        rLast = signal_raw_r[0]
        for i in range((window / 2), len(signal_raw_f) - (window / 2)):
            Nf.append(fSum)
            Nr.append(rSum)
            fSum -= fLast
            fSum += signal_raw_f[i + (window / 2)]
            fLast = signal_raw_f[i - (window / 2) + 1]
            rSum -= rLast
            rSum += signal_raw_r[i + (window / 2)]
            rLast = signal_raw_r[i - (window / 2) + 1]

        # Calculating bias and writing to wig file
        fSum = sum(signal_bias_f[:window])
        rSum = sum(signal_bias_r[:window])
        fLast = signal_bias_f[0]
        rLast = signal_bias_r[0]
        bias_f = []
        bias_r = []
        raw = []
        raw_f = []
        raw_r = []
        bc = []
        bc_f = []
        bc_r = []
        for i in range((window / 2), len(signal_bias_f) - (window / 2)):
            nhatf = Nf[i - (window / 2)] * (signal_bias_f[i] / fSum)
            nhatr = Nr[i - (window / 2)] * (signal_bias_r[i] / rSum)
            bias_f.append(signal_bias_f[i])
            bias_r.append(signal_bias_r[i])
            raw.append(signal_raw_f[i] + signal_raw_r[i])
            raw_f.append(signal_raw_f[i])
            raw_r.append(signal_raw_r[i])
            # zf = (signal_raw_f[i]) / (signal_bias_f[i])
            # zr = (signal_raw_r[i]) / (signal_bias_r[i])
            bc.append(nhatf + nhatr)
            bc_f.append(nhatf)
            bc_r.append(nhatr)
            fSum -= fLast
            fSum += signal_bias_f[i + (window / 2)]
            fLast = signal_bias_f[i - (window / 2) + 1]
            rSum -= rLast
            rSum += signal_bias_r[i + (window / 2)]
            rLast = signal_bias_r[i - (window / 2) + 1]

        currStr = str(fasta.fetch(ref, p1_wk, p2_wk - 1)).upper()
        currRevComp = AuxiliaryFunctions.revcomp(str(fasta.fetch(ref, p1_wk + 1, p2_wk)).upper())

        # Iterating on sequence to create the bias signal
        signal_bias_f = []
        signal_bias_r = []
        for i in range(int(k_nb / 2.), len(currStr) - int(k_nb / 2) + 1):
            fseq = currStr[i - int(k_nb / 2.):i + int(k_nb / 2.)]
            rseq = currRevComp[len(currStr) - int(k_nb / 2.) - i:len(currStr) + int(k_nb / 2.) - i]
            try:
                signal_bias_f.append(fBiasDict[fseq])
            except Exception:
                signal_bias_f.append(defaultKmerValue)
            try:
                signal_bias_r.append(rBiasDict[rseq])
            except Exception:
                signal_bias_r.append(defaultKmerValue)

        bias_f = []
        bias_r = []
        for i in range((window / 2), len(signal_bias_f) - (window / 2)):
            bias_f.append(signal_bias_f[i])
            bias_r.append(signal_bias_r[i])

        if strand:
            return bias_f, bias_r, raw, raw_f, raw_r, bc, bc_f, bc_r
        else:
            return bias_f, bias_r, raw, bc
예제 #23
0
    def load_gene_list(self, file_name, filter_havana=True, protein_coding=False, known_only=False):
        """
        Reads gene annotation in gtf (gencode) format. It populates self.gene_list with such entries.

        Keyword arguments:
        file_name -- The gencode .gtf file name.
        
        Return: void.
        """
        # Opening GTF file
        try: gtf_file = open(file_name,"r")
        except Exception: pass # TODO

        # Reading GTF file
        for line in gtf_file:
        
            # Processing line
            line = line.strip()
            if(line[0] == "#"): continue
            line_list = line.split("\t")
            if(filter_havana and line_list[1] == "HAVANA"): continue

            addt_list = line_list[8].split(";")

            if(protein_coding and "protein_coding" not in addt_list[2] ): continue
            if(known_only and "KNOWN" not in addt_list[3] ): continue
            
            if(protein_coding and "protein_coding" not in addt_list[5] ): continue
            if(known_only and "KNOWN" not in addt_list[6] ): continue
            addt_list = filter(None,addt_list)

            # Processing additional list of options
            addt_dict = dict()
            for addt_element in addt_list:
                addt_element_list = addt_element.split(" ")
                addt_element_list = filter(None,addt_element_list)
                addt_element_list[1] = addt_element_list[1].replace("\"","") # Removing " symbol from string options
                addt_dict[addt_element_list[0]] = addt_element_list[1]
        
            # Removing dot from IDs
            addt_dict["gene_id"] = addt_dict["gene_id"].split(".")[0]
            addt_dict["transcript_id"] = addt_dict["transcript_id"].split(".")[0]
                
                                                                                                                                                                                          # Creating final version of additional arguments
            final_addt_list = []
            for addt_key in ["gene_id", "transcript_id", "gene_type", "gene_status", "gene_name", 
                             "transcript_type", "transcript_status", "transcript_name", "level"]:
                try: final_addt_list.append(addt_dict[addt_key])
                except Exception: final_addt_list.append(None)

            # Handling score
            current_score = 0
            if(AuxiliaryFunctions.string_is_int(line_list[5])):
                current_score = AuxiliaryFunctions.correct_standard_bed_score(line_list[5])

            # Creating GenomicRegion
            genomic_region = GenomicRegion(chrom = line_list[0], 
                                           initial = int(line_list[3])-1, 
                                           final = int(line_list[4]), 
                                           orientation = line_list[6], 
                                           data = current_score)

            # Creating final vector
            extra_index_elements = [[],[]] # One list for each: EXACT_GENE_MATCHES, INEXACT_GENE_MATCHES
            final_vector = [genomic_region,line_list[1],line_list[2],line_list[7]] + final_addt_list + extra_index_elements
            self.gene_list.append(final_vector)
            
        # Termination
        gtf_file.close()