Пример #1
0
def filter_deadzones(bed_deadzones, peak_regions):
    """Filter by peaklist by deadzones"""
    deadzones = GenomicRegionSet('deadzones')
    deadzones.read_bed(bed_deadzones)
    peak_regions = peak_regions.subtract(deadzones, whole_region=True)
    
    return peak_regions
Пример #2
0
def filter_deadzones(bed_deadzones, peak_regions):
    """Filter by peaklist by deadzones"""
    deadzones = GenomicRegionSet('deadzones')
    deadzones.read_bed(bed_deadzones)
    peak_regions = peak_regions.subtract(deadzones, whole_region=True)

    return peak_regions
Пример #3
0
    def read_bed(self, bedfile, genome_file_dir):
        """Read the sequences defined by BED file on the given genomce"""

        # Read BED into GenomicRegionSet
        bed = GenomicRegionSet(os.path.basename(bedfile))
        bed.read_bed(bedfile)
        
        # Parse each chromosome and fetch the defined region in this chromosome
        chroms = list(set(bed.get_chrom()))

        chro_files = [x.split(".")[0] for x in os.listdir(genome_file_dir)]

        for ch in chroms:
            if ch not in chro_files: print(" *** There is no genome FASTA file for: "+ch)

            # Read genome in FASTA according to the given chromosome
            ch_seq = SequenceSet(name=ch, seq_type=SequenceType.DNA)
            try: 
                ch_seq.read_fasta(os.path.join(genome_file_dir, ch+".fa"))
            except:
                continue
            
            # Regions in given chromosome
            beds = bed.any_chrom(chrom=ch)

            for s in beds:
                seq = ch_seq[0].seq[s.initial:s.final]
                try: strand = s.strand
                except: strand = "+"
                self.sequences.append(Sequence(seq=seq, name=s.__repr__(), 
                                               strand=strand))
Пример #4
0
def initialize(name, dims, genome_path, regions, stepsize, binsize, bamfiles, exts, \
               inputs, exts_inputs, factors_inputs, chrom_sizes, verbose, no_gc_content, \
               tracker, debug, norm_regions, scaling_factors_ip, save_wig, housekeeping_genes, \
               test, report, chrom_sizes_dict, counter, end, gc_content_cov=None, avg_gc_content=None, \
               gc_hist=None, output_bw=True, save_input=False, m_threshold=80, a_threshold=95, rmdup=False):
    """Initialize the MultiCoverageSet"""
    regionset = regions
    regionset.sequences.sort()
    
    if norm_regions:
        norm_regionset = GenomicRegionSet('norm_regions')
        norm_regionset.read_bed(norm_regions)
    else:
        norm_regionset = None
        
    exts, exts_inputs = _compute_extension_sizes(bamfiles, exts, inputs, exts_inputs, report)
    
    multi_cov_set = MultiCoverageSet(name=name, regions=regionset, dims=dims, genome_path=genome_path,
                                     binsize=binsize, stepsize=stepsize, rmdup=rmdup, path_bamfiles=bamfiles,
                                     path_inputs=inputs, exts=exts, exts_inputs=exts_inputs,
                                     factors_inputs=factors_inputs, chrom_sizes=chrom_sizes, verbose=verbose,
                                     no_gc_content=no_gc_content, chrom_sizes_dict=chrom_sizes_dict, debug=debug,
                                     norm_regionset=norm_regionset, scaling_factors_ip=scaling_factors_ip,
                                     save_wig=save_wig, strand_cov=True, housekeeping_genes=housekeeping_genes,
                                     tracker=tracker, gc_content_cov=gc_content_cov, avg_gc_content=avg_gc_content,
                                     gc_hist=gc_hist, end=end, counter=counter, output_bw=output_bw,
                                     folder_report=FOLDER_REPORT, report=report, save_input=save_input,
                                     m_threshold=m_threshold, a_threshold=a_threshold)
    return multi_cov_set
Пример #5
0
    def create_file(self):
        # Expanding summits
        tfbs_summit_regions = GenomicRegionSet("TFBS Summit Regions")
        tfbs_summit_regions.read_bed(self.tfbs_summit_fname)

        for region in iter(tfbs_summit_regions):
            summit = int(region.data.split()[-1]) + region.initial
            region.initial = max(summit - (self.peak_ext / 2), 0)
            region.final = summit + (self.peak_ext / 2)

        # Calculating intersections
        mpbs_regions = GenomicRegionSet("MPBS Regions")
        mpbs_regions.read_bed(self.mpbs_fname)

        tfbs_summit_regions.sort()
        mpbs_regions.sort()

        with_overlap_regions = mpbs_regions.intersect(tfbs_summit_regions, mode=OverlapType.ORIGINAL)
        without_overlap_regions = mpbs_regions.subtract(tfbs_summit_regions, whole_region=True)
        tfbs_regions = GenomicRegionSet("TFBS Regions")

        for region in iter(with_overlap_regions):
            region.name = region.name.split(":")[0] + ":Y"
            tfbs_regions.add(region)

        for region in iter(without_overlap_regions):
            region.name = region.name.split(":")[0] + ":N"
            tfbs_regions.add(region)

        tfbs_regions.sort()

        tfbs_fname = os.path.join(self.output_location, "{}.bed".format(self.mpbs_name))
        tfbs_regions.write_bed(tfbs_fname)
Пример #6
0
def main():
    options, vcf_list = input()

    #thres_mq = 20
    #thres_dp = 20
    #filter_dbSNP = True
    #tfbs_motifs_path = '/home/manuel/workspace/cluster_p/human_genetics/exp/exp01_motifsearch_sox2/humangenetics_motifs/Match/chr11_mpbs.bed'

    sample_data = load_data(vcf_list)
    print("##Filter variants of samples", file=sys.stderr)
    pipeline(sample_data, options)

    if options.list_wt:
        wt_data = load_data(options.list_wt)
        print("##Filter variants of wildtypes", file=sys.stderr)
        pipeline(wt_data, options)
        union_wt = GenomicVariantSet(name="union_wt")
        for wt in wt_data:
            union_wt.sequences += wt.sequences

        print("#wildtype variants:", file=sys.stderr)
        print("union WT", len(union_wt), file=sys.stderr, sep="\t")

        #delete Wildtype
        for sample in sample_data:
            sample.subtract(union_wt)

        print_length(sample_data, "#variants after subtracting wildtypes")
    else:
        print("#Do not filter by wildtype", file=sys.stderr)

    if options.max_density:
        get_max_density(GenomicVariantSets=sample_data,
                        lowerBound=options.lower_bound,
                        upperBound=options.upper_bound)
    else:
        print("#Do not perform max. density search", file=sys.stderr)

    if options.list_bed:
        tfbs_motifs = GenomicRegionSet('tfbs_motifs')
        tfbs_motifs.read_bed(options.list_bed)

        for sample in sample_data:
            sample.intersect(tfbs_motifs)

        print_length(sample_data, "#variants after filtering by BED file")
    else:
        print("#Do not filter by BED file", file=sys.stderr)

    print(
        "#Compute intersection of sample's subsets (give intersection's name and size)"
    )
    output_intersections(sample_data)

    print("#Write filtered sample files")
    for sample in sample_data:
        sample.write_vcf("%s-filtered.vcf" % sample.name)
Пример #7
0
def main():
    options, vcf_list = input()
    
    #thres_mq = 20
    #thres_dp = 20 
    #filter_dbSNP = True
    #tfbs_motifs_path = '/home/manuel/workspace/cluster_p/human_genetics/exp/exp01_motifsearch_sox2/humangenetics_motifs/Match/chr11_mpbs.bed'
    
    sample_data = load_data(vcf_list)
    print("##Filter variants of samples", file=sys.stderr)
    pipeline(sample_data, options)
    
    if options.list_wt:
        wt_data = load_data(options.list_wt)
        print("##Filter variants of wildtypes", file=sys.stderr)
        pipeline(wt_data, options)
        union_wt = GenomicVariantSet(name = "union_wt")
        for wt in wt_data:
            union_wt.sequences += wt.sequences 
        
        print("#wildtype variants:", file=sys.stderr)
        print("union WT", len(union_wt), file=sys.stderr, sep="\t")
        
        #delete Wildtype
        for sample in sample_data:
            sample.subtract(union_wt)
        
        print_length(sample_data, "#variants after subtracting wildtypes")
    else:
        print("#Do not filter by wildtype", file=sys.stderr)
    
    if options.max_density:
        get_max_density(GenomicVariantSets=sample_data, lowerBound=options.lower_bound, upperBound=options.upper_bound)
    else:
        print("#Do not perform max. density search", file=sys.stderr)
        
    if options.list_bed:
        tfbs_motifs = GenomicRegionSet('tfbs_motifs')   
        tfbs_motifs.read_bed(options.list_bed)
        
        for sample in sample_data:
            sample.intersect(tfbs_motifs)
    
        print_length(sample_data, "#variants after filtering by BED file")
    else:
        print("#Do not filter by BED file", file=sys.stderr)
    
    print("#Compute intersection of sample's subsets (give intersection's name and size)")
    output_intersections(sample_data)
    
    print("#Write filtered sample files")
    for sample in sample_data:
        sample.write_vcf("%s-filtered.vcf" %sample.name)
Пример #8
0
def initialize(name, dims, genome_path, regions, stepsize, binsize, bamfiles, exts, \
               inputs, exts_inputs, factors_inputs, chrom_sizes, verbose, no_gc_content, \
               tracker, debug, norm_regions, scaling_factors_ip, save_wig, housekeeping_genes, \
               test, report, chrom_sizes_dict, counter, end, gc_content_cov=None, avg_gc_content=None, \
               gc_hist=None, output_bw=True, save_input=False, m_threshold=80, a_threshold=95, rmdup=False):
    """Initialize the MultiCoverageSet"""
    regionset = regions
    regionset.sequences.sort()

    if norm_regions:
        norm_regionset = GenomicRegionSet('norm_regions')
        norm_regionset.read_bed(norm_regions)
    else:
        norm_regionset = None

    exts, exts_inputs = _compute_extension_sizes(bamfiles, exts, inputs,
                                                 exts_inputs, report)

    multi_cov_set = MultiCoverageSet(name=name,
                                     regions=regionset,
                                     dims=dims,
                                     genome_path=genome_path,
                                     binsize=binsize,
                                     stepsize=stepsize,
                                     rmdup=rmdup,
                                     path_bamfiles=bamfiles,
                                     path_inputs=inputs,
                                     exts=exts,
                                     exts_inputs=exts_inputs,
                                     factors_inputs=factors_inputs,
                                     chrom_sizes=chrom_sizes,
                                     verbose=verbose,
                                     no_gc_content=no_gc_content,
                                     chrom_sizes_dict=chrom_sizes_dict,
                                     debug=debug,
                                     norm_regionset=norm_regionset,
                                     scaling_factors_ip=scaling_factors_ip,
                                     save_wig=save_wig,
                                     strand_cov=True,
                                     housekeeping_genes=housekeeping_genes,
                                     tracker=tracker,
                                     gc_content_cov=gc_content_cov,
                                     avg_gc_content=avg_gc_content,
                                     gc_hist=gc_hist,
                                     end=end,
                                     counter=counter,
                                     output_bw=output_bw,
                                     folder_report=FOLDER_REPORT,
                                     report=report,
                                     save_input=save_input,
                                     m_threshold=m_threshold,
                                     a_threshold=a_threshold)
    return multi_cov_set
Пример #9
0
def merge_DBD_regions(path):
    """Merge all available DBD regions in BED format. """

    for t in os.listdir(path):
        if os.path.isdir(os.path.join(path, t)):
            dbd_pool = GenomicRegionSet(t)
            for rna in os.listdir(os.path.join(path,t)):
                f = os.path.join(path, t, rna, "DBD_"+rna+".bed")
                if os.path.exists(f):
                    dbd = GenomicRegionSet(rna)
                    dbd.read_bed(f)
                    for r in dbd: r.name = rna+"_"+r.name
                    dbd_pool.combine(dbd)
            dbd_pool.write_bed(os.path.join(path, t, "DBD_"+t+".bed"))
Пример #10
0
def initialize(name, dims, genome_path, regions, stepsize, binsize, bamfiles, exts, \
               inputs, exts_inputs, factors_inputs, chrom_sizes, verbose, no_gc_content, \
               tracker, debug, norm_regions, scaling_factors_ip, save_wig, housekeeping_genes):
    """Initialize the MultiCoverageSet"""

    regionset = GenomicRegionSet(name)
    chrom_sizes_dict = {}
    #if regions option is set, take the values, otherwise the whole set of 
    #chromosomes as region to search for DPs
    if regions is not None:
        print("Call DPs on specified regions.", file=sys.stderr)
        with open(regions) as f:
            for line in f:
                line = line.strip()
                line = line.split('\t')
                c, s, e = line[0], int(line[1]), int(line[2])
                regionset.add(GenomicRegion(chrom=c, initial=s, final=e))
                chrom_sizes_dict[c] = e
    else:
        print("Call DPs on whole genome.", file=sys.stderr)
        with open(chrom_sizes) as f:
            for line in f:
                line = line.strip()
                line = line.split('\t')
                chrom, end = line[0], int(line[1])
                regionset.add(GenomicRegion(chrom=chrom, initial=0, final=end))
                chrom_sizes_dict[chrom] = end
    
    if norm_regions:
        norm_regionset = GenomicRegionSet('norm_regions')
        norm_regionset.read_bed(norm_regions)
    else:
        norm_regionset = None
        
    if housekeeping_genes:
        scaling_factors_ip, _ = norm_gene_level(bamfiles, housekeeping_genes, name, verbose=True)
    
    if scaling_factors_ip:
        tracker.write(text=map(lambda x: str(x), scaling_factors_ip), header="Scaling factors")
    
    regionset.sequences.sort()
    exts, exts_inputs = _compute_extension_sizes(bamfiles, exts, inputs, exts_inputs, verbose)
    tracker.write(text=str(exts).strip('[]'), header="Extension size (rep1, rep2, input1, input2)")
    
    multi_cov_set = MultiCoverageSet(name=name, regions=regionset, dims=dims, genome_path=genome_path, binsize=binsize, stepsize=stepsize,rmdup=True,\
                                  path_bamfiles = bamfiles, path_inputs = inputs, exts = exts, exts_inputs = exts_inputs, factors_inputs = factors_inputs, \
                                  chrom_sizes=chrom_sizes, verbose=verbose, no_gc_content=no_gc_content, chrom_sizes_dict=chrom_sizes_dict, debug=debug, \
                                  norm_regionset=norm_regionset, scaling_factors_ip=scaling_factors_ip, save_wig=save_wig)
    
    return multi_cov_set
Пример #11
0
    def read_bed(self, bedfile, genome_file_dir):
        """Read the sequences defined by BED file on the given genomce.

        *Keyword arguments:*

            - bedfile -- The path to the BED file which defines the regions.
            - genome_file_dir -- A directory which contains the FASTA files for each chromosome.
        """

        # Read BED into GenomicRegionSet
        from rgt.GenomicRegionSet import GenomicRegionSet
        bed = GenomicRegionSet(os.path.basename(bedfile))
        bed.read_bed(bedfile)
        self.read_genomic_set(bed, genome_file_dir)
Пример #12
0
    def read_bed(self, bedfile, genome_file_dir):
        """Read the sequences defined by BED file on the given genomce.

        *Keyword arguments:*

            - bedfile -- The path to the BED file which defines the regions.
            - genome_file_dir -- A directory which contains the FASTA files for each chromosome.
        """

        # Read BED into GenomicRegionSet
        from rgt.GenomicRegionSet import GenomicRegionSet
        bed = GenomicRegionSet(os.path.basename(bedfile))
        bed.read_bed(bedfile)
        self.read_genomic_set(bed, genome_file_dir)
Пример #13
0
def initialize(name, dims, genome_path, regions, stepsize, binsize, bamfiles, exts, \
               inputs, exts_inputs, factors_inputs, chrom_sizes, verbose, no_gc_content, \
               tracker, debug, norm_regions, scaling_factors_ip, save_wig):
    """Initialize the MultiCoverageSet"""

    regionset = GenomicRegionSet(name)
    chrom_sizes_dict = {}
    #if regions option is set, take the values, otherwise the whole set of 
    #chromosomes as region to search for DPs
    if regions is not None:
        print("Call DPs on specified regions.", file=sys.stderr)
        with open(regions) as f:
            for line in f:
                line = line.strip()
                line = line.split('\t')
                c, s, e = line[0], int(line[1]), int(line[2])
                regionset.add(GenomicRegion(chrom=c, initial=s, final=e))
                chrom_sizes_dict[c] = e
    else:
        print("Call DPs on whole genome.", file=sys.stderr)
        with open(chrom_sizes) as f:
            for line in f:
                line = line.strip()
                line = line.split('\t')
                chrom, end = line[0], int(line[1])
                regionset.add(GenomicRegion(chrom=chrom, initial=0, final=end))
                chrom_sizes_dict[chrom] = end
    
    if norm_regions:
        norm_regionset = GenomicRegionSet('norm_regions')
        norm_regionset.read_bed(norm_regions)
    else:
        norm_regionset = None
    
    regionset.sequences.sort()
    exts, exts_inputs = _compute_extension_sizes(bamfiles, exts, inputs, exts_inputs, verbose)
    tracker.write(text=str(exts).strip('[]'), header="Extension size (rep1, rep2, input1, input2)")
    
    multi_cov_set = MultiCoverageSet(name=name, regions=regionset, dims=dims, genome_path=genome_path, binsize=binsize, stepsize=stepsize,rmdup=True,\
                                  path_bamfiles = bamfiles, path_inputs = inputs, exts = exts, exts_inputs = exts_inputs, factors_inputs = factors_inputs, \
                                  chrom_sizes=chrom_sizes, verbose=verbose, no_gc_content=no_gc_content, chrom_sizes_dict=chrom_sizes_dict, debug=debug, \
                                  norm_regionset=norm_regionset, scaling_factors_ip=scaling_factors_ip, save_wig=save_wig)
    
    return multi_cov_set
Пример #14
0
def get_experimental_matrix(bams, bed):
    """Load artificially experimental matrix. Only genes in BED file are needed."""
    m = ExperimentalMatrix()

    m.fields = ['name', 'type', 'file']
    m.fieldsDict = {}

    names = []
    for bam in bams:
        n, _ = os.path.splitext(os.path.basename(bam))
        m.files[n] = bam
        names.append(n)
    m.names = np.array(['housekeep'] + names)
    m.types = np.array(['regions'] + ['reads'] * len(names))
    g = GenomicRegionSet('RegionSet')
    g.read_bed(bed)
    m.objectsDict['housekeep'] = g

    return m
Пример #15
0
def get_experimental_matrix(bams, bed):
    """Load artificially experimental matrix. Only genes in BED file are needed."""
    m = ExperimentalMatrix()
    
    m.fields = ['name', 'type', 'file']
    m.fieldsDict = {}
    
    names = []
    for bam in bams:
        n, _ = os.path.splitext(os.path.basename(bam))
        m.files[n] = bam
        names.append(n) 
    m.names = np.array(['housekeep'] + names)
    m.types = np.array(['regions'] + ['reads']*len(names))
    g = GenomicRegionSet('RegionSet')
    g.read_bed(bed)
    m.objectsDict['housekeep'] = g
    
    return m
Пример #16
0
def get_dbss(input_BED,output_BED,rna_fasta,output_rbss,organism,l,e,c,fr,fm,of,mf,rm,temp):
    regions = GenomicRegionSet("Target")
    regions.read_bed(input_BED)
    regions.gene_association(organism=organism, show_dis=True)

    connect_rna(rna_fasta, temp=temp, rna_name="RNA")
    rnas = SequenceSet(name="rna", seq_type=SequenceType.RNA)
    rnas.read_fasta(os.path.join(temp,"rna_temp.fa"))
    rna_regions = get_rna_region_str(os.path.join(temp,rna_fasta))
    # print(rna_regions)
    genome = GenomeData(organism)
    genome_path = genome.get_genome()
    txp = find_triplex(rna_fasta=rna_fasta, dna_region=regions, 
                       temp=temp, organism=organism, remove_temp=False,
                       l=l, e=e, c=c, fr=fr, fm=fm, of=of, mf=mf, genome_path=genome_path,
                       prefix="targeted_region", dna_fine_posi=True)

    print("Total binding events:\t",str(len(txp)))
    txp.write_bed(output_BED)
    txp.write_txp(filename=output_BED.replace(".bed",".txp"))
    rbss = txp.get_rbs()
    dbd_regions(exons=rna_regions, sig_region=rbss, rna_name="rna", output=output_rbss, 
                out_file=True, temp=temp, fasta=False)
Пример #17
0
    def create_file(self):
        # Expanding summits
        tfbs_summit_regions = GenomicRegionSet("TFBS Summit Regions")
        tfbs_summit_regions.read_bed(self.tfbs_summit_fname)

        for region in iter(tfbs_summit_regions):
            summit = int(region.data.split()[-1]) + region.initial
            region.initial = max(summit - (self.peak_ext / 2), 0)
            region.final = summit + (self.peak_ext / 2)

        # Calculating intersections
        mpbs_regions = GenomicRegionSet("MPBS Regions")
        mpbs_regions.read_bed(self.mpbs_fname)

        tfbs_summit_regions.sort()
        mpbs_regions.sort()

        with_overlap_regions = mpbs_regions.intersect(
            tfbs_summit_regions, mode=OverlapType.ORIGINAL)
        without_overlap_regions = mpbs_regions.subtract(tfbs_summit_regions,
                                                        whole_region=True)
        tfbs_regions = GenomicRegionSet("TFBS Regions")

        for region in iter(with_overlap_regions):
            region.name = region.name.split(":")[0] + ":Y"
            tfbs_regions.add(region)

        for region in iter(without_overlap_regions):
            region.name = region.name.split(":")[0] + ":N"
            tfbs_regions.add(region)

        tfbs_regions.sort()

        tfbs_fname = os.path.join(self.output_location,
                                  "{}.bed".format(self.mpbs_name))
        tfbs_regions.write_bed(tfbs_fname)
Пример #18
0
    def chip_evaluate(self):
        """
        This evaluation methodology uses motif-predicted binding sites (MPBSs) together with TF ChIP-seq data
        to evaluate the footprint predictions.

        return:
        """

        # Evaluate Statistics
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        roc_auc_1 = dict()
        roc_auc_2 = dict()
        recall = dict()
        precision = dict()
        prc_auc = dict()

        if "SEG" in self.footprint_type:
            mpbs_regions = GenomicRegionSet("TFBS")
            mpbs_regions.read_bed(self.tfbs_file)
            mpbs_regions.sort()

            # Verifying the maximum score of the MPBS file
            max_score = -99999999
            for region in iter(mpbs_regions):
                score = int(region.data)
                if score > max_score:
                    max_score = score
            max_score += 1

        for i in range(len(self.footprint_file)):
            footprints_regions = GenomicRegionSet("Footprints Prediction")
            footprints_regions.read_bed(self.footprint_file[i])

            # Sort footprint prediction bed files
            footprints_regions.sort()

            if self.footprint_type[i] == "SEG":
                # Increasing the score of MPBS entry once if any overlaps found in the predicted footprints.
                increased_score_mpbs_regions = GenomicRegionSet(
                    "Increased Regions")
                intersect_regions = mpbs_regions.intersect(
                    footprints_regions, mode=OverlapType.ORIGINAL)
                for region in iter(intersect_regions):
                    region.data = str(int(region.data) + max_score)
                    increased_score_mpbs_regions.add(region)

                # Keep the score of remained MPBS entry unchanged
                without_intersect_regions = mpbs_regions.subtract(
                    footprints_regions, whole_region=True)
                for region in iter(without_intersect_regions):
                    increased_score_mpbs_regions.add(region)

                increased_score_mpbs_regions.sort_score()

                fpr[i], tpr[i], roc_auc[i], roc_auc_1[i], roc_auc_2[
                    i] = self.roc_curve(increased_score_mpbs_regions)
                recall[i], precision[i], prc_auc[
                    i] = self.precision_recall_curve(
                        increased_score_mpbs_regions)
            elif self.footprint_type[i] == "SC":
                footprints_regions.sort_score()
                fpr[i], tpr[i], roc_auc[i], roc_auc_1[i], roc_auc_2[
                    i] = self.roc_curve(footprints_regions)
                recall[i], precision[i], prc_auc[
                    i] = self.precision_recall_curve(footprints_regions)

        # Output the statistics results into text
        stats_fname = self.output_location + self.tf_name + "_stats.txt"
        stats_header = ["METHOD", "AUC_100", "AUC_10", "AUC_1", "AUPR"]
        with open(stats_fname, "w") as stats_file:
            stats_file.write("\t".join(stats_header) + "\n")
            for i in range(len(self.footprint_name)):
                stats_file.write(self.footprint_name[i] + "\t" +
                                 str(roc_auc[i]) + "\t" + str(roc_auc_1[i]) +
                                 "\t" + str(roc_auc_2[i]) + "\t" +
                                 str(prc_auc[i]) + "\n")

        # Output the curves
        if self.print_roc_curve:
            label_x = "False Positive Rate"
            label_y = "True Positive Rate"
            curve_name = "ROC"
            self.plot_curve(fpr, tpr, roc_auc, label_x, label_y, self.tf_name,
                            curve_name)
        if self.print_pr_curve:
            label_x = "Recall"
            label_y = "Precision"
            curve_name = "PRC"
            self.plot_curve(recall, precision, prc_auc, label_x, label_y,
                            self.tf_name, curve_name)

        self.output_points(self.tf_name, fpr, tpr, recall, precision)
Пример #19
0
    def line(self):
        signal = GenomicSignal(self.bam_file)
        signal.load_sg_coefs(slope_window_size=9)
        bias_table = BiasTable()
        bias_table_list = self.bias_table.split(",")
        table = bias_table.load_table(table_file_name_F=bias_table_list[0],
                                      table_file_name_R=bias_table_list[1])
        genome_data = GenomeData(self.organism)
        fasta = Fastafile(genome_data.get_genome())
        pwm_dict = dict([("A", [0.0] * self.window_size),
                         ("C", [0.0] * self.window_size),
                         ("G", [0.0] * self.window_size),
                         ("T", [0.0] * self.window_size),
                         ("N", [0.0] * self.window_size)])

        mean_raw_signal = np.zeros(self.window_size)
        mean_bc_signal = np.zeros(self.window_size)
        mean_raw_signal_f = np.zeros(self.window_size)
        mean_bc_signal_f = np.zeros(self.window_size)
        mean_raw_signal_r = np.zeros(self.window_size)
        mean_bc_signal_r = np.zeros(self.window_size)

        mean_bias_signal_f = np.zeros(self.window_size)
        mean_bias_signal_r = np.zeros(self.window_size)
        num_sites = 0

        mpbs_regions = GenomicRegionSet("Motif Predicted Binding Sites")
        mpbs_regions.read_bed(self.motif_file)

        total_nc_signal = 0
        total_nl_signal = 0
        total_nr_signal = 0

        for region in mpbs_regions:
            if str(region.name).split(":")[-1] == "Y":
                num_sites += 1
                # Extend by 50 bp
                mid = (region.initial + region.final) / 2
                p1 = mid - (self.window_size / 2)
                p2 = mid + (self.window_size / 2)

                if not self.strands_specific:
                    # Fetch raw signal
                    raw_signal, _ = signal.get_signal(
                        ref=region.chrom,
                        start=p1,
                        end=p2,
                        downstream_ext=self.atac_downstream_ext,
                        upstream_ext=self.atac_upstream_ext,
                        forward_shift=self.atac_forward_shift,
                        reverse_shift=self.atac_reverse_shift,
                        genome_file_name=genome_data.get_genome())

                    mean_raw_signal = np.add(mean_raw_signal, raw_signal)

                    # Fetch bias correction signal
                    bc_signal, _ = signal.get_signal(
                        ref=region.chrom,
                        start=p1,
                        end=p2,
                        bias_table=table,
                        downstream_ext=self.atac_downstream_ext,
                        upstream_ext=self.atac_upstream_ext,
                        forward_shift=self.atac_forward_shift,
                        reverse_shift=self.atac_reverse_shift,
                        genome_file_name=genome_data.get_genome())

                    mean_bc_signal = np.add(mean_bc_signal, bc_signal)
                else:
                    raw_signal_f, _, raw_signal_r, _ = signal.get_signal_per_strand(
                        ref=region.chrom,
                        start=p1,
                        end=p2,
                        downstream_ext=self.atac_downstream_ext,
                        upstream_ext=self.atac_upstream_ext,
                        forward_shift=self.atac_forward_shift,
                        reverse_shift=self.atac_reverse_shift,
                        genome_file_name=genome_data.get_genome())
                    mean_raw_signal_f = np.add(mean_raw_signal_f, raw_signal_f)
                    mean_raw_signal_r = np.add(mean_raw_signal_r, raw_signal_r)

                    bc_signal_f, _, bc_signal_r, _ = signal.get_signal_per_strand(
                        ref=region.chrom,
                        start=p1,
                        end=p2,
                        bias_table=table,
                        downstream_ext=self.atac_downstream_ext,
                        upstream_ext=self.atac_upstream_ext,
                        forward_shift=self.atac_forward_shift,
                        reverse_shift=self.atac_reverse_shift,
                        genome_file_name=genome_data.get_genome())
                    mean_bc_signal_f = np.add(mean_bc_signal_f, bc_signal_f)
                    mean_bc_signal_r = np.add(mean_bc_signal_r, bc_signal_r)

                # Update pwm
                aux_plus = 1
                dna_seq = str(fasta.fetch(region.chrom, p1, p2)).upper()
                if (region.final - region.initial) % 2 == 0:
                    aux_plus = 0
                dna_seq_rev = AuxiliaryFunctions.revcomp(
                    str(fasta.fetch(region.chrom, p1 + aux_plus,
                                    p2 + aux_plus)).upper())
                if region.orientation == "+":
                    for i in range(0, len(dna_seq)):
                        pwm_dict[dna_seq[i]][i] += 1
                elif region.orientation == "-":
                    for i in range(0, len(dna_seq_rev)):
                        pwm_dict[dna_seq_rev[i]][i] += 1

                # Create bias signal
                bias_table_f = table[0]
                bias_table_r = table[1]
                self.k_nb = len(bias_table_f.keys()[0])
                bias_signal_f = []
                bias_signal_r = []
                p1_wk = p1 - int(self.k_nb / 2)
                p2_wk = p2 + int(self.k_nb / 2)
                dna_seq = str(fasta.fetch(region.chrom, p1_wk,
                                          p2_wk - 1)).upper()
                dna_seq_rev = AuxiliaryFunctions.revcomp(
                    str(fasta.fetch(region.chrom, p1_wk, p2_wk + 1)).upper())
                for i in range(int(self.k_nb / 2),
                               len(dna_seq) - int(self.k_nb / 2) + 1):
                    fseq = dna_seq[i - int(self.k_nb / 2):i +
                                   int(self.k_nb / 2)]
                    rseq = dna_seq_rev[len(dna_seq) - int(self.k_nb / 2) -
                                       i:len(dna_seq) + int(self.k_nb / 2) - i]
                    try:
                        bias_signal_f.append(bias_table_f[fseq])
                    except Exception:
                        bias_signal_f.append(1)
                    try:
                        bias_signal_r.append(bias_table_r[rseq])
                    except Exception:
                        bias_signal_r.append(1)

                mean_bias_signal_f = np.add(mean_bias_signal_f,
                                            np.array(bias_signal_f))
                mean_bias_signal_r = np.add(mean_bias_signal_r,
                                            np.array(bias_signal_r))

                if self.protection_score:
                    # signal in the center of the MPBS
                    p1 = region.initial
                    p2 = region.final
                    nc_signal, _ = signal.get_signal(
                        ref=region.chrom,
                        start=p1,
                        end=p2,
                        bias_table=table,
                        downstream_ext=self.atac_downstream_ext,
                        upstream_ext=self.atac_upstream_ext,
                        forward_shift=self.atac_forward_shift,
                        reverse_shift=self.atac_reverse_shift,
                        genome_file_name=genome_data.get_genome())
                    total_nc_signal += sum(nc_signal)
                    p1 = region.final
                    p2 = 2 * region.final - region.initial
                    nr_signal, _ = signal.get_signal(
                        ref=region.chrom,
                        start=p1,
                        end=p2,
                        bias_table=table,
                        downstream_ext=self.atac_downstream_ext,
                        upstream_ext=self.atac_upstream_ext,
                        forward_shift=self.atac_forward_shift,
                        reverse_shift=self.atac_reverse_shift,
                        genome_file_name=genome_data.get_genome())
                    total_nr_signal += sum(nr_signal)
                    p1 = 2 * region.initial - region.final
                    p2 = region.final
                    nl_signal, _ = signal.get_signal(
                        ref=region.chrom,
                        start=p1,
                        end=p2,
                        bias_table=table,
                        downstream_ext=self.atac_downstream_ext,
                        upstream_ext=self.atac_upstream_ext,
                        forward_shift=self.atac_forward_shift,
                        reverse_shift=self.atac_reverse_shift,
                        genome_file_name=genome_data.get_genome())
                    total_nl_signal += sum(nl_signal)

        mean_raw_signal = mean_raw_signal / num_sites
        mean_bc_signal = mean_bc_signal / num_sites

        mean_raw_signal_f = mean_raw_signal_f / num_sites
        mean_raw_signal_r = mean_raw_signal_r / num_sites
        mean_bc_signal_f = mean_bc_signal_f / num_sites
        mean_bc_signal_r = mean_bc_signal_r / num_sites

        mean_bias_signal_f = mean_bias_signal_f / num_sites
        mean_bias_signal_r = mean_bias_signal_r / num_sites

        protection_score = (total_nl_signal + total_nr_signal -
                            2 * total_nc_signal) / (2 * num_sites)

        # Output PWM and create logo
        pwm_fname = os.path.join(self.output_loc,
                                 "{}.pwm".format(self.motif_name))
        pwm_file = open(pwm_fname, "w")
        for e in ["A", "C", "G", "T"]:
            pwm_file.write(" ".join([str(int(f)) for f in pwm_dict[e]]) + "\n")
        pwm_file.close()

        logo_fname = os.path.join(self.output_loc,
                                  "{}.logo.eps".format(self.motif_name))
        pwm = motifs.read(open(pwm_fname), "pfm")
        pwm.weblogo(logo_fname,
                    format="eps",
                    stack_width="large",
                    stacks_per_line="100",
                    color_scheme="color_classic",
                    unit_name="",
                    show_errorbars=False,
                    logo_title="",
                    show_xaxis=False,
                    xaxis_label="",
                    show_yaxis=False,
                    yaxis_label="",
                    show_fineprint=False,
                    show_ends=False)

        # Output the raw, bias corrected signal and protection score
        output_fname = os.path.join(self.output_loc,
                                    "{}.txt".format(self.motif_name))
        output_file = open(output_fname, "w")
        if not self.strands_specific:
            output_file.write("raw signal: \n" +
                              np.array_str(mean_raw_signal) + "\n")
            output_file.write("bias corrected signal: \n" +
                              np.array_str(mean_bc_signal) + "\n")
        else:
            output_file.write("raw forward signal: \n" +
                              np.array_str(mean_raw_signal_f) + "\n")
            output_file.write("bias corrected forward signal: \n" +
                              np.array_str(mean_bc_signal_f) + "\n")
            output_file.write("raw reverse signal: \n" +
                              np.array_str(mean_raw_signal_r) + "\n")
            output_file.write("bias reverse corrected signal: \n" +
                              np.array_str(mean_bc_signal_r) + "\n")
        output_file.write("forward bias signal: \n" +
                          np.array_str(mean_bias_signal_f) + "\n")
        output_file.write("reverse bias signal: \n" +
                          np.array_str(mean_bias_signal_r) + "\n")
        if self.protection_score:
            output_file.write("protection score: \n" + str(protection_score) +
                              "\n")
        output_file.close()

        if self.strands_specific:
            fig, (ax1, ax2, ax3) = plt.subplots(3, figsize=(12.0, 10.0))
        else:
            fig, (ax1, ax2) = plt.subplots(2)
        x = np.linspace(-50, 49, num=self.window_size)

        ax1.plot(x, mean_bias_signal_f, color='red', label='Forward')
        ax1.plot(x, mean_bias_signal_r, color='blue', label='Reverse')

        ax1.xaxis.set_ticks_position('bottom')
        ax1.yaxis.set_ticks_position('left')
        ax1.spines['top'].set_visible(False)
        ax1.spines['right'].set_visible(False)
        ax1.spines['left'].set_position(('outward', 15))
        ax1.spines['bottom'].set_position(('outward', 5))
        ax1.tick_params(direction='out')

        ax1.set_xticks([-50, -40, -30, -20, -10, 0, 10, 20, 30, 40, 49])
        ax1.set_xticklabels([
            '-50', '-40', '-30', '-20', '-10', '0', '10', '20', '30', '40',
            '49'
        ])
        min_bias_signal = min(min(mean_bias_signal_f), min(mean_bias_signal_r))
        max_bias_signal = max(max(mean_bias_signal_f), max(mean_bias_signal_r))
        ax1.set_yticks([min_bias_signal, max_bias_signal])
        ax1.set_yticklabels(
            [str(round(min_bias_signal, 2)),
             str(round(max_bias_signal, 2))],
            rotation=90)

        ax1.text(-48,
                 max_bias_signal,
                 '# Sites = {}'.format(str(num_sites)),
                 fontweight='bold')
        ax1.set_title(self.motif_name, fontweight='bold')
        ax1.set_xlim(-50, 49)
        ax1.set_ylim([min_bias_signal, max_bias_signal])
        ax1.legend(loc="upper right", frameon=False)
        ax1.set_ylabel("Average Bias \nSignal", rotation=90, fontweight='bold')

        if not self.strands_specific:
            mean_raw_signal = self.standardize(mean_raw_signal)
            mean_bc_signal = self.standardize(mean_bc_signal)
            ax2.plot(x, mean_raw_signal, color='red', label='Uncorrected')
            ax2.plot(x, mean_bc_signal, color='green', label='Corrected')
        else:
            mean_raw_signal_f = self.standardize(mean_raw_signal_f)
            mean_raw_signal_r = self.standardize(mean_raw_signal_r)
            mean_bc_signal_f = self.standardize(mean_bc_signal_f)
            mean_bc_signal_r = self.standardize(mean_bc_signal_r)
            ax2.plot(x, mean_raw_signal_f, color='red', label='Forward')
            ax2.plot(x, mean_raw_signal_r, color='green', label='Reverse')
            ax3.plot(x, mean_bc_signal_f, color='red', label='Forward')
            ax3.plot(x, mean_bc_signal_r, color='green', label='Reverse')

        ax2.xaxis.set_ticks_position('bottom')
        ax2.yaxis.set_ticks_position('left')
        ax2.spines['top'].set_visible(False)
        ax2.spines['right'].set_visible(False)
        ax2.spines['left'].set_position(('outward', 15))
        ax2.tick_params(direction='out')
        ax2.set_xticks([-50, -40, -30, -20, -10, 0, 10, 20, 30, 40, 49])
        ax2.set_xticklabels([
            '-50', '-40', '-30', '-20', '-10', '0', '10', '20', '30', '40',
            '49'
        ])
        ax2.set_yticks([0, 1])
        ax2.set_yticklabels([str(0), str(1)], rotation=90)
        ax2.set_xlim(-50, 49)
        ax2.set_ylim([0, 1])

        if not self.strands_specific:
            ax2.spines['bottom'].set_position(('outward', 40))
            ax2.set_xlabel("Coordinates from Motif Center", fontweight='bold')
            ax2.set_ylabel("Average ATAC-seq \nSignal",
                           rotation=90,
                           fontweight='bold')
            ax2.legend(loc="center",
                       frameon=False,
                       bbox_to_anchor=(0.85, 0.06))
        else:
            ax2.spines['bottom'].set_position(('outward', 5))
            ax2.set_ylabel("Average ATAC-seq \n Uncorrected Signal",
                           rotation=90,
                           fontweight='bold')
            ax2.legend(loc="lower right", frameon=False)

            ax3.xaxis.set_ticks_position('bottom')
            ax3.yaxis.set_ticks_position('left')
            ax3.spines['top'].set_visible(False)
            ax3.spines['right'].set_visible(False)
            ax3.spines['left'].set_position(('outward', 15))
            ax3.tick_params(direction='out')
            ax3.set_xticks([-50, -40, -30, -20, -10, 0, 10, 20, 30, 40, 49])
            ax3.set_xticklabels([
                '-50', '-40', '-30', '-20', '-10', '0', '10', '20', '30', '40',
                '49'
            ])
            ax3.set_yticks([0, 1])
            ax3.set_yticklabels([str(0), str(1)], rotation=90)
            ax3.set_xlim(-50, 49)
            ax3.set_ylim([0, 1])
            ax3.legend(loc="lower right", frameon=False)
            ax3.spines['bottom'].set_position(('outward', 40))
            ax3.set_xlabel("Coordinates from Motif Center", fontweight='bold')
            ax3.set_ylabel("Average ATAC-seq \n Corrected Signal",
                           rotation=90,
                           fontweight='bold')
            ax3.text(-48,
                     0.05,
                     '# K-mer = {}\n# Forward Shift = {}'.format(
                         str(self.k_nb), str(self.atac_forward_shift)),
                     fontweight='bold')

        figure_name = os.path.join(self.output_loc,
                                   "{}.line.eps".format(self.motif_name))
        fig.subplots_adjust(bottom=.2, hspace=.5)
        fig.tight_layout()
        fig.savefig(figure_name, format="eps", dpi=300)

        # Creating canvas and printing eps / pdf with merged results
        output_fname = os.path.join(self.output_loc,
                                    "{}.eps".format(self.motif_name))
        c = pyx.canvas.canvas()
        c.insert(pyx.epsfile.epsfile(0, 0, figure_name, scale=1.0))
        if self.strands_specific:
            c.insert(
                pyx.epsfile.epsfile(2.76,
                                    1.58,
                                    logo_fname,
                                    width=27.2,
                                    height=2.45))
        else:
            c.insert(
                pyx.epsfile.epsfile(2.5,
                                    1.54,
                                    logo_fname,
                                    width=16,
                                    height=1.75))
        c.writeEPSfile(output_fname)
        os.system("epstopdf " + figure_name)
        os.system("epstopdf " + logo_fname)
        os.system("epstopdf " + output_fname)
Пример #20
0
    def read_states_signals(self):
        # Read states from the annotation file
        states = ""
        with open(self.annotate_fname) as annotate_file:
            for line in annotate_file:
                if len(line) < 2 or "#" in line or "=" in line:
                    continue
                ll = line.strip().split(" ")
                for state in ll[1:-1]:
                    states += state

        # If need to estimate bias table
        bias_table = BiasTable(output_loc=self.output_locaiton)
        genome_data = GenomeData(self.organism)
        table = None
        if self.estimate_bias_correction:
            regions = GenomicRegionSet("Bias Regions")
            if self.original_regions.split(".")[-1] == "bed":
                regions.read_bed(self.original_regions)
            if self.original_regions.split(".")[-1] == "fa":
                regions.read_sequence(self.original_regions)

            if self.estimate_bias_type == "FRE":
                table = bias_table.estimate_table(
                    regions=regions,
                    dnase_file_name=self.bam_file,
                    genome_file_name=genome_data.get_genome(),
                    k_nb=self.k_nb,
                    forward_shift=self.atac_forward_shift,
                    reverse_shift=self.atac_reverse_shift)
            elif self.estimate_bias_type == "PWM":
                table = bias_table.estimate_table_pwm(
                    regions=regions,
                    dnase_file_name=self.bam_file,
                    genome_file_name=genome_data.get_genome(),
                    k_nb=self.k_nb,
                    forward_shift=self.atac_forward_shift,
                    reverse_shift=self.atac_reverse_shift)

            bias_fname = os.path.join(
                self.output_locaiton, "Bias",
                "{}_{}".format(self.k_nb, self.atac_forward_shift))
            bias_table.write_tables(bias_fname, table)

        # If the bias table is provided
        if self.bias_table:
            bias_table_list = self.bias_table.split(",")
            table = bias_table.load_table(table_file_name_F=bias_table_list[0],
                                          table_file_name_R=bias_table_list[1])

        # Get the normalization and slope signal from the raw bam file
        raw_signal = GenomicSignal(self.bam_file)
        raw_signal.load_sg_coefs(slope_window_size=9)
        norm_signal, slope_signal = raw_signal.get_signal(
            ref=self.chrom,
            start=self.start,
            end=self.end,
            downstream_ext=self.atac_downstream_ext,
            upstream_ext=self.atac_upstream_ext,
            forward_shift=self.atac_forward_shift,
            reverse_shift=self.atac_reverse_shift,
            initial_clip=self.atac_initial_clip,
            bias_table=table,
            genome_file_name=genome_data.get_genome(),
            print_raw_signal=self.print_raw_signal,
            print_bc_signal=self.print_bc_signal,
            print_norm_signal=self.print_norm_signal,
            print_slope_signal=self.print_slope_signal)
        if self.print_bed_file:
            self.output_bed_file(states)

        return states, norm_signal, slope_signal
Пример #21
0
class RandomTest:
    def __init__(self,
                 rna_fasta,
                 rna_name,
                 dna_region,
                 organism,
                 showdbs=False):
        self.organism = organism
        genome = GenomeData(organism)
        self.genome_path = genome.get_genome()
        # RNA: Path to the FASTA file
        self.rna_fasta = rna_fasta
        self.showdbs = showdbs

        rnas = SequenceSet(name="rna", seq_type=SequenceType.RNA)
        rnas.read_fasta(self.rna_fasta)
        if rna_name:
            self.rna_name = rna_name
        else:
            self.rna_name = rnas[0].name

        # DNA: GenomicRegionSet
        self.dna_region = GenomicRegionSet(name="target")
        self.dna_region.read_bed(dna_region)
        self.dna_region = self.dna_region.gene_association(
            organism=self.organism, show_dis=True)

        self.topDBD = []
        self.stat = OrderedDict(name=rna_name, genome=organism)
        self.stat["target_regions"] = str(len(self.dna_region))

    def get_rna_region_str(self, rna):
        """Getting the rna region from the information header with the pattern:
                REGION_chr3_51978050_51983935_-_"""
        self.rna_regions = get_rna_region_str(rna)
        if self.rna_regions and len(self.rna_regions[0]) == 5:
            self.rna_expression = float(self.rna_regions[0][-1])
        else:
            self.rna_expression = "n.a."

    def connect_rna(self, rna, temp):
        d = connect_rna(rna, temp, self.rna_name)
        self.stat["exons"] = str(d[0])
        self.stat["seq_length"] = str(d[1])
        self.rna_len = d[1]

    def target_dna(self,
                   temp,
                   remove_temp,
                   cutoff,
                   l,
                   e,
                   c,
                   fr,
                   fm,
                   of,
                   mf,
                   par,
                   obed=False):
        """Calculate the true counts of triplexes on the given dna regions"""
        self.triplexator_p = [l, e, c, fr, fm, of, mf]

        txp = find_triplex(rna_fasta=os.path.join(temp, "rna_temp.fa"),
                           dna_region=self.dna_region,
                           temp=temp,
                           organism=self.organism,
                           remove_temp=remove_temp,
                           l=l,
                           e=e,
                           c=c,
                           fr=fr,
                           fm=fm,
                           of=of,
                           mf=mf,
                           par=par,
                           genome_path=self.genome_path,
                           prefix="targeted_region",
                           dna_fine_posi=False)
        txp.merge_rbs(rm_duplicate=True,
                      region_set=self.dna_region,
                      asgene_organism=self.organism,
                      cutoff=cutoff)
        self.txp = txp
        self.stat["DBSs_target_all"] = str(len(self.txp))
        txp.remove_duplicates()
        self.rbss = txp.merged_dict.keys()
        # if len(self.rbss) == 0:
        #     print("ERROR: No potential binding event. Please change the parameters.")
        #     sys.exit(1)

        txpf = find_triplex(rna_fasta=os.path.join(temp, "rna_temp.fa"),
                            dna_region=self.dna_region,
                            temp=temp,
                            organism=self.organism,
                            remove_temp=remove_temp,
                            l=l,
                            e=e,
                            c=c,
                            fr=fr,
                            fm=fm,
                            of=of,
                            mf=mf,
                            par=par,
                            genome_path=self.genome_path,
                            prefix="dbs",
                            dna_fine_posi=True)
        txpf.remove_duplicates()
        txpf.merge_rbs(rbss=self.rbss,
                       rm_duplicate=True,
                       asgene_organism=self.organism)
        self.txpf = txpf

        self.stat["DBSs_target_all"] = str(len(self.txpf))

        self.counts_tr = OrderedDict()
        self.counts_dbs = OrderedDict()

        for rbs in self.rbss:
            tr = len(self.txp.merged_dict[rbs])
            self.counts_tr[rbs] = [tr, len(self.dna_region) - tr]
            self.counts_dbs[rbs] = len(self.txpf.merged_dict[rbs])

        self.region_dbd = self.txpf.sort_rbs_by_regions(self.dna_region)

        self.region_dbs = self.txpf.sort_rd_by_regions(
            regionset=self.dna_region)
        self.region_dbsm = {}
        self.region_coverage = {}

        for region in self.dna_region:
            self.region_dbsm[region.toString()] = self.region_dbs[
                region.toString()].get_dbs().merge(w_return=True)
            self.region_coverage[region.toString()] = float(self.region_dbsm[region.toString()].total_coverage()) / len \
                (region)
        self.stat["target_regions"] = str(len(self.dna_region))

        if obed:
            # btr = self.txp.get_dbs()
            # btr = btr.gene_association(organism=self.organism, show_dis=True)
            # btr.write_bed(os.path.join(temp, obed + "_target_region_dbs.bed"))
            # dbss = txpf.get_dbs()
            # dbss.write_bed(os.path.join(temp, obed + "_dbss.bed"))

            # output = self.dna_region.gene_association(organism=self.organism, show_dis=True)

            self.txp.write_bed(filename=os.path.join(
                temp, obed + "_target_region_dbs.bed"),
                               dbd_tag=False,
                               remove_duplicates=False,
                               associated=self.organism)
            self.txpf.write_bed(filename=os.path.join(temp,
                                                      obed + "_dbss.bed"),
                                remove_duplicates=False)

    def random_test(self, repeats, temp, remove_temp, l, e, c, fr, fm, of, mf,
                    rm, par, filter_bed, alpha):
        """Perform randomization for the given times"""
        self.repeats = repeats
        marks = numpy.round(numpy.linspace(0, repeats - 1, num=41)).tolist()
        print("random_test")
        print(par)
        # Prepare the input lists for multiprocessing
        mp_input = []
        for i in range(repeats):
            mp_input.append([
                str(i),
                os.path.join(temp, "rna_temp.fa"), self.dna_region, temp,
                self.organism, self.rbss,
                str(marks.count(i)),
                str(l),
                str(e),
                str(c),
                str(fr),
                str(fm),
                str(of),
                str(mf),
                str(rm), filter_bed, self.genome_path, par
            ])
        # Multiprocessing
        print("\t\t|0%                  |                100%|")
        print("\t\t[", end="")
        pool = multiprocessing.Pool(processes=multiprocessing.cpu_count() - 2)
        mp_output = pool.map(random_each, mp_input)
        # print(mp_output)
        pool.close()
        pool.join()
        print("]")

        # Processing the result
        self.region_matrix = []
        self.dbss_matrix = []
        self.data = {
            "region": {
                "ave": [],
                "sd": [],
                "p": [],
                "sig_region": [],
                "sig_boolean": []
            },
            "dbs": {
                "ave": [],
                "sd": [],
                "p": [],
                "sig_region": [],
                "sig_boolean": []
            }
        }

        region_counts = [v[0] for v in mp_output]
        dbss_counts = [v[1] for v in mp_output]

        for i, rbs in enumerate(self.rbss):

            counts_regions = [v[i] for v in region_counts]

            self.data["region"]["ave"].append(numpy.mean(counts_regions))
            self.data["region"]["sd"].append(numpy.std(counts_regions))
            num_sig = len(
                [h for h in counts_regions if h > self.counts_tr[rbs][0]])
            p_region = float(num_sig) / repeats
            self.data["region"]["p"].append(p_region)
            self.region_matrix.append(counts_regions)

            if p_region < alpha:
                self.data["region"]["sig_region"].append(rbs)
                self.data["region"]["sig_boolean"].append(True)
            else:
                self.data["region"]["sig_boolean"].append(False)

            try:
                if p_region < self.topDBD[1]:
                    self.topDBD = [rbs.str_rna(pa=False), p_region]
            except:
                self.topDBD = [rbs.str_rna(pa=False), p_region]

            # Analysis based on DBSs
            if self.showdbs:
                counts_dbss = [v[i] for v in dbss_counts]

                self.data["dbs"]["ave"].append(numpy.mean(counts_dbss))
                self.data["dbs"]["sd"].append(numpy.std(counts_dbss))
                num_sig = len(
                    [h for h in counts_dbss if h > self.counts_dbs[rbs]])
                p_dbs = float(num_sig) / repeats
                self.data["dbs"]["p"].append(p_dbs)
                self.dbss_matrix.append(counts_dbss)
                if p_dbs < alpha:
                    self.data["dbs"]["sig_region"].append(rbs)
                    self.data["dbs"]["sig_boolean"].append(True)
                else:
                    self.data["dbs"]["sig_boolean"].append(False)
            try:
                self.stat["p_value"] = str(min(self.data["region"]["p"]))
            except:
                self.stat["p_value"] = "1"

        self.region_matrix = numpy.array(self.region_matrix)

        if self.showdbs: self.dbss_matrix = numpy.array(self.dbss_matrix)

        counts_dbss = [v[i] for v in dbss_counts]
        self.stat["DBSs_random_ave"] = numpy.mean(counts_dbss)
        try:
            self.stat["p_value"] = str(min(self.data["region"]["p"]))
        except:
            self.stat["p_value"] = "1"

    def dbd_regions(self, sig_region, output):
        """Generate the BED file of significant DBD regions and FASTA file of the sequences"""
        dbd_regions(exons=self.rna_regions,
                    sig_region=sig_region,
                    rna_name=self.rna_name,
                    output=output)

        self.stat["DBD_all"] = str(len(self.rbss))
        self.stat["DBD_sig"] = str(len(self.data["region"]["sig_region"]))

        sigDBD = GenomicRegionSet("DBD_sig")
        sigDBD.sequences = self.data["region"]["sig_region"]
        rbss = self.txp.get_rbs()
        overlaps = rbss.intersect(y=sigDBD, mode=OverlapType.ORIGINAL)
        self.stat["DBSs_target_DBD_sig"] = str(len(overlaps))

    def lineplot(self, txp, dirp, ac, cut_off, log, ylabel, linelabel, showpa,
                 sig_region, filename):
        """Generate lineplot for RNA"""

        lineplot(txp=txp,
                 rnalen=self.rna_len,
                 rnaname=self.rna_name,
                 dirp=dirp,
                 sig_region=sig_region,
                 cut_off=cut_off,
                 log=log,
                 ylabel=ylabel,
                 linelabel=linelabel,
                 filename=filename,
                 ac=ac,
                 showpa=showpa)

    def boxplot(self, dir, matrix, sig_region, truecounts, sig_boolean, ylabel,
                filename):
        """Generate the visualized plot"""
        tick_size = 8
        label_size = 9

        f, ax = plt.subplots(1, 1, dpi=300, figsize=(6, 4))
        max_y = int(max([matrix.max()] + truecounts) * 1.1) + 1
        min_y = max(int(matrix.min() * 0.9) - 1, 0)

        # Significant DBD
        rect = patches.Rectangle(xy=(1, 0),
                                 width=0.8,
                                 height=max_y,
                                 facecolor=sig_color,
                                 edgecolor="none",
                                 alpha=0.5,
                                 lw=None,
                                 label="Significant DBD")
        for i, r in enumerate(sig_boolean):
            if r:
                rect = patches.Rectangle(xy=(i + 0.6, min_y),
                                         width=0.8,
                                         height=max_y,
                                         facecolor=sig_color,
                                         edgecolor="none",
                                         alpha=0.5,
                                         lw=None,
                                         label="Significant DBD")
                ax.add_patch(rect)

        # Plotting

        bp = ax.boxplot(matrix.transpose(),
                        notch=False,
                        sym='o',
                        vert=True,
                        whis=1.5,
                        positions=None,
                        widths=None,
                        patch_artist=True,
                        bootstrap=None)
        z = 10
        plt.setp(bp['boxes'], color=nontarget_color, alpha=1, edgecolor="none")
        plt.setp(bp['whiskers'],
                 color='black',
                 linestyle='-',
                 linewidth=1,
                 zorder=z,
                 alpha=1)
        plt.setp(bp['fliers'],
                 markerfacecolor='gray',
                 color='white',
                 alpha=0.3,
                 markersize=1.8,
                 zorder=z)
        plt.setp(bp['caps'], color='white', zorder=-1)
        plt.setp(bp['medians'], color='black', linewidth=1.5, zorder=z + 1)

        # Plot target regions
        plt.plot(range(1,
                       len(self.rbss) + 1),
                 truecounts,
                 markerfacecolor=target_color,
                 marker='o',
                 markersize=5,
                 linestyle='None',
                 markeredgecolor="white",
                 zorder=z + 5)

        ax.set_xlabel(self.rna_name + " DNA Binding Domains",
                      fontsize=label_size)
        ax.set_ylabel(ylabel, fontsize=label_size, rotation=90)

        ax.set_ylim([min_y, max_y])
        ax.yaxis.set_major_locator(MaxNLocator(integer=True))

        ax.set_xticklabels([dbd.str_rna(pa=False) for dbd in self.rbss],
                           rotation=35,
                           ha="right",
                           fontsize=tick_size)
        for tick in ax.yaxis.get_major_ticks():
            tick.label.set_fontsize(tick_size)

        for spine in ['top', 'right']:
            ax.spines[spine].set_visible(False)
        ax.tick_params(axis='x',
                       which='both',
                       bottom='off',
                       top='off',
                       labelbottom='on')
        ax.tick_params(axis='y',
                       which='both',
                       left='on',
                       right='off',
                       labelbottom='off')

        # Legend
        dot_legend, = plt.plot([1, 1],
                               color=target_color,
                               marker='o',
                               markersize=5,
                               markeredgecolor="white",
                               linestyle='None')
        bp_legend, = plt.plot([1, 1],
                              color=nontarget_color,
                              linewidth=6,
                              alpha=1)

        ax.legend([dot_legend, bp_legend, rect],
                  ["Target Regions", "Non-target regions", "Significant DBD"],
                  bbox_to_anchor=(0., 1.02, 1., .102),
                  loc=2,
                  mode="expand",
                  borderaxespad=0.,
                  prop={'size': 9},
                  ncol=3,
                  numpoints=1)
        bp_legend.set_visible(False)
        dot_legend.set_visible(False)

        # f.tight_layout(pad=1.08, h_pad=None, w_pad=None)
        f.savefig(os.path.join(dir, filename + ".png"),
                  facecolor='w',
                  edgecolor='w',
                  bbox_extra_artists=(plt.gci()),
                  bbox_inches='tight',
                  dpi=300)
        # PDF
        for tick in ax.xaxis.get_major_ticks():
            tick.label.set_fontsize(12)
        for tick in ax.yaxis.get_major_ticks():
            tick.label.set_fontsize(12)
        ax.xaxis.label.set_size(14)
        ax.yaxis.label.set_size(14)

        pp = PdfPages(os.path.join(dir, filename + '.pdf'))
        pp.savefig(f, bbox_extra_artists=(plt.gci()), bbox_inches='tight')
        pp.close()

    def gen_html(self,
                 directory,
                 parameters,
                 obed,
                 align=50,
                 alpha=0.05,
                 score=False):
        """Generate the HTML file"""
        dir_name = os.path.basename(directory)
        html_header = "Genomic Region Test: " + dir_name
        link_ds = OrderedDict()
        link_ds["RNA"] = "index.html"
        link_ds["Sig Target Regions"] = "starget_regions.html"
        link_ds["Target Regions"] = "target_regions.html"
        link_ds["Parameters"] = "parameters.html"

        ##################################################
        # index.html

        html = Html(
            name=html_header,
            links_dict=link_ds,  # fig_dir=os.path.join(directory,"style"),
            fig_rpath="../style",
            RGT_header=False,
            other_logo="TDF",
            homepage="../index.html")
        # Plots
        html.add_figure("lineplot_region.png",
                        align="left",
                        width="45%",
                        more_images=["boxplot_regions.png"])
        if self.showdbs:
            html.add_figure("lineplot_dbs.png",
                            align="left",
                            width="45%",
                            more_images=["boxplot_dbs.png"])

        if self.showdbs:
            header_list = [[
                "#", "DBD", "Target Regions", None, "Non-target Regions", None,
                "Statistics", "Target Regions", "Non-target Regions", None,
                "Statistics"
            ],
                           [
                               "", "", "with DBS", "without DBS",
                               "with DBS (average)", "s.d.", "<i>p</i>-value",
                               "NO. DBSs", "NO. DBSs (average)", "s.d.",
                               "<i>p</i>-value"
                           ]]
            header_titles = [
                [
                    "Rank", "DNA Binding Domain",
                    "Given target regions on DNA", None,
                    "Regions from randomization", None,
                    "Statistics based on target regions",
                    "Given target regions on DNA",
                    "Regions from randomization", None,
                    "Statistics based on DNA Binding Sites"
                ],
                [
                    "", "", "Number of target regions with DBS binding",
                    "Number of target regions without DBS binding",
                    "Average number of regions from randomization with DBS binding",
                    "Standard deviation", "P value",
                    "Number of related DNA Binding Sites binding to target regions",
                    "Average number of DNA Binding Sites binding to random regions",
                    "Standard deviation", "P-value"
                ]
            ]
            border_list = [
                " style=\"border-right:1pt solid gray\"",
                " style=\"border-right:1pt solid gray\"", "",
                " style=\"border-right:1pt solid gray\"", "",
                " style=\"border-right:1pt solid gray\"",
                " style=\"border-right:2pt solid gray\"",
                " style=\"border-right:1pt solid gray\"", "",
                " style=\"border-right:1pt solid gray\"",
                " style=\"border-right:1pt solid gray\""
            ]
        else:
            header_list = [[
                "#", "DBD", "Target Regions", None, "Non-target Regions", None,
                "Statistics", None
            ],
                           [
                               "", "", "with DBS", "without DBS",
                               "with DBS (average)", "s.d.", "<i>p</i>-value",
                               "z-score"
                           ]]
            header_titles = [
                [
                    "Rank", "DNA Binding Domain",
                    "Given target regions on DNA", None,
                    "Regions from randomization", None,
                    "Statistics based on target regions", None
                ],
                [
                    "", "", "Number of target regions with DBS binding",
                    "Number of target regions without DBS binding",
                    "Average number of regions from randomization with DBS binding",
                    "Standard deviation", "P value", "Z-score"
                ]
            ]
            border_list = [
                " style=\"border-right:1pt solid gray\"",
                " style=\"border-right:1pt solid gray\"", "",
                " style=\"border-right:1pt solid gray\"", "",
                " style=\"border-right:1pt solid gray\"",
                " style=\"border-right:1pt solid gray\"", ""
            ]

        type_list = 'ssssssssssssssss'
        col_size_list = [
            50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50
        ]
        data_table = []

        for i, rbs in enumerate(self.rbss):
            if self.data["region"]["p"][i] < alpha:
                p_region = "<font color=\"red\">" + value2str(
                    self.data["region"]["p"][i]) + "</font>"

            else:
                p_region = value2str(self.data["region"]["p"][i])
            zs = (self.counts_tr[rbs][0] -
                  self.data["region"]["ave"][i]) / self.data["region"]["sd"][i]
            new_line = [
                str(i + 1),
                rbs.str_rna(pa=False), '<a href="dbd_region.html#' +
                rbs.str_rna() + '" style="text-align:left">' +
                str(self.counts_tr[rbs][0]) + '</a>',
                str(self.counts_tr[rbs][1]),
                value2str(self.data["region"]["ave"][i]),
                value2str(self.data["region"]["sd"][i]), p_region,
                value2str(zs)
            ]
            if self.showdbs:
                if self.data["dbs"]["p"][i] < alpha:
                    p_dbs = "<font color=\"red\">" + value2str(
                        self.data["dbs"]["p"][i]) + "</font>"
                else:
                    p_dbs = value2str(self.data["dbs"]["p"][i])

                new_line += [
                    str(self.counts_dbs[rbs]),
                    value2str(self.data["dbs"]["ave"][i]),
                    value2str(self.data["dbs"]["sd"][i]), p_dbs
                ]
            data_table.append(new_line)

        data_table = natsort.natsorted(data_table, key=lambda x: x[6])
        html.add_zebra_table(header_list,
                             col_size_list,
                             type_list,
                             data_table,
                             align=align,
                             cell_align="left",
                             auto_width=True,
                             header_titles=header_titles,
                             border_list=border_list,
                             sortable=True)

        html.add_heading("Notes")
        html.add_list([
            "RNA name: " + self.rna_name,
            "Randomization is performed for " + str(self.repeats) + " times.",
            "DBD stands for DNA Binding Domain on RNA.",
            "DBS stands for DNA Binding Site on DNA."
        ])
        html.add_fixed_rank_sortable()
        html.write(os.path.join(directory, "index.html"))

        #############################################################
        # RNA subpage: Profile of targeted regions for each merged DNA Binding Domain
        #############################################################

        header_list = [
            "#", "Target Region", "Associated Gene", "No. of DBSs",
            "DBS coverage"
        ]
        header_titles = [
            "Rank", "Given target regions from BED files",
            "Associated genes which is overlapping with the given region or close to it (less than 50000 bp)",
            "Number of DNA Binding Sites locate within the region",
            "The proportion of the region covered by DBS binding"
        ]

        #########################################################
        # dbd_region.html
        html = Html(
            name=html_header,
            links_dict=link_ds,  # fig_dir=os.path.join(directory,"style"),
            fig_rpath="../style",
            RGT_header=False,
            other_logo="TDF",
            homepage="../index.html")

        for rbsm in self.rbss:
            html.add_heading("DNA Binding Domain: " + rbsm.str_rna(),
                             idtag=rbsm.str_rna())
            data_table = []
            for i, region in enumerate(self.txp.merged_dict[rbsm]):
                # Add information
                data_table.append([
                    str(i + 1),
                    '<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' +
                    self.organism + "&position=" + region.chrom + "%3A" +
                    str(region.initial) + "-" + str(region.final) +
                    '" style="text-align:left">' +
                    region.toString(space=True) + '</a>',
                    split_gene_name(gene_name=region.name, org=self.organism),
                    str(len(self.region_dbs[region.toString()])),
                    value2str(self.region_coverage[region.toString()])
                ])

            html.add_zebra_table(header_list,
                                 col_size_list,
                                 type_list,
                                 data_table,
                                 align=align,
                                 cell_align="left",
                                 auto_width=True,
                                 header_titles=header_titles,
                                 sortable=True)
        html.add_fixed_rank_sortable()
        html.write(os.path.join(directory, "dbd_region.html"))

        #############################################################
        # Targeted regions centered
        #############################################################

        ##############################################################################################
        # target_regions.html
        html = Html(
            name=html_header,
            links_dict=link_ds,  # fig_dir=os.path.join(directory,"style"),
            fig_rpath="../style",
            RGT_header=False,
            other_logo="TDF",
            homepage="../index.html")

        if score:
            header_list = [
                "#", "Target region", "Associated Gene", "DBSs Count",
                "DBS coverage", "Score", "Sum of ranks"
            ]
            header_titles = [
                "Rank", "Target regions loaded from the given BED file",
                "Associated genes which is overlapping with the given region or close to it (less than 50000 bp)",
                "Number of DNA Binding Sites within the region",
                "The proportion of the region covered by DBS binding",
                "Scores from BED file", "Sum of all the left-hand-side ranks"
            ]
        else:
            header_list = [
                "#", "Target region", "Associated Gene", "DBSs Count",
                "DBS coverage", "Sum of ranks"
            ]
            header_titles = [
                "Rank", "Target regions loaded from the given BED file",
                "Associated genes which is overlapping with the given region or close to it (less than 50000 bp)",
                "Number of DNA Binding Sites within the region",
                "The proportion of the region covered by DBS binding",
                "Sum of all the left-hand-side ranks"
            ]
        html.add_heading("Target Regions")
        data_table = []

        if not self.dna_region.sorted: self.dna_region.sort()

        # Calculate the ranking
        rank_count = len(self.dna_region) - rank_array(
            [len(self.region_dbs[p.toString()]) for p in self.dna_region])
        rank_coverage = len(self.dna_region) - rank_array(
            [self.region_coverage[p.toString()] for p in self.dna_region])

        if score:
            try:
                score_list = [
                    float(p.data.split("\t")[0]) for p in self.dna_region
                ]
                rank_score = len(self.dna_region) - rank_array(
                    [abs(s) for s in score_list])
                rank_sum = [
                    x + y + z
                    for x, y, z in zip(rank_count, rank_coverage, rank_score)
                ]
                # sum_rank = rank_array(rank_sum)  # method='min'
            except ImportError:
                print(
                    "There is no score in BED file, please don't use '-score' argument."
                )
        else:
            rank_sum = [x + y for x, y in zip(rank_count, rank_coverage)]
            sum_rank = rank_array(rank_sum)

        for i, region in enumerate(self.dna_region):
            dbs_counts = str(len(self.region_dbs[region.toString()]))
            dbs_cover = value2str(self.region_coverage[region.toString()])

            newline = [
                str(i + 1),
                '<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' +
                self.organism + "&position=" + region.chrom + "%3A" +
                str(region.initial) + "-" + str(region.final) +
                '" style="text-align:left">' + region.toString(space=True) +
                '</a>',
                split_gene_name(gene_name=region.name, org=self.organism),
                '<a href="region_dbs.html#' + region.toString() +
                '" style="text-align:left">' + dbs_counts + '</a>', dbs_cover
            ]

            if score:
                dbs_score = value2str(score_list[i])
                region.data = "\t".join(
                    [dbs_counts, dbs_cover, dbs_score,
                     str(rank_sum[i])])
                newline.append(dbs_score)
                newline.append(str(rank_sum[i]))
            else:
                region.data = "\t".join(
                    [dbs_counts, dbs_cover,
                     str(rank_sum[i])])
                newline.append(str(rank_sum[i]))
            data_table.append(newline)

        data_table = natsort.natsorted(data_table, key=lambda x: x[-1])
        # data_table = sorted(data_table, key=lambda x: x[-1])
        html.add_zebra_table(header_list,
                             col_size_list,
                             type_list,
                             data_table,
                             align=align,
                             cell_align="left",
                             auto_width=True,
                             header_titles=header_titles,
                             sortable=True)
        html.add_heading("Notes")
        html.add_list(["All target regions without any bindings are ignored."])
        html.add_fixed_rank_sortable()
        html.write(os.path.join(directory, "target_regions.html"))

        self.dna_region.sort_score()
        self.dna_region.write_bed(
            os.path.join(directory, obed + "_target_regions.bed"))

        ##############################################################################################
        # starget_regions.html    for significant target regions

        stargets = GenomicRegionSet("sig_targets")
        sig_dbs = {}
        sig_dbs_coverage = {}
        for i, r in enumerate(self.dna_region):
            sig_bindings = self.region_dbs[r.toString()].overlap_rbss(
                rbss=self.data["region"]["sig_region"])
            dbs = sig_bindings.get_dbs()
            if len(dbs) > 0:
                stargets.add(r)
                m_dbs = dbs.merge(w_return=True)
                sig_dbs[r] = len(dbs)
                # self.promoter["de"]["merged_dbs"][promoter.toString()] = len(m_dbs)
                sig_dbs_coverage[r] = float(m_dbs.total_coverage()) / len(r)

        html = Html(
            name=html_header,
            links_dict=link_ds,  # fig_dir=os.path.join(directory,"style"),
            fig_rpath="../style",
            RGT_header=False,
            other_logo="TDF",
            homepage="../index.html")

        # Select promoters in sig DBD
        if len(self.data["region"]["sig_region"]) == 0:
            html.add_heading("There is no significant DBD.")
        else:
            html.add_heading("Target regions bound by significant DBD")
            data_table = []
            # Calculate the ranking
            rank_count = len(stargets) - rank_array(
                [sig_dbs[p] for p in stargets])
            rank_coverage = len(stargets) - rank_array(
                [sig_dbs_coverage[p] for p in stargets])
            if score:
                score_list = [float(p.data.split("\t")[0]) for p in stargets]
                rank_score = len(stargets) - rank_array(
                    [abs(s) for s in score_list])
                rank_sum = [
                    x + y + z
                    for x, y, z in zip(rank_count, rank_coverage, rank_score)
                ]
                sum_rank = rank_array(rank_sum)  # method='min'
            else:
                rank_sum = [x + y for x, y in zip(rank_count, rank_coverage)]
                sum_rank = rank_array(rank_sum)

            for i, region in enumerate(stargets):
                dbssount = '<a href="region_dbs.html#' + region.toString() + \
                           '" style="text-align:left">' + str(sig_dbs[region]) + '</a>'

                region_link = region_link_internet(self.organism, region)

                newline = [
                    str(i + 1), region_link,
                    split_gene_name(gene_name=region.name, org=self.organism),
                    dbssount,
                    value2str(sig_dbs_coverage[region])
                ]
                if score:
                    dbs_score = value2str(score_list[i])
                    # region.data = "\t".join([dbs_counts, dbs_cover, dbs_score, str(sum_rank[i])])
                    newline.append(dbs_score)
                    newline.append(str(rank_sum[i]))
                    # print([dbs_score, str(sum_rank[i])])
                else:
                    # region.data = "\t".join([dbs_counts, dbs_cover, str(sum_rank[i])])
                    newline.append(str(rank_sum[i]))

                # newline += ["<i>" + str(rank_sum[i]) + "</i>"]
                # print(newline)
                data_table.append(newline)

            # print(data_table)
            # data_table = sorted(data_table, key=lambda x: x[-1])
            data_table = natsort.natsorted(data_table, key=lambda x: x[-1])
            html.add_zebra_table(header_list,
                                 col_size_list,
                                 type_list,
                                 data_table,
                                 align=align,
                                 cell_align="left",
                                 header_titles=header_titles,
                                 border_list=None,
                                 sortable=True)
            html.add_heading("Notes")
            html.add_list([
                "DBS stands for DNA Binding Site on DNA.",
                "DBS coverage is the proportion of the region where has potential to form triple helices with the given RNA."
            ])
            html.add_fixed_rank_sortable()
            html.write(os.path.join(directory, "starget_regions.html"))

        ############################
        # Subpages for targeted region centered page
        # region_dbs.html
        header_list = ["RBS", "DBS", "Strand", "Score", "Motif", "Orientation"]

        html = Html(
            name=html_header,
            links_dict=link_ds,  # fig_dir=os.path.join(directory,"style"),
            fig_rpath="../style",
            RGT_header=False,
            other_logo="TDF",
            homepage="../index.html")

        for i, region in enumerate(self.dna_region):
            if len(self.region_dbs[region.toString()]) == 0:
                continue
            else:
                html.add_heading(
                    "Associated gene: " +
                    split_gene_name(gene_name=region.name, org=self.organism),
                    idtag=region.toString())
                html.add_free_content([
                    '<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' +
                    self.organism + "&position=" + region.chrom + "%3A" +
                    str(region.initial) + "-" + str(region.final) +
                    '" style="margin-left:50">' + region.toString(space=True) +
                    '</a>'
                ])
                data_table = []
                for rd in self.region_dbs[region.toString()]:
                    rbs = rd.rna.str_rna(pa=False)
                    for rbsm in self.data["region"]["sig_region"]:
                        # rbsm = rbsm.partition(":")[2].split("-")
                        if rd.rna.overlap(rbsm):
                            rbs = "<font color=\"red\">" + rbs + "</font>"
                    data_table.append([
                        rbs,
                        '<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db='
                        + self.organism + "&position=" + rd.dna.chrom + "%3A" +
                        str(rd.dna.initial) + "-" + str(rd.dna.final) +
                        '" style="text-align:left">' +
                        rd.dna.toString(space=True) + '</a>',
                        rd.dna.orientation, rd.score, rd.motif, rd.orient
                    ])
                html.add_zebra_table(header_list,
                                     col_size_list,
                                     type_list,
                                     data_table,
                                     align=align,
                                     cell_align="left",
                                     auto_width=True)
        html.write(os.path.join(directory, "region_dbs.html"))

        ###############################################################################33
        ################ Parameters.html

        html = Html(
            name=html_header,
            links_dict=link_ds,  # fig_dir=os.path.join(directory,"style"),
            fig_rpath="../style",
            RGT_header=False,
            other_logo="TDF",
            homepage="../index.html")
        html.add_heading("Parameters")
        header_list = ["Description", "Arguments", "Value"]

        data_table = [
            ["RNA sequence name", "-rn", parameters.rn],
            ["Input RNA sequence file", "-r",
             os.path.basename(parameters.r)],
            ["Input BED file", "-bed",
             os.path.basename(parameters.bed)],
            ["Output directory", "-o",
             os.path.basename(parameters.o)],
            ["Organism", "-organism", parameters.organism],
            ["Number of repitetion of andomization", "-n",
             str(parameters.n)],
            ["Alpha level for rejection p value", "-a",
             str(parameters.a)],
            [
                "Cut off value for filtering out the low counts of DBSs",
                "-ccf",
                str(parameters.ccf)
            ], ["Remove temporary files", "-rt",
                str(parameters.rt)],
            [
                "Input BED file for masking in randomization", "-f",
                str(parameters.f)
            ], ["Input file for RNA accecibility", "-ac",
                str(parameters.ac)],
            [
                "Cut off value for RNA accecibility", "-accf",
                str(parameters.accf)
            ],
            [
                "Output the BED files for DNA binding sites.", "-obed",
                str(parameters.obed)
            ],
            [
                "Show parallel and antiparallel bindings in the plot separately.",
                "-showpa",
                str(parameters.showpa)
            ], ["Minimum length", "-l",
                str(self.triplexator_p[0])],
            ["Maximum error rate", "-e",
             str(self.triplexator_p[1])],
            [
                "Tolerated number of consecutive errors", "-c",
                str(self.triplexator_p[2])
            ], ["Filtering repeats", "-fr",
                str(self.triplexator_p[3])],
            ["Filtering mode", "-fm",
             str(self.triplexator_p[4])],
            ["Output format", "-of",
             str(self.triplexator_p[5])],
            ["Merge features", "-mf",
             str(self.triplexator_p[6])]
        ]
        html.add_zebra_table(header_list,
                             col_size_list,
                             type_list,
                             data_table,
                             align=align,
                             cell_align="left",
                             auto_width=True)
        html.add_free_content(
            ['<a href="summary.txt" style="margin-left:100">See details</a>'])
        html.write(os.path.join(directory, "parameters.html"))
Пример #22
0
def find(s, ch):
    return [i for i, ltr in enumerate(s) if ltr == ch]

##################################################################################
parser = argparse.ArgumentParser(description='Check the coding potential by PhyloCSF', 
                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-i', metavar='  ', type=str, help="Input BED file")
parser.add_argument('-o', metavar='  ', type=str, help="Output BED file with the coding-potential score")
parser.add_argument('-organism', metavar='  ', type=str, help="Define the organism")
parser.add_argument('-rmcoding', metavar='  ', type=float, help="Define the cutoff to remove the entries with coding potential")
parser.add_argument('-mafdir', metavar='  ', type=str, help="Define the directory to MAF files")
# python /projects/reg-gen/tools/phylocsf_check.py -i
args = parser.parse_args()

bed = GenomicRegionSet("input")
bed.read_bed(args.i)
num = len(bed)

organisms = { "hg18": "Human",
              "panTro2": "Chimp",
              "rheMac2": "Rhesus",
              "tarSyr1": "Tarsier",
              "micMur1": "Mouse_lemur",
              "otoGar1": "Bushbaby",
              "tupBel1": "Shrew",
              "mm9": "Mouse",
              "rn4": "Rat",
              "dipOrd1": "Kangaroo_Rat",
              "cavPor2": "Guinea_Pig",
              "speTri1": "Squirrel",
              "oryCun1": "Rabbit",
Пример #23
0
    def chip_evaluate(self):
        """
        This evaluation methodology uses motif-predicted binding sites (MPBSs) together with TF ChIP-seq data
        to evaluate the footprint predictions.

        return:
        """

        # Evaluate Statistics
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        roc_auc_1 = dict()
        roc_auc_2 = dict()
        recall = dict()
        precision = dict()
        prc_auc = dict()

        if "SEG" in self.footprint_type:
            mpbs_regions = GenomicRegionSet("TFBS")
            mpbs_regions.read_bed(self.tfbs_file)
            mpbs_regions.sort()

            # Verifying the maximum score of the MPBS file
            max_score = -99999999
            for region in iter(mpbs_regions):
                score = int(region.data)
                if score > max_score:
                    max_score = score
            max_score += 1

        for i in range(len(self.footprint_file)):
            footprints_regions = GenomicRegionSet("Footprints Prediction")
            footprints_regions.read_bed(self.footprint_file[i])

            # Sort footprint prediction bed files
            footprints_regions.sort()

            if self.footprint_type[i] == "SEG":
                # Increasing the score of MPBS entry once if any overlaps found in the predicted footprints.
                increased_score_mpbs_regions = GenomicRegionSet("Increased Regions")
                intersect_regions = mpbs_regions.intersect(footprints_regions, mode=OverlapType.ORIGINAL)
                for region in iter(intersect_regions):
                    region.data = str(int(region.data) + max_score)
                    increased_score_mpbs_regions.add(region)


                # Keep the score of remained MPBS entry unchanged
                without_intersect_regions = mpbs_regions.subtract(footprints_regions, whole_region=True)
                for region in iter(without_intersect_regions):
                    increased_score_mpbs_regions.add(region)

                increased_score_mpbs_regions.sort_score()

                fpr[i], tpr[i], roc_auc[i], roc_auc_1[i], roc_auc_2[i] = self.roc_curve(increased_score_mpbs_regions)
                recall[i], precision[i], prc_auc[i] = self.precision_recall_curve(increased_score_mpbs_regions)
            elif self.footprint_type[i] == "SC":
                footprints_regions.sort_score()
                fpr[i], tpr[i], roc_auc[i], roc_auc_1[i], roc_auc_2[i] = self.roc_curve(footprints_regions)
                recall[i], precision[i], prc_auc[i] = self.precision_recall_curve(footprints_regions)

        # Output the statistics results into text
        stats_fname = self.output_location + self.tf_name + "_stats.txt"
        stats_header = ["METHOD", "AUC_100", "AUC_10", "AUC_1", "AUPR"]
        with open(stats_fname, "w") as stats_file:
            stats_file.write("\t".join(stats_header) + "\n")
            for i in range(len(self.footprint_name)):
                stats_file.write(self.footprint_name[i] + "\t" + str(roc_auc[i]) + "\t" + str(roc_auc_1[i]) + "\t"
                                 + str(roc_auc_2[i]) + "\t" + str(prc_auc[i]) + "\n")

        # Output the curves
        if self.print_roc_curve:
            label_x = "False Positive Rate"
            label_y = "True Positive Rate"
            curve_name = "ROC"
            self.plot_curve(fpr, tpr, roc_auc, label_x, label_y, self.tf_name, curve_name)
        if self.print_pr_curve:
            label_x = "Recall"
            label_y = "Precision"
            curve_name = "PRC"
            self.plot_curve(recall, precision, prc_auc, label_x, label_y, self.tf_name, curve_name)

        self.output_points(self.tf_name, fpr, tpr, recall, precision)
Пример #24
0
                        seq = "\t".join([ch, line[4], line[3], gn, ".", line[6]])
                    else:
                        continue
                # print(seq)

                if not args.g:
                    print(seq, file=g)
                elif select_genes.check(gn) or select_genes.check(gi):
                    
                    print(seq, file=g)
                else:
                    continue

        if args.b:
            exons = GenomicRegionSet("output")
            exons.read_bed(args.o)
            exons.write_bed_blocks(args.o)

        # sys.exit(1)

        # if args.g:
        #     select_genes = GeneSet("genes")
        #     select_genes.read(args.g)

        # # if args.t == "gene" or args.t == "transcript":
        # with open(args.i, "r") as f,open(args.o, "w") as g:
        #     find_ind = False
        #     for line in f:
        #         if line[0] == "#": 
        #             continue
        #         elif args.known_only:
Пример #25
0
    def read_states_signals(self):
        # Read states from the annotation file
        states = ""
        with open(self.annotate_fname) as annotate_file:
            for line in annotate_file:
                if len(line) < 2 or "#" in line or "=" in line:
                    continue
                ll = line.strip().split(" ")
                for state in ll[1:-1]:
                    states += state

        # If need to estimate bias table
        bias_table = BiasTable(output_loc=self.output_locaiton)
        genome_data = GenomeData(self.organism)
        table = None
        if self.estimate_bias_correction:
            regions = GenomicRegionSet("Bias Regions")
            if self.original_regions.split(".")[-1] == "bed":
                regions.read_bed(self.original_regions)
            if self.original_regions.split(".")[-1] == "fa":
                regions.read_sequence(self.original_regions)

            if self.estimate_bias_type == "FRE":
                table = bias_table.estimate_table(regions=regions, dnase_file_name=self.bam_file,
                                                  genome_file_name=genome_data.get_genome(),
                                                  k_nb=self.k_nb,
                                                  forward_shift=self.atac_forward_shift,
                                                  reverse_shift=self.atac_reverse_shift)
            elif self.estimate_bias_type == "PWM":
                table = bias_table.estimate_table_pwm(regions=regions, dnase_file_name=self.bam_file,
                                                      genome_file_name=genome_data.get_genome(),
                                                      k_nb=self.k_nb,
                                                      forward_shift=self.atac_forward_shift,
                                                      reverse_shift=self.atac_reverse_shift)

            bias_fname = os.path.join(self.output_locaiton, "Bias", "{}_{}".format(self.k_nb, self.atac_forward_shift))
            bias_table.write_tables(bias_fname, table)

        # If the bias table is provided
        if self.bias_table:
            bias_table_list = self.bias_table.split(",")
            table = bias_table.load_table(table_file_name_F=bias_table_list[0],
                                          table_file_name_R=bias_table_list[1])

        # Get the normalization and slope signal from the raw bam file
        raw_signal = GenomicSignal(self.bam_file)
        raw_signal.load_sg_coefs(slope_window_size=9)
        norm_signal, slope_signal = raw_signal.get_signal(ref=self.chrom, start=self.start, end=self.end,
                                                          downstream_ext=self.atac_downstream_ext,
                                                          upstream_ext=self.atac_upstream_ext,
                                                          forward_shift=self.atac_forward_shift,
                                                          reverse_shift=self.atac_reverse_shift,
                                                          initial_clip=self.atac_initial_clip,
                                                          bias_table=table,
                                                          genome_file_name=genome_data.get_genome(),
                                                          print_raw_signal=self.print_raw_signal,
                                                          print_bc_signal=self.print_bc_signal,
                                                          print_norm_signal=self.print_norm_signal,
                                                          print_slope_signal=self.print_slope_signal)
        if self.print_bed_file:
            self.output_bed_file(states)

        return states, norm_signal, slope_signal
Пример #26
0
class RandomTest:
    def __init__(self, rna_fasta, rna_name, dna_region, organism, showdbs=False):
        self.organism = organism
        genome = GenomeData(organism)
        self.genome_path = genome.get_genome()
        # RNA: Path to the FASTA file
        self.rna_fasta = rna_fasta
        self.showdbs = showdbs

        rnas = SequenceSet(name="rna", seq_type=SequenceType.RNA)
        rnas.read_fasta(self.rna_fasta)
        if rna_name:
            self.rna_name = rna_name
        else:
            self.rna_name = rnas[0].name

        # DNA: GenomicRegionSet
        self.dna_region = GenomicRegionSet(name="target")
        self.dna_region.read_bed(dna_region)
        self.dna_region = self.dna_region.gene_association(organism=self.organism, show_dis=True)

        self.topDBD = []
        self.stat = OrderedDict(name=rna_name, genome=organism)
        self.stat["target_regions"] = str(len(self.dna_region))


    def get_rna_region_str(self, rna):
        """Getting the rna region from the information header with the pattern:
                REGION_chr3_51978050_51983935_-_"""
        self.rna_regions = get_rna_region_str(rna)
        if self.rna_regions and len(self.rna_regions[0]) == 5:
            self.rna_expression = float(self.rna_regions[0][-1])
        else:
            self.rna_expression = "n.a."

    def connect_rna(self, rna, temp):
        d = connect_rna(rna, temp, self.rna_name)
        self.stat["exons"] = str(d[0])
        self.stat["seq_length"] = str(d[1])
        self.rna_len = d[1]

    def target_dna(self, temp, remove_temp, cutoff, l, e, c, fr, fm, of, mf, par, obed=False):
        """Calculate the true counts of triplexes on the given dna regions"""
        self.triplexator_p = [ l, e, c, fr, fm, of, mf ]

        txp = find_triplex(rna_fasta=os.path.join(temp, "rna_temp.fa"), dna_region=self.dna_region,
                           temp=temp, organism=self.organism, remove_temp=remove_temp,
                           l=l, e=e, c=c, fr=fr, fm=fm, of=of, mf=mf, par=par, genome_path=self.genome_path,
                           prefix="targeted_region", dna_fine_posi=False)
        txp.merge_rbs(rm_duplicate=True, region_set=self.dna_region, asgene_organism=self.organism, cutoff=cutoff)
        self.txp = txp
        self.stat["DBSs_target_all"] = str(len(self.txp))
        txp.remove_duplicates()
        self.rbss = txp.merged_dict.keys()
        # if len(self.rbss) == 0:
        #     print("ERROR: No potential binding event. Please change the parameters.")
        #     sys.exit(1)

        txpf = find_triplex(rna_fasta=os.path.join(temp, "rna_temp.fa"), dna_region=self.dna_region,
                            temp=temp, organism=self.organism, remove_temp=remove_temp,
                            l=l, e=e, c=c, fr=fr, fm=fm, of=of, mf=mf, par=par, genome_path=self.genome_path,
                            prefix="dbs", dna_fine_posi=True)
        txpf.remove_duplicates()
        txpf.merge_rbs(rbss=self.rbss, rm_duplicate=True, asgene_organism=self.organism)
        self.txpf = txpf

        self.stat["DBSs_target_all"] = str(len(self.txpf))

        self.counts_tr = OrderedDict()
        self.counts_dbs = OrderedDict()

        for rbs in self.rbss:
            tr = len(self.txp.merged_dict[rbs])
            self.counts_tr[rbs] = [tr, len(self.dna_region) - tr]
            self.counts_dbs[rbs] = len(self.txpf.merged_dict[rbs])

        self.region_dbd = self.txpf.sort_rbs_by_regions(self.dna_region)

        self.region_dbs = self.txpf.sort_rd_by_regions(regionset=self.dna_region)
        self.region_dbsm = {}
        self.region_coverage = {}

        for region in self.dna_region:
            self.region_dbsm[region.toString()] = self.region_dbs[region.toString()].get_dbs().merge(w_return=True)
            self.region_coverage[region.toString()] = float(self.region_dbsm[region.toString()].total_coverage()) / len \
                (region)
        self.stat["target_regions"] = str(len(self.dna_region))

        if obed:
            # btr = self.txp.get_dbs()
            # btr = btr.gene_association(organism=self.organism, show_dis=True)
            # btr.write_bed(os.path.join(temp, obed + "_target_region_dbs.bed"))
            # dbss = txpf.get_dbs()
            # dbss.write_bed(os.path.join(temp, obed + "_dbss.bed"))

            # output = self.dna_region.gene_association(organism=self.organism, show_dis=True)

            self.txp.write_bed(filename=os.path.join(temp, obed + "_target_region_dbs.bed"),
                               dbd_tag=False,
                               remove_duplicates=False, associated=self.organism)
            self.txpf.write_bed(filename=os.path.join(temp, obed + "_dbss.bed"),
                                remove_duplicates=False)


    def random_test(self, repeats, temp, remove_temp, l, e, c, fr, fm, of, mf, rm, par, filter_bed, alpha):
        """Perform randomization for the given times"""
        self.repeats = repeats
        marks = numpy.round(numpy.linspace(0, repeats - 1, num=41)).tolist()
        print("random_test")
        print(par)
        # Prepare the input lists for multiprocessing
        mp_input = []
        for i in range(repeats):
            mp_input.append([str(i), os.path.join(temp, "rna_temp.fa"), self.dna_region,
                             temp, self.organism, self.rbss, str(marks.count(i)),
                             str(l), str(e), str(c), str(fr), str(fm), str(of), str(mf), str(rm),
                             filter_bed, self.genome_path, par])
        # Multiprocessing
        print("\t\t|0%                  |                100%|")
        print("\t\t[", end="")
        pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()-2)
        mp_output = pool.map(random_each, mp_input)
        # print(mp_output)
        pool.close()
        pool.join()
        print("]")

        # Processing the result
        self.region_matrix = []
        self.dbss_matrix = []
        self.data = {"region": {"ave": [],
                                "sd": [],
                                "p": [],
                                "sig_region": [],
                                "sig_boolean": []},
                     "dbs": {"ave": [],
                             "sd": [],
                             "p": [],
                             "sig_region": [],
                             "sig_boolean": []}}

        region_counts = [v[0] for v in mp_output]
        dbss_counts = [v[1] for v in mp_output]

        for i, rbs in enumerate(self.rbss):

            counts_regions = [v[i] for v in region_counts]

            self.data["region"]["ave"].append(numpy.mean(counts_regions))
            self.data["region"]["sd"].append(numpy.std(counts_regions))
            num_sig = len([h for h in counts_regions if h > self.counts_tr[rbs][0]])
            p_region = float(num_sig) / repeats
            self.data["region"]["p"].append(p_region)
            self.region_matrix.append(counts_regions)

            if p_region < alpha:
                self.data["region"]["sig_region"].append(rbs)
                self.data["region"]["sig_boolean"].append(True)
            else:
                self.data["region"]["sig_boolean"].append(False)

            try:
                if p_region < self.topDBD[1]: self.topDBD = [rbs.str_rna(pa=False), p_region]
            except:
                self.topDBD = [rbs.str_rna(pa=False), p_region]

            # Analysis based on DBSs
            if self.showdbs:
                counts_dbss = [v[i] for v in dbss_counts]

                self.data["dbs"]["ave"].append(numpy.mean(counts_dbss))
                self.data["dbs"]["sd"].append(numpy.std(counts_dbss))
                num_sig = len([h for h in counts_dbss if h > self.counts_dbs[rbs]])
                p_dbs = float(num_sig) / repeats
                self.data["dbs"]["p"].append(p_dbs)
                self.dbss_matrix.append(counts_dbss)
                if p_dbs < alpha:
                    self.data["dbs"]["sig_region"].append(rbs)
                    self.data["dbs"]["sig_boolean"].append(True)
                else:
                    self.data["dbs"]["sig_boolean"].append(False)
            try:
                self.stat["p_value"] = str(min(self.data["region"]["p"]))
            except:
                self.stat["p_value"] = "1"

        self.region_matrix = numpy.array(self.region_matrix)

        if self.showdbs: self.dbss_matrix = numpy.array(self.dbss_matrix)

        counts_dbss = [v[i] for v in dbss_counts]
        self.stat["DBSs_random_ave"] = numpy.mean(counts_dbss)
        try: self.stat["p_value"] = str(min(self.data["region"]["p"]))
        except: self.stat["p_value"] = "1"

    def dbd_regions(self, sig_region, output):
        """Generate the BED file of significant DBD regions and FASTA file of the sequences"""
        dbd_regions(exons=self.rna_regions, sig_region=sig_region, rna_name=self.rna_name, output=output)

        self.stat["DBD_all"] = str(len(self.rbss))
        self.stat["DBD_sig"] = str(len(self.data["region"]["sig_region"]))

        sigDBD = GenomicRegionSet("DBD_sig")
        sigDBD.sequences = self.data["region"]["sig_region"]
        rbss = self.txp.get_rbs()
        overlaps = rbss.intersect(y=sigDBD, mode=OverlapType.ORIGINAL)
        self.stat["DBSs_target_DBD_sig"] = str(len(overlaps))


    def lineplot(self, txp, dirp, ac, cut_off, log, ylabel, linelabel, showpa, sig_region, filename):
        """Generate lineplot for RNA"""

        lineplot(txp=txp, rnalen=self.rna_len, rnaname=self.rna_name, dirp=dirp, sig_region=sig_region,
                 cut_off=cut_off, log=log, ylabel=ylabel, linelabel=linelabel,
                 filename=filename, ac=ac, showpa=showpa)

    def boxplot(self, dir, matrix, sig_region, truecounts, sig_boolean, ylabel, filename):
        """Generate the visualized plot"""
        tick_size = 8
        label_size = 9

        f, ax = plt.subplots(1, 1, dpi=300, figsize=(6, 4))
        max_y = int(max([matrix.max()] + truecounts) * 1.1) + 1
        min_y = max(int(matrix.min() * 0.9) - 1, 0)

        # Significant DBD
        rect = patches.Rectangle(xy=(1, 0), width=0.8, height=max_y, facecolor=sig_color,
                                 edgecolor="none", alpha=0.5, lw=None, label="Significant DBD")
        for i, r in enumerate(sig_boolean):
            if r:
                rect = patches.Rectangle(xy=(i + 0.6, min_y), width=0.8, height=max_y, facecolor=sig_color,
                                         edgecolor="none", alpha=0.5, lw=None, label="Significant DBD")
                ax.add_patch(rect)

        # Plotting

        bp = ax.boxplot(matrix.transpose(), notch=False, sym='o', vert=True,
                        whis=1.5, positions=None, widths=None,
                        patch_artist=True, bootstrap=None)
        z = 10
        plt.setp(bp['boxes'], color=nontarget_color, alpha=1, edgecolor="none")
        plt.setp(bp['whiskers'], color='black', linestyle='-', linewidth=1, zorder=z, alpha=1)
        plt.setp(bp['fliers'], markerfacecolor='gray', color='white', alpha=0.3, markersize=1.8, zorder=z)
        plt.setp(bp['caps'], color='white', zorder=-1)
        plt.setp(bp['medians'], color='black', linewidth=1.5, zorder=z + 1)

        # Plot target regions
        plt.plot(range(1, len(self.rbss) + 1), truecounts, markerfacecolor=target_color,
                 marker='o', markersize=5, linestyle='None', markeredgecolor="white", zorder=z + 5)

        ax.set_xlabel(self.rna_name + " DNA Binding Domains", fontsize=label_size)
        ax.set_ylabel(ylabel, fontsize=label_size, rotation=90)

        ax.set_ylim([min_y, max_y])
        ax.yaxis.set_major_locator(MaxNLocator(integer=True))

        ax.set_xticklabels([dbd.str_rna(pa=False) for dbd in self.rbss], rotation=35,
                           ha="right", fontsize=tick_size)
        for tick in ax.yaxis.get_major_ticks(): tick.label.set_fontsize(tick_size)

        for spine in ['top', 'right']:
            ax.spines[spine].set_visible(False)
        ax.tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='on')
        ax.tick_params(axis='y', which='both', left='on', right='off', labelbottom='off')

        # Legend
        dot_legend, = plt.plot([1, 1], color=target_color, marker='o', markersize=5, markeredgecolor="white",
                               linestyle='None')
        bp_legend, = plt.plot([1, 1], color=nontarget_color, linewidth=6, alpha=1)

        ax.legend([dot_legend, bp_legend, rect], ["Target Regions", "Non-target regions", "Significant DBD"],
                  bbox_to_anchor=(0., 1.02, 1., .102), loc=2, mode="expand", borderaxespad=0.,
                  prop={'size': 9}, ncol=3, numpoints=1)
        bp_legend.set_visible(False)
        dot_legend.set_visible(False)

        # f.tight_layout(pad=1.08, h_pad=None, w_pad=None)
        f.savefig(os.path.join(dir, filename + ".png"), facecolor='w', edgecolor='w',
                  bbox_extra_artists=(plt.gci()), bbox_inches='tight', dpi=300)
        # PDF
        for tick in ax.xaxis.get_major_ticks():
            tick.label.set_fontsize(12)
        for tick in ax.yaxis.get_major_ticks():
            tick.label.set_fontsize(12)
        ax.xaxis.label.set_size(14)
        ax.yaxis.label.set_size(14)

        pp = PdfPages(os.path.join(dir, filename + '.pdf'))
        pp.savefig(f, bbox_extra_artists=(plt.gci()), bbox_inches='tight')
        pp.close()

    def gen_html(self, directory, parameters, obed, align=50, alpha=0.05, score=False):
        """Generate the HTML file"""
        dir_name = os.path.basename(directory)
        html_header = "Genomic Region Test: " + dir_name
        link_ds = OrderedDict()
        link_ds["RNA"] = "index.html"
        link_ds["Sig Target Regions"] = "starget_regions.html"
        link_ds["Target Regions"] = "target_regions.html"
        link_ds["Parameters"] = "parameters.html"

        ##################################################
        # index.html

        html = Html(name=html_header, links_dict=link_ds,  # fig_dir=os.path.join(directory,"style"),
                    fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html")
        # Plots
        html.add_figure("lineplot_region.png", align="left", width="45%", more_images=["boxplot_regions.png"])
        if self.showdbs:
            html.add_figure("lineplot_dbs.png", align="left", width="45%", more_images=["boxplot_dbs.png"])

        if self.showdbs:
            header_list = [["#", "DBD", "Target Regions", None, "Non-target Regions", None, "Statistics",
                            "Target Regions", "Non-target Regions", None, "Statistics"],
                           ["", "", "with DBS", "without DBS", "with DBS (average)", "s.d.", "<i>p</i>-value",
                            "NO. DBSs", "NO. DBSs (average)", "s.d.", "<i>p</i>-value"]]
            header_titles = [["Rank", "DNA Binding Domain", "Given target regions on DNA", None,
                              "Regions from randomization", None, "Statistics based on target regions",
                              "Given target regions on DNA", "Regions from randomization", None,
                              "Statistics based on DNA Binding Sites"],
                             ["", "",
                              "Number of target regions with DBS binding",
                              "Number of target regions without DBS binding",
                              "Average number of regions from randomization with DBS binding",
                              "Standard deviation", "P value",
                              "Number of related DNA Binding Sites binding to target regions",
                              "Average number of DNA Binding Sites binding to random regions",
                              "Standard deviation", "P-value"]]
            border_list = [" style=\"border-right:1pt solid gray\"",
                           " style=\"border-right:1pt solid gray\"", "",
                           " style=\"border-right:1pt solid gray\"", "",
                           " style=\"border-right:1pt solid gray\"",
                           " style=\"border-right:2pt solid gray\"",
                           " style=\"border-right:1pt solid gray\"", "",
                           " style=\"border-right:1pt solid gray\"",
                           " style=\"border-right:1pt solid gray\""]
        else:
            header_list = [["#", "DBD", "Target Regions", None, "Non-target Regions", None, "Statistics", None],
                           ["", "", "with DBS", "without DBS", "with DBS (average)", "s.d.", "<i>p</i>-value",
                            "z-score"]]
            header_titles = [["Rank", "DNA Binding Domain", "Given target regions on DNA", None,
                              "Regions from randomization", None, "Statistics based on target regions", None],
                             ["", "",
                              "Number of target regions with DBS binding",
                              "Number of target regions without DBS binding",
                              "Average number of regions from randomization with DBS binding",
                              "Standard deviation", "P value", "Z-score"]]
            border_list = [" style=\"border-right:1pt solid gray\"",
                           " style=\"border-right:1pt solid gray\"", "",
                           " style=\"border-right:1pt solid gray\"", "",
                           " style=\"border-right:1pt solid gray\"",
                           " style=\"border-right:1pt solid gray\"", ""]

        type_list = 'ssssssssssssssss'
        col_size_list = [50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50]
        data_table = []

        for i, rbs in enumerate(self.rbss):
            if self.data["region"]["p"][i] < alpha:
                p_region = "<font color=\"red\">" + value2str(self.data["region"]["p"][i]) + "</font>"

            else:
                p_region = value2str(self.data["region"]["p"][i])
            zs = (self.counts_tr[rbs][0] - self.data["region"]["ave"][i]) / self.data["region"]["sd"][i]
            new_line = [str(i + 1),
                        rbs.str_rna(pa=False),
                        '<a href="dbd_region.html#' + rbs.str_rna() +
                        '" style="text-align:left">' + str(self.counts_tr[rbs][0]) + '</a>',
                        str(self.counts_tr[rbs][1]),
                        value2str(self.data["region"]["ave"][i]),
                        value2str(self.data["region"]["sd"][i]),
                        p_region,
                        value2str(zs)]
            if self.showdbs:
                if self.data["dbs"]["p"][i] < alpha:
                    p_dbs = "<font color=\"red\">" + value2str(self.data["dbs"]["p"][i]) + "</font>"
                else:
                    p_dbs = value2str(self.data["dbs"]["p"][i])

                new_line += [str(self.counts_dbs[rbs]),
                             value2str(self.data["dbs"]["ave"][i]),
                             value2str(self.data["dbs"]["sd"][i]),
                             p_dbs]
            data_table.append(new_line)

        data_table = natsort.natsorted(data_table, key=lambda x: x[6])
        html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left",
                             auto_width=True, header_titles=header_titles, border_list=border_list, sortable=True)

        html.add_heading("Notes")
        html.add_list(["RNA name: " + self.rna_name,
                       "Randomization is performed for " + str(self.repeats) + " times.",
                       "DBD stands for DNA Binding Domain on RNA.",
                       "DBS stands for DNA Binding Site on DNA."])
        html.add_fixed_rank_sortable()
        html.write(os.path.join(directory, "index.html"))

        #############################################################
        # RNA subpage: Profile of targeted regions for each merged DNA Binding Domain
        #############################################################

        header_list = ["#", "Target Region",
                       "Associated Gene",
                       "No. of DBSs",
                       "DBS coverage"]
        header_titles = ["Rank", "Given target regions from BED files",
                         "Associated genes which is overlapping with the given region or close to it (less than 50000 bp)",
                         "Number of DNA Binding Sites locate within the region",
                         "The proportion of the region covered by DBS binding"]

        #########################################################
        # dbd_region.html
        html = Html(name=html_header, links_dict=link_ds,  # fig_dir=os.path.join(directory,"style"),
                    fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html")

        for rbsm in self.rbss:
            html.add_heading("DNA Binding Domain: " + rbsm.str_rna(),
                             idtag=rbsm.str_rna())
            data_table = []
            for i, region in enumerate(self.txp.merged_dict[rbsm]):
                # Add information
                data_table.append([str(i + 1),
                                   '<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' + self.organism +
                                   "&position=" + region.chrom + "%3A" + str(region.initial) + "-" + str(region.final) +
                                   '" style="text-align:left">' + region.toString(space=True) + '</a>',
                                   split_gene_name(gene_name=region.name, org=self.organism),
                                   str(len(self.region_dbs[region.toString()])),
                                   value2str(self.region_coverage[region.toString()])
                                   ])

            html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left",
                                 auto_width=True, header_titles=header_titles, sortable=True)
        html.add_fixed_rank_sortable()
        html.write(os.path.join(directory, "dbd_region.html"))

        #############################################################
        # Targeted regions centered
        #############################################################

        ##############################################################################################
        # target_regions.html
        html = Html(name=html_header, links_dict=link_ds,  # fig_dir=os.path.join(directory,"style"),
                    fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html")

        if score:
            header_list = ["#", "Target region", "Associated Gene", "DBSs Count",
                           "DBS coverage", "Score", "Sum of ranks"]
            header_titles = ["Rank",
                             "Target regions loaded from the given BED file",
                             "Associated genes which is overlapping with the given region or close to it (less than 50000 bp)",
                             "Number of DNA Binding Sites within the region",
                             "The proportion of the region covered by DBS binding",
                             "Scores from BED file",
                             "Sum of all the left-hand-side ranks"]
        else:
            header_list = ["#", "Target region", "Associated Gene", "DBSs Count",
                           "DBS coverage", "Sum of ranks"]
            header_titles = ["Rank",
                             "Target regions loaded from the given BED file",
                             "Associated genes which is overlapping with the given region or close to it (less than 50000 bp)",
                             "Number of DNA Binding Sites within the region",
                             "The proportion of the region covered by DBS binding",
                             "Sum of all the left-hand-side ranks"]
        html.add_heading("Target Regions")
        data_table = []

        if not self.dna_region.sorted: self.dna_region.sort()

        # Calculate the ranking
        rank_count = len(self.dna_region) - rank_array([len(self.region_dbs[p.toString()]) for p in self.dna_region])
        rank_coverage = len(self.dna_region) - rank_array([self.region_coverage[p.toString()] for p in self.dna_region])

        if score:
            try:
                score_list = [float(p.data.split("\t")[0]) for p in self.dna_region]
                rank_score = len(self.dna_region) - rank_array([abs(s) for s in score_list])
                rank_sum = [x + y + z for x, y, z in zip(rank_count, rank_coverage, rank_score)]
                # sum_rank = rank_array(rank_sum)  # method='min'
            except ImportError:
                print("There is no score in BED file, please don't use '-score' argument.")
        else:
            rank_sum = [x + y for x, y in zip(rank_count, rank_coverage)]
            sum_rank = rank_array(rank_sum)

        for i, region in enumerate(self.dna_region):
            dbs_counts = str(len(self.region_dbs[region.toString()]))
            dbs_cover = value2str(self.region_coverage[region.toString()])

            newline = [str(i + 1),
                       '<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' + self.organism +
                       "&position=" + region.chrom + "%3A" + str(region.initial) + "-" + str(region.final) +
                       '" style="text-align:left">' + region.toString(space=True) + '</a>',
                       split_gene_name(gene_name=region.name, org=self.organism),
                       '<a href="region_dbs.html#' + region.toString() +
                       '" style="text-align:left">' + dbs_counts + '</a>',
                       dbs_cover]

            if score:
                dbs_score = value2str(score_list[i])
                region.data = "\t".join([dbs_counts, dbs_cover, dbs_score, str(rank_sum[i])])
                newline.append(dbs_score)
                newline.append(str(rank_sum[i]))
            else:
                region.data = "\t".join([dbs_counts, dbs_cover, str(rank_sum[i])])
                newline.append(str(rank_sum[i]))
            data_table.append(newline)

        data_table = natsort.natsorted(data_table, key=lambda x: x[-1])
        # data_table = sorted(data_table, key=lambda x: x[-1])
        html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left",
                             auto_width=True, header_titles=header_titles, sortable=True)
        html.add_heading("Notes")
        html.add_list(["All target regions without any bindings are ignored."])
        html.add_fixed_rank_sortable()
        html.write(os.path.join(directory, "target_regions.html"))

        self.dna_region.sort_score()
        self.dna_region.write_bed(os.path.join(directory, obed + "_target_regions.bed"))



        ##############################################################################################
        # starget_regions.html    for significant target regions

        stargets = GenomicRegionSet("sig_targets")
        sig_dbs = {}
        sig_dbs_coverage = {}
        for i, r in enumerate(self.dna_region):
            sig_bindings = self.region_dbs[r.toString()].overlap_rbss(rbss=self.data["region"]["sig_region"])
            dbs = sig_bindings.get_dbs()
            if len(dbs) > 0:
                stargets.add(r)
                m_dbs = dbs.merge(w_return=True)
                sig_dbs[r] = len(dbs)
                # self.promoter["de"]["merged_dbs"][promoter.toString()] = len(m_dbs)
                sig_dbs_coverage[r] = float(m_dbs.total_coverage()) / len(r)

        html = Html(name=html_header, links_dict=link_ds,  # fig_dir=os.path.join(directory,"style"),
                    fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html")

        # Select promoters in sig DBD
        if len(self.data["region"]["sig_region"]) == 0:
            html.add_heading("There is no significant DBD.")
        else:
            html.add_heading("Target regions bound by significant DBD")
            data_table = []
            # Calculate the ranking
            rank_count = len(stargets) - rank_array([sig_dbs[p] for p in stargets])
            rank_coverage = len(stargets) - rank_array([sig_dbs_coverage[p] for p in stargets])
            if score:
                score_list = [float(p.data.split("\t")[0]) for p in stargets]
                rank_score = len(stargets) - rank_array([abs(s) for s in score_list])
                rank_sum = [x + y + z for x, y, z in zip(rank_count, rank_coverage, rank_score)]
                sum_rank = rank_array(rank_sum)  # method='min'
            else:
                rank_sum = [x + y for x, y in zip(rank_count, rank_coverage)]
                sum_rank = rank_array(rank_sum)

            for i, region in enumerate(stargets):
                dbssount = '<a href="region_dbs.html#' + region.toString() + \
                           '" style="text-align:left">' + str(sig_dbs[region]) + '</a>'

                region_link = region_link_internet(self.organism, region)

                newline = [str(i + 1), region_link,
                           split_gene_name(gene_name=region.name, org=self.organism),
                           dbssount, value2str(sig_dbs_coverage[region]) ]
                if score:
                    dbs_score = value2str(score_list[i])
                    # region.data = "\t".join([dbs_counts, dbs_cover, dbs_score, str(sum_rank[i])])
                    newline.append(dbs_score)
                    newline.append(str(rank_sum[i]))
                    # print([dbs_score, str(sum_rank[i])])
                else:
                    # region.data = "\t".join([dbs_counts, dbs_cover, str(sum_rank[i])])
                    newline.append(str(rank_sum[i]))

                # newline += ["<i>" + str(rank_sum[i]) + "</i>"]
                # print(newline)
                data_table.append(newline)

            # print(data_table)
            # data_table = sorted(data_table, key=lambda x: x[-1])
            data_table = natsort.natsorted(data_table, key=lambda x: x[-1])
            html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left",
                                 header_titles=header_titles, border_list=None, sortable=True)
            html.add_heading("Notes")
            html.add_list(["DBS stands for DNA Binding Site on DNA.",
                           "DBS coverage is the proportion of the region where has potential to form triple helices with the given RNA."])
            html.add_fixed_rank_sortable()
            html.write(os.path.join(directory, "starget_regions.html"))

        ############################
        # Subpages for targeted region centered page
        # region_dbs.html
        header_list = ["RBS", "DBS", "Strand", "Score", "Motif", "Orientation"]

        html = Html(name=html_header, links_dict=link_ds,  # fig_dir=os.path.join(directory,"style"),
                    fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html")

        for i, region in enumerate(self.dna_region):
            if len(self.region_dbs[region.toString()]) == 0:
                continue
            else:
                html.add_heading("Associated gene: " + split_gene_name(gene_name=region.name, org=self.organism),
                                 idtag=region.toString())
                html.add_free_content(['<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' + self.organism +
                                       "&position=" + region.chrom + "%3A" + str(region.initial) +
                                       "-" + str(region.final) + '" style="margin-left:50">' +
                                       region.toString(space=True) + '</a>'])
                data_table = []
                for rd in self.region_dbs[region.toString()]:
                    rbs = rd.rna.str_rna(pa=False)
                    for rbsm in self.data["region"]["sig_region"]:
                        # rbsm = rbsm.partition(":")[2].split("-")
                        if rd.rna.overlap(rbsm):
                            rbs = "<font color=\"red\">" + rbs + "</font>"
                    data_table.append([rbs,
                                       '<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' + self.organism +
                                       "&position=" + rd.dna.chrom + "%3A" + str(rd.dna.initial) + "-" + str(
                                           rd.dna.final) +
                                       '" style="text-align:left">' + rd.dna.toString(space=True) + '</a>',
                                       rd.dna.orientation, rd.score, rd.motif, rd.orient])
                html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left",
                                     auto_width=True)
        html.write(os.path.join(directory, "region_dbs.html"))

        ###############################################################################33
        ################ Parameters.html

        html = Html(name=html_header, links_dict=link_ds,  # fig_dir=os.path.join(directory,"style"),
                    fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html")
        html.add_heading("Parameters")
        header_list = ["Description", "Arguments", "Value"]

        data_table = [["RNA sequence name", "-rn", parameters.rn],
                      ["Input RNA sequence file", "-r", os.path.basename(parameters.r)],
                      ["Input BED file", "-bed", os.path.basename(parameters.bed)],
                      ["Output directory", "-o", os.path.basename(parameters.o)],
                      ["Organism", "-organism", parameters.organism],
                      ["Number of repitetion of andomization", "-n", str(parameters.n)],
                      ["Alpha level for rejection p value", "-a", str(parameters.a)],
                      ["Cut off value for filtering out the low counts of DBSs", "-ccf", str(parameters.ccf)],
                      ["Remove temporary files", "-rt", str(parameters.rt)],
                      ["Input BED file for masking in randomization", "-f", str(parameters.f)],
                      ["Input file for RNA accecibility", "-ac", str(parameters.ac)],
                      ["Cut off value for RNA accecibility", "-accf", str(parameters.accf)],
                      ["Output the BED files for DNA binding sites.", "-obed", str(parameters.obed)],
                      ["Show parallel and antiparallel bindings in the plot separately.", "-showpa",
                       str(parameters.showpa)],
                      ["Minimum length", "-l", str(self.triplexator_p[0])],
                      ["Maximum error rate", "-e", str(self.triplexator_p[1])],
                      ["Tolerated number of consecutive errors", "-c", str(self.triplexator_p[2])],
                      ["Filtering repeats", "-fr", str(self.triplexator_p[3])],
                      ["Filtering mode", "-fm", str(self.triplexator_p[4])],
                      ["Output format", "-of", str(self.triplexator_p[5])],
                      ["Merge features", "-mf", str(self.triplexator_p[6])]]
        html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left",
                             auto_width=True)
        html.add_free_content(['<a href="summary.txt" style="margin-left:100">See details</a>'])
        html.write(os.path.join(directory, "parameters.html"))
Пример #27
0
    def line(self):
        signal = GenomicSignal(self.bam_file)
        signal.load_sg_coefs(slope_window_size=9)
        bias_table = BiasTable()
        bias_table_list = self.bias_table.split(",")
        table = bias_table.load_table(table_file_name_F=bias_table_list[0],
                                      table_file_name_R=bias_table_list[1])
        genome_data = GenomeData(self.organism)
        fasta = Fastafile(genome_data.get_genome())
        pwm_dict = dict([("A", [0.0] * self.window_size), ("C", [0.0] * self.window_size),
                        ("G", [0.0] * self.window_size), ("T", [0.0] * self.window_size),
                        ("N", [0.0] * self.window_size)])


        mean_raw_signal = np.zeros(self.window_size)
        mean_bc_signal = np.zeros(self.window_size)
        mean_raw_signal_f = np.zeros(self.window_size)
        mean_bc_signal_f = np.zeros(self.window_size)
        mean_raw_signal_r = np.zeros(self.window_size)
        mean_bc_signal_r = np.zeros(self.window_size)

        mean_bias_signal_f = np.zeros(self.window_size)
        mean_bias_signal_r = np.zeros(self.window_size)
        num_sites = 0

        mpbs_regions = GenomicRegionSet("Motif Predicted Binding Sites")
        mpbs_regions.read_bed(self.motif_file)

        total_nc_signal = 0
        total_nl_signal = 0
        total_nr_signal = 0

        for region in mpbs_regions:
            if str(region.name).split(":")[-1] == "Y":
                num_sites += 1
                # Extend by 50 bp
                mid = (region.initial + region.final) / 2
                p1 = mid - (self.window_size / 2)
                p2 = mid + (self.window_size / 2)

                if not self.strands_specific:
                    # Fetch raw signal
                    raw_signal, _ = signal.get_signal(ref=region.chrom, start=p1, end=p2,
                                                  downstream_ext=self.atac_downstream_ext,
                                                  upstream_ext=self.atac_upstream_ext,
                                                  forward_shift=self.atac_forward_shift,
                                                  reverse_shift=self.atac_reverse_shift,
                                                  genome_file_name=genome_data.get_genome())

                    mean_raw_signal = np.add(mean_raw_signal, raw_signal)

                    # Fetch bias correction signal
                    bc_signal, _ = signal.get_signal(ref=region.chrom, start=p1, end=p2,
                                                     bias_table=table,
                                                     downstream_ext=self.atac_downstream_ext,
                                                     upstream_ext=self.atac_upstream_ext,
                                                     forward_shift=self.atac_forward_shift,
                                                     reverse_shift=self.atac_reverse_shift,
                                                     genome_file_name=genome_data.get_genome())

                    mean_bc_signal = np.add(mean_bc_signal, bc_signal)
                else:
                    raw_signal_f, _, raw_signal_r, _ =  signal.get_signal_per_strand(ref=region.chrom, start=p1, end=p2,
                                                                        downstream_ext=self.atac_downstream_ext,
                                                                        upstream_ext=self.atac_upstream_ext,
                                                                        forward_shift=self.atac_forward_shift,
                                                                        reverse_shift=self.atac_reverse_shift,
                                                                        genome_file_name=genome_data.get_genome())
                    mean_raw_signal_f = np.add(mean_raw_signal_f, raw_signal_f)
                    mean_raw_signal_r = np.add(mean_raw_signal_r, raw_signal_r)

                    bc_signal_f, _, bc_signal_r, _ = signal.get_signal_per_strand(ref=region.chrom, start=p1, end=p2,
                                                                                  bias_table=table,
                                                                                  downstream_ext=self.atac_downstream_ext,
                                                                                  upstream_ext=self.atac_upstream_ext,
                                                                                  forward_shift=self.atac_forward_shift,
                                                                                  reverse_shift=self.atac_reverse_shift,
                                                                                  genome_file_name=genome_data.get_genome())
                    mean_bc_signal_f = np.add(mean_bc_signal_f, bc_signal_f)
                    mean_bc_signal_r = np.add(mean_bc_signal_r, bc_signal_r)

                # Update pwm
                aux_plus = 1
                dna_seq = str(fasta.fetch(region.chrom, p1, p2)).upper()
                if (region.final - region.initial) % 2 == 0:
                    aux_plus = 0
                dna_seq_rev = AuxiliaryFunctions.revcomp(str(fasta.fetch(region.chrom,
                                                                         p1 + aux_plus, p2 + aux_plus)).upper())
                if region.orientation == "+":
                    for i in range(0, len(dna_seq)):
                        pwm_dict[dna_seq[i]][i] += 1
                elif region.orientation == "-":
                    for i in range(0, len(dna_seq_rev)):
                        pwm_dict[dna_seq_rev[i]][i] += 1

                # Create bias signal
                bias_table_f = table[0]
                bias_table_r = table[1]
                self.k_nb = len(bias_table_f.keys()[0])
                bias_signal_f = []
                bias_signal_r = []
                p1_wk = p1 - int(self.k_nb / 2)
                p2_wk = p2 + int(self.k_nb / 2)
                dna_seq = str(fasta.fetch(region.chrom, p1_wk, p2_wk - 1)).upper()
                dna_seq_rev = AuxiliaryFunctions.revcomp(str(fasta.fetch(region.chrom, p1_wk, p2_wk + 1)).upper())
                for i in range(int(self.k_nb / 2), len(dna_seq) - int(self.k_nb / 2) + 1):
                    fseq = dna_seq[i - int(self.k_nb / 2):i + int(self.k_nb / 2)]
                    rseq = dna_seq_rev[len(dna_seq) - int(self.k_nb / 2) - i:len(dna_seq) + int(self.k_nb / 2) - i]
                    try:
                        bias_signal_f.append(bias_table_f[fseq])
                    except Exception:
                        bias_signal_f.append(1)
                    try:
                        bias_signal_r.append(bias_table_r[rseq])
                    except Exception:
                        bias_signal_r.append(1)

                mean_bias_signal_f = np.add(mean_bias_signal_f, np.array(bias_signal_f))
                mean_bias_signal_r = np.add(mean_bias_signal_r, np.array(bias_signal_r))

                if self.protection_score:
                    # signal in the center of the MPBS
                    p1 = region.initial
                    p2 = region.final
                    nc_signal, _ = signal.get_signal(ref=region.chrom, start=p1, end=p2,
                                                     bias_table=table,
                                                     downstream_ext=self.atac_downstream_ext,
                                                     upstream_ext=self.atac_upstream_ext,
                                                     forward_shift=self.atac_forward_shift,
                                                     reverse_shift=self.atac_reverse_shift,
                                                     genome_file_name=genome_data.get_genome())
                    total_nc_signal += sum(nc_signal)
                    p1 = region.final
                    p2 = 2 * region.final - region.initial
                    nr_signal, _ = signal.get_signal(ref=region.chrom, start=p1, end=p2,
                                                     bias_table=table,
                                                     downstream_ext=self.atac_downstream_ext,
                                                     upstream_ext=self.atac_upstream_ext,
                                                     forward_shift=self.atac_forward_shift,
                                                     reverse_shift=self.atac_reverse_shift,
                                                     genome_file_name=genome_data.get_genome())
                    total_nr_signal += sum(nr_signal)
                    p1 = 2 * region.initial - region.final
                    p2 = region.final
                    nl_signal, _ = signal.get_signal(ref=region.chrom, start=p1, end=p2,
                                                     bias_table=table,
                                                     downstream_ext=self.atac_downstream_ext,
                                                     upstream_ext=self.atac_upstream_ext,
                                                     forward_shift=self.atac_forward_shift,
                                                     reverse_shift=self.atac_reverse_shift,
                                                     genome_file_name=genome_data.get_genome())
                    total_nl_signal += sum(nl_signal)


        mean_raw_signal = mean_raw_signal / num_sites
        mean_bc_signal = mean_bc_signal / num_sites

        mean_raw_signal_f = mean_raw_signal_f / num_sites
        mean_raw_signal_r = mean_raw_signal_r / num_sites
        mean_bc_signal_f = mean_bc_signal_f / num_sites
        mean_bc_signal_r = mean_bc_signal_r / num_sites

        mean_bias_signal_f = mean_bias_signal_f / num_sites
        mean_bias_signal_r = mean_bias_signal_r / num_sites

        protection_score = (total_nl_signal + total_nr_signal - 2 * total_nc_signal) / (2 * num_sites)

        # Output PWM and create logo
        pwm_fname = os.path.join(self.output_loc, "{}.pwm".format(self.motif_name))
        pwm_file = open(pwm_fname,"w")
        for e in ["A","C","G","T"]:
            pwm_file.write(" ".join([str(int(f)) for f in pwm_dict[e]])+"\n")
        pwm_file.close()

        logo_fname = os.path.join(self.output_loc, "{}.logo.eps".format(self.motif_name))
        pwm = motifs.read(open(pwm_fname), "pfm")
        pwm.weblogo(logo_fname, format="eps", stack_width="large", stacks_per_line="100",
                    color_scheme="color_classic", unit_name="", show_errorbars=False, logo_title="",
                    show_xaxis=False, xaxis_label="", show_yaxis=False, yaxis_label="",
                    show_fineprint=False, show_ends=False)

        # Output the raw, bias corrected signal and protection score
        output_fname = os.path.join(self.output_loc, "{}.txt".format(self.motif_name))
        output_file = open(output_fname, "w")
        if not self.strands_specific:
            output_file.write("raw signal: \n" + np.array_str(mean_raw_signal) + "\n")
            output_file.write("bias corrected signal: \n" + np.array_str(mean_bc_signal) + "\n")
        else:
            output_file.write("raw forward signal: \n" + np.array_str(mean_raw_signal_f) + "\n")
            output_file.write("bias corrected forward signal: \n" + np.array_str(mean_bc_signal_f) + "\n")
            output_file.write("raw reverse signal: \n" + np.array_str(mean_raw_signal_r) + "\n")
            output_file.write("bias reverse corrected signal: \n" + np.array_str(mean_bc_signal_r) + "\n")
        output_file.write("forward bias signal: \n" + np.array_str(mean_bias_signal_f) + "\n")
        output_file.write("reverse bias signal: \n" + np.array_str(mean_bias_signal_r) + "\n")
        if self.protection_score:
            output_file.write("protection score: \n" + str(protection_score) + "\n")
        output_file.close()

        if self.strands_specific:
            fig, (ax1, ax2, ax3) = plt.subplots(3, figsize=(12.0, 10.0))
        else:
            fig, (ax1, ax2) = plt.subplots(2)
        x = np.linspace(-50, 49, num=self.window_size)

        ax1.plot(x, mean_bias_signal_f, color='red', label='Forward')
        ax1.plot(x, mean_bias_signal_r, color='blue', label='Reverse')

        ax1.xaxis.set_ticks_position('bottom')
        ax1.yaxis.set_ticks_position('left')
        ax1.spines['top'].set_visible(False)
        ax1.spines['right'].set_visible(False)
        ax1.spines['left'].set_position(('outward', 15))
        ax1.spines['bottom'].set_position(('outward', 5))
        ax1.tick_params(direction='out')

        ax1.set_xticks([-50, -40, -30, -20, -10, 0, 10, 20, 30, 40, 49])
        ax1.set_xticklabels(['-50', '-40', '-30', '-20', '-10', '0', '10', '20', '30', '40', '49'])
        min_bias_signal = min(min(mean_bias_signal_f), min(mean_bias_signal_r))
        max_bias_signal = max(max(mean_bias_signal_f), max(mean_bias_signal_r))
        ax1.set_yticks([min_bias_signal, max_bias_signal])
        ax1.set_yticklabels([str(round(min_bias_signal,2)), str(round(max_bias_signal,2))], rotation=90)

        ax1.text(-48, max_bias_signal, '# Sites = {}'.format(str(num_sites)), fontweight='bold')
        ax1.set_title(self.motif_name, fontweight='bold')
        ax1.set_xlim(-50, 49)
        ax1.set_ylim([min_bias_signal, max_bias_signal])
        ax1.legend(loc="upper right", frameon=False)
        ax1.set_ylabel("Average Bias \nSignal", rotation=90, fontweight='bold')

        if not self.strands_specific:
            mean_raw_signal = self.standardize(mean_raw_signal)
            mean_bc_signal = self.standardize(mean_bc_signal)
            ax2.plot(x, mean_raw_signal, color='red', label='Uncorrected')
            ax2.plot(x, mean_bc_signal, color='green', label='Corrected')
        else:
            mean_raw_signal_f = self.standardize(mean_raw_signal_f)
            mean_raw_signal_r = self.standardize(mean_raw_signal_r)
            mean_bc_signal_f = self.standardize(mean_bc_signal_f)
            mean_bc_signal_r = self.standardize(mean_bc_signal_r)
            ax2.plot(x, mean_raw_signal_f, color='red', label='Forward')
            ax2.plot(x, mean_raw_signal_r, color='green', label='Reverse')
            ax3.plot(x, mean_bc_signal_f, color='red', label='Forward')
            ax3.plot(x, mean_bc_signal_r, color='green', label='Reverse')

        ax2.xaxis.set_ticks_position('bottom')
        ax2.yaxis.set_ticks_position('left')
        ax2.spines['top'].set_visible(False)
        ax2.spines['right'].set_visible(False)
        ax2.spines['left'].set_position(('outward', 15))
        ax2.tick_params(direction='out')
        ax2.set_xticks([-50, -40, -30, -20, -10, 0, 10, 20, 30, 40, 49])
        ax2.set_xticklabels(['-50', '-40', '-30', '-20', '-10', '0', '10', '20', '30', '40', '49'])
        ax2.set_yticks([0, 1])
        ax2.set_yticklabels([str(0), str(1)], rotation=90)
        ax2.set_xlim(-50, 49)
        ax2.set_ylim([0, 1])

        if not self.strands_specific:
            ax2.spines['bottom'].set_position(('outward', 40))
            ax2.set_xlabel("Coordinates from Motif Center", fontweight='bold')
            ax2.set_ylabel("Average ATAC-seq \nSignal", rotation=90, fontweight='bold')
            ax2.legend(loc="center", frameon=False, bbox_to_anchor=(0.85, 0.06))
        else:
            ax2.spines['bottom'].set_position(('outward', 5))
            ax2.set_ylabel("Average ATAC-seq \n Uncorrected Signal", rotation=90, fontweight='bold')
            ax2.legend(loc="lower right", frameon=False)

            ax3.xaxis.set_ticks_position('bottom')
            ax3.yaxis.set_ticks_position('left')
            ax3.spines['top'].set_visible(False)
            ax3.spines['right'].set_visible(False)
            ax3.spines['left'].set_position(('outward', 15))
            ax3.tick_params(direction='out')
            ax3.set_xticks([-50, -40, -30, -20, -10, 0, 10, 20, 30, 40, 49])
            ax3.set_xticklabels(['-50', '-40', '-30', '-20', '-10', '0', '10', '20', '30', '40', '49'])
            ax3.set_yticks([0, 1])
            ax3.set_yticklabels([str(0), str(1)], rotation=90)
            ax3.set_xlim(-50, 49)
            ax3.set_ylim([0, 1])
            ax3.legend(loc="lower right", frameon=False)
            ax3.spines['bottom'].set_position(('outward', 40))
            ax3.set_xlabel("Coordinates from Motif Center", fontweight='bold')
            ax3.set_ylabel("Average ATAC-seq \n Corrected Signal", rotation=90, fontweight='bold')
            ax3.text(-48, 0.05, '# K-mer = {}\n# Forward Shift = {}'.format(str(self.k_nb), str(self.atac_forward_shift)),
                     fontweight='bold')

        figure_name = os.path.join(self.output_loc, "{}.line.eps".format(self.motif_name))
        fig.subplots_adjust(bottom=.2, hspace=.5)
        fig.tight_layout()
        fig.savefig(figure_name, format="eps", dpi=300)

        # Creating canvas and printing eps / pdf with merged results
        output_fname = os.path.join(self.output_loc, "{}.eps".format(self.motif_name))
        c = pyx.canvas.canvas()
        c.insert(pyx.epsfile.epsfile(0, 0, figure_name, scale=1.0))
        if self.strands_specific:
            c.insert(pyx.epsfile.epsfile(2.76, 1.58, logo_fname, width=27.2, height=2.45))
        else:
            c.insert(pyx.epsfile.epsfile(2.5, 1.54, logo_fname, width=16, height=1.75))
        c.writeEPSfile(output_fname)
        os.system("epstopdf " + figure_name)
        os.system("epstopdf " + logo_fname)
        os.system("epstopdf " + output_fname)