def gen_html_distribution(self, outputname, title, align=50): fp = os.path.join(dir, outputname, title) link_d = {title: "distribution.html"} html = Html(name="Viz", links_dict=link_d, fig_dir=os.path.join(dir, outputname, "fig"), other_logo="viz", homepage="../index.html") for i, f in enumerate(self.fig): html.add_figure("distribution_test_" + str(i) + ".png", align="center") html.add_free_content(['<p style=\"margin-left: ' + str(align + 150) + '">' + '** </p>']) type_list = 'ssssssssssssssssssssssssssssssssssssssssssssss' col_size_list = [10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10] data_table = [] for ind_ty, ty in enumerate(self.disperDict.keys()): header_list = ["Chromosome"] + self.disperDict[ty].keys() html.add_heading(ty, size=4, bold=False) for i, ch in enumerate(self.chrom_list): # for ind_r,r in enumerate(self.disperDict[ty].keys()): data_table.append( [ch] + ["{:.3f} %".format(100 * self.disperDict[ty][r][i]) for r in self.disperDict[ty].keys()]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align) html.add_free_content(['<a href="parameters.txt" style="margin-left:100">See parameters</a>']) html.add_free_content([ '<a href="reference_experimental_matrix.txt" style="margin-left:100">See reference experimental matrix</a>']) html.add_free_content( ['<a href="query_experimental_matrix.txt" style="margin-left:100">See query experimental matrix</a>']) html.write(os.path.join(fp, "distribution.html"))
def list_all_index(path): """Creat an 'index.html' in the defined directory """ dirname = os.path.basename(path) parentdir = os.path.basename(os.path.dirname(path)) # link_d = {"List":"index.html"} link_d = {} #### for root, dirnames, filenames in os.walk(os.path.dirname(path)): for filename in fnmatch.filter(filenames, 'index.html'): if root.split('/')[-2] == parentdir: link_d[root.split('/')[-1]] = "../"+root.split('/')[-1]+"/index.html" link_d = OrderedDict(sorted(link_d.items(), key=lambda (key, value): key)) ### html = Html(name="Directory: "+dirname, links_dict=link_d, fig_dir=os.path.join(path,"style"), fig_rpath="./style", RGT_header=False, other_logo="viz") header_list = ["No.", "Experiments"] html.add_heading("All experiments in: "+dirname+"/") data_table = [] type_list = 'ssss' col_size_list = [10, 10, 10] c = 0 for root, dirnames, filenames in os.walk(path): #roots = root.split('/') for filename in fnmatch.filter(filenames, '*.html'): if filename == 'index.html' and root.split('/')[-1] != dirname: # print(root) c += 1 data_table.append([str(c), '<a href="'+os.path.join(root.split('/')[-1], filename)+'"><font size="4">'+root.split('/')[-1]+"</a>"]) #print(link_d[roots[-1]]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=50, cell_align="left", sortable=True) html.add_fixed_rank_sortable() html.write(os.path.join(path,"index.html"))
def list_all_index(path): """Creat an 'index.html' in the defined directory """ dirname = os.path.basename(path) parentdir = os.path.basename(os.path.dirname(path)) # link_d = {"List":"index.html"} link_d = {} #### for root, dirnames, filenames in os.walk(os.path.dirname(path)): for filename in fnmatch.filter(filenames, 'index.html'): if root.split('/')[-2] == parentdir: link_d[root.split('/') [-1]] = "../" + root.split('/')[-1] + "/index.html" link_d = OrderedDict(sorted(link_d.items(), key=lambda (key, value): key)) ### html = Html(name="Directory: " + dirname, links_dict=link_d, fig_dir=os.path.join(path, "style"), fig_rpath="./style", RGT_header=False, other_logo="viz") header_list = ["No.", "Experiments"] html.add_heading("All experiments in: " + dirname + "/") data_table = [] type_list = 'ssss' col_size_list = [10, 10, 10] c = 0 for root, dirnames, filenames in os.walk(path): # roots = root.split('/') for filename in fnmatch.filter(filenames, '*.html'): if filename == 'index.html' and root.split('/')[-1] != dirname: # print(root) c += 1 data_table.append([ str(c), '<a href="' + os.path.join(root.split('/')[-1], filename) + '"><font size="4">' + root.split('/')[-1] + "</a>" ]) # print(link_d[roots[-1]]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=50, cell_align="left", sortable=True) html.add_fixed_rank_sortable() html.write(os.path.join(path, "index.html"))
def list_all_index(path): """Creat an 'index.html' in the defined directory """ dirname = os.path.basename(path) link_d = {"List": "index.html"} html = Html(name="Directory: " + dirname, links_dict=link_d, fig_dir=os.path.join(path, "style"), fig_rpath="./style", RGT_header=False, other_logo="TDF") html.add_heading("All experiments in: " + dirname + "/") data_table = [] type_list = 'sssss' col_size_list = [10, 10, 10, 10, 10] c = 0 for root, dirnames, filenames in os.walk(path): #roots = root.split('/') for filename in fnmatch.filter(filenames, '*.html'): if filename == 'index.html' and root.split('/')[-1] != dirname: c += 1 if "_" in root.split('/')[-1]: tags = root.split('/')[-1].split("_") p1 = tags[0] p2 = tags[-1] data_table.append([ str(c), '<a href="' + os.path.join(root.split('/')[-1], filename) + '">' + root.split('/')[-1] + "</a>", p1, p2 ]) header_list = ["No.", "Experiments", "Tag1", "Tag2"] else: data_table.append([ str(c), '<a href="' + os.path.join(root.split('/')[-1], filename) + '">' + root.split('/')[-1] + "</a>" ]) header_list = ["No.", "Experiments"] #print(link_d[roots[-1]]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=50, cell_align="left", sortable=True) html.add_fixed_rank_sortable() html.write(os.path.join(path, "index.html"))
def gen_html(self, directory, title, align=50): dir_name = os.path.basename(directory) # check_dir(directory) html_header = dir_name + " / " + title link_d = OrderedDict() link_d["Lineplot"] = "index.html" link_d["Parameters"] = "parameters.html" html = Html(name=html_header, links_dict=link_d, fig_rpath="../style", RGT_header=False, other_logo="viz", homepage="../index.html") for g in self.group_tags: html.add_heading(heading=g) html.add_figure("lineplot_" + g + ".png", align="center", width="80%") html.write(os.path.join(directory, title, "index.html")) ## Parameters html = Html(name=html_header, links_dict=link_d, fig_rpath="../style", RGT_header=False, other_logo="viz", homepage="../index.html") type_list = 'ssssssssss' col_size_list = [20, 20, 20, 20, 20, 20, 20, 20, 20] header_list = ["Assumptions and hypothesis"] data_table = [] if self.annotation: data_table.append( ["Genomic annotation: TSS - Transcription Start Site; TTS - Transcription Termination Site."]) data_table.append(["Directory: " + directory.rpartition("/")[2]]) data_table.append(["Title: " + title]) data_table.append(["Extend length: " + str(self.extend)]) data_table.append(["Read size: " + str(self.rs)]) data_table.append(["Bin size: " + str(self.bs)]) data_table.append(["Step size: " + str(self.ss)]) data_table.append(["Center mode: " + self.center]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left") html.add_free_content(['<a href="parameters.txt" style="margin-left:100">See parameters</a>']) html.add_free_content(['<a href="experimental_matrix.txt" style="margin-left:100">See experimental matrix</a>']) html.write(os.path.join(directory, title, "parameters.html"))
def gen_html(self, directory, title, args, align=50): dir_name = os.path.basename(directory) statistic_table = [] # check_dir(directory) html_header = "Projection Test: " + dir_name link_d = OrderedDict() link_d["Projection test"] = "index.html" link_d["Parameters"] = "parameters.html" html = Html(name=html_header, links_dict=link_d, fig_rpath="../style", RGT_header=False, other_logo="viz", homepage="../index.html") html.add_figure("projection_test.png", align="center") header_list = ["No.", "Reference<br>name", "Query<br>name", "Reference<br>number", "Query<br>number", "Proportion", "Background<br>proportion", "Positive<br>association<br>p-value", "Negative<br>association<br>p-value"] statistic_table.append(["Reference_name", "Query_name", "Reference_number", "Query_number", "Proportion", "Background_proportion", "Positive_association_p-value", "Negative_association_p-value"]) type_list = 'ssssssssssssssss' col_size_list = [5, 10, 10, 10, 10, 10, 10, 15, 15] nalist = [] for ind_ty, ty in enumerate(self.plist.keys()): html.add_heading(ty, size=4, bold=False) data_table = [] for ind_r, r in enumerate(self.plist[ty].keys()): rlen = str(self.lenlist[r]) for ind_q, q in enumerate(self.plist[ty][r].keys()): qlen = str(self.lenlist[q]) backv = value2str(self.qlist[ty][r]['Background']) propor = value2str(self.qlist[ty][r][q]) pv = self.plist[ty][r][q] if pv == "na": nalist.append(r) continue elif self.qlist[ty][r][q] < args.cfp: continue else: pvn = 1 - pv if self.plist[ty][r][q] < 0.05: if self.qlist[ty][r]['Background'] < self.qlist[ty][r][q]: data_table.append([str(ind_ty), r, q, rlen, qlen, propor, backv, "<font color=\"red\">" + value2str(pv) + "</font>", value2str(pvn)]) statistic_table.append([r, q, rlen, qlen, propor, backv, value2str(pv), value2str(pvn)]) else: data_table.append([str(ind_ty), r, q, rlen, qlen, propor, backv, value2str(pvn), "<font color=\"red\">" + value2str(pv) + "</font>"]) statistic_table.append([r, q, rlen, qlen, propor, backv, value2str(pvn), value2str(pv)]) else: data_table.append( [str(ind_ty), r, q, rlen, qlen, propor, backv, value2str(pv), value2str(pvn)]) statistic_table.append([r, q, rlen, qlen, propor, backv, value2str(pv), value2str(pvn)]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, sortable=True) output_array(statistic_table, directory=directory, folder=title, filename="statistics" + ty + ".txt") header_list = ["Assumptions and hypothesis"] data_table = [['If the background proportion is too small, it may cause bias in p value.'], [ 'For projection test, the reference GenomicRegionSet should have non-zero length in order to calculate its background proportion.'], ['P values are corrected by multiple test correction.'], ['Positive association is defined by: Proportion > Background.'], ['Negative association is defined by: Proportion < Background.']] nalist = set(nalist) if len(nalist) > 0: data_table.append([ 'The following references contain zero-length region which cause error in proportion calculation, please check it:<br>' + ' <font color=\"red\">' + ', '.join([s for s in nalist]) + '</font></p>']) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left") html.add_fixed_rank_sortable() html.write(os.path.join(directory, os.path.join(title, "index.html"))) # Parameters html = Html(name=html_header, links_dict=link_d, fig_rpath="../style", RGT_header=False, other_logo="viz", homepage="../index.html") header_list = ["Description", "Argument", "Value"] data_table = [["Reference", "-r", args.r], ["Query", "-q", args.q], ["Output directory", "-o", os.path.basename(args.o)], ["Experiment title", "-t", args.t], # ["Grouping tag", "-g", args.g], # ["Coloring tag", "-c", args.c], # ["Background", "-bg", args.bg], ["Organism", "-organism", args.organism], ["Cutoff of proportion", "-cfp", str(args.cfp)]] html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left") html.add_free_content([ '<a href="reference_experimental_matrix.txt" style="margin-left:100">See reference experimental matrix</a>']) html.add_free_content( ['<a href="query_experimental_matrix.txt" style="margin-left:100">See query experimental matrix</a>']) html.add_free_content(['<a href="parameters.txt" style="margin-left:100">See details</a>']) html.write(os.path.join(directory, os.path.join(title, "parameters.html")))
def gen_html(self, directory, parameters, obed, align=50, alpha=0.05, score=False): """Generate the HTML file""" dir_name = os.path.basename(directory) html_header = "Genomic Region Test: " + dir_name link_ds = OrderedDict() link_ds["RNA"] = "index.html" link_ds["Sig Target Regions"] = "starget_regions.html" link_ds["Target Regions"] = "target_regions.html" link_ds["Parameters"] = "parameters.html" ################################################## # index.html html = Html( name=html_header, links_dict=link_ds, # fig_dir=os.path.join(directory,"style"), fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html") # Plots html.add_figure("lineplot_region.png", align="left", width="45%", more_images=["boxplot_regions.png"]) if self.showdbs: html.add_figure("lineplot_dbs.png", align="left", width="45%", more_images=["boxplot_dbs.png"]) if self.showdbs: header_list = [[ "#", "DBD", "Target Regions", None, "Non-target Regions", None, "Statistics", "Target Regions", "Non-target Regions", None, "Statistics" ], [ "", "", "with DBS", "without DBS", "with DBS (average)", "s.d.", "<i>p</i>-value", "NO. DBSs", "NO. DBSs (average)", "s.d.", "<i>p</i>-value" ]] header_titles = [ [ "Rank", "DNA Binding Domain", "Given target regions on DNA", None, "Regions from randomization", None, "Statistics based on target regions", "Given target regions on DNA", "Regions from randomization", None, "Statistics based on DNA Binding Sites" ], [ "", "", "Number of target regions with DBS binding", "Number of target regions without DBS binding", "Average number of regions from randomization with DBS binding", "Standard deviation", "P value", "Number of related DNA Binding Sites binding to target regions", "Average number of DNA Binding Sites binding to random regions", "Standard deviation", "P-value" ] ] border_list = [ " style=\"border-right:1pt solid gray\"", " style=\"border-right:1pt solid gray\"", "", " style=\"border-right:1pt solid gray\"", "", " style=\"border-right:1pt solid gray\"", " style=\"border-right:2pt solid gray\"", " style=\"border-right:1pt solid gray\"", "", " style=\"border-right:1pt solid gray\"", " style=\"border-right:1pt solid gray\"" ] else: header_list = [[ "#", "DBD", "Target Regions", None, "Non-target Regions", None, "Statistics", None ], [ "", "", "with DBS", "without DBS", "with DBS (average)", "s.d.", "<i>p</i>-value", "z-score" ]] header_titles = [ [ "Rank", "DNA Binding Domain", "Given target regions on DNA", None, "Regions from randomization", None, "Statistics based on target regions", None ], [ "", "", "Number of target regions with DBS binding", "Number of target regions without DBS binding", "Average number of regions from randomization with DBS binding", "Standard deviation", "P value", "Z-score" ] ] border_list = [ " style=\"border-right:1pt solid gray\"", " style=\"border-right:1pt solid gray\"", "", " style=\"border-right:1pt solid gray\"", "", " style=\"border-right:1pt solid gray\"", " style=\"border-right:1pt solid gray\"", "" ] type_list = 'ssssssssssssssss' col_size_list = [ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50 ] data_table = [] for i, rbs in enumerate(self.rbss): if self.data["region"]["p"][i] < alpha: p_region = "<font color=\"red\">" + value2str( self.data["region"]["p"][i]) + "</font>" else: p_region = value2str(self.data["region"]["p"][i]) zs = (self.counts_tr[rbs][0] - self.data["region"]["ave"][i]) / self.data["region"]["sd"][i] new_line = [ str(i + 1), rbs.str_rna(pa=False), '<a href="dbd_region.html#' + rbs.str_rna() + '" style="text-align:left">' + str(self.counts_tr[rbs][0]) + '</a>', str(self.counts_tr[rbs][1]), value2str(self.data["region"]["ave"][i]), value2str(self.data["region"]["sd"][i]), p_region, value2str(zs) ] if self.showdbs: if self.data["dbs"]["p"][i] < alpha: p_dbs = "<font color=\"red\">" + value2str( self.data["dbs"]["p"][i]) + "</font>" else: p_dbs = value2str(self.data["dbs"]["p"][i]) new_line += [ str(self.counts_dbs[rbs]), value2str(self.data["dbs"]["ave"][i]), value2str(self.data["dbs"]["sd"][i]), p_dbs ] data_table.append(new_line) data_table = natsort.natsorted(data_table, key=lambda x: x[6]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left", auto_width=True, header_titles=header_titles, border_list=border_list, sortable=True) html.add_heading("Notes") html.add_list([ "RNA name: " + self.rna_name, "Randomization is performed for " + str(self.repeats) + " times.", "DBD stands for DNA Binding Domain on RNA.", "DBS stands for DNA Binding Site on DNA." ]) html.add_fixed_rank_sortable() html.write(os.path.join(directory, "index.html")) ############################################################# # RNA subpage: Profile of targeted regions for each merged DNA Binding Domain ############################################################# header_list = [ "#", "Target Region", "Associated Gene", "No. of DBSs", "DBS coverage" ] header_titles = [ "Rank", "Given target regions from BED files", "Associated genes which is overlapping with the given region or close to it (less than 50000 bp)", "Number of DNA Binding Sites locate within the region", "The proportion of the region covered by DBS binding" ] ######################################################### # dbd_region.html html = Html( name=html_header, links_dict=link_ds, # fig_dir=os.path.join(directory,"style"), fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html") for rbsm in self.rbss: html.add_heading("DNA Binding Domain: " + rbsm.str_rna(), idtag=rbsm.str_rna()) data_table = [] for i, region in enumerate(self.txp.merged_dict[rbsm]): # Add information data_table.append([ str(i + 1), '<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' + self.organism + "&position=" + region.chrom + "%3A" + str(region.initial) + "-" + str(region.final) + '" style="text-align:left">' + region.toString(space=True) + '</a>', split_gene_name(gene_name=region.name, org=self.organism), str(len(self.region_dbs[region.toString()])), value2str(self.region_coverage[region.toString()]) ]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left", auto_width=True, header_titles=header_titles, sortable=True) html.add_fixed_rank_sortable() html.write(os.path.join(directory, "dbd_region.html")) ############################################################# # Targeted regions centered ############################################################# ############################################################################################## # target_regions.html html = Html( name=html_header, links_dict=link_ds, # fig_dir=os.path.join(directory,"style"), fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html") if score: header_list = [ "#", "Target region", "Associated Gene", "DBSs Count", "DBS coverage", "Score", "Sum of ranks" ] header_titles = [ "Rank", "Target regions loaded from the given BED file", "Associated genes which is overlapping with the given region or close to it (less than 50000 bp)", "Number of DNA Binding Sites within the region", "The proportion of the region covered by DBS binding", "Scores from BED file", "Sum of all the left-hand-side ranks" ] else: header_list = [ "#", "Target region", "Associated Gene", "DBSs Count", "DBS coverage", "Sum of ranks" ] header_titles = [ "Rank", "Target regions loaded from the given BED file", "Associated genes which is overlapping with the given region or close to it (less than 50000 bp)", "Number of DNA Binding Sites within the region", "The proportion of the region covered by DBS binding", "Sum of all the left-hand-side ranks" ] html.add_heading("Target Regions") data_table = [] if not self.dna_region.sorted: self.dna_region.sort() # Calculate the ranking rank_count = len(self.dna_region) - rank_array( [len(self.region_dbs[p.toString()]) for p in self.dna_region]) rank_coverage = len(self.dna_region) - rank_array( [self.region_coverage[p.toString()] for p in self.dna_region]) if score: try: score_list = [ float(p.data.split("\t")[0]) for p in self.dna_region ] rank_score = len(self.dna_region) - rank_array( [abs(s) for s in score_list]) rank_sum = [ x + y + z for x, y, z in zip(rank_count, rank_coverage, rank_score) ] # sum_rank = rank_array(rank_sum) # method='min' except ImportError: print( "There is no score in BED file, please don't use '-score' argument." ) else: rank_sum = [x + y for x, y in zip(rank_count, rank_coverage)] sum_rank = rank_array(rank_sum) for i, region in enumerate(self.dna_region): dbs_counts = str(len(self.region_dbs[region.toString()])) dbs_cover = value2str(self.region_coverage[region.toString()]) newline = [ str(i + 1), '<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' + self.organism + "&position=" + region.chrom + "%3A" + str(region.initial) + "-" + str(region.final) + '" style="text-align:left">' + region.toString(space=True) + '</a>', split_gene_name(gene_name=region.name, org=self.organism), '<a href="region_dbs.html#' + region.toString() + '" style="text-align:left">' + dbs_counts + '</a>', dbs_cover ] if score: dbs_score = value2str(score_list[i]) region.data = "\t".join( [dbs_counts, dbs_cover, dbs_score, str(rank_sum[i])]) newline.append(dbs_score) newline.append(str(rank_sum[i])) else: region.data = "\t".join( [dbs_counts, dbs_cover, str(rank_sum[i])]) newline.append(str(rank_sum[i])) data_table.append(newline) data_table = natsort.natsorted(data_table, key=lambda x: x[-1]) # data_table = sorted(data_table, key=lambda x: x[-1]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left", auto_width=True, header_titles=header_titles, sortable=True) html.add_heading("Notes") html.add_list(["All target regions without any bindings are ignored."]) html.add_fixed_rank_sortable() html.write(os.path.join(directory, "target_regions.html")) self.dna_region.sort_score() self.dna_region.write_bed( os.path.join(directory, obed + "_target_regions.bed")) ############################################################################################## # starget_regions.html for significant target regions stargets = GenomicRegionSet("sig_targets") sig_dbs = {} sig_dbs_coverage = {} for i, r in enumerate(self.dna_region): sig_bindings = self.region_dbs[r.toString()].overlap_rbss( rbss=self.data["region"]["sig_region"]) dbs = sig_bindings.get_dbs() if len(dbs) > 0: stargets.add(r) m_dbs = dbs.merge(w_return=True) sig_dbs[r] = len(dbs) # self.promoter["de"]["merged_dbs"][promoter.toString()] = len(m_dbs) sig_dbs_coverage[r] = float(m_dbs.total_coverage()) / len(r) html = Html( name=html_header, links_dict=link_ds, # fig_dir=os.path.join(directory,"style"), fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html") # Select promoters in sig DBD if len(self.data["region"]["sig_region"]) == 0: html.add_heading("There is no significant DBD.") else: html.add_heading("Target regions bound by significant DBD") data_table = [] # Calculate the ranking rank_count = len(stargets) - rank_array( [sig_dbs[p] for p in stargets]) rank_coverage = len(stargets) - rank_array( [sig_dbs_coverage[p] for p in stargets]) if score: score_list = [float(p.data.split("\t")[0]) for p in stargets] rank_score = len(stargets) - rank_array( [abs(s) for s in score_list]) rank_sum = [ x + y + z for x, y, z in zip(rank_count, rank_coverage, rank_score) ] sum_rank = rank_array(rank_sum) # method='min' else: rank_sum = [x + y for x, y in zip(rank_count, rank_coverage)] sum_rank = rank_array(rank_sum) for i, region in enumerate(stargets): dbssount = '<a href="region_dbs.html#' + region.toString() + \ '" style="text-align:left">' + str(sig_dbs[region]) + '</a>' region_link = region_link_internet(self.organism, region) newline = [ str(i + 1), region_link, split_gene_name(gene_name=region.name, org=self.organism), dbssount, value2str(sig_dbs_coverage[region]) ] if score: dbs_score = value2str(score_list[i]) # region.data = "\t".join([dbs_counts, dbs_cover, dbs_score, str(sum_rank[i])]) newline.append(dbs_score) newline.append(str(rank_sum[i])) # print([dbs_score, str(sum_rank[i])]) else: # region.data = "\t".join([dbs_counts, dbs_cover, str(sum_rank[i])]) newline.append(str(rank_sum[i])) # newline += ["<i>" + str(rank_sum[i]) + "</i>"] # print(newline) data_table.append(newline) # print(data_table) # data_table = sorted(data_table, key=lambda x: x[-1]) data_table = natsort.natsorted(data_table, key=lambda x: x[-1]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left", header_titles=header_titles, border_list=None, sortable=True) html.add_heading("Notes") html.add_list([ "DBS stands for DNA Binding Site on DNA.", "DBS coverage is the proportion of the region where has potential to form triple helices with the given RNA." ]) html.add_fixed_rank_sortable() html.write(os.path.join(directory, "starget_regions.html")) ############################ # Subpages for targeted region centered page # region_dbs.html header_list = ["RBS", "DBS", "Strand", "Score", "Motif", "Orientation"] html = Html( name=html_header, links_dict=link_ds, # fig_dir=os.path.join(directory,"style"), fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html") for i, region in enumerate(self.dna_region): if len(self.region_dbs[region.toString()]) == 0: continue else: html.add_heading( "Associated gene: " + split_gene_name(gene_name=region.name, org=self.organism), idtag=region.toString()) html.add_free_content([ '<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' + self.organism + "&position=" + region.chrom + "%3A" + str(region.initial) + "-" + str(region.final) + '" style="margin-left:50">' + region.toString(space=True) + '</a>' ]) data_table = [] for rd in self.region_dbs[region.toString()]: rbs = rd.rna.str_rna(pa=False) for rbsm in self.data["region"]["sig_region"]: # rbsm = rbsm.partition(":")[2].split("-") if rd.rna.overlap(rbsm): rbs = "<font color=\"red\">" + rbs + "</font>" data_table.append([ rbs, '<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' + self.organism + "&position=" + rd.dna.chrom + "%3A" + str(rd.dna.initial) + "-" + str(rd.dna.final) + '" style="text-align:left">' + rd.dna.toString(space=True) + '</a>', rd.dna.orientation, rd.score, rd.motif, rd.orient ]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left", auto_width=True) html.write(os.path.join(directory, "region_dbs.html")) ###############################################################################33 ################ Parameters.html html = Html( name=html_header, links_dict=link_ds, # fig_dir=os.path.join(directory,"style"), fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html") html.add_heading("Parameters") header_list = ["Description", "Arguments", "Value"] data_table = [ ["RNA sequence name", "-rn", parameters.rn], ["Input RNA sequence file", "-r", os.path.basename(parameters.r)], ["Input BED file", "-bed", os.path.basename(parameters.bed)], ["Output directory", "-o", os.path.basename(parameters.o)], ["Organism", "-organism", parameters.organism], ["Number of repitetion of andomization", "-n", str(parameters.n)], ["Alpha level for rejection p value", "-a", str(parameters.a)], [ "Cut off value for filtering out the low counts of DBSs", "-ccf", str(parameters.ccf) ], ["Remove temporary files", "-rt", str(parameters.rt)], [ "Input BED file for masking in randomization", "-f", str(parameters.f) ], ["Input file for RNA accecibility", "-ac", str(parameters.ac)], [ "Cut off value for RNA accecibility", "-accf", str(parameters.accf) ], [ "Output the BED files for DNA binding sites.", "-obed", str(parameters.obed) ], [ "Show parallel and antiparallel bindings in the plot separately.", "-showpa", str(parameters.showpa) ], ["Minimum length", "-l", str(self.triplexator_p[0])], ["Maximum error rate", "-e", str(self.triplexator_p[1])], [ "Tolerated number of consecutive errors", "-c", str(self.triplexator_p[2]) ], ["Filtering repeats", "-fr", str(self.triplexator_p[3])], ["Filtering mode", "-fm", str(self.triplexator_p[4])], ["Output format", "-of", str(self.triplexator_p[5])], ["Merge features", "-mf", str(self.triplexator_p[6])] ] html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left", auto_width=True) html.add_free_content( ['<a href="summary.txt" style="margin-left:100">See details</a>']) html.write(os.path.join(directory, "parameters.html"))
def make_html(self): html_header = "THOR" from rgt.THOR.dpc_help import FOLDER_REPORT #Links links_dict = OrderedDict() links_dict['Experimental Configuration'] = 'index.html#extinfo' links_dict['Sample Information'] = 'index.html#sampleinfo' links_dict['HMM Information'] = 'index.html#hmminfo' links_dict['Mean Variance Function Estimate'] = 'index.html#mvfunction' p = path.join(FOLDER_REPORT, 'pics/fragment_size_estimate.png') if path.isfile(p): links_dict['Fragment Size Estimate'] = 'index.html#fsestimate' p = path.join(FOLDER_REPORT, 'pics/data/sample.data') if path.isfile(p): links_dict['Housekeeping Gene Normalization'] = 'index.html#norm' links_dict['References'] = 'index.html#ref' links_dict['Contact'] = 'index.html#contact' # copy basic rgt logo, style etc to local directory inside report fig_path = path.join(FOLDER_REPORT, "fig") html = Html(name=html_header, links_dict=links_dict, fig_dir=fig_path, fig_rpath="fig") try: html.add_heading("Experimental Configuration", idtag='extinfo') self.make_ext_config(html) except: pass html.add_heading("Pre- and post-processing Features", idtag='prepostinfo') self.make_pre_post(html) try: html.add_heading("Sample Information", idtag='sampleinfo') self.make_ext_scaling_table(html) except: pass #Run Info try: html.add_heading("HMM Information", idtag='hmminfo') self.make_hmm(html) except: pass #Mean Variance Function try: p = path.join(FOLDER_REPORT, 'pics/mean_variance_func_cond_0_original.png') if path.isfile(p): html.add_heading("Mean Variance Function", idtag='mvfunction') html.add_figure( path.relpath(p, FOLDER_REPORT), align="left", width="45%", more_images=[ 'pics/mean_variance_func_cond_1_original.png' ]) info = "THOR uses a polynomial function to empirically describe the relationship between mean and variance in the data.\ The data the plot is based on can be found at report/pics/data for further downstream analysis." self._write_text(html, info) except: pass #Fragment Size Estimate try: p = path.join(FOLDER_REPORT, 'pics/fragment_size_estimate.png') if path.isfile(p): html.add_heading("Fragment Size Estimate", idtag='fsestimate') html.add_figure(path.relpath(p, FOLDER_REPORT), align="left", width="45%") info = "THOR estimates the fragmentation sizes of each sample's reads. Here, the cross-correlation function [1] is shown. Their maxima give the\ fragmentation extension sizes.<br> The data the plot is based on can be found at report/pics/data for further downstream analysis." self._write_text(html, info) except: pass #HK normalization try: p = path.join(FOLDER_REPORT, 'pics/data/gene.data') if path.isfile(p): d = self._read_hk(p) html.add_heading("Housekeeping Gene Normalization", idtag='norm') html.add_zebra_table(header_list=['gene', 'quality q'], col_size_list=[1, 150], type_list='s' * len(d), data_table=d) info = "For active histone marks, housekeeping genes given by [4] can be used for normalization [1]. Here, the genes for the experiments are\ evaluated. For each gene i, we estimate the normalization factors with gene i and without gene i and compute the sums of squared deviations q.\ High values (higher than 2) indicate striking genes which should be considered to be left our for normalization.,<br> One can also \ use other genes or regions for normalization.<br> The data the plot is based on can be found at report/pics/data for further downstream analysis." self._write_text(html, info) p = path.join(FOLDER_REPORT, 'pics/data/sample.data') if path.isfile(p): d = self._read_hk(p) html.add_zebra_table(header_list=['sample', 'quality p'], col_size_list=[1, 150], type_list='s' * len(d), data_table=d) info = "We evaluate the effect of samples to the normalization factors. For sample j, we estimate the normalization factors with sample j\ and without sample j and compute the sums of squared deviations p. High values (higher than 2) indicate striking samples which should be\ considered to be left out for the analysis.<br> The data the plot is based on can be found at report/pics/data for further downstream analysis." self._write_text(html, info) except: pass html.add_heading("References", idtag='ref') info = "[1] M. Allhoff, J. F. Pires, K. Seré, M. Zenke, and I. G. Costa. Differential Peak Calling of ChIP-Seq \ Signals with Replicates with THOR. <i>submitted.</i> <br>\ [2] A. Mammana, M. Vingron, and H.-R. Chung. Inferring nucleosome positions with their histone mark annotation from chip data. \ Bioinformatics, 29(20):2547-2554, 2013. <br>\ [3] M. D. Robinson and A. Oshlack. A scaling normalization method for differential expression analysis of RNA-seq data. \ Genome Biology, 11(3):R25, 2010. <br>\ [4] E. Eisenberg and E. Y. Levanon. Human housekeeping genes, revisited. Trends in genetics: TIG, 29(10):569-574, 2013." self._write_text(html, info) html.add_heading("Contact", idtag='contact') info = "If you have any questions, please don't hesitate to contact us: [email protected]" self._write_text(html, info) html.write(path.join(FOLDER_REPORT, "index.html"))
def make_html(self): html_header = "THOR" from rgt.THOR.dpc_help import FOLDER_REPORT #Links links_dict = OrderedDict() links_dict['Experimental Configuration'] = 'index.html#extinfo' links_dict['Sample Information'] = 'index.html#sampleinfo' links_dict['HMM Information'] = 'index.html#hmminfo' links_dict['Mean Variance Function Estimate'] = 'index.html#mvfunction' p = path.join(FOLDER_REPORT, 'pics/fragment_size_estimate.png') if path.isfile(p): links_dict['Fragment Size Estimate'] = 'index.html#fsestimate' p = path.join(FOLDER_REPORT, 'pics/data/sample.data') if path.isfile(p): links_dict['Housekeeping Gene Normalization'] = 'index.html#norm' links_dict['References'] = 'index.html#ref' links_dict['Contact'] = 'index.html#contact' config_class = ConfigurationFile() html = Html(name=html_header, links_dict=links_dict, fig_rpath= config_class.data_dir + '/fig/') try: html.add_heading("Experimental Configuration", idtag = 'extinfo') self.make_ext_config(html) except: pass html.add_heading("Pre- and post-processing Features", idtag = 'prepostinfo') self.make_pre_post(html) try: html.add_heading("Sample Information", idtag = 'sampleinfo') self.make_ext_scaling_table(html) except: pass #Run Info try: html.add_heading("HMM Information", idtag = 'hmminfo') self.make_hmm(html) except: pass #Mean Variance Function try: p = path.join(FOLDER_REPORT, "pics/mean_variance_func_cond_0_original.png") if path.isfile(p): html.add_heading("Mean Variance Function", idtag='mvfunction') html.add_figure(p, align="left", width="45%", more_images=[path.join(FOLDER_REPORT, 'pics/mean_variance_func_cond_1_original.png')]) info = "THOR uses a polynomial function to empirically describe the relationship between mean and variance in the data.\ The data the plot is based on can be found at report/pics/data for further downstream analysis." self._write_text(html, info) except: pass #Fragment Size Estimate try: p = path.join(FOLDER_REPORT, 'pics/fragment_size_estimate.png') if path.isfile(p): html.add_heading("Fragment Size Estimate", idtag = 'fsestimate') html.add_figure(p, align="left", width="45%") info = "THOR estimates the fragmentation sizes of each sample's reads. Here, the cross-correlation function [1] is shown. Their maxima give the\ fragmentation extension sizes.<br> The data the plot is based on can be found at report/pics/data for further downstream analysis." self._write_text(html, info) except: pass #HK normalization try: p = path.join(FOLDER_REPORT, 'pics/data/gene.data') if path.isfile(p): d = self._read_hk(p) html.add_heading("Housekeeping Gene Normalization", idtag = 'norm') html.add_zebra_table(header_list=['gene', 'quality q'], col_size_list=[1,150], type_list='s'*len(d), data_table=d) info = "For active histone marks, housekeeping genes given by [4] can be used for normalization [1]. Here, the genes for the experiments are\ evaluated. For each gene i, we estimate the normalization factors with gene i and without gene i and compute the sums of squared deviations q.\ High values (higher than 2) indicate striking genes which should be considered to be left our for normalization.,<br> One can also \ use other genes or regions for normalization.<br> The data the plot is based on can be found at report/pics/data for further downstream analysis." self._write_text(html, info) p = path.join(FOLDER_REPORT, 'pics/data/sample.data') if path.isfile(p): d = self._read_hk(p) html.add_zebra_table(header_list=['sample', 'quality p'], col_size_list=[1,150], type_list='s'*len(d), data_table=d) info = "We evaluate the effect of samples to the normalization factors. For sample j, we estimate the normalization factors with sample j\ and without sample j and compute the sums of squared deviations p. High values (higher than 2) indicate striking samples which should be\ considered to be left out for the analysis.<br> The data the plot is based on can be found at report/pics/data for further downstream analysis." self._write_text(html, info) except: pass html.add_heading("References", idtag = 'ref') info = "[1] M. Allhoff, J. F. Pires, K. Seré, M. Zenke, and I. G. Costa. Differential Peak Calling of ChIP-Seq \ Signals with Replicates with THOR. <i>submitted.</i> <br>\ [2] A. Mammana, M. Vingron, and H.-R. Chung. Inferring nucleosome positions with their histone mark annotation from chip data. \ Bioinformatics, 29(20):2547-2554, 2013. <br>\ [3] M. D. Robinson and A. Oshlack. A scaling normalization method for differential expression analysis of RNA-seq data. \ Genome Biology, 11(3):R25, 2010. <br>\ [4] E. Eisenberg and E. Y. Levanon. Human housekeeping genes, revisited. Trends in genetics: TIG, 29(10):569-574, 2013." self._write_text(html, info) html.add_heading("Contact", idtag = 'contact') info = "If you have any questions, please don't hesitate to contact us: [email protected]" self._write_text(html, info) html.write(path.join(FOLDER_REPORT, "index.html"))
def list_all_index(path, link_d=None): """Creat an 'index.html' in the defined directory """ dirname = os.path.basename(path) if link_d: pass else: link_d = {"List": "index.html"} html = Html(name="Directory: " + dirname, links_dict=link_d, fig_rpath="./style", fig_dir=os.path.join(path, "style"), RGT_header=False, other_logo="TDF", homepage="../index.html") html.add_heading("All experiments in: " + dirname + "/") data_table = [] type_list = 'sssssssssssssssssss' col_size_list = [20] * 20 c = 0 header_list = ["No.", "Experiments", "RNA", "Closest genes", "Exon", "Length", "Expression*", "Norm DBS*", "Norm DBD*", "No sig. DBD", "Organism", "Target region", "Rank*"] profile_f = open(os.path.join(path, "profile.txt"), 'r') profile = {} for line in profile_f: line = line.strip() line = line.split("\t") if line[0] == "Experiment": continue elif len(line) > 5: profile[line[0]] = line[1:] profile_f.close() # sig_list = [] for i, exp in enumerate(profile.keys()): c += 1 if profile[exp][10] == "-": new_line = [str(c), exp, profile[exp][0]] else: new_line = [str(c), '<a href="' + os.path.join(exp, "index.html") + \ '">' + exp + "</a>", profile[exp][0]] new_line += [ profile[exp][12],#3 close genes profile[exp][1], #4 exon profile[exp][2], #5 length profile[exp][13] ]#6 exp if float(profile[exp][11]) < 0.05: new_line += [ profile[exp][6], #7 norm DBS profile[exp][8], #8 norm DBD profile[exp][9]] #9 sig DBD # profile[exp][10], #10 Top DBD # "<font color=\"red\">" + \ # profile[exp][11] + "</font>"] # sig_list.append(True) else: new_line += [str(0), # 7 norm DBS str(0), # 8 norm DBD profile[exp][9]] # 9 sig DBD # profile[exp][10], # 10 Top DBD # profile[exp][11]] # sig_list.append(False) new_line += [ profile[exp][4], profile[exp][5] ] data_table.append(new_line) rank_dbd = len(data_table) - rank_array([float(x[8]) for x in data_table]) rank_dbs = len(data_table) - rank_array([float(x[7]) for x in data_table]) rank_exp = len(data_table) - rank_array([0 if x[6] == "n.a." else float(x[6]) for x in data_table ]) rank_sum = [x + y + z for x, y, z in zip(rank_dbd, rank_dbs, rank_exp)] nd = [ d + [str(rank_sum[i])] for i, d in enumerate(data_table) ] nd = natsort_ob.natsorted(nd, key=lambda x: x[-1]) html.add_zebra_table(header_list, col_size_list, type_list, nd, align=10, cell_align="left", sortable=True) html.add_fixed_rank_sortable() html.write(os.path.join(path, "index.html"))
def list_all_index(path, show_RNA_ass_gene=False): """Creat an 'index.html' in the defined directory """ dirname = os.path.basename(path) link_d = {"List":"index.html"} html = Html(name="Directory: "+dirname, links_dict=link_d, fig_dir=os.path.join(path,"style"), fig_rpath="./style", RGT_header=False, other_logo="TDF") html.add_heading("All experiments in: "+dirname+"/") data_table = [] type_list = 'sssssssssssss' col_size_list = [20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20] c = 0 if show_RNA_ass_gene: header_list = ["No.", "Experiments", "RNA", "Closest genes", "Organism", #"Condition", "Target region", "No significant DBD", "Top DBD", "p-value"] else: header_list = ["No.", "Experiments", "RNA", "Organism", #"Condition", "Target region", "No significant DBD", "Top DBD", "p-value"] profile_f = open(os.path.join(path, "profile.txt"),'r') profile = {} for line in profile_f: line = line.strip() line = line.split("\t") profile[line[0]] = line[1:] #profile = pickle.load(profile_f) for root, dirnames, filenames in os.walk(path): #roots = root.split('/') #for filename in fnmatch.filter(filenames, '*.html'): # if filename == 'index.html' and root.split('/')[-1] != dirname: for i, dirname in enumerate(dirnames): if dirname in profile.keys(): c += 1 #exp = root.split('/')[-1] exp = dirname if profile[exp][5] == "-": new_line = [ str(c), exp, profile[exp][0] ] else: new_line = [ str(c), '<a href="'+os.path.join(exp, "index.html")+'">'+exp+"</a>", profile[exp][0] ] if show_RNA_ass_gene: new_line.append( split_gene_name(gene_name=profile[exp][7], org=profile[exp][2]) ) try: if profile[exp][6] == "-": new_line += [ profile[exp][2], profile[exp][3], profile[exp][4], profile[exp][5], profile[exp][6] ] elif float(profile[exp][6]) < 0.05: new_line += [ profile[exp][2], profile[exp][3], profile[exp][4], profile[exp][5], "<font color=\"red\">"+profile[exp][6]+"</font>" ] else: new_line += [ profile[exp][2], profile[exp][3], profile[exp][4], profile[exp][5], profile[exp][6] ] data_table.append(new_line) except: print("Error in loading profile: "+exp) continue html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=50, cell_align="left", sortable=True) html.add_fixed_rank_sortable() html.write(os.path.join(path,"index.html"))
def list_all_index(path, link_d=None, show_RNA_ass_gene=False): """Creat an 'index.html' in the defined directory """ dirname = os.path.basename(path) if link_d: pass else: link_d = {"List":"index.html"} html = Html(name="Directory: "+dirname, links_dict=link_d, fig_rpath="./style", fig_dir=os.path.join(path,"style"), RGT_header=False, other_logo="TDF", homepage="../index.html") html.add_heading("All experiments in: "+dirname+"/") data_table = [] type_list = 'sssssssssssss' col_size_list = [20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20] c = 0 if show_RNA_ass_gene: header_list = ["No.", "Experiments", "RNA", "Closest genes", "No sig. DBD", "Top DBD", "p-value", "Organism", "Target region"] else: header_list = ["No.", "Experiments", "RNA", "No sig. DBD", "Top DBD", "p-value", "Organism", #"Condition", "Target region" ] profile_f = open(os.path.join(path, "profile.txt"),'r') profile = {} for line in profile_f: line = line.strip() line = line.split("\t") profile[line[0]] = line[1:] for i, exp in enumerate(profile.keys()): #print(exp) c += 1 try: if profile[exp][5] == "-": new_line = [ str(c), exp, profile[exp][0] ] else: new_line = [ str(c), '<a href="'+os.path.join(exp, "index.html")+\ '">'+exp+"</a>", profile[exp][0] ] if show_RNA_ass_gene: new_line.append( split_gene_name(gene_name=profile[exp][7], org=profile[exp][2]) ) if profile[exp][6] == "-": new_line += [ profile[exp][4], profile[exp][5], profile[exp][6], profile[exp][2], profile[exp][3] ] elif float(profile[exp][6]) < 0.05: new_line += [ profile[exp][4], profile[exp][5], "<font color=\"red\">"+\ profile[exp][6]+"</font>", profile[exp][2], profile[exp][3] ] else: new_line += [ profile[exp][4], profile[exp][5], profile[exp][6], profile[exp][2], profile[exp][3] ] data_table.append(new_line) except: if exp != "Experiment": print("Error in loading profile: "+exp) continue html.add_zebra_table( header_list, col_size_list, type_list, data_table, align=10, cell_align="left", sortable=True) html.add_fixed_rank_sortable() html.write(os.path.join(path,"index.html"))
def gen_html(self, directory, title, align=50): dir_name = os.path.basename(directory) # check_dir(directory) html_header = title link_d = OrderedDict() link_d["Boxplot"] = "index.html" link_d["Parameters"] = "parameters.html" html = Html(name=html_header, links_dict=link_d, fig_rpath="../style", RGT_header=False, other_logo="viz", homepage="../index.html") # fp = os.path.join(dir,outputname,title) html.add_figure("boxplot.png", align="center") type_list = 'ssssssssssssssssssssssssssssssssssssssssssssss' #### Calculate p value #### plist = {} for g in self.sortDict.keys(): plist[g] = {} for s1 in self.sortDict[g].keys(): for c1 in self.sortDict[g][s1].keys(): data1 = self.sortDict[g][s1][c1] plist[g][s1 + c1] = {} for s2 in self.sortDict[g].keys(): for c2 in self.sortDict[g][s2].keys(): if s2 == s1 and c2 == c1: pass else: data2 = self.sortDict[g][s2][c2] u, p_value = mannwhitneyu(data1, data2) plist[g][s1 + c1][s2 + c2] = p_value print("Multiple test correction.") multiple_correction(plist) for g in self.sortDict.keys(): html.add_heading(g, size=4, bold=False) data_table = [] col_size_list = [15] header_list = ["p-value"] for s in self.sortDict[g].keys(): for c in self.sortDict[g][s1].keys(): header_list.append(s + "\n" + c) col_size_list.append(15) for s1 in self.sortDict[g].keys(): for c1 in self.sortDict[g][s1].keys(): row = [s1 + "\n" + c1] for s2 in self.sortDict[g].keys(): for c2 in self.sortDict[g][s2].keys(): if s2 == s1 and c2 == c1: row.append("-") else: p = plist[g][s1 + c1][s2 + c2] if p > 0.05: row.append(value2str(p)) else: row.append("<font color=\"red\">" + value2str(p) + "</font>") data_table.append(row) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align + 50) # html.add_fixed_rank_sortable() html.write(os.path.join(directory, title, "index.html")) ## Parameters html = Html(name=html_header, links_dict=link_d, fig_rpath="../style", RGT_header=False, other_logo="viz", homepage="../index.html") header_list = ["Assumptions and hypothesis"] col_size_list = [50] data_table = [['All the regions among different BED files are normalized by quantile normalization.'], [ 'If there is any grouping problem, please check all the optional columns in input experimental matrix.']] html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left") html.add_free_content(['<a href="parameters.txt" style="margin-left:100">See parameters</a>']) html.add_free_content(['<a href="experimental_matrix.txt" style="margin-left:100">See experimental matrix</a>']) html.write(os.path.join(directory, title, "parameters.html"))
def gen_html(self, directory, parameters, obed, align=50, alpha=0.05, score=False): """Generate the HTML file""" dir_name = os.path.basename(directory) html_header = "Genomic Region Test: " + dir_name link_ds = OrderedDict() link_ds["RNA"] = "index.html" link_ds["Sig Target Regions"] = "starget_regions.html" link_ds["Target Regions"] = "target_regions.html" link_ds["Parameters"] = "parameters.html" ################################################## # index.html html = Html(name=html_header, links_dict=link_ds, # fig_dir=os.path.join(directory,"style"), fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html") # Plots html.add_figure("lineplot_region.png", align="left", width="45%", more_images=["boxplot_regions.png"]) if self.showdbs: html.add_figure("lineplot_dbs.png", align="left", width="45%", more_images=["boxplot_dbs.png"]) if self.showdbs: header_list = [["#", "DBD", "Target Regions", None, "Non-target Regions", None, "Statistics", "Target Regions", "Non-target Regions", None, "Statistics"], ["", "", "with DBS", "without DBS", "with DBS (average)", "s.d.", "<i>p</i>-value", "NO. DBSs", "NO. DBSs (average)", "s.d.", "<i>p</i>-value"]] header_titles = [["Rank", "DNA Binding Domain", "Given target regions on DNA", None, "Regions from randomization", None, "Statistics based on target regions", "Given target regions on DNA", "Regions from randomization", None, "Statistics based on DNA Binding Sites"], ["", "", "Number of target regions with DBS binding", "Number of target regions without DBS binding", "Average number of regions from randomization with DBS binding", "Standard deviation", "P value", "Number of related DNA Binding Sites binding to target regions", "Average number of DNA Binding Sites binding to random regions", "Standard deviation", "P-value"]] border_list = [" style=\"border-right:1pt solid gray\"", " style=\"border-right:1pt solid gray\"", "", " style=\"border-right:1pt solid gray\"", "", " style=\"border-right:1pt solid gray\"", " style=\"border-right:2pt solid gray\"", " style=\"border-right:1pt solid gray\"", "", " style=\"border-right:1pt solid gray\"", " style=\"border-right:1pt solid gray\""] else: header_list = [["#", "DBD", "Target Regions", None, "Non-target Regions", None, "Statistics", None], ["", "", "with DBS", "without DBS", "with DBS (average)", "s.d.", "<i>p</i>-value", "z-score"]] header_titles = [["Rank", "DNA Binding Domain", "Given target regions on DNA", None, "Regions from randomization", None, "Statistics based on target regions", None], ["", "", "Number of target regions with DBS binding", "Number of target regions without DBS binding", "Average number of regions from randomization with DBS binding", "Standard deviation", "P value", "Z-score"]] border_list = [" style=\"border-right:1pt solid gray\"", " style=\"border-right:1pt solid gray\"", "", " style=\"border-right:1pt solid gray\"", "", " style=\"border-right:1pt solid gray\"", " style=\"border-right:1pt solid gray\"", ""] type_list = 'ssssssssssssssss' col_size_list = [50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50] data_table = [] for i, rbs in enumerate(self.rbss): if self.data["region"]["p"][i] < alpha: p_region = "<font color=\"red\">" + value2str(self.data["region"]["p"][i]) + "</font>" else: p_region = value2str(self.data["region"]["p"][i]) zs = (self.counts_tr[rbs][0] - self.data["region"]["ave"][i]) / self.data["region"]["sd"][i] new_line = [str(i + 1), rbs.str_rna(pa=False), '<a href="dbd_region.html#' + rbs.str_rna() + '" style="text-align:left">' + str(self.counts_tr[rbs][0]) + '</a>', str(self.counts_tr[rbs][1]), value2str(self.data["region"]["ave"][i]), value2str(self.data["region"]["sd"][i]), p_region, value2str(zs)] if self.showdbs: if self.data["dbs"]["p"][i] < alpha: p_dbs = "<font color=\"red\">" + value2str(self.data["dbs"]["p"][i]) + "</font>" else: p_dbs = value2str(self.data["dbs"]["p"][i]) new_line += [str(self.counts_dbs[rbs]), value2str(self.data["dbs"]["ave"][i]), value2str(self.data["dbs"]["sd"][i]), p_dbs] data_table.append(new_line) data_table = natsort.natsorted(data_table, key=lambda x: x[6]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left", auto_width=True, header_titles=header_titles, border_list=border_list, sortable=True) html.add_heading("Notes") html.add_list(["RNA name: " + self.rna_name, "Randomization is performed for " + str(self.repeats) + " times.", "DBD stands for DNA Binding Domain on RNA.", "DBS stands for DNA Binding Site on DNA."]) html.add_fixed_rank_sortable() html.write(os.path.join(directory, "index.html")) ############################################################# # RNA subpage: Profile of targeted regions for each merged DNA Binding Domain ############################################################# header_list = ["#", "Target Region", "Associated Gene", "No. of DBSs", "DBS coverage"] header_titles = ["Rank", "Given target regions from BED files", "Associated genes which is overlapping with the given region or close to it (less than 50000 bp)", "Number of DNA Binding Sites locate within the region", "The proportion of the region covered by DBS binding"] ######################################################### # dbd_region.html html = Html(name=html_header, links_dict=link_ds, # fig_dir=os.path.join(directory,"style"), fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html") for rbsm in self.rbss: html.add_heading("DNA Binding Domain: " + rbsm.str_rna(), idtag=rbsm.str_rna()) data_table = [] for i, region in enumerate(self.txp.merged_dict[rbsm]): # Add information data_table.append([str(i + 1), '<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' + self.organism + "&position=" + region.chrom + "%3A" + str(region.initial) + "-" + str(region.final) + '" style="text-align:left">' + region.toString(space=True) + '</a>', split_gene_name(gene_name=region.name, org=self.organism), str(len(self.region_dbs[region.toString()])), value2str(self.region_coverage[region.toString()]) ]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left", auto_width=True, header_titles=header_titles, sortable=True) html.add_fixed_rank_sortable() html.write(os.path.join(directory, "dbd_region.html")) ############################################################# # Targeted regions centered ############################################################# ############################################################################################## # target_regions.html html = Html(name=html_header, links_dict=link_ds, # fig_dir=os.path.join(directory,"style"), fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html") if score: header_list = ["#", "Target region", "Associated Gene", "DBSs Count", "DBS coverage", "Score", "Sum of ranks"] header_titles = ["Rank", "Target regions loaded from the given BED file", "Associated genes which is overlapping with the given region or close to it (less than 50000 bp)", "Number of DNA Binding Sites within the region", "The proportion of the region covered by DBS binding", "Scores from BED file", "Sum of all the left-hand-side ranks"] else: header_list = ["#", "Target region", "Associated Gene", "DBSs Count", "DBS coverage", "Sum of ranks"] header_titles = ["Rank", "Target regions loaded from the given BED file", "Associated genes which is overlapping with the given region or close to it (less than 50000 bp)", "Number of DNA Binding Sites within the region", "The proportion of the region covered by DBS binding", "Sum of all the left-hand-side ranks"] html.add_heading("Target Regions") data_table = [] if not self.dna_region.sorted: self.dna_region.sort() # Calculate the ranking rank_count = len(self.dna_region) - rank_array([len(self.region_dbs[p.toString()]) for p in self.dna_region]) rank_coverage = len(self.dna_region) - rank_array([self.region_coverage[p.toString()] for p in self.dna_region]) if score: try: score_list = [float(p.data.split("\t")[0]) for p in self.dna_region] rank_score = len(self.dna_region) - rank_array([abs(s) for s in score_list]) rank_sum = [x + y + z for x, y, z in zip(rank_count, rank_coverage, rank_score)] # sum_rank = rank_array(rank_sum) # method='min' except ImportError: print("There is no score in BED file, please don't use '-score' argument.") else: rank_sum = [x + y for x, y in zip(rank_count, rank_coverage)] sum_rank = rank_array(rank_sum) for i, region in enumerate(self.dna_region): dbs_counts = str(len(self.region_dbs[region.toString()])) dbs_cover = value2str(self.region_coverage[region.toString()]) newline = [str(i + 1), '<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' + self.organism + "&position=" + region.chrom + "%3A" + str(region.initial) + "-" + str(region.final) + '" style="text-align:left">' + region.toString(space=True) + '</a>', split_gene_name(gene_name=region.name, org=self.organism), '<a href="region_dbs.html#' + region.toString() + '" style="text-align:left">' + dbs_counts + '</a>', dbs_cover] if score: dbs_score = value2str(score_list[i]) region.data = "\t".join([dbs_counts, dbs_cover, dbs_score, str(rank_sum[i])]) newline.append(dbs_score) newline.append(str(rank_sum[i])) else: region.data = "\t".join([dbs_counts, dbs_cover, str(rank_sum[i])]) newline.append(str(rank_sum[i])) data_table.append(newline) data_table = natsort.natsorted(data_table, key=lambda x: x[-1]) # data_table = sorted(data_table, key=lambda x: x[-1]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left", auto_width=True, header_titles=header_titles, sortable=True) html.add_heading("Notes") html.add_list(["All target regions without any bindings are ignored."]) html.add_fixed_rank_sortable() html.write(os.path.join(directory, "target_regions.html")) self.dna_region.sort_score() self.dna_region.write_bed(os.path.join(directory, obed + "_target_regions.bed")) ############################################################################################## # starget_regions.html for significant target regions stargets = GenomicRegionSet("sig_targets") sig_dbs = {} sig_dbs_coverage = {} for i, r in enumerate(self.dna_region): sig_bindings = self.region_dbs[r.toString()].overlap_rbss(rbss=self.data["region"]["sig_region"]) dbs = sig_bindings.get_dbs() if len(dbs) > 0: stargets.add(r) m_dbs = dbs.merge(w_return=True) sig_dbs[r] = len(dbs) # self.promoter["de"]["merged_dbs"][promoter.toString()] = len(m_dbs) sig_dbs_coverage[r] = float(m_dbs.total_coverage()) / len(r) html = Html(name=html_header, links_dict=link_ds, # fig_dir=os.path.join(directory,"style"), fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html") # Select promoters in sig DBD if len(self.data["region"]["sig_region"]) == 0: html.add_heading("There is no significant DBD.") else: html.add_heading("Target regions bound by significant DBD") data_table = [] # Calculate the ranking rank_count = len(stargets) - rank_array([sig_dbs[p] for p in stargets]) rank_coverage = len(stargets) - rank_array([sig_dbs_coverage[p] for p in stargets]) if score: score_list = [float(p.data.split("\t")[0]) for p in stargets] rank_score = len(stargets) - rank_array([abs(s) for s in score_list]) rank_sum = [x + y + z for x, y, z in zip(rank_count, rank_coverage, rank_score)] sum_rank = rank_array(rank_sum) # method='min' else: rank_sum = [x + y for x, y in zip(rank_count, rank_coverage)] sum_rank = rank_array(rank_sum) for i, region in enumerate(stargets): dbssount = '<a href="region_dbs.html#' + region.toString() + \ '" style="text-align:left">' + str(sig_dbs[region]) + '</a>' region_link = region_link_internet(self.organism, region) newline = [str(i + 1), region_link, split_gene_name(gene_name=region.name, org=self.organism), dbssount, value2str(sig_dbs_coverage[region]) ] if score: dbs_score = value2str(score_list[i]) # region.data = "\t".join([dbs_counts, dbs_cover, dbs_score, str(sum_rank[i])]) newline.append(dbs_score) newline.append(str(rank_sum[i])) # print([dbs_score, str(sum_rank[i])]) else: # region.data = "\t".join([dbs_counts, dbs_cover, str(sum_rank[i])]) newline.append(str(rank_sum[i])) # newline += ["<i>" + str(rank_sum[i]) + "</i>"] # print(newline) data_table.append(newline) # print(data_table) # data_table = sorted(data_table, key=lambda x: x[-1]) data_table = natsort.natsorted(data_table, key=lambda x: x[-1]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left", header_titles=header_titles, border_list=None, sortable=True) html.add_heading("Notes") html.add_list(["DBS stands for DNA Binding Site on DNA.", "DBS coverage is the proportion of the region where has potential to form triple helices with the given RNA."]) html.add_fixed_rank_sortable() html.write(os.path.join(directory, "starget_regions.html")) ############################ # Subpages for targeted region centered page # region_dbs.html header_list = ["RBS", "DBS", "Strand", "Score", "Motif", "Orientation"] html = Html(name=html_header, links_dict=link_ds, # fig_dir=os.path.join(directory,"style"), fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html") for i, region in enumerate(self.dna_region): if len(self.region_dbs[region.toString()]) == 0: continue else: html.add_heading("Associated gene: " + split_gene_name(gene_name=region.name, org=self.organism), idtag=region.toString()) html.add_free_content(['<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' + self.organism + "&position=" + region.chrom + "%3A" + str(region.initial) + "-" + str(region.final) + '" style="margin-left:50">' + region.toString(space=True) + '</a>']) data_table = [] for rd in self.region_dbs[region.toString()]: rbs = rd.rna.str_rna(pa=False) for rbsm in self.data["region"]["sig_region"]: # rbsm = rbsm.partition(":")[2].split("-") if rd.rna.overlap(rbsm): rbs = "<font color=\"red\">" + rbs + "</font>" data_table.append([rbs, '<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' + self.organism + "&position=" + rd.dna.chrom + "%3A" + str(rd.dna.initial) + "-" + str( rd.dna.final) + '" style="text-align:left">' + rd.dna.toString(space=True) + '</a>', rd.dna.orientation, rd.score, rd.motif, rd.orient]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left", auto_width=True) html.write(os.path.join(directory, "region_dbs.html")) ###############################################################################33 ################ Parameters.html html = Html(name=html_header, links_dict=link_ds, # fig_dir=os.path.join(directory,"style"), fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html") html.add_heading("Parameters") header_list = ["Description", "Arguments", "Value"] data_table = [["RNA sequence name", "-rn", parameters.rn], ["Input RNA sequence file", "-r", os.path.basename(parameters.r)], ["Input BED file", "-bed", os.path.basename(parameters.bed)], ["Output directory", "-o", os.path.basename(parameters.o)], ["Organism", "-organism", parameters.organism], ["Number of repitetion of andomization", "-n", str(parameters.n)], ["Alpha level for rejection p value", "-a", str(parameters.a)], ["Cut off value for filtering out the low counts of DBSs", "-ccf", str(parameters.ccf)], ["Remove temporary files", "-rt", str(parameters.rt)], ["Input BED file for masking in randomization", "-f", str(parameters.f)], ["Input file for RNA accecibility", "-ac", str(parameters.ac)], ["Cut off value for RNA accecibility", "-accf", str(parameters.accf)], ["Output the BED files for DNA binding sites.", "-obed", str(parameters.obed)], ["Show parallel and antiparallel bindings in the plot separately.", "-showpa", str(parameters.showpa)], ["Minimum length", "-l", str(self.triplexator_p[0])], ["Maximum error rate", "-e", str(self.triplexator_p[1])], ["Tolerated number of consecutive errors", "-c", str(self.triplexator_p[2])], ["Filtering repeats", "-fr", str(self.triplexator_p[3])], ["Filtering mode", "-fm", str(self.triplexator_p[4])], ["Output format", "-of", str(self.triplexator_p[5])], ["Merge features", "-mf", str(self.triplexator_p[6])]] html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left", auto_width=True) html.add_free_content(['<a href="summary.txt" style="margin-left:100">See details</a>']) html.write(os.path.join(directory, "parameters.html"))
def main(): ########################################################################## ##### PARAMETERS ######################################################### ########################################################################## parser = argparse.ArgumentParser(description='Triplex Domain Finder is a statistical framework \ for detection of triple helix potential of \ lncRNAs from genome-wide functional data. \ Author: Chao-Chung Kuo\ \nVersion: ' + __version__, formatter_class=argparse.ArgumentDefaultsHelpFormatter) subparsers = parser.add_subparsers(help='sub-command help',dest='mode') ################### Promoter test ########################################## h_promotor = "Promoter test evaluates the association between the given lncRNA to the target promoters." parser_promotertest = subparsers.add_parser('promotertest', help=h_promotor) parser_promotertest.add_argument('-r', type=str, metavar=' ', help="Input file name for RNA sequence (in fasta format)") parser_promotertest.add_argument('-rl', type=str, default=None, metavar=' ', help="Input list for paths to all RNA sequences (in fasta format)") parser_promotertest.add_argument('-rn', type=str, default=None, metavar=' ', help="Define the RNA name") parser_promotertest.add_argument('-de', default=False, metavar=' ', help="Input file for target gene list (gene symbols or Ensembl ID)") parser_promotertest.add_argument('-bed', default=False, metavar=' ', help="Input BED file of the promoter regions of target genes") parser_promotertest.add_argument('-bg', default=False, metavar=' ', help="Input BED file of the promoter regions of background genes") parser_promotertest.add_argument('-o', metavar=' ', help="Output directory name for all the results") parser_promotertest.add_argument('-t', metavar=' ', default=False, help="Define the title name for the results under the Output name. (default: %(default)s)") parser_promotertest.add_argument('-organism', metavar=' ', help='Define the organism (hg19 or mm9)') parser_promotertest.add_argument('-gtf', metavar=' ', default=None, help='Define the GTF file for annotation (optional)') parser_promotertest.add_argument('-pl', type=int, default=1000, metavar=' ', help="Define the promotor length (default: %(default)s)") parser_promotertest.add_argument('-showdbs', action="store_true", help="Show the plots and statistics of DBS (DNA Binding sites)") parser_promotertest.add_argument('-score', action="store_true", help="Load score column from input gene list or BED file for analysis.") parser_promotertest.add_argument('-scoreh', action="store_true", help="Use the header of scores from the given gene list or BED file.") parser_promotertest.add_argument('-a', type=float, default=0.05, metavar=' ', help="Define significance level for rejection null hypothesis (default: %(default)s)") parser_promotertest.add_argument('-ccf', type=int, default=100, metavar=' ', help="Define the cut off value for promoter counts (default: %(default)s)") parser_promotertest.add_argument('-rt', action="store_true", default=False, help="Remove temporary files (fa, txp...etc)") parser_promotertest.add_argument('-log', action="store_true", default=False, help="Set the plots in log scale") parser_promotertest.add_argument('-ac', type=str, default=False, metavar=' ', help="Input file for RNA accecibility ") parser_promotertest.add_argument('-accf', type=float, default=500, metavar=' ', help="Define the cut off value for RNA accecibility") parser_promotertest.add_argument('-obed', action="store_true", default=True, help="Output the BED files for DNA binding sites.") parser_promotertest.add_argument('-showpa', action="store_true", default=False, help="Show parallel and antiparallel bindings in the plot separately.") # parser_promotertest.add_argument('-motif', action="store_true", default=False, help="Show motif of binding sites.") parser_promotertest.add_argument('-filter_havana', type=str, default="F", metavar=' ', help="Apply filtering to remove HAVANA entries.") parser_promotertest.add_argument('-protein_coding', type=str, default="F", metavar=' ', help="Apply filtering to get only protein coding genes.") parser_promotertest.add_argument('-known_only', type=str, default="F", metavar=' ', help="Apply filtering to get only known genes.") parser_promotertest.add_argument('-dump', action="store_true", default=False, help="Only dump the experimental file and leave the program.") parser_promotertest.add_argument('-rnaexp', type=str, default=None, metavar=' ', help="Given a file with RNA name and the expression value") parser_promotertest.add_argument('-l', type=int, default=20, metavar=' ', help="[Triplexator] Define the minimum length of triplex (default: %(default)s)") parser_promotertest.add_argument('-e', type=int, default=20, metavar=' ', help="[Triplexator] Set the maximal error-rate in %% tolerated (default: %(default)s)") parser_promotertest.add_argument('-c', type=int, default=2, metavar=' ', help="[Triplexator] Sets the tolerated number of consecutive errors with respect to the canonical triplex rules as such were found to greatly destabilize triplexes in vitro (default: %(default)s)") parser_promotertest.add_argument('-fr', type=str, default="off", metavar=' ', help="[Triplexator] Activates the filtering of low complexity regions and repeats in the sequence data (default: %(default)s)") parser_promotertest.add_argument('-fm', type=int, default=0, metavar=' ', help="[Triplexator] Method to quickly discard non-hits (default: %(default)s).'0' = greedy approach; '1' = q-gram filtering.") parser_promotertest.add_argument('-of', type=int, default=1, metavar=' ', help="[Triplexator] Define output formats of Triplexator (default: %(default)s)") parser_promotertest.add_argument('-mf', action="store_true", default=False, help="[Triplexator] Merge overlapping features into a cluster and report the spanning region.") parser_promotertest.add_argument('-rm', type=int, default=0, metavar=' ', help="[Triplexator] Set the multiprocessing") parser_promotertest.add_argument('-par', type=str, default="", metavar=' ', help="[Triplexator] Define other parameters for Triplexator") ################### Genomic Region Test ########################################## h_region = "Genomic region test evaluates the association between the given lncRNA to the target regions by randomization." parser_randomtest = subparsers.add_parser('regiontest', help=h_region) parser_randomtest.add_argument('-r', type=str, metavar=' ', help="Input file name for RNA sequence (in fasta format)") parser_randomtest.add_argument('-rl', type=str, default=None, metavar=' ', help="Input list for paths to all RNA sequences (in fasta format)") parser_randomtest.add_argument('-rn', type=str, default=False, metavar=' ', help="Define the RNA name") parser_randomtest.add_argument('-bed', metavar=' ', help="Input BED file for interested regions on DNA") parser_randomtest.add_argument('-o', metavar=' ', help="Output directory name for all the results and temporary files") parser_randomtest.add_argument('-t', metavar=' ', default=False, help="Define the title name for the results under the Output name. (default: %(default)s)") parser_randomtest.add_argument('-n', type=int, default=10000, metavar=' ', help="Number of times for randomization (default: %(default)s)") parser_randomtest.add_argument('-organism', metavar=' ', help='Define the organism (hg19 or mm9)') parser_randomtest.add_argument('-showdbs', action="store_true", help="Show the plots and statistics of DBS (DNA Binding sites)") parser_randomtest.add_argument('-score', action="store_true", help="Load score column from input BED file") parser_randomtest.add_argument('-a', type=float, default=0.05, metavar=' ', help="Define significance level for rejection null hypothesis (default: %(default)s)") parser_randomtest.add_argument('-ccf', type=int, default=40, metavar=' ', help="Define the cut off value for DBS counts (default: %(default)s)") parser_randomtest.add_argument('-rt', action="store_true", default=False, help="Remove temporary files (fa, txp...etc)") parser_randomtest.add_argument('-log', action="store_true", default=False, help="Set the plots in log scale") parser_randomtest.add_argument('-f', type=str, default=False, metavar=' ', help="Input BED file as mask in randomization") parser_randomtest.add_argument('-ac', type=str, default=False, metavar=' ', help="Input file for RNA accecibility ") parser_randomtest.add_argument('-accf', type=float, default=500, metavar=' ', help="Define the cut off value for RNA accecibility") parser_randomtest.add_argument('-obed', action="store_true", default=True, help="Output the BED files for DNA binding sites.") parser_randomtest.add_argument('-showpa', action="store_true", default=False, help="Show parallel and antiparallel bindings in the plot separately.") parser_randomtest.add_argument('-l', type=int, default=20, metavar=' ', help="[Triplexator] Define the minimum length of triplex (default: %(default)s)") parser_randomtest.add_argument('-e', type=int, default=20, metavar=' ', help="[Triplexator] Set the maximal error-rate in %% tolerated (default: %(default)s)") parser_randomtest.add_argument('-c', type=int, default=2, metavar=' ', help="[Triplexator] Sets the tolerated number of consecutive errors with respect to the canonical triplex rules as such were found to greatly destabilize triplexes in vitro (default: %(default)s)") parser_randomtest.add_argument('-fr', type=str, default="off", metavar=' ', help="[Triplexator] Activates the filtering of low complexity regions and repeats in the sequence data (default: %(default)s)") parser_randomtest.add_argument('-fm', type=int, default=0, metavar=' ', help="[Triplexator] Method to quickly discard non-hits (default: %(default)s).'0' = greedy approach; '1' = q-gram filtering.") parser_randomtest.add_argument('-of', type=int, default=1, metavar=' ', help="[Triplexator] Define output formats of Triplexator (default: %(default)s)") parser_randomtest.add_argument('-mf', action="store_true", default=False, help="[Triplexator] Merge overlapping features into a cluster and report the spanning region.") parser_randomtest.add_argument('-rm', type=int, default=0, metavar=' ', help="[Triplexator] Set the multiprocessing") parser_randomtest.add_argument('-par', type=str, default="", metavar=' ', help="[Triplexator] Define other parameters for Triplexator") ########################################################################## parser_bed2bed = subparsers.add_parser('get_dbss', help="Get DBSs in BED format from the single BED file") parser_bed2bed.add_argument('-i',type=str, metavar=' ', help='Input BED file of the target regions') parser_bed2bed.add_argument('-dbs',type=str, metavar=' ', help='Output BED file of the DBSs') parser_bed2bed.add_argument('-rbs',type=str, metavar=' ', help='Output BED file of the RBSs') parser_bed2bed.add_argument('-r',type=str, metavar=' ', help='Input FASTA file of the RNA') parser_bed2bed.add_argument('-organism', metavar=' ', help='Define the organism (hg19 or mm9)') parser_bed2bed.add_argument('-l', type=int, default=20, metavar=' ', help="[Triplexator] Define the minimum length of triplex (default: %(default)s)") parser_bed2bed.add_argument('-e', type=int, default=20, metavar=' ', help="[Triplexator] Set the maximal error-rate in %% tolerated (default: %(default)s)") parser_bed2bed.add_argument('-c', type=int, default=2, metavar=' ', help="[Triplexator] Sets the tolerated number of consecutive errors with respect to the canonical triplex rules as such were found to greatly destabilize triplexes in vitro (default: %(default)s)") parser_bed2bed.add_argument('-fr', type=str, default="off", metavar=' ', help="[Triplexator] Activates the filtering of low complexity regions and repeats in the sequence data (default: %(default)s)") parser_bed2bed.add_argument('-fm', type=int, default=0, metavar=' ', help="[Triplexator] Method to quickly discard non-hits (default: %(default)s).'0' = greedy approach; '1' = q-gram filtering.") parser_bed2bed.add_argument('-of', type=int, default=1, metavar=' ', help="[Triplexator] Define output formats of Triplexator (default: %(default)s)") parser_bed2bed.add_argument('-mf', action="store_true", default=False, help="[Triplexator] Merge overlapping features into a cluster and report the spanning region.") parser_bed2bed.add_argument('-rm', type=int, default=0, metavar=' ', help="[Triplexator] Set the multiprocessing") ########################################################################## # rgt-TDF integrate -path parser_integrate = subparsers.add_parser('integrate', help="Integrate the project's links and generate project-level statistics.") parser_integrate.add_argument('-path',type=str, metavar=' ', help='Define the path of the project.') ########################################################################## parser_updatehtml = subparsers.add_parser('updatehtml', help="Update the project's html.") parser_updatehtml.add_argument('-path',type=str, metavar=' ', help='Define the path of the project.') parser_updatehtml.add_argument('-exp', type=str, metavar=' ', help='Define file with expression data.') ################### Parsing the arguments ################################ if len(sys.argv) == 1: parser.print_help() sys.exit(1) elif len(sys.argv) == 2: # retrieve subparsers from parser subparsers_actions = [action for action in parser._actions if isinstance(action, argparse._SubParsersAction)] # there will probably only be one subparser_action,but better save than sorry for subparsers_action in subparsers_actions: # get all subparsers and print help for choice, subparser in subparsers_action.choices.items(): if choice == sys.argv[1]: print("\nYou need more arguments.") print("\nSubparser '{}'".format(choice)) subparser.print_help() sys.exit(1) else: args = parser.parse_args() #################################################################################### ######### Integration if args.mode == "integrate": condition_list = [] # name, link, no. tests, no. sig. for item in os.listdir(args.path): if item == "style": continue if os.path.isfile(os.path.join(args.path,item)): continue elif os.path.isdir(os.path.join(args.path,item)): h = os.path.join(item, "index.html") pro = os.path.join(args.path, item, "profile.txt") if os.path.isfile(pro): integrate_stat(path=os.path.join(args.path, item)) nt = 0 ns = 0 with open(pro) as f: for line in f: line = line.strip().split("\t") if line[0] == "Experiment": continue nt += 1 if float(line[7]) < 0.05: ns += 1 # print([item, h, str(nt), str(ns)]) condition_list.append( [item, h, str(nt), str(ns)] ) # print(condition_list) link_d = {"List":"index.html"} fp = condition_list[0][0] + "/style" html = Html(name="Directory: "+args.path, links_dict=link_d, fig_rpath=fp, #fig_dir=fp, RGT_header=False, other_logo="TDF") html.add_heading("All conditions in: "+args.path+"/") data_table = [] type_list = 'sssssssssssss' col_size_list = [20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20] c = 0 header_list = ["No.", "Conditions", "No. tests", "No. sig. tests" ] for i, exp in enumerate(condition_list): c += 1 data_table.append([str(c), '<a href="'+exp[1]+'">'+exp[0]+"</a>", exp[2], exp[3] ]) html.add_zebra_table( header_list, col_size_list, type_list, data_table, align=10, cell_align="left", sortable=True) html.add_fixed_rank_sortable() html.write(os.path.join(args.path,"index.html")) gen_heatmap(path=args.path) generate_rna_exp_pv_table(root=args.path, multi_corr=False) merge_DBD_regions(path=args.path) sys.exit(0) #################################################################################### ######### updatehtml elif args.mode == "updatehtml": for item in os.listdir(args.path): pro = os.path.join(args.path, item, "profile.txt") if os.path.isfile(pro): update_profile(dirpath=os.path.join(args.path, item), expression=args.exp) revise_index(root=args.path) generate_rna_exp_pv_table(root=args.path, multi_corr=True) sys.exit(0) #################################################################################### ######### get_dbss elif args.mode == "get_dbss": get_dbss(input_BED=args.i,output_BED=args.dbs,rna_fasta=args.r,output_rbss=args.rbs, organism=args.organism,l=args.l,e=args.e,c=args.c, fr=args.fr,fm=args.fm,of=args.of,mf=args.mf,rm=args.rm,temp=dir) os.remove("dna_targeted_region.fa") os.remove("dna_targeted_region.txp") os.remove("rna_temp.fa") sys.exit(0) ####################################################################### #### Checking arguments if not args.o: print("Please define the output directory name. \n") sys.exit(1) if not args.organism: print("Please define the organism. (hg19 or mm9)") sys.exit(1) if not args.rn and not args.rl: print("Please define RNA sequence name.") sys.exit(1) if args.r and args.rl: print("Both -r and -rl are given. TDF will skip -r and process -rl ") if args.rl: with open(args.rl) as f: for line in f: line = line.strip() rn = os.path.basename(line).rpartition(".")[0] print("\tProcessing: "+rn) command = ["rgt-TDF", args.mode, "-r", line, "-rn", rn, "-o", os.path.join(args.o, rn), "-organism", args.organism ] if args.de and not args.bed: command += ["-de", args.de] if args.bed and args.bg: command += ["-bed", args.bed, "-bg", args.bg] if args.score: command += ["-score"] if args.rt: command += ["-rt" ] if args.pl != 1000: command += ["-pl", args.pl] if args.ccf != 40: command += ["-ccf", args.ccf] if args.obed: command += ["-obed"] if args.a != 0.05: command += ["-a", args.a] if args.filter_havana == 'F': command += ["-filter_havana", 'F'] if args.protein_coding == 'T': command += ["-protein_coding", 'T'] if args.known_only == 'F': command += ["-known_only", 'F'] if args.rm > 0: command += ["-rm", args.rm ] if args.fr != 'off': command += ["-fr", args.fr ] if args.c != 2: command += ["-c", args.c ] if args.e != 20: command += ["-e", args.e ] if args.of != 1: command += ["-of", args.of ] if args.l != 15: command += ["-l", args.l ] if args.fr != 'off': command += ["-fr", args.fr ] if args.fr != 'off': command += ["-fr", args.fr ] if args.fr != 'off': command += ["-fr", args.fr ] subprocess.call(command) sys.exit(0) t0 = time.time() # Normalised output path if not args.t: title = args.rn else: title = args.t args.o = os.path.normpath(os.path.join(dir,args.o,title)) check_dir(os.path.dirname(os.path.dirname(args.o))) check_dir(os.path.dirname(args.o)) check_dir(args.o) # Input parameters dictionary summary = [] summary.append("Time: " + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) summary.append("User: "******"\nCommand:\n\t$ " + " ".join(sys.argv)) ################################################################################ ##### Promoter Test ############################################################ ################################################################################ if args.mode == 'promotertest': ################################################################################################3 if args.bed and not args.bg: print("Please add background promoters in BED format. (-bg)") sys.exit(1) if args.scoreh and not args.score: print("Score header (-scoreh) can only be used when scores (-score) are loaded.") print("Please add '-score'.") sys.exit(1) print2(summary, "\n"+"*************** Promoter Test ****************") print2(summary, "*** Input RNA sequence: "+args.r) if args.o.count("/") < 3: print2(summary, "*** Output directory: "+ args.o) else: n = args.o.count("/") - 3 + 1 print2(summary, "*** Output directory: "+ args.o.split("/",n)[-1] ) args.r = os.path.normpath(os.path.join(dir,args.r)) if args.de: args.de = os.path.normpath(os.path.join(dir,args.de)) if args.bed: args.bed = os.path.normpath(os.path.join(dir,args.bed)) if args.bg: args.bg = os.path.normpath(os.path.join(dir,args.bg)) # Get GenomicRegionSet from the given genes print2(summary, "Step 1: Calculate the triplex forming sites on RNA and DNA.") promoter = PromoterTest(gene_list_file=args.de, gtf=args.gtf, rna_name=args.rn, bed=args.bed, bg=args.bg, organism=args.organism, promoterLength=args.pl, summary=summary, temp=dir, output=args.o, showdbs=args.showdbs, score=args.score, scoreh=args.scoreh, filter_havana=args.filter_havana, protein_coding=args.protein_coding, known_only=args.known_only) if args.dump: sys.exit(0) promoter.get_rna_region_str(rna=args.r, expfile=args.rnaexp) promoter.connect_rna(rna=args.r, temp=args.o) promoter.search_triplex(temp=args.o, l=args.l, e=args.e, remove_temp=args.rt, c=args.c, fr=args.fr, fm=args.fm, of=args.of, mf=args.mf, par=args.par) t1 = time.time() print2(summary, "\tRunning time is: " + str(datetime.timedelta(seconds=round(t1-t0)))) print2(summary, "Step 2: Calculate the frequency of DNA binding sites within the promotors.") if args.obed: obedp = os.path.basename(args.o) else: obedp = None promoter.count_frequency(temp=args.o, remove_temp=args.rt, obedp=obedp, cutoff=args.ccf, l=args.l) promoter.fisher_exact(alpha=args.a) t2 = time.time() print2(summary, "\tRunning time is: " + str(datetime.timedelta(seconds=round(t2-t1)))) if len(promoter.rbss) == 0: no_binding_response(args=args, rna_regions=promoter.rna_regions, rna_name=promoter.rna_name, organism=promoter.organism, stat=promoter.stat, expression=promoter.rna_expression) promoter.dbd_regions(output=args.o) os.remove(os.path.join(args.o,"rna_temp.fa")) try: os.remove(os.path.join(args.o,"rna_temp.fa.fai")) except: pass print2(summary, "Step 3: Establishing promoter profile.") t3 = time.time() print2(summary, "\tRunning time is: " + str(datetime.timedelta(seconds=round(t3-t2)))) print2(summary, "Step 4: Generate plot and output html files.") promoter.plot_lines(txp=promoter.txp_def, rna=args.r, dirp=args.o, ac=args.ac, cut_off=args.accf, log=args.log, showpa=args.showpa, sig_region=promoter.sig_DBD, ylabel="Number of DBSs", linelabel="No. DBSs", filename="plot_promoter.png") promoter.barplot(dirp=args.o, filename="bar_promoter.png", sig_region=promoter.sig_DBD ) #if args.showdbs: # promoter.plot_lines(txp=promoter.txp_def, rna=args.r, dirp=args.o, ac=args.ac, # cut_off=args.accf, log=args.log, showpa=args.showpa, # sig_region=promoter.sig_region_dbs, # ylabel="Number of DBSs on target promoters", # linelabel="No. DBSs", filename="plot_dbss.png") # promoter.barplot(dirp=args.o, filename="bar_dbss.png", sig_region=promoter.sig_region_dbs, dbs=True) # if args.motif: promoter.gen_motifs(temp=args.o) promoter.gen_html(directory=args.o, parameters=args, ccf=args.ccf, align=50, alpha=args.a) promoter.gen_html_genes(directory=args.o, align=50, alpha=args.a, nonDE=False) # promoter.save_table(path=os.path.dirname(args.o), table=promoter.ranktable, # filename="lncRNA_target_ranktable.txt") # promoter.save_table(path=os.path.dirname(args.o), table=promoter.dbstable, # filename="lncRNA_target_dbstable.txt") #promoter.heatmap(table="ranktable.txt", temp=os.path.dirname(args.o)) t4 = time.time() print2(summary, "\tRunning time is: " + str(datetime.timedelta(seconds=round(t4-t3)))) print2(summary, "\nTotal running time is: " + str(datetime.timedelta(seconds=round(t4-t0)))) output_summary(summary, args.o, "summary.txt") save_profile(rna_regions=promoter.rna_regions, rna_name=promoter.rna_name, organism=promoter.organism, output=args.o, bed=args.bed, geneset=args.de, stat=promoter.stat, topDBD=promoter.topDBD, sig_DBD=promoter.sig_DBD, expression=promoter.rna_expression) revise_index(root=os.path.dirname(os.path.dirname(args.o))) try: os.remove(os.path.join(args.o, "de.fa")) except OSError: pass try: os.remove(os.path.join(args.o, "nde.fa")) except OSError: pass write_stat(stat=promoter.stat, filename=os.path.join(args.o, "stat.txt")) ################################################################################ ##### Genomic Region Test ###################################################### ################################################################################ if args.mode == 'regiontest': def no_binding_code(): print("*** Find no triple helices binding on the given RNA") pro_path = os.path.join(os.path.dirname(args.o), "profile.txt") exp = os.path.basename(args.o) tar_reg = os.path.basename(args.bed) r_genes = rna_associated_gene(rna_regions=randomtest.rna_regions, name=randomtest.rna_name, organism=randomtest.organism) newlines = [] if os.path.isfile(pro_path): with open(pro_path,'r') as f: new_exp = True for line in f: line = line.strip() line = line.split("\t") if line[0] == exp: newlines.append([exp, args.rn, args.o.split("_")[-1], args.organism, tar_reg, "0", "-", "1.0", r_genes, "No triplex found" ]) new_exp = False else: newlines.append(line) if new_exp: newlines.append([exp, args.rn, args.o.split("_")[-1], args.organism, tar_reg,"0", "-", "1.0", r_genes, "No triplex found" ]) else: newlines.append(["Experiment","RNA_names","Tag","Organism","Target_region","No_sig_DBDs", "Top_DBD", "p-value","closest_genes"]) newlines.append([exp, args.rn, args.o.split("_")[-1], args.organism, tar_reg, "0", "-", "1.0", r_genes, "No triplex found" ]) with open(pro_path,'w') as f: for lines in newlines: print("\t".join(lines), file=f) #shutil.rmtree(args.o) list_all_index(path=os.path.dirname(args.o), show_RNA_ass_gene=randomtest.rna_regions) shutil.rmtree(args.o) sys.exit(1) ######################################################### print2(summary, "\n"+"*************** Genomic Region Test ***************") print2(summary, "*** Input RNA sequence: "+args.r) print2(summary, "*** Input regions in BED: "+os.path.basename(args.bed)) print2(summary, "*** Number of randomization: "+str(args.n)) print2(summary, "*** Output directoey: "+os.path.basename(args.o)) args.r = os.path.normpath(os.path.join(dir,args.r)) print2(summary, "\nStep 1: Calculate the triplex forming sites on RNA and the given regions") randomtest = RandomTest(rna_fasta=args.r, rna_name=args.rn, dna_region=args.bed, organism=args.organism, showdbs=args.showdbs) randomtest.get_rna_region_str(rna=args.r) obed = os.path.basename(args.o) randomtest.connect_rna(rna=args.r, temp=args.o) randomtest.target_dna(temp=args.o, remove_temp=args.rt, l=args.l, e=args.e, obed=obed, c=args.c, fr=args.fr, fm=args.fm, of=args.of, mf=args.mf, par=args.par, cutoff=args.ccf ) t1 = time.time() print2(summary, "\tRunning time is: " + str(datetime.timedelta(seconds=round(t1-t0)))) # print(args.par) if len(randomtest.rbss) == 0: # no_binding_code() no_binding_response(args=args, rna_regions=randomtest.rna_regions, rna_name=randomtest.rna_name, organism=randomtest.organism, stat=randomtest.stat, expression=randomtest.rna_expression) print2(summary, "Step 2: Randomization and counting number of binding sites") randomtest.random_test(repeats=args.n, temp=args.o, remove_temp=args.rt, l=args.l, e=args.e, c=args.c, fr=args.fr, fm=args.fm, of=args.of, mf=args.mf, par=args.par, rm=args.rm, filter_bed=args.f, alpha=args.a) t2 = time.time() print2(summary, "\tRunning time is: " + str(datetime.timedelta(seconds=round(t2-t1)))) print2(summary, "Step 3: Generating plot and output HTML") randomtest.dbd_regions(sig_region=randomtest.data["region"]["sig_region"], output=args.o) os.remove(os.path.join(args.o, "rna_temp.fa")) try: os.remove(os.path.join(args.o, "rna_temp.fa.fai")) except: pass randomtest.lineplot(txp=randomtest.txpf, dirp=args.o, ac=args.ac, cut_off=args.accf, showpa=args.showpa, log=args.log, ylabel="Number of DBS", sig_region=randomtest.data["region"]["sig_region"], linelabel="No. DBS", filename="lineplot_region.png") #randomtest.lineplot(txp=randomtest.txp, dirp=args.o, ac=args.ac, cut_off=args.accf, showpa=args.showpa, # log=args.log, ylabel="Number of target regions with DBS", # sig_region=randomtest.data["region"]["sig_region"], # linelabel="No. target regions", filename="lineplot_region.png") randomtest.boxplot(dir=args.o, matrix=randomtest.region_matrix, sig_region=randomtest.data["region"]["sig_region"], truecounts=[r[0] for r in randomtest.counts_tr.values()], sig_boolean=randomtest.data["region"]["sig_boolean"], ylabel="Number of target regions", filename="boxplot_regions" ) #if args.showdbs: # randomtest.lineplot(txp=randomtest.txpf, dirp=args.o, ac=args.ac, cut_off=args.accf, showpa=args.showpa, # log=args.log, ylabel="Number of DBS on target regions", # sig_region=randomtest.data["dbs"]["sig_region"], # linelabel="No. DBS", filename="lineplot_dbs.png") # randomtest.boxplot(dir=args.o, matrix=randomtest.dbss_matrix, # sig_region=randomtest.data["dbs"]["sig_region"], # truecounts=randomtest.counts_dbs.values(), # sig_boolean=randomtest.data["dbs"]["sig_boolean"], # ylabel="Number of DBS on target regions", # filename="boxplot_dbs" ) randomtest.gen_html(directory=args.o, parameters=args, align=50, alpha=args.a, score=args.score, obed=obed) t3 = time.time() print2(summary, "\tRunning time is: " + str(datetime.timedelta(seconds=round(t3-t2)))) print2(summary, "\nTotal running time is: " + str(datetime.timedelta(seconds=round(t3-t0)))) output_summary(summary, args.o, "summary.txt") # save_profile(output=args.o, bed=args.bed) save_profile(rna_regions=randomtest.rna_regions, rna_name=randomtest.rna_name, organism=randomtest.organism, output=args.o, bed=args.bed, stat=randomtest.stat, topDBD=randomtest.topDBD, sig_DBD=randomtest.data["region"]["sig_region"], expression=randomtest.rna_expression) list_all_index(path=os.path.dirname(args.o)) for f in os.listdir(args.o): if re.search("dna*.fa", f) or re.search("dna*.txp", f): os.remove(os.path.join(args.o, f)) write_stat(stat=randomtest.stat, filename=os.path.join(args.o, "stat.txt"))
def main(): ########################################################################## ##### PARAMETERS ######################################################### ########################################################################## parser = argparse.ArgumentParser( description='Triplex Domain Finder is a statistical framework \ for detection of triple helix potential of \ lncRNAs from genome-wide functional data. \ Author: Chao-Chung Kuo\ \nVersion: ' + __version__, formatter_class=argparse.ArgumentDefaultsHelpFormatter) subparsers = parser.add_subparsers(help='sub-command help', dest='mode') ################### Promoter test ########################################## h_promotor = "Promoter test evaluates the association between the given lncRNA to the target promoters." parser_promotertest = subparsers.add_parser('promotertest', help=h_promotor) parser_promotertest.add_argument( '-r', type=str, metavar=' ', help="Input file name for RNA sequence (in fasta format)") parser_promotertest.add_argument( '-rl', type=str, default=None, metavar=' ', help="Input list for paths to all RNA sequences (in fasta format)") parser_promotertest.add_argument('-rn', type=str, default=None, metavar=' ', help="Define the RNA name") parser_promotertest.add_argument( '-de', default=False, metavar=' ', help="Input file for target gene list (gene symbols or Ensembl ID)") parser_promotertest.add_argument( '-bed', default=False, metavar=' ', help="Input BED file of the promoter regions of target genes") parser_promotertest.add_argument( '-bg', default=False, metavar=' ', help="Input BED file of the promoter regions of background genes") parser_promotertest.add_argument( '-o', metavar=' ', help="Output directory name for all the results") parser_promotertest.add_argument( '-t', metavar=' ', default=False, help= "Define the title name for the results under the Output name. (default: %(default)s)" ) parser_promotertest.add_argument('-organism', metavar=' ', help='Define the organism (hg19 or mm9)') parser_promotertest.add_argument( '-gtf', metavar=' ', default=None, help='Define the GTF file for annotation (optional)') parser_promotertest.add_argument( '-pl', type=int, default=1000, metavar=' ', help="Define the promotor length (default: %(default)s)") parser_promotertest.add_argument( '-showdbs', action="store_true", help="Show the plots and statistics of DBS (DNA Binding sites)") parser_promotertest.add_argument( '-score', action="store_true", help="Load score column from input gene list or BED file for analysis." ) parser_promotertest.add_argument( '-scoreh', action="store_true", help="Use the header of scores from the given gene list or BED file.") parser_promotertest.add_argument( '-a', type=float, default=0.05, metavar=' ', help= "Define significance level for rejection null hypothesis (default: %(default)s)" ) parser_promotertest.add_argument( '-ccf', type=int, default=100, metavar=' ', help= "Define the cut off value for promoter counts (default: %(default)s)") parser_promotertest.add_argument( '-rt', action="store_true", default=False, help="Remove temporary files (fa, txp...etc)") parser_promotertest.add_argument('-log', action="store_true", default=False, help="Set the plots in log scale") parser_promotertest.add_argument('-ac', type=str, default=False, metavar=' ', help="Input file for RNA accecibility ") parser_promotertest.add_argument( '-accf', type=float, default=500, metavar=' ', help="Define the cut off value for RNA accecibility") parser_promotertest.add_argument( '-obed', action="store_true", default=True, help="Output the BED files for DNA binding sites.") parser_promotertest.add_argument( '-showpa', action="store_true", default=False, help="Show parallel and antiparallel bindings in the plot separately.") # parser_promotertest.add_argument('-motif', action="store_true", default=False, help="Show motif of binding sites.") parser_promotertest.add_argument( '-filter_havana', type=str, default="F", metavar=' ', help="Apply filtering to remove HAVANA entries.") parser_promotertest.add_argument( '-protein_coding', type=str, default="F", metavar=' ', help="Apply filtering to get only protein coding genes.") parser_promotertest.add_argument( '-known_only', type=str, default="F", metavar=' ', help="Apply filtering to get only known genes.") parser_promotertest.add_argument( '-dump', action="store_true", default=False, help="Only dump the experimental file and leave the program.") parser_promotertest.add_argument( '-rnaexp', type=str, default=None, metavar=' ', help="Given a file with RNA name and the expression value") parser_promotertest.add_argument( '-l', type=int, default=20, metavar=' ', help= "[Triplexator] Define the minimum length of triplex (default: %(default)s)" ) parser_promotertest.add_argument( '-e', type=int, default=20, metavar=' ', help= "[Triplexator] Set the maximal error-rate in %% tolerated (default: %(default)s)" ) parser_promotertest.add_argument( '-c', type=int, default=2, metavar=' ', help= "[Triplexator] Sets the tolerated number of consecutive errors with respect to the canonical triplex rules as such were found to greatly destabilize triplexes in vitro (default: %(default)s)" ) parser_promotertest.add_argument( '-fr', type=str, default="off", metavar=' ', help= "[Triplexator] Activates the filtering of low complexity regions and repeats in the sequence data (default: %(default)s)" ) parser_promotertest.add_argument( '-fm', type=int, default=0, metavar=' ', help= "[Triplexator] Method to quickly discard non-hits (default: %(default)s).'0' = greedy approach; '1' = q-gram filtering." ) parser_promotertest.add_argument( '-of', type=int, default=1, metavar=' ', help= "[Triplexator] Define output formats of Triplexator (default: %(default)s)" ) parser_promotertest.add_argument( '-mf', action="store_true", default=False, help= "[Triplexator] Merge overlapping features into a cluster and report the spanning region." ) parser_promotertest.add_argument( '-rm', type=int, default=0, metavar=' ', help="[Triplexator] Set the multiprocessing") parser_promotertest.add_argument( '-par', type=str, default="", metavar=' ', help="[Triplexator] Define other parameters for Triplexator") ################### Genomic Region Test ########################################## h_region = "Genomic region test evaluates the association between the given lncRNA to the target regions by randomization." parser_randomtest = subparsers.add_parser('regiontest', help=h_region) parser_randomtest.add_argument( '-r', type=str, metavar=' ', help="Input file name for RNA sequence (in fasta format)") parser_randomtest.add_argument( '-rl', type=str, default=None, metavar=' ', help="Input list for paths to all RNA sequences (in fasta format)") parser_randomtest.add_argument('-rn', type=str, default=False, metavar=' ', help="Define the RNA name") parser_randomtest.add_argument( '-bed', metavar=' ', help="Input BED file for interested regions on DNA") parser_randomtest.add_argument( '-o', metavar=' ', help="Output directory name for all the results and temporary files") parser_randomtest.add_argument( '-t', metavar=' ', default=False, help= "Define the title name for the results under the Output name. (default: %(default)s)" ) parser_randomtest.add_argument( '-n', type=int, default=10000, metavar=' ', help="Number of times for randomization (default: %(default)s)") parser_randomtest.add_argument('-organism', metavar=' ', help='Define the organism (hg19 or mm9)') parser_randomtest.add_argument( '-showdbs', action="store_true", help="Show the plots and statistics of DBS (DNA Binding sites)") parser_randomtest.add_argument( '-score', action="store_true", help="Load score column from input BED file") parser_randomtest.add_argument( '-a', type=float, default=0.05, metavar=' ', help= "Define significance level for rejection null hypothesis (default: %(default)s)" ) parser_randomtest.add_argument( '-ccf', type=int, default=40, metavar=' ', help="Define the cut off value for DBS counts (default: %(default)s)") parser_randomtest.add_argument( '-rt', action="store_true", default=False, help="Remove temporary files (fa, txp...etc)") parser_randomtest.add_argument('-log', action="store_true", default=False, help="Set the plots in log scale") parser_randomtest.add_argument( '-f', type=str, default=False, metavar=' ', help="Input BED file as mask in randomization") parser_randomtest.add_argument('-ac', type=str, default=False, metavar=' ', help="Input file for RNA accecibility ") parser_randomtest.add_argument( '-accf', type=float, default=500, metavar=' ', help="Define the cut off value for RNA accecibility") parser_randomtest.add_argument( '-obed', action="store_true", default=True, help="Output the BED files for DNA binding sites.") parser_randomtest.add_argument( '-showpa', action="store_true", default=False, help="Show parallel and antiparallel bindings in the plot separately.") parser_randomtest.add_argument( '-l', type=int, default=20, metavar=' ', help= "[Triplexator] Define the minimum length of triplex (default: %(default)s)" ) parser_randomtest.add_argument( '-e', type=int, default=20, metavar=' ', help= "[Triplexator] Set the maximal error-rate in %% tolerated (default: %(default)s)" ) parser_randomtest.add_argument( '-c', type=int, default=2, metavar=' ', help= "[Triplexator] Sets the tolerated number of consecutive errors with respect to the canonical triplex rules as such were found to greatly destabilize triplexes in vitro (default: %(default)s)" ) parser_randomtest.add_argument( '-fr', type=str, default="off", metavar=' ', help= "[Triplexator] Activates the filtering of low complexity regions and repeats in the sequence data (default: %(default)s)" ) parser_randomtest.add_argument( '-fm', type=int, default=0, metavar=' ', help= "[Triplexator] Method to quickly discard non-hits (default: %(default)s).'0' = greedy approach; '1' = q-gram filtering." ) parser_randomtest.add_argument( '-of', type=int, default=1, metavar=' ', help= "[Triplexator] Define output formats of Triplexator (default: %(default)s)" ) parser_randomtest.add_argument( '-mf', action="store_true", default=False, help= "[Triplexator] Merge overlapping features into a cluster and report the spanning region." ) parser_randomtest.add_argument( '-rm', type=int, default=0, metavar=' ', help="[Triplexator] Set the multiprocessing") parser_randomtest.add_argument( '-par', type=str, default="", metavar=' ', help="[Triplexator] Define other parameters for Triplexator") ########################################################################## parser_bed2bed = subparsers.add_parser( 'get_dbss', help="Get DBSs in BED format from the single BED file") parser_bed2bed.add_argument('-i', type=str, metavar=' ', help='Input BED file of the target regions') parser_bed2bed.add_argument('-dbs', type=str, metavar=' ', help='Output BED file of the DBSs') parser_bed2bed.add_argument('-rbs', type=str, metavar=' ', help='Output BED file of the RBSs') parser_bed2bed.add_argument('-r', type=str, metavar=' ', help='Input FASTA file of the RNA') parser_bed2bed.add_argument('-organism', metavar=' ', help='Define the organism (hg19 or mm9)') parser_bed2bed.add_argument( '-l', type=int, default=20, metavar=' ', help= "[Triplexator] Define the minimum length of triplex (default: %(default)s)" ) parser_bed2bed.add_argument( '-e', type=int, default=20, metavar=' ', help= "[Triplexator] Set the maximal error-rate in %% tolerated (default: %(default)s)" ) parser_bed2bed.add_argument( '-c', type=int, default=2, metavar=' ', help= "[Triplexator] Sets the tolerated number of consecutive errors with respect to the canonical triplex rules as such were found to greatly destabilize triplexes in vitro (default: %(default)s)" ) parser_bed2bed.add_argument( '-fr', type=str, default="off", metavar=' ', help= "[Triplexator] Activates the filtering of low complexity regions and repeats in the sequence data (default: %(default)s)" ) parser_bed2bed.add_argument( '-fm', type=int, default=0, metavar=' ', help= "[Triplexator] Method to quickly discard non-hits (default: %(default)s).'0' = greedy approach; '1' = q-gram filtering." ) parser_bed2bed.add_argument( '-of', type=int, default=1, metavar=' ', help= "[Triplexator] Define output formats of Triplexator (default: %(default)s)" ) parser_bed2bed.add_argument( '-mf', action="store_true", default=False, help= "[Triplexator] Merge overlapping features into a cluster and report the spanning region." ) parser_bed2bed.add_argument('-rm', type=int, default=0, metavar=' ', help="[Triplexator] Set the multiprocessing") ########################################################################## # rgt-TDF integrate -path parser_integrate = subparsers.add_parser( 'integrate', help= "Integrate the project's links and generate project-level statistics.") parser_integrate.add_argument('-path', type=str, metavar=' ', help='Define the path of the project.') ########################################################################## parser_updatehtml = subparsers.add_parser( 'updatehtml', help="Update the project's html.") parser_updatehtml.add_argument('-path', type=str, metavar=' ', help='Define the path of the project.') parser_updatehtml.add_argument('-exp', type=str, metavar=' ', help='Define file with expression data.') ################### Parsing the arguments ################################ if len(sys.argv) == 1: parser.print_help() sys.exit(1) elif len(sys.argv) == 2: # retrieve subparsers from parser subparsers_actions = [ action for action in parser._actions if isinstance(action, argparse._SubParsersAction) ] # there will probably only be one subparser_action,but better save than sorry for subparsers_action in subparsers_actions: # get all subparsers and print help for choice, subparser in subparsers_action.choices.items(): if choice == sys.argv[1]: print("\nYou need more arguments.") print("\nSubparser '{}'".format(choice)) subparser.print_help() sys.exit(1) else: args = parser.parse_args() #################################################################################### ######### Integration if args.mode == "integrate": condition_list = [] # name, link, no. tests, no. sig. for item in os.listdir(args.path): if item == "style": continue if os.path.isfile(os.path.join(args.path, item)): continue elif os.path.isdir(os.path.join(args.path, item)): h = os.path.join(item, "index.html") pro = os.path.join(args.path, item, "profile.txt") if os.path.isfile(pro): integrate_stat(path=os.path.join(args.path, item)) nt = 0 ns = 0 with open(pro) as f: for line in f: line = line.strip().split("\t") if line[0] == "Experiment": continue nt += 1 if float(line[7]) < 0.05: ns += 1 # print([item, h, str(nt), str(ns)]) condition_list.append([item, h, str(nt), str(ns)]) # print(condition_list) link_d = {"List": "index.html"} fp = condition_list[0][0] + "/style" html = Html( name="Directory: " + args.path, links_dict=link_d, fig_rpath=fp, #fig_dir=fp, RGT_header=False, other_logo="TDF") html.add_heading("All conditions in: " + args.path + "/") data_table = [] type_list = 'sssssssssssss' col_size_list = [20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20] c = 0 header_list = ["No.", "Conditions", "No. tests", "No. sig. tests"] for i, exp in enumerate(condition_list): c += 1 data_table.append([ str(c), '<a href="' + exp[1] + '">' + exp[0] + "</a>", exp[2], exp[3] ]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=10, cell_align="left", sortable=True) html.add_fixed_rank_sortable() html.write(os.path.join(args.path, "index.html")) gen_heatmap(path=args.path) generate_rna_exp_pv_table(root=args.path, multi_corr=False) merge_DBD_regions(path=args.path) sys.exit(0) #################################################################################### ######### updatehtml elif args.mode == "updatehtml": for item in os.listdir(args.path): pro = os.path.join(args.path, item, "profile.txt") if os.path.isfile(pro): update_profile(dirpath=os.path.join(args.path, item), expression=args.exp) revise_index(root=args.path) generate_rna_exp_pv_table(root=args.path, multi_corr=True) sys.exit(0) #################################################################################### ######### get_dbss elif args.mode == "get_dbss": get_dbss(input_BED=args.i, output_BED=args.dbs, rna_fasta=args.r, output_rbss=args.rbs, organism=args.organism, l=args.l, e=args.e, c=args.c, fr=args.fr, fm=args.fm, of=args.of, mf=args.mf, rm=args.rm, temp=dir) os.remove("dna_targeted_region.fa") os.remove("dna_targeted_region.txp") os.remove("rna_temp.fa") sys.exit(0) ####################################################################### #### Checking arguments if not args.o: print("Please define the output directory name. \n") sys.exit(1) if not args.organism: print("Please define the organism. (hg19 or mm9)") sys.exit(1) if not args.rn and not args.rl: print("Please define RNA sequence name.") sys.exit(1) if args.r and args.rl: print( "Both -r and -rl are given. TDF will skip -r and process -rl ") if args.rl: with open(args.rl) as f: for line in f: line = line.strip() rn = os.path.basename(line).rpartition(".")[0] print("\tProcessing: " + rn) command = [ "rgt-TDF", args.mode, "-r", line, "-rn", rn, "-o", os.path.join(args.o, rn), "-organism", args.organism ] if args.de and not args.bed: command += ["-de", args.de] if args.bed and args.bg: command += ["-bed", args.bed, "-bg", args.bg] if args.score: command += ["-score"] if args.rt: command += ["-rt"] if args.pl != 1000: command += ["-pl", args.pl] if args.ccf != 40: command += ["-ccf", args.ccf] if args.obed: command += ["-obed"] if args.a != 0.05: command += ["-a", args.a] if args.filter_havana == 'F': command += ["-filter_havana", 'F'] if args.protein_coding == 'T': command += ["-protein_coding", 'T'] if args.known_only == 'F': command += ["-known_only", 'F'] if args.rm > 0: command += ["-rm", args.rm] if args.fr != 'off': command += ["-fr", args.fr] if args.c != 2: command += ["-c", args.c] if args.e != 20: command += ["-e", args.e] if args.of != 1: command += ["-of", args.of] if args.l != 15: command += ["-l", args.l] if args.fr != 'off': command += ["-fr", args.fr] if args.fr != 'off': command += ["-fr", args.fr] if args.fr != 'off': command += ["-fr", args.fr] subprocess.call(command) sys.exit(0) t0 = time.time() # Normalised output path if not args.t: title = args.rn else: title = args.t args.o = os.path.normpath(os.path.join(dir, args.o, title)) check_dir(os.path.dirname(os.path.dirname(args.o))) check_dir(os.path.dirname(args.o)) check_dir(args.o) # Input parameters dictionary summary = [] summary.append("Time: " + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) summary.append("User: "******"\nCommand:\n\t$ " + " ".join(sys.argv)) ################################################################################ ##### Promoter Test ############################################################ ################################################################################ if args.mode == 'promotertest': ################################################################################################3 if args.bed and not args.bg: print("Please add background promoters in BED format. (-bg)") sys.exit(1) if args.scoreh and not args.score: print( "Score header (-scoreh) can only be used when scores (-score) are loaded." ) print("Please add '-score'.") sys.exit(1) print2(summary, "\n" + "*************** Promoter Test ****************") print2(summary, "*** Input RNA sequence: " + args.r) if args.o.count("/") < 3: print2(summary, "*** Output directory: " + args.o) else: n = args.o.count("/") - 3 + 1 print2(summary, "*** Output directory: " + args.o.split("/", n)[-1]) args.r = os.path.normpath(os.path.join(dir, args.r)) if args.de: args.de = os.path.normpath(os.path.join(dir, args.de)) if args.bed: args.bed = os.path.normpath(os.path.join(dir, args.bed)) if args.bg: args.bg = os.path.normpath(os.path.join(dir, args.bg)) # Get GenomicRegionSet from the given genes print2(summary, "Step 1: Calculate the triplex forming sites on RNA and DNA.") promoter = PromoterTest(gene_list_file=args.de, gtf=args.gtf, rna_name=args.rn, bed=args.bed, bg=args.bg, organism=args.organism, promoterLength=args.pl, summary=summary, temp=dir, output=args.o, showdbs=args.showdbs, score=args.score, scoreh=args.scoreh, filter_havana=args.filter_havana, protein_coding=args.protein_coding, known_only=args.known_only) if args.dump: sys.exit(0) promoter.get_rna_region_str(rna=args.r, expfile=args.rnaexp) promoter.connect_rna(rna=args.r, temp=args.o) promoter.search_triplex(temp=args.o, l=args.l, e=args.e, remove_temp=args.rt, c=args.c, fr=args.fr, fm=args.fm, of=args.of, mf=args.mf, par=args.par) t1 = time.time() print2( summary, "\tRunning time is: " + str(datetime.timedelta(seconds=round(t1 - t0)))) print2( summary, "Step 2: Calculate the frequency of DNA binding sites within the promotors." ) if args.obed: obedp = os.path.basename(args.o) else: obedp = None promoter.count_frequency(temp=args.o, remove_temp=args.rt, obedp=obedp, cutoff=args.ccf, l=args.l) promoter.fisher_exact(alpha=args.a) t2 = time.time() print2( summary, "\tRunning time is: " + str(datetime.timedelta(seconds=round(t2 - t1)))) if len(promoter.rbss) == 0: no_binding_response(args=args, rna_regions=promoter.rna_regions, rna_name=promoter.rna_name, organism=promoter.organism, stat=promoter.stat, expression=promoter.rna_expression) promoter.dbd_regions(output=args.o) os.remove(os.path.join(args.o, "rna_temp.fa")) try: os.remove(os.path.join(args.o, "rna_temp.fa.fai")) except: pass print2(summary, "Step 3: Establishing promoter profile.") t3 = time.time() print2( summary, "\tRunning time is: " + str(datetime.timedelta(seconds=round(t3 - t2)))) print2(summary, "Step 4: Generate plot and output html files.") promoter.plot_lines(txp=promoter.txp_def, rna=args.r, dirp=args.o, ac=args.ac, cut_off=args.accf, log=args.log, showpa=args.showpa, sig_region=promoter.sig_DBD, ylabel="Number of DBSs", linelabel="No. DBSs", filename="plot_promoter.png") promoter.barplot(dirp=args.o, filename="bar_promoter.png", sig_region=promoter.sig_DBD) #if args.showdbs: # promoter.plot_lines(txp=promoter.txp_def, rna=args.r, dirp=args.o, ac=args.ac, # cut_off=args.accf, log=args.log, showpa=args.showpa, # sig_region=promoter.sig_region_dbs, # ylabel="Number of DBSs on target promoters", # linelabel="No. DBSs", filename="plot_dbss.png") # promoter.barplot(dirp=args.o, filename="bar_dbss.png", sig_region=promoter.sig_region_dbs, dbs=True) # if args.motif: promoter.gen_motifs(temp=args.o) promoter.gen_html(directory=args.o, parameters=args, ccf=args.ccf, align=50, alpha=args.a) promoter.gen_html_genes(directory=args.o, align=50, alpha=args.a, nonDE=False) # promoter.save_table(path=os.path.dirname(args.o), table=promoter.ranktable, # filename="lncRNA_target_ranktable.txt") # promoter.save_table(path=os.path.dirname(args.o), table=promoter.dbstable, # filename="lncRNA_target_dbstable.txt") #promoter.heatmap(table="ranktable.txt", temp=os.path.dirname(args.o)) t4 = time.time() print2( summary, "\tRunning time is: " + str(datetime.timedelta(seconds=round(t4 - t3)))) print2( summary, "\nTotal running time is: " + str(datetime.timedelta(seconds=round(t4 - t0)))) output_summary(summary, args.o, "summary.txt") save_profile(rna_regions=promoter.rna_regions, rna_name=promoter.rna_name, organism=promoter.organism, output=args.o, bed=args.bed, geneset=args.de, stat=promoter.stat, topDBD=promoter.topDBD, sig_DBD=promoter.sig_DBD, expression=promoter.rna_expression) revise_index(root=os.path.dirname(os.path.dirname(args.o))) try: os.remove(os.path.join(args.o, "de.fa")) except OSError: pass try: os.remove(os.path.join(args.o, "nde.fa")) except OSError: pass write_stat(stat=promoter.stat, filename=os.path.join(args.o, "stat.txt")) ################################################################################ ##### Genomic Region Test ###################################################### ################################################################################ if args.mode == 'regiontest': def no_binding_code(): print("*** Find no triple helices binding on the given RNA") pro_path = os.path.join(os.path.dirname(args.o), "profile.txt") exp = os.path.basename(args.o) tar_reg = os.path.basename(args.bed) r_genes = rna_associated_gene(rna_regions=randomtest.rna_regions, name=randomtest.rna_name, organism=randomtest.organism) newlines = [] if os.path.isfile(pro_path): with open(pro_path, 'r') as f: new_exp = True for line in f: line = line.strip() line = line.split("\t") if line[0] == exp: newlines.append([ exp, args.rn, args.o.split("_")[-1], args.organism, tar_reg, "0", "-", "1.0", r_genes, "No triplex found" ]) new_exp = False else: newlines.append(line) if new_exp: newlines.append([ exp, args.rn, args.o.split("_")[-1], args.organism, tar_reg, "0", "-", "1.0", r_genes, "No triplex found" ]) else: newlines.append([ "Experiment", "RNA_names", "Tag", "Organism", "Target_region", "No_sig_DBDs", "Top_DBD", "p-value", "closest_genes" ]) newlines.append([ exp, args.rn, args.o.split("_")[-1], args.organism, tar_reg, "0", "-", "1.0", r_genes, "No triplex found" ]) with open(pro_path, 'w') as f: for lines in newlines: print("\t".join(lines), file=f) #shutil.rmtree(args.o) list_all_index(path=os.path.dirname(args.o), show_RNA_ass_gene=randomtest.rna_regions) shutil.rmtree(args.o) sys.exit(1) ######################################################### print2(summary, "\n" + "*************** Genomic Region Test ***************") print2(summary, "*** Input RNA sequence: " + args.r) print2(summary, "*** Input regions in BED: " + os.path.basename(args.bed)) print2(summary, "*** Number of randomization: " + str(args.n)) print2(summary, "*** Output directoey: " + os.path.basename(args.o)) args.r = os.path.normpath(os.path.join(dir, args.r)) print2( summary, "\nStep 1: Calculate the triplex forming sites on RNA and the given regions" ) randomtest = RandomTest(rna_fasta=args.r, rna_name=args.rn, dna_region=args.bed, organism=args.organism, showdbs=args.showdbs) randomtest.get_rna_region_str(rna=args.r) obed = os.path.basename(args.o) randomtest.connect_rna(rna=args.r, temp=args.o) randomtest.target_dna(temp=args.o, remove_temp=args.rt, l=args.l, e=args.e, obed=obed, c=args.c, fr=args.fr, fm=args.fm, of=args.of, mf=args.mf, par=args.par, cutoff=args.ccf) t1 = time.time() print2( summary, "\tRunning time is: " + str(datetime.timedelta(seconds=round(t1 - t0)))) # print(args.par) if len(randomtest.rbss) == 0: # no_binding_code() no_binding_response(args=args, rna_regions=randomtest.rna_regions, rna_name=randomtest.rna_name, organism=randomtest.organism, stat=randomtest.stat, expression=randomtest.rna_expression) print2(summary, "Step 2: Randomization and counting number of binding sites") randomtest.random_test(repeats=args.n, temp=args.o, remove_temp=args.rt, l=args.l, e=args.e, c=args.c, fr=args.fr, fm=args.fm, of=args.of, mf=args.mf, par=args.par, rm=args.rm, filter_bed=args.f, alpha=args.a) t2 = time.time() print2( summary, "\tRunning time is: " + str(datetime.timedelta(seconds=round(t2 - t1)))) print2(summary, "Step 3: Generating plot and output HTML") randomtest.dbd_regions( sig_region=randomtest.data["region"]["sig_region"], output=args.o) os.remove(os.path.join(args.o, "rna_temp.fa")) try: os.remove(os.path.join(args.o, "rna_temp.fa.fai")) except: pass randomtest.lineplot(txp=randomtest.txpf, dirp=args.o, ac=args.ac, cut_off=args.accf, showpa=args.showpa, log=args.log, ylabel="Number of DBS", sig_region=randomtest.data["region"]["sig_region"], linelabel="No. DBS", filename="lineplot_region.png") #randomtest.lineplot(txp=randomtest.txp, dirp=args.o, ac=args.ac, cut_off=args.accf, showpa=args.showpa, # log=args.log, ylabel="Number of target regions with DBS", # sig_region=randomtest.data["region"]["sig_region"], # linelabel="No. target regions", filename="lineplot_region.png") randomtest.boxplot( dir=args.o, matrix=randomtest.region_matrix, sig_region=randomtest.data["region"]["sig_region"], truecounts=[r[0] for r in randomtest.counts_tr.values()], sig_boolean=randomtest.data["region"]["sig_boolean"], ylabel="Number of target regions", filename="boxplot_regions") #if args.showdbs: # randomtest.lineplot(txp=randomtest.txpf, dirp=args.o, ac=args.ac, cut_off=args.accf, showpa=args.showpa, # log=args.log, ylabel="Number of DBS on target regions", # sig_region=randomtest.data["dbs"]["sig_region"], # linelabel="No. DBS", filename="lineplot_dbs.png") # randomtest.boxplot(dir=args.o, matrix=randomtest.dbss_matrix, # sig_region=randomtest.data["dbs"]["sig_region"], # truecounts=randomtest.counts_dbs.values(), # sig_boolean=randomtest.data["dbs"]["sig_boolean"], # ylabel="Number of DBS on target regions", # filename="boxplot_dbs" ) randomtest.gen_html(directory=args.o, parameters=args, align=50, alpha=args.a, score=args.score, obed=obed) t3 = time.time() print2( summary, "\tRunning time is: " + str(datetime.timedelta(seconds=round(t3 - t2)))) print2( summary, "\nTotal running time is: " + str(datetime.timedelta(seconds=round(t3 - t0)))) output_summary(summary, args.o, "summary.txt") # save_profile(output=args.o, bed=args.bed) save_profile(rna_regions=randomtest.rna_regions, rna_name=randomtest.rna_name, organism=randomtest.organism, output=args.o, bed=args.bed, stat=randomtest.stat, topDBD=randomtest.topDBD, sig_DBD=randomtest.data["region"]["sig_region"], expression=randomtest.rna_expression) list_all_index(path=os.path.dirname(args.o)) for f in os.listdir(args.o): if re.search("dna*.fa", f) or re.search("dna*.txp", f): os.remove(os.path.join(args.o, f)) write_stat(stat=randomtest.stat, filename=os.path.join(args.o, "stat.txt"))
def gen_html(self, directory, title, align=50): dir_name = os.path.basename(directory) # check_dir(directory) html_header = title link_d = OrderedDict() link_d["Boxplot"] = "index.html" link_d["Parameters"] = "parameters.html" html = Html(name=html_header, links_dict=link_d, fig_rpath="../style", RGT_header=False, other_logo="viz", homepage="../index.html") # fp = os.path.join(dir,outputname,title) html.add_figure("boxplot.png", align="center") type_list = 'ssssssssssssssssssssssssssssssssssssssssssssss' #### Calculate p value #### plist = {} for g in self.sortDict.keys(): plist[g] = {} for s1 in self.sortDict[g].keys(): for c1 in self.sortDict[g][s1].keys(): data1 = self.sortDict[g][s1][c1] plist[g][s1 + c1] = {} for s2 in self.sortDict[g].keys(): for c2 in self.sortDict[g][s2].keys(): if s2 == s1 and c2 == c1: pass else: data2 = self.sortDict[g][s2][c2] u, p_value = mannwhitneyu(data1, data2) plist[g][s1 + c1][s2 + c2] = p_value print("Multiple test correction.") multiple_correction(plist) for g in self.sortDict.keys(): html.add_heading(g, size=4, bold=False) data_table = [] col_size_list = [15] header_list = ["p-value"] for s in self.sortDict[g].keys(): for c in self.sortDict[g][s1].keys(): header_list.append(s + "\n" + c) col_size_list.append(15) for s1 in self.sortDict[g].keys(): for c1 in self.sortDict[g][s1].keys(): row = [s1 + "\n" + c1] for s2 in self.sortDict[g].keys(): for c2 in self.sortDict[g][s2].keys(): if s2 == s1 and c2 == c1: row.append("-") else: p = plist[g][s1 + c1][s2 + c2] if p > 0.05: row.append(value2str(p)) else: row.append("<font color=\"red\">" + value2str(p) + "</font>") data_table.append(row) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align + 50) # html.add_fixed_rank_sortable() html.write(os.path.join(directory, title, "index.html")) ## Parameters html = Html(name=html_header, links_dict=link_d, fig_rpath="../style", RGT_header=False, other_logo="viz", homepage="../index.html") header_list = ["Assumptions and hypothesis"] col_size_list = [50] data_table = [ [ 'All the regions among different BED files are normalized by quantile normalization.' ], [ 'If there is any grouping problem, please check all the optional columns in input experimental matrix.' ] ] html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left") html.add_free_content([ '<a href="parameters.txt" style="margin-left:100">See parameters</a>' ]) html.add_free_content([ '<a href="experimental_matrix.txt" style="margin-left:100">See experimental matrix</a>' ]) html.write(os.path.join(directory, title, "parameters.html"))