def validate_cores(args): mylims = Lims(BASEURI, USERNAME, PASSWORD) pjs=mylims.get_projects() couch = load_couch_server(args.conf) proj_db = couch['projects'] samp_db = couch['samples'] tested=0 passed=0 failed=[] log=setupLog(args) for p in pjs: if p.open_date: opDate=datetime.datetime.strptime(p.open_date, '%Y-%m-%d') now=datetime.datetime.now() if now-opDate < datetime.timedelta(days=args.days): tested+=1 log.info("comparing project {0}".format(p.name)) oldpj= oldDB.ProjectDB(mylims, p.id, samp_db) newpj= newDB.ProjectDB(mylims, p.id, samp_db) if oldpj.obj == newpj.obj: log.info("passed") passed+=1 else: log.error("PROJECT {0} FAILED COMPARISON".format(p.name)) failed.append(p.name) else: log.info("skipping project {0}, too old".format(p.name)) else: log.info("skipping project {0}, no open date".format(p.name)) log.info("Final stats :") log.info("{0}/{1} passed".format(passed, tested)) if failed: log.error("Failed projects : {0}".format(", ".join(failed)))
def generate_report(config_file, proj_conf, single_end, stranded): d = { "project_id": proj_conf["id"], "samplenames": " ".join(proj_conf["samples"]), "latex_opt": "", "uppnex": "", "mapping": "", "dup_rem": "", "read_count": "", "quantifyer": "", "gene_body_cov": "", "FPKM_heatmap": "", "Mapping_statistics": "", "Read_Distribution": "", "rRNA_table": "", "GBC": "", "strandness_table": "", "complexity_plot": "", "species": "", "genombuild": "", "rseqc_version": "", "Preseq": "", "date": date.today(), "anotation_version": "", } ## Latex option (no of floats per page) floats_per_page = ".. raw:: latex\n\n \setcounter{totalnumber}{8}" d["latex_opt"] = floats_per_page ## Project information fetched from StatusDB couch = load_couch_server(config_file) proj_db = couch["projects"] key = find_proj_from_view(proj_db, proj_conf["id"]) info = proj_db[key] try: uppnex_proj = info["uppnex_id"] reference_genome = info["reference_genome"] if reference_genome == "hg19": d["species"] = "Human" elif reference_genome == "mm9": d["species"] = "Mouse" elif reference_genome == "rn4": d["species"] = "Rat" elif reference_genome == ("Zv9" or "Zv8"): d["species"] = "Zebrafish" elif reference_genome == "sacCer2": d["species"] = "Saccharomyces cerevisiae" elif reference_genome == "dm3": d["species"] = "Drosophila melanogaster" else: d["species"] = reference_genome except: uppnex_proj = "" print "No uppnex ID fetched" pass d["uppnex"] = uppnex_proj ## RNA-seq tools fetched from config file post_process.yaml try: tools = proj_conf["config"]["custom_algorithms"]["RNA-seq analysis"] d["mapping"] = os.path.join(tools["aligner"], tools["aligner_version"]) d["dup_rem"] = os.path.join(tools["dup_remover"], tools["dup_remover_version"]) d["read_count"] = os.path.join(tools["counts"], tools["counts_version"]) d["quantifyer"] = os.path.join(tools["quantifyer"], tools["quantifyer_version"]) d["genombuild"] = tools[reference_genome]["name"] d["rseqc_version"] = tools["rseqc_version"] d["Preseq"] = tools["preseq"] d["anotation_version"] = tools[reference_genome]["annotation_release"] except: print "Could not fetched RNA-seq tools from config file post_process.yaml" pass ## Mapping Statistics tab = Texttable() tab.set_cols_dtype(["t", "t", "t", "t"]) tab.header(["Sample", "Tot NO Reads", "UniqMapped", "UniqMapped DuplRem"]) statistics = {} try: for sample_name in proj_conf["samples"]: try: f = open("tophat_out_" + sample_name + "/logs/prep_reads.log", "r") tot_NO_read_pairs = f.readlines()[2].split()[3] f.close() f = open("tophat_out_" + sample_name + "/stat" + sample_name, "r") dict = make_stat(f, tot_NO_read_pairs, single_end) tab.add_row( [ sample_name, tot_NO_read_pairs, str(dict["bef_dup_rem"]["%uniq_mapped"]) + "%", str(dict["aft_dup_rem"]["%uniq_mapped"]) + "%", ] ) statistics[sample_name] = dict except: print "Could not make mapping statistics for sample " + sample_name d["Mapping_statistics"] = indent_texttable_for_rst(tab) stat_json = open("stat.json", "w") print >> stat_json, statistics stat_json.close() except: print "Could not make Mapping Statistics table" pass ## Read Distribution try: tab = Texttable() tab.set_cols_dtype(["t", "t", "t", "t", "t", "t", "t", "t"]) tab.header(["Sample", "CDS", "5'UTR", "3'UTR", "Intron", "TSS", "TES", "mRNA"]) read_dist = {} for i in range(len(proj_conf["samples"])): sample_name = proj_conf["samples"][i] dict = {} try: f = open("RSeQC_rd_" + sample_name + ".out", "r") dict = read_RSeQC_rd233(f) row = [ sample_name, dict["CDS_Exons"]["Tags/Kb"], dict["5'UTR_Exons"]["Tags/Kb"], dict["3'UTR_Exons"]["Tags/Kb"], dict["Introns"]["Tags/Kb"], dict["TSS_up_1kb"]["Tags/Kb"], dict["TES_down_1kb"]["Tags/Kb"], dict["mRNA_frac"], ] tab.add_row(row) read_dist[sample_name] = dict except: print "Could not make read distribution for sample " + sample_name pass RSeQC_rd_json = open("RSeQC_rd.json", "w") print >> RSeQC_rd_json, read_dist RSeQC_rd_json.close() d["Read_Distribution"] = indent_texttable_for_rst(tab) except: print "Could not make Read Distribution table" pass ## Gene Body Coverage try: figure() x = range(0, 101) for i in range(len(proj_conf["samples"])): y = zeros(101) sample_name = proj_conf["samples"][i] f = open(sample_name + ".geneBodyCoverage.txt", "r") for line in f.readlines(): try: key = int(line.split()[0]) val = int(line.split()[1]) y[key] = val except: pass plot(x, y) # ,label=proj_conf['samples'][i]) # legend(loc='upper left',fontsize='xx-small') ylabel("read number") xlabel("percentile of gene body (5'->3')") savefig("gbc.pdf") d["GBC"] = image("gbc.pdf", width="100%") except: print "could not make GBC plot" ## FPKM_heatmap if os.path.exists("FPKM_heatmap.pdf"): d["FPKM_heatmap"] = image("FPKM_heatmap.pdf", width="100%") else: print "could not make FPKM heatmap" ## complexity plot if os.path.exists("complexity_curves.pdf"): d["complexity_plot"] = image("complexity_curves.pdf", width="100%") else: complexity = False print "could not make complexity plot" ## rRNA_table try: tab = Texttable() tab.set_cols_dtype(["t", "t"]) tab.header(["Sample", "rRNA"]) f = open("rRNA.quantification", "r") D = {} for line in f: D[str(line.split("\t")[0].strip())] = str(line.split("\t")[1].strip()) for sample_name in proj_conf["samples"]: if D.has_key(sample_name): tab.add_row([sample_name, D[sample_name]]) d["rRNA_table"] = indent_texttable_for_rst(tab) f.close() except: print "could not generate rRNA table" pass ## strandness_table try: tab = Texttable() tab.set_cols_dtype(["t", "t"]) tab.header(["Sample", "strand-specific reads"]) try: f = open("infer_experiment.json", "rb") data = json.load(f) except: print "can't open infer_experiment.json\n" D = data for sample_name in proj_conf["samples"]: if D.has_key(sample_name): tab.add_row([sample_name, str(float(D[sample_name]) * 100) + "%"]) d["strandness_table"] = indent_texttable_for_rst(tab) f.close() except: print "could not generate strandness_table" pass return d
def generate_report(config_file,proj_conf,single_end,stranded,genome): d = { 'project_id': proj_conf['id'], 'samplenames': ' '.join(proj_conf['samples']), 'latex_opt' : "", 'uppnex': "", 'mapping':"", 'dup_rem':"", 'read_count':"", 'quantifyer':"", 'gene_body_cov':"", 'FPKM_heatmap':"", 'Mapping_statistics': "", 'Read_Distribution':"", 'rRNA_table':"", 'GBC':"", 'strandness_table':"", 'complexity_plot': "", 'species':"", 'genombuild':"", 'rseqc_version':'', 'Preseq':'', 'date':date.today(), 'anotation_version':'' } ## Latex option (no of floats per page) floats_per_page = '.. raw:: latex\n\n \setcounter{totalnumber}{8}' d['latex_opt'] = floats_per_page ## Project information fetched from StatusDB couch=load_couch_server(config_file) proj_db = couch['projects'] key = find_proj_from_view(proj_db, proj_conf['id']) info = proj_db[key] species= { 'hg19': 'Human', 'mm9': 'Mouse', 'rn4': 'Rat', 'rn5': 'Rat', 'Zv8': 'Zebrafish', 'Zv9': 'Zebrafish', 'Zv10': 'Zebrafish', 'sacCer2': 'Saccharomyces cerevisiae', 'dm3': 'Drosophila melanogaster' } try: uppnex_proj = info['uppnex_id'] reference_genome = genome if genome else info['reference_genome'] if reference_genome in species.keys(): d['species'] = species.get(reference_genome, reference_genome) else: d['species'] = reference_genome except: uppnex_proj = "" print "No uppnex ID fetched" pass d['uppnex'] = uppnex_proj ## RNA-seq tools fetched from config file post_process.yaml try: tools = proj_conf['config']['custom_algorithms']['RNA-seq analysis'] d['mapping'] = os.path.join(tools['aligner'],tools['aligner_version']) d['dup_rem'] = os.path.join(tools['dup_remover'],tools['dup_remover_version']) d['read_count'] = os.path.join(tools['counts'],tools['counts_version']) d['quantifyer'] = os.path.join(tools['quantifyer'],tools['quantifyer_version']) d['genombuild'] = tools[reference_genome]['name'] d['rseqc_version'] = tools['rseqc_version'] d['Preseq'] = os.path.join(tools['Preseq'],tools['Preseq_version']) d['anotation_version'] = tools[reference_genome]['annotation_release'] except: print "Could not fetched RNA-seq tools from config file post_process.yaml" pass ## Mapping Statistics tab = Texttable() tab.set_cols_dtype(['t','t','t','t']) tab.header(['Sample','Tot NO Reads','UniqMapped','UniqMapped DuplRem']) statistics={} try: for sample_name in proj_conf['samples']: try: f = open('tophat_out_'+sample_name+'/logs/prep_reads.log', 'r') tot_NO_read_pairs = f.readlines()[2].split()[3] f.close() f = open('tophat_out_'+sample_name+'/stat'+sample_name, 'r') dict = make_stat(f,tot_NO_read_pairs,single_end) tab.add_row([sample_name,tot_NO_read_pairs,str(dict['bef_dup_rem']['%uniq_mapped'])+'%',str(dict['aft_dup_rem']['%uniq_mapped'])+'%']) statistics[sample_name] = dict except: print 'Could not make mapping statistics for sample '+sample_name d['Mapping_statistics'] = indent_texttable_for_rst(tab) stat_json = open('stat.json','w') print>> stat_json, statistics stat_json.close() except: print "Could not make Mapping Statistics table" pass ## Read Distribution try: tab = Texttable() tab.set_cols_dtype(['t','t','t','t','t','t','t','t']) tab.header(["Sample","CDS","5'UTR","3'UTR","Intron","TSS","TES","mRNA"]) read_dist = {} for i in range(len(proj_conf['samples'])): sample_name = proj_conf['samples'][i] dict = {} try: f = open('RSeQC_rd_'+sample_name+'.out','r') dict = read_RSeQC_rd233(f) row = [sample_name,dict['CDS_Exons']['Tags/Kb'], dict["5'UTR_Exons"]['Tags/Kb'], dict["3'UTR_Exons"]['Tags/Kb'],dict['Introns']['Tags/Kb'], dict['TSS_up_1kb']['Tags/Kb'],dict['TES_down_1kb']['Tags/Kb'], dict['mRNA_frac']] tab.add_row(row) read_dist[sample_name] = dict except: print "Could not make read distribution for sample "+sample_name pass RSeQC_rd_json = open('RSeQC_rd.json','w') print >> RSeQC_rd_json, read_dist RSeQC_rd_json.close() d['Read_Distribution'] = indent_texttable_for_rst(tab) except: print "Could not make Read Distribution table" pass ## Gene Body Coverage try: pdf_fn="geneBodyCoverage.pdf" values = [1] *101 fig = plt.figure() axes = fig.add_subplot(111) plt.subplots_adjust(right=0.7) sample_num=len(proj_conf['samples']) n=math.ceil(float(sample_num)/12) col=math.ceil(float(sample_num)/40) colours = ['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c', '#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#ffff99','#b15928']*int(n) i = 0 for s in range(0,sample_num): sample_name = proj_conf['samples'][s] coverage_file = sample_name + '.geneBodyCoverage.txt' fn = os.path.realpath(coverage_file) try: lines=open(fn).readlines() for l in range(1,101): (percentile, count) = (lines[0].split()[l],lines[1].split()[l]) if percentile.isdigit() is False: continue values[int(percentile)] = float(count) / 1000000 #print values except IOError as e: print "Could not load input file: {}".format(fn) axes.plot(values, label=sample_name, color=colours[i]) i += 1 # Tidy the axes axes.tick_params(which='both', labelsize=8, direction='out', top=False, right=False) # Make the x axis labels percentages axes.set_xticklabels(["%d%%" % p for p in axes.get_xticks()]) # Labels matplotlib.rcParams['mathtext.default'] = 'regular' plt.xlabel(r"Gene body position ($5' \rightarrow 3'$)") plt.ylabel(r'Cumulative Read Count ($\times 10^6$)') plt.title('Gene Body Coverage') #Legend axes.legend(loc='upper left', bbox_to_anchor = (1.02, 1.02), fontsize=6, ncol=int(col)) plt.savefig(pdf_fn) d['GBC'] = image(pdf_fn, width="100%") plt.close(fig) except: print sys.exc_info() print "could not make GBC plot" ## FPKM_heatmap if os.path.exists("FPKM_heatmap.pdf"): d['FPKM_heatmap'] = image("FPKM_heatmap.pdf", width="100%") else: print "could not make FPKM heatmap" ## complexity plot if os.path.exists("complexity_curves.pdf"): d['complexity_plot'] = image("complexity_curves.pdf", width="100%") else: complexity=False print "could not make complexity plot" ## rRNA_table try: tab = Texttable() tab.set_cols_dtype(['t','t']) tab.header(["Sample","rRNA"]) f=open('rRNA.quantification','r') D={} for line in f: D[str(line.split('\t')[0].strip())]=str(line.split('\t')[1].strip()) for sample_name in proj_conf['samples']: if D.has_key(sample_name): tab.add_row([sample_name,D[sample_name]]) d['rRNA_table']=indent_texttable_for_rst(tab) f.close() except: print "could not generate rRNA table" pass ## strandness_table try: tab = Texttable() tab.set_cols_dtype(['t','t']) tab.header(["Sample","strand-specific reads"]) try: f=open('infer_experiment.json', 'rb') data=json.load(f) except: print "can't open infer_experiment.json\n" D=data for sample_name in proj_conf['samples']: if D.has_key(sample_name): tab.add_row([sample_name,str(float(D[sample_name])*100)+'%']) d['strandness_table']=indent_texttable_for_rst(tab) f.close() except: print "could not generate strandness_table" pass return d