def uniqStat(bams): outs = [] for bam in bams: cmd = "" log.run("stat unique mapped reads", cmd, para=2) outs.append(out) return outs
def statCov(parms): '''statistic coverage of every region and every base site Args: parms (dict) : which has the following keys:: { bams: a list, [[bam1, prefix1],[bam2, prefix2], ...] bed : bed file } Returns: dict : ``{"regionStats":"outputs_of_regionStat", "baseStats":"outputs_of_baseStat"}`` ''' bams = parms['bams'] bed = parms['bed'] region_outs = [] base_outs = [] for bam, prefix in bams: out = prefix + '.coverage.region.txt' cmd = "%s depth region -L %s %s -o %s" % (sambamba, bed, bam, out) log.run("stat coverage", cmd, para=2) region_outs.append(out) out = prefix + '.coverage.base.txt' cmd = "%s depth base -L %s %s -o %s" % (sambamba, bed, bam, out) log.run("stat coverage", cmd, para=2) base_outs.append(out) return {'regionStats': region_outs, 'baseStats': base_outs}
def data_cycle(params): anno2conf = params.copy() del anno2conf["sv_annos"] sv_param = params["sv_annos"] circos_res = {} for key, value in sv_param.items(): anno2conf["svinp"] = value anno2conf["prefix"] = str(key) ### filing the params to the input of transfer json file jsonfile = anno2conf['prefix'] + "_transfer.json" conf = anno2conf['prefix'] + '_circos.sv.conf' png = anno2conf['prefix'] + "_circos.sv.png" tpl = anno2conf['circos_sv_tmp'] paramstr = json.dumps(anno2conf) cmd1 = "echo '%s' > %s " % (paramstr, jsonfile) log.run("produce a sv json file", cmd1, i=None, o=[jsonfile]) circos_res[key] = {} circos_res[key]["conf"] = conf circos_res[key]["png"] = png ### wirte the command to the log file get_cmd(tpl, jsonfile) ### return a list return circos_res
def sam2bam(sam, prefix): bam = prefix + '.raw.bam' cmd = "%s view -bS %s > %s" % (samtools, sam, bam) #print cmd tag = "sam2bam" log.run(tag, cmd) return bam
def aln(parms): """bwa aligner Args: parms (dict) : which has the following keys:: { fq1 : the first fastq file fq2 : the second fastq file prefix : prefix of output file reference: reference file for bwa args : extra args for map } Returns: dict: ``{"bam":"bam_file"}`` """ fq1 = parms['fq1'] fq2 = parms['fq2'] prefix = parms['prefix'] ref = parms['reference'] args = parms['args'] if fq2: sam = DoubleQ(fq1, fq2, prefix, ref, args) else: sam = SingleQ(fq1, prefix, ref, args) bam = prefix + '.bam' cmd = "%s view -bS %s > %s" % (samtools, sam, bam) log.run('sam to bam', cmd) return {'bam': bam}
def report(params): """ enrichreport to markdown file and html file Args: report input dict, key is `yaml`, value is yaml file path:: "xx": path of xx. Returns: dict : key is `yaml`,value is yaml file path """ # handle input yamlin = params["yaml"] indict = yaml.load(open(yamlin)) enrichreportl = get_template("enrichreport") out = "enrichreport.md" cmd = "%s -t %s -j %s -o %s -y" % (render, enrichreportl, yamlin, out) log.run("render enrichreport template", cmd) cmd = "%s %s" % (md2html, out) log.run("md2html enrichreport ", cmd, o=["html.tgz"]) outdict = {} outdict["enrichreport"] = out yamlout = yamladd(yamlin, outdict) yamlout["enrichreport_outdir"] = os.getcwd() return yamlout
def SingleQ(fq1, prefix, ref, parms): sam = prefix + ".bwa.raw.sam" cmd = '%s mem %s -R "@RG\\tID:%s\\tSM:%s\\tLB:%s\\tPL:ILLUMINA" %s %s > %s ' % ( bwa, parms, prefix, prefix, prefix, ref, fq1, sam) #print cmd tag = "bwa mem" log.run(tag, cmd) return sam
def flagstat(bams, suffix): stats = [] for bam, prefix in bams: out = prefix + '.' + suffix cmd = "%s flagstat %s > %s" % (samtools, bam, out) log.run("mapping stat", cmd, para=2) stats.append(out) return stats
def get_cmd(tpl, jsonfile): cmd2 = "%s %s" % (sv_filing, jsonfile) tag2 = "use the sv_filing script to calculate the MM value" log.run(tag2, cmd2, i=[jsonfile], o=[jsonfile]) cmd3 = "%s %s" % (sv_proconf, jsonfile) tag3 = "get the conf file of circos" log.run(tag3, cmd3, i=[tpl, jsonfile], o=[jsonfile])
def DoubleQ(fq1, fq2, prefix, ref, parms): sam = prefix + ".bwa.raw.sam" #logFile = prefix + ".bwa.align.log" cmd = '%s mem %s -R "@RG\\tID:%s\\tSM:%s\\tLB:%s\\tPL:ILLUMINA" %s %s %s > %s ' % ( bwa, parms, prefix, prefix, prefix, ref, fq1, fq2, sam) #print cmd tag = "bwa mem" log.run(tag, cmd) return sam
def data_cycle(params): circos_res = params['circos_res'] for patient,value in circos_res.items(): conf = value['conf'] png = value['png'] cmd = "perl %s -conf %s" % (circostool,conf) tag = "Get the cmd of circos:" log.run(tag, cmd, i=[conf], o=[png])
def genTempltRendrParms(mapRateFile, meanCovFile, nxs): out_dict = {} out_dict['mappingRateStat'] = mapRateFile out_dict['targetRegionCovStat'] = meanCovFile out_dict['nXimages'] = nxs outfile = "mapping_template.json" jstr = json.dumps(out_dict) cmd = "echo '%s' > %s " % (jstr, outfile) log.run('generate template json', cmd) return outfile
def statCov(bams, bed): region_outs = [] base_outs = [] for bam, prefix in bams: out = prefix + '.coverage.region.txt' cmd = "%s depth region -L %s %s -o %s" % (sambamba, bed, bam, out) log.run("stat coverage", cmd, para=2) region_outs.append(out) out = prefix + '.coverage.base.txt' cmd = "%s depth base -L %s %s -o %s" % (sambamba, bed, bam, out) log.run("stat coverage", cmd, para=2) base_outs.append(out) return region_outs, base_outs
def report(parms): ijson = parms['templtJson'] enrichTemplt = parms['template'] if enrichTemplt.startswith("http://"): enrichTemplt = get_templt(enrichTemplt) try: targetDir = parms['resultsDirectory'] except: targetDir = './' out = os.path.join(targetDir, "geneEnrich_report.md") cmd = "%s -t %s -j %s -o %s" % (render, enrichTemplt, ijson, out) log.run("generating cancer drug report templete", cmd) cmd = "%s %s" % (md2html, out) log.run("generating mapping report", cmd)
def indexs(parms): '''index bam Args: parms (dict) : which has the following keys:: { bam: bam file } Returns: null ''' bam = parms['bam'] cmd = "%s index %s" % (sambamba, bam) log.run('bam index', cmd)
def dedups(parms): '''dedup bam Args: parms (dict) : which has the following keys:: { bam : bam file prefix: prefix of output } Returns: dict : ``{"bam":"bam_dedup", "prefix":"prefix"}`` ''' bam = parms['bam'] prefix = parms['prefix'] bam_dedup = prefix + ".dedup.bam" cmd = "java -jar %s INPUT=%s OUTPUT=%s REMOVE_DUPLICATES=true AS=true VALIDATION_STRINGENCY=SILENT M=%s " % ( picard_dedup, bam, bam_dedup, prefix + '.MarkDuplicates.stat') log.run('dedup bam', cmd) return {'bam': bam_dedup, 'prefix': prefix}
def get_cmd(tpl, jsonfile): cmd2 = "%s %s" % (snv_filing, jsonfile) tag2 = "use the snv_filing script to get the separated file" log.run(tag2, cmd2, i=[jsonfile], o=[jsonfile]) cmd3 = "%s %s" % (snv_count, jsonfile) tag3 = "use the snv_count script to get the counted file" log.run(tag3, cmd3, i=[jsonfile], o=[jsonfile]) cmd4 = "%s %s" % (cnv_filing, jsonfile) tag4 = "use the cnv_filing script to calculate the MM value" log.run(tag4, cmd4, i=[jsonfile], o=[jsonfile]) cmd5 = "%s %s" % (sv_filing, jsonfile) tag5 = "use the sv_filing script to get the circos input" log.run(tag5, cmd5, i=[jsonfile], o=[jsonfile]) cmd6 = "%s %s" % (multi_proconf, jsonfile) tag6 = "get the conf file of circos" log.run(tag6, cmd6, i=[tpl, jsonfile], o=[jsonfile])
def report(params): """ circos_report to markdown file and html file Args: report input dict, key is `yaml`, value is yaml file path:: "xx": path of xx. Returns: dict : key is `yaml`,value is yaml file path """ # handle input yamlin = params["yaml"] indict = yaml.load(open(yamlin)) yamlfile = "circos_report.json" paramstr = json.dumps(indict) cmd1 = "echo '%s' > %s " % (paramstr, yamlfile) tag1 = "make sure the existed yaml file" log.run("tag1", cmd1, i=None, o=[yamlfile]) templ = get_template("circos_report") out = "circos_report.md" cmd = "%s -t %s -j %s -o %s -y" % (render, templ, yamlfile, out) log.run("render circos_report template", cmd) cmd = "%s %s" % (md2html, out) log.run("md2html circos_report ", cmd) outdict = {} outdict["circos_report"] = out yamlout = yamladd(yamlin, outdict) yamlout["circos_report_outdir"] = os.getcwd() return yamlout
def sorts(parms): '''sorting bam file Args: parms (dict) : which has the following keys:: { bam : bam file prefix: prefix of output args : args of sambamba sort } Returns: dict : ``{"bam":"bam_sort", "prefix":"prefix"}`` ''' bam = parms['bam'] prefix = parms['prefix'] args = parms['args'] bam_sort = prefix + ".sort.bam" cmd = "%s sort %s %s -o %s " % (sambamba,bam,args, bam_sort) log.run('sort bam', cmd) return {"bam":bam_sort, 'prefix': prefix}
def intersects(parms): '''intersect bam with bed file Args: parms (dict) : which has the following keys:: { bam : bam file bed : bed file prefix: prefix of output } Returns: dict : ``{"bam":"bam_target", "prefix":"prefix", "bed":"bed"}`` ''' bam = parms['bam'] bed = parms['bed'] prefix = parms['prefix'] bam_target = prefix + ".target.bam" cmd = "%s -abam %s -b %s -wa -u > %s" % (intersectBed, bam, bed, bam_target) log.run('intersect bam', cmd) return {'bam': bam_target, 'prefix': prefix, 'bed': bed}
def report(parms): '''Generating report Args: parms (dict) : which has the following keys:: { template : template of report templtJson: json file, input parameter for template } Returns: dict : ``{"outfile":"report"}`` ''' mappingTemplt = parms['template'] if mappingTemplt.startswith("http://"): mappingTemplt = get_templt(mappingTemplt) ijson = parms['templtJson'] out = "mapping_report.md" cmd = "%s -t %s -j %s -o %s" % (render, mappingTemplt, ijson, out) log.run("generating mapping report templete", cmd) cmd = "%s %s" % (md2html, out) log.run("generating mapping report", cmd) return {'outfile': out}
def statMappingRate(parms): '''statistic bam files Args: parms (dict) : which has the following keys:: { sortBams : a list, [[bam1, prefix1],[bam2, prefix2], ...] dedupBams : a list, [[bam1, prefix1],[bam2, prefix2], ...] targetBams : a list, [[bam1, prefix1],[bam2, prefix2], ...] bed : bed file samples : a list, prefixs of bams } Returns: dict : ``{"regionStats": "region_covs", "mapRateFile":"mapRateFile", "meanCovFile":"meanCovFile", "nXs": "nxs"}`` ''' sortbams = parms['sortBams'] dedupbams = parms['dedupBams'] targetbams = parms['targetBams'] bed = parms['bed'] samples = parms['samples'] sstats = flagstat(sortbams, 'sort.mapping.stat') dstats = flagstat(dedupbams, 'dedup.mapping.stat') tstats = flagstat(targetbams, 'target.mapping.stat') region_covs, base_covs = statCov(dedupbams, bed) mapDir = 'report/mapping' nXdir = 'report/mapping/nX' mapRateFile = os.path.join(mapDir, "readsMappingRateStat.xlsx") cmd = "%s %s %s %s %s %s" % (mapRate, mapRateFile, '-'.join(sstats), '-'.join(dstats), '-'.join(tstats), '-'.join(samples)) log.run("mapping rate stats", cmd) meanCovFile = os.path.join(mapDir, "AllFile.mean.coverage.xlsx") cmd = "%s %s %s %s " % (covFormat, '-'.join(base_covs), meanCovFile, '-'.join(samples)) log.run('bam coverage stats', cmd) nxs = [] for sample in samples: f = sample + '.cov.txt' out1 = os.path.join(nXdir, sample + '.region.coverage.png') out2 = os.path.join(nXdir, sample + '.region.coverage.pdf') cmd = "%s %s %s %s" % (nXplot, f, os.path.join(nXdir, sample), sample) log.run('plot target region coverage rate', cmd) nxs.append(out1) res = { 'regionStats': region_covs, 'mapRateFile': mapRateFile, 'meanCovFile': meanCovFile, 'nXs': nxs } return res
def get_cmd(tpl, jsonfile): cmd2 = "%s %s" % (snv_filing, jsonfile) tag2 = "use the snv_filing script to get the separated file" log.run(tag2, cmd2, i=[jsonfile], o=[jsonfile]) cmd3 = "%s %s" % (snv_count, jsonfile) tag3 = "use the snv_count script to get the counted file" log.run(tag3, cmd3, i=[jsonfile], o=[jsonfile]) cmd4 = "%s %s" % (snv_proconf, jsonfile) tag4 = "get the conf file of circos" log.run(tag4, cmd4, i=[tpl, jsonfile], o=[jsonfile])
def enrich(parms): enrichFile = parms['enrichFile'] prefix = parms['prefix'] cmd = "%s %s %s" % (funcAnnoGO, enrichFile, prefix) log.run('func annotation', cmd) cmd = "%s %s %s" % (enrichGO, enrichFile, prefix) log.run('GO enrich analysis', cmd) cmd = "%s %s %s" % (enrichKEGG, enrichFile, prefix) log.run('KEGG enrich analysis', cmd) #func_outs = [] #suffixs = ['go.CC.bar.func.png', 'go.MF.bar.func.png', 'go.BP.bar.func.png'] #for item in suffixs: # func_outs.append(prefix+'.'+item) #goEnrich_outs = [] #suffixs = ['go.CC.net.enrich.png', 'go.MF.net.enrich.png', 'go.BP.net.enrich.png', 'go.CC.dot.enrich.png', 'go.MF.dot.enrich.png', 'go.BP.dot.enrich.png'] #for item in suffixs: # goEnrich_outs.append(prefix+'.'+item) #kegg_out = prefix+'.kegg.enrich.png' func = prefix + '.func.go.txt' go = prefix + '.enrich.go.txt' kegg = prefix + '.enrich.kegg.txt' out_dict = {'func': func, 'go': go, 'kegg': kegg} return out_dict
def report(ymlfile): """ {{projName}} to markdown file and html file """ # handle input indict = yaml.load(open(ymlfile)) render_yml = "{{projName}}_render.yml" cmd = "echo '%s' > %s" % (json.dumps(indict),render_yml) log.run("get {{projName}} args to render",cmd) templ = get_template("{{projName}}") out = "{{projName}}.md" cmd = "%s -t %s -j %s -o %s -y" % (render,templ,render_yml,out) log.run("render {{projName}} template",cmd,docker="jbioi/report",singularity="report.img") cmd = "%s %s" % (md2html,out) log.run("md2html {{projName}} ",cmd,docker="jbioi/report",singularity="report.img") outdict = {} outdict["{{projName}}"] = out yamlout = yamladd(yamlin,outdict) yamlout["{{projName}}_outdir"] = os.getcwd() return ymlfile
def arranger(parms): '''Arranging final results and generating report directory Args: parms (dict) : which has the following keys:: { regionStats: a list, results of sambamba depth region mapRateFile: statistics of BAM mapping information meanCovFile: summary of mean coverage for bams nXs : a list, plots of bam coverage } Reruens: dict : ``{"templtJson":"templtParms"}`` ''' regionStats = parms['regionStats'] mapRateFile = parms['mapRateFile'] meanCovFile = parms['meanCovFile'] nxs = parms['nXs'] mapDir = 'report/mapping' nXdir = os.path.join(mapDir, 'nX') mapStat = os.path.join(mapDir, 'mapStat') if not os.path.exists(nXdir): cmd = "mkdir -p %s" % nXdir log.run('mkdir', cmd) if not os.path.exists(nXdir): cmd = "mkdir -p %s" % mapStat log.run('mkdir', cmd) for region in regionStats: cmd = 'mv %s %s' % (region, mapStat) log.run('mv coverage stat files', cmd) templtParms = genTempltRendrParms(mapRateFile, meanCovFile, nxs) return {'templtJson': templtParms}
def enrich(parms): '''Gene GO functional annotation, GO enrichment and KEGG enrichment Args: parms (dict) : which has the following keys:: { yaml : a yaml file including parms for gene enrich analysis } Returns: dict : ``{"yaml": yamlfile for arrange and report}`` ''' yaml_file = parms["yaml"] fp = open(yaml_file) enrich_dict = yaml.load(fp.read()) fp.close() files = enrich_dict['enrichFiles'] func_pdfs, func_csvs, func_xls = [], [], [] go_dot_pdfs, go_net_pdfs, go_csvs, go_csvs_all, go_xls, go_xls_all = [], [], [], [], [], [] kegg_pdfs, kegg_csvs, kegg_csvs_all, kegg_xls, kegg_xls_all = [], [], [], [], [] id_files = [] for prefix, enrichFile in files.items(): # check file # with open(enrichFile, 'r') as f: head = f.readline().split('\t') try: gidx = head.index("Gene") except ValueError: print("\n\nInput file: {} need the head include 'Gene'.\n\n". format(enrichFile)) # functation annotation # cmd = "%s %s %s" % (funcAnnoGO, enrichFile, prefix) log.run('func annotation', cmd, i=[enrichFile]) # GO enrich analysis # cmd = "%s %s %s" % (enrichGO, enrichFile, prefix) log.run('GO enrich analysis', cmd) # KEGG enrich analysis # cmd = "%s %s %s" % (enrichKEGG, enrichFile, prefix) log.run('KEGG enrich analysis', cmd) for item in ['CC', 'BP', 'MF']: func_pdfs.append('{prefix}.go.{item}.bar.func.pdf'.format( prefix=prefix, item=item)) func_csvs.append('{prefix}.go.{item}.func.csv'.format( prefix=prefix, item=item)) func_xls.append('{prefix}.go.{item}.func.xls'.format(prefix=prefix, item=item)) go_dot_pdfs.append('{prefix}.go.{item}.dot.enrich.pdf'.format( prefix=prefix, item=item)) go_net_pdfs.append('{prefix}.go.{item}.net.enrich.pdf'.format( prefix=prefix, item=item)) go_csvs.append('{prefix}.go.{item}.enrich.csv'.format( prefix=prefix, item=item)) go_csvs_all.append('{prefix}.go.{item}.enrich.all.csv'.format( prefix=prefix, item=item)) go_xls.append('{prefix}.go.{item}.enrich.xls'.format(prefix=prefix, item=item)) go_xls_all.append('{prefix}.go.{item}.enrich.all.xls'.format( prefix=prefix, item=item)) id_files.append("{}.gene_id.csv".format(prefix)) kegg_pdfs.append('{}.KEGG.enrich.pdf'.format(prefix)) kegg_csvs.append('{}.KEGG.enrich.csv'.format(prefix)) kegg_csvs_all.append('{}.KEGG.enrich.all.csv'.format(prefix)) kegg_xls.append('{}.KEGG.enrich.xls'.format(prefix)) kegg_xls_all.append('{}.KEGG.enrich.all.xls'.format(prefix)) out_dict = {'func_pdfs':func_pdfs, 'go_dot_pdfs':go_dot_pdfs, 'go_net_pdfs': go_net_pdfs, 'kegg_pdfs':kegg_pdfs, \ 'func_csvs': func_csvs, 'func_xls': func_xls, 'go_csvs': go_csvs, 'go_xls': go_xls, 'go_csvs_all': go_csvs_all, 'go_xls_all': go_xls_all, 'kegg_csvs': kegg_csvs , \ 'kegg_csvs_all': kegg_csvs_all, 'kegg_xls': kegg_xls, 'kegg_xls_all': kegg_xls_all} res = yamladd(yaml_file, out_dict) # csv2xls # cmd = "{csv2xls} {yamlfile}".format(csv2xls=csv2xls, yamlfile=yaml_file) log.run('csv2xls', cmd) return res
def arrange(parms): func = parms['func'] go = parms['go'] kegg = parms['kegg'] cmd = "%s %s %s %s" % (arnge, func, go, kegg) log.run('arranger', cmd)