def main(**job_inputs): input_vcfs = [ dx_utils.download_and_gunzip_file(f, skip_decompress=True) for f in job_inputs['input_vcfs'] ] input_ref = dx_utils.download_and_gunzip_file(job_inputs['ref_fasta']) # create index files for inputs dx_utils.run_cmd(['samtools', 'faidx', input_ref]) map(dx_utils.run_cmd, ['tabix {0}'.format(vcf) for vcf in input_vcfs]) with open(VCF_FOFN, 'w') as fh: fh.write('\n'.join(input_vcfs)) # get the bcftools version and help doc cmd = ['bcftools', '--help'] dx_utils.run_cmd(cmd) output_prefix = job_inputs.get('output_prefix', '') output_bcf = output_prefix + 'concat' + '.bcf' # concatenate the bcf/vcf files concat_cmd = ['bcftools', 'concat', '-f', VCF_FOFN] view_cmd = ['bcftools', 'view', '-Ou', '-e\'type="ref"\''] norm_cmd = [ 'bcftools', 'norm', '-Ob', '-f', input_ref, '-o', output_bcf, '--threads={0}'.format(multiprocessing.cpu_count()) ] # run the commands dx_utils.run_pipe(concat_cmd, view_cmd, norm_cmd) # index the concatenated bcf file dx_utils.run_cmd(['bcftools', 'index', output_bcf]) # call consensus output_fasta = output_prefix + 'consensus.fasta' consensus_filter = 'QUAL>1 && (GT="AA" || GT="Aa")' consensus_cmd = [ 'bcftools', 'consensus', '-i', consensus_filter, '-Hla', '-f', input_ref, output_bcf ] dx_utils.run_pipe(consensus_cmd, outputFile=output_fasta) # save the changes to vcf output_vcf = output_prefix + 'changes.vcf.gz' vcf_cmd = [ 'bcftools', 'view', '-i', consensus_filter, '-Oz', '--threads={0}'.format(multiprocessing.cpu_count()), output_bcf ] dx_utils.run_pipe(vcf_cmd, outputFile=output_vcf) consensus_link = dx_utils.gzip_and_upload(output_fasta) print(consensus_link) output = {} output['consensus_fasta'] = consensus_link output['consensus_vcf'] = dxpy.dxlink(dxpy.upload_local_file(output_vcf)) return output
def main(input_assembly, hic_alignments, restriction_enzyme_bases, filter_alignments, input_assembly_graph=None): # make sure we can run salsa dx_utils.run_cmd("python /opt/SALSA/run_pipeline.py -h") input_assembly = dx_utils.download_and_gunzip_file(input_assembly) alignment_prefix = input_assembly.split(".fasta")[0] # process inputs and convert to bed first_file = True for bam_file in hic_alignments: fn = dxpy.describe(bam_file['$dnanexus_link'])['name'] cmd = 'dx cat {0}'.format(bam_file['$dnanexus_link']) prefix, suffix = os.path.splitext(fn) if suffix == '.gz': cmd += '| gunzip ' fn = prefix cmd += '| bedtools bamtobed -i stdin' if first_file: cmd += ' > {0}.bed'.format(alignment_prefix) first_file = False else: cmd += ' >> {0}.bed'.format(alignment_prefix) dx_utils.run_cmd(cmd) # index the ref cmd = 'samtools faidx {0} '.format(input_assembly) dx_utils.run_cmd(cmd) # if we were asked to filter by contig names, make a bed file and subset the input bed if filter_alignments == True: f = open('%s.fai' % (input_assembly)) o = open("%s.contigs.bed" % (alignment_prefix), 'w') for line in f: line = line.strip().split() o.write("%s\t1\t%s\n" % (line[0], line[1])) f.close() o.close() cmd = 'bedtools intersect -wa -a {0}.bed -b {0}.contigs.bed > {0}.filtered.bed'.format( alignment_prefix) dx_utils.run_cmd(cmd) else: cmd = 'ln -s {0}.bed {0}.filtered.bed'.format(alignment_prefix) dx_utils.run_cmd(cmd) # now sort the bed file cmd = "sort -T . -k4 {0}.filtered.bed > {0}.sorted.bed".format( alignment_prefix) dx_utils.run_cmd(cmd) cmd = 'python /opt/SALSA/run_pipeline.py -a {0} -b {1}.sorted.bed -l {0}.fai -o {2} -e {3} -m yes -p yes ' cmd = cmd.format(input_assembly, alignment_prefix, './', ','.join(restriction_enzyme_bases)) if input_assembly_graph is not None: cmd = "%s -g input_assembly_graph" % (cmd) dx_utils.run_cmd(cmd) output = {} # final scaffold final_fasta = glob.glob('scaffold*FINAL.fasta')[0] output['final_scaffold_fasta'] = dx_utils.gzip_and_upload(final_fasta) # final agp final_agp = glob.glob('scaffold*FINAL.agp')[0] output['final_scaffold_agp'] = dx_utils.gzip_and_upload(final_agp) # alignment_iteration_1 bed alignment_iteration_1 = glob.glob('alignment_iteration_1.bed')[0] output['alignment_iteration_1'] = dx_utils.gzip_and_upload( alignment_iteration_1) # scaffold_length_iteration_1 scaffold_length_iteration_1 = glob.glob('scaffold_length_iteration_1')[0] output['scaffold_length_iteration_1'] = dx_utils.gzip_and_upload( scaffold_length_iteration_1) # all others files = glob.glob('scaffold*fasta') files.extend(glob.glob('scaffold*agp')) if final_fasta in files: files.remove(final_fasta) if final_agp in files: files.remove(final_agp) print files output['scaffold'] = dx_utils.tar_files_and_upload(files, alignment_prefix) return output
def main(**job_inputs): bionano_cmap_1_link = job_inputs['bng_enzyme1'] bionano_cmap_2_link = job_inputs['bng_enzyme2'] ngs_fasta_link = job_inputs['ngs_fasta_or_cmap'] args_xml_link = job_inputs.get('args_xml') # Download all the inputs bionano_cmap_1_filename = os.path.join( '/home/dnanexus', download_and_gunzip_file(bionano_cmap_1_link)) bionano_cmap_2_filename = os.path.join( '/home/dnanexus', download_and_gunzip_file(bionano_cmap_2_link)) ngs_fasta_filename = os.path.join('/home/dnanexus', download_and_gunzip_file(ngs_fasta_link)) if args_xml_link: args_xml_filename = download_and_gunzip_file(args_xml_link) else: args_xml_filename = os.path.join(HYBRID_DIR, 'TGH', 'hybridScaffold_two_enzymes.xml') output_dir = "hybrid_scaffold_output" run_cmd('mkdir {0}'.format(output_dir)) results_tar = output_dir + '_results.tar' cmd = "Rscript {dir}/runTGH.R --help".format(dir=HYBRID_DIR) run_cmd(cmd) scaffold_cmd = ( "Rscript {dir}/runTGH.R -N {ngs_fasta} " "-b1 {bng1} -b2 {bng2} -O {outdir} -R {refaligner} -t {results} ". format(dir=HYBRID_DIR, ngs_fasta=ngs_fasta_filename, bng1=bionano_cmap_1_filename, bng2=bionano_cmap_2_filename, outdir=output_dir, refaligner=os.path.join(TOOLS_DIR, 'RefAligner'), results=results_tar)) scaffold_cmd += '-e1 {enzyme1} -e2 {enzyme2} '.format( enzyme1=job_inputs['enzyme1_name'], enzyme2=job_inputs['enzyme2_name']) if job_inputs.get("cuts1_file") and job_inputs.get("cuts2_file"): cuts1_file = download_and_gunzip_file(job_inputs["cuts1_file"]) cuts2_file = download_and_gunzip_file(job_inputs["cuts2_file"]) scaffold_cmd += '-m1 {cuts1} -m2 {cuts2} '.format(cuts1=cuts1_file, cuts2=cuts2_file) scaffold_cmd += ' {args_xml}'.format(args_xml=args_xml_filename) run_cmd(scaffold_cmd) scaffold_final = glob.glob( os.path.join(output_dir, 'TGH_M1', 'AGPExport', '*HYBRID*')) if not scaffold_final: print("ERROR: No hybrid scaffolds produced.") hybrid_scaffold_log = os.path.join(output_dir, 'TGH.log') run_cmd('tail -n 50 {0}'.format(hybrid_scaffold_log)) scaffold_final_ncbi = glob.glob( os.path.join(output_dir, 'hybrid_scaffolds*', '*_HYBRID_SCAFFOLD_NCBI.fasta')) unscaffolded_final = glob.glob( os.path.join(output_dir, 'hybrid_scaffolds*', '*_HYBRID_SCAFFOLD_NOT_SCAFFOLDED.fasta')) output = { "scaffold_fasta": [ dxpy.dxlink(dxpy.upload_local_file(f)) for f in scaffold_final if f.endswith(".fasta") ], "scaffold_output": [dxpy.dxlink(dxpy.upload_local_file(f)) for f in scaffold_final], "ncbi_scaffold_final": dx_utils.gzip_and_upload(scaffold_final_ncbi[0]), "unscaffolded_final": dx_utils.gzip_and_upload(unscaffolded_final[0]) } tar_name = "hybrid_scaffold_output.tar.gz" tar_cmd = "tar czvf {tar_name} {outdir}".format(tar_name=tar_name, outdir=output_dir) run_cmd(tar_cmd) output_id = dxpy.upload_local_file(tar_name) output["scaffold_targz"] = dxpy.dxlink(output_id) return output
def main(**job_inputs): bionano_cmap_link = job_inputs['refinefinal_merged_cmap'] ngs_fasta_link = job_inputs['ngs_fasta_or_cmap'] args_xml_link = job_inputs.get('args_xml') # Download all the inputs bionano_cmap_filename = download_and_gunzip_file(bionano_cmap_link) ngs_fasta_filename = download_and_gunzip_file(ngs_fasta_link) if args_xml_link: args_xml_filename = download_and_gunzip_file(args_xml_link) else: args_xml_filename = os.path.join(HYBRID_DIR, 'hybridScaffold_config.xml') output_dir = "hybrid_scaffold_output" scaffold_cmd = ( "perl {dir}/hybridScaffold.pl -n {ngs_fasta} -b {cmap} " "-o {outdir} -c {args_xml} " .format(dir=HYBRID_DIR, ngs_fasta=ngs_fasta_filename, cmap=bionano_cmap_filename, args_xml=args_xml_filename, outdir=output_dir)) scaffold_cmd += '-r {refaligner} '.format(refaligner=os.path.join(TOOLS_DIR, 'RefAligner')) if "conflict_resolution_file" in job_inputs: conflict_resolution_file = download_and_gunzip_file(job_inputs["conflict_resolution_file"]) scaffold_cmd += '-M {cr_file} '.format(conflict_resolution_file) else: scaffold_cmd += '-B {b_level} -N {n_level} '.format( b_level=job_inputs["b_conflict_filter"], n_level=job_inputs["n_conflict_filter"]) if job_inputs["generate_molecules"] is True: scaffold_cmd += '-x ' scaffold_cmd += '-p {0}'.format(SCRIPTS_DIR) try: molecules_bnx_file = download_and_gunzip_file(job_inputs["molecules_bnx_file"]) scaffold_cmd += '-m {0} '.format(molecules_bnx_file) except KeyError: raise dxpy.AppError("Molecules BNX file required for Align Molecules flag (-x)") try: optargs_xml = download_and_gunzip_file(job_inputs["optargs_xml"]) scaffold_cmd += '-q {0} '.format(optargs_xml) except KeyError: raise dxpy.AppError("OptArgs XML file required for Align Molecules flag (-x)") if job_inputs["generate_chimeric"] is True: scaffold_cmd += '-y ' if molecules_bnx_file: scaffold_cmd += '-m {0} '.format(molecules_bnx_file) else: try: molecules_bnx_file = download_and_gunzip_file(job_inputs["molecules_bnx_file"]) scaffold_cmd += '-m {0} '.format(molecules_bnx_file) except KeyError: raise dxpy.AppError("Molecules BNX file required for Generate Molecules flag") if "err_files" in job_inputs: err_files = [download_and_gunzip_file(err_file) for err_file in job_inputs["err_files"]] err_cmd = ' '.join(['-e {0}'.format(err) for err in err_files]) scaffold_cmd += err_cmd run_cmd(scaffold_cmd) run_cmd('tree {0}'.format(output_dir)) scaffold_final_ncbi = glob.glob( os.path.join(output_dir, 'hybrid_scaffolds*', '*_HYBRID_SCAFFOLD_NCBI.fasta')) unscaffolded_final = glob.glob( os.path.join(output_dir, 'hybrid_scaffolds*', '*_HYBRID_SCAFFOLD_NOT_SCAFFOLDED.fasta')) scaffold_final = glob.glob( os.path.join(output_dir, 'hybrid_scaffolds*', '*_HYBRID_SCAFFOLD.fasta')) scaffold_final.extend(glob.glob( os.path.join(output_dir, 'hybrid_scaffolds*', '*_HYBRID_SCAFFOLD.cmap'))) scaffold_final.extend(glob.glob( os.path.join(output_dir, 'hybrid_scaffolds*', '*_HYBRID_SCAFFOLD.agp'))) scaffold_output = glob.glob( os.path.join(output_dir, 'hybrid_scaffolds*', '*_HYBRID_SCAFFOLD*')) cut_and_conflict = glob.glob(os.path.join(output_dir, 'hybrid_scaffolds*', 'conflicts*.txt')) cut_and_conflict.extend(glob.glob(os.path.join(output_dir, 'hybrid_scaffolds*', '*_annotations.bed'))) output = {"scaffold_final": [dx_utils.gzip_and_upload(f) for f in scaffold_final], "scaffold_output": [dx_utils.gzip_and_upload(f) for f in scaffold_output if f not in scaffold_final], "cut_and_conflict": [dxpy.dxlink(dxpy.upload_local_file(f)) for f in cut_and_conflict], "ncbi_scaffold_final": dx_utils.gzip_and_upload(scaffold_final_ncbi[0]), "unscaffolded_final": dx_utils.gzip_and_upload(unscaffolded_final[0])} tar_name = "hybrid_scaffold_output.tar.gz" tar_cmd = "tar czvf {tar_name} {outdir}".format( tar_name=tar_name, outdir=output_dir) dx_utils.run_cmd(tar_cmd) output_id = dxpy.upload_local_file(tar_name) output["scaffold_targz"] = dxpy.dxlink(output_id) return output
def main(**job_inputs): bionano_cmap_link = job_inputs['refinefinal_merged_cmap'] ngs_fasta_link = job_inputs['ngs_fasta_or_cmap'] args_xml_link = job_inputs.get('args_xml') # Download all the inputs bionano_cmap_filename = dx_utils.download_and_gunzip_file(bionano_cmap_link) ngs_fasta_filename = dx_utils.download_and_gunzip_file(ngs_fasta_link) if args_xml_link: args_xml_filename = dx_utils.download_and_gunzip_file(args_xml_link) else: args_xml_filename = os.path.join(HYBRID_DIR, 'hybridScaffold_config.xml') output_dir = "hybrid_scaffold_output" scaffold_cmd = ["perl", os.path.join(HYBRID_DIR, "hybridScaffold.pl"), "-n", ngs_fasta_filename, "-b", bionano_cmap_filename, "-o", output_dir, "-c", args_xml_filename, "-r", os.path.join(TOOLS_DIR, 'RefAligner')] if "conflict_resolution_file" in job_inputs: conflict_resolution_file = dx_utils.download_and_gunzip_file(job_inputs["conflict_resolution_file"]) scaffold_cmd += ["-M", conflict_resolution_file] else: scaffold_cmd += ["-B", str(job_inputs["b_conflict_filter"]), "-N", str(job_inputs["n_conflict_filter"])] molecules_bnx_file = None if job_inputs["generate_molecules"] is True: scaffold_cmd += ["-x", "-p", SCRIPTS_DIR] try: molecules_bnx_file = dx_utils.download_and_gunzip_file(job_inputs["molecules_bnx_file"]) scaffold_cmd += ["-m", molecules_bnx_file] except KeyError: raise dxpy.AppError("Molecules BNX file required for Align Molecules flag (-x)") try: optargs_xml = dx_utils.download_and_gunzip_file(job_inputs["optargs_xml"]) scaffold_cmd += ["-q", optargs_xml] except KeyError: raise dxpy.AppError("OptArgs XML file required for Align Molecules flag (-x)") if job_inputs["generate_chimeric"] is True: scaffold_cmd += ["-y"] if molecules_bnx_file: scaffold_cmd += ["-m", molecules_bnx_file] else: try: molecules_bnx_file = dx_utils.download_and_gunzip_file(job_inputs["molecules_bnx_file"]) scaffold_cmd += ["-m", molecules_bnx_file] except KeyError: raise dxpy.AppError("Molecules BNX file required for Generate Molecules flag") if "err_files" in job_inputs: err_files = [dx_utils.download_and_gunzip_file(err_file) for err_file in job_inputs["err_files"]] for err in err_files: scaffold_cmd += ["-e", err] dx_utils.run_cmd(scaffold_cmd) dx_utils.run_cmd(["tree", output_dir]) scaffold_final_ncbi = glob.glob( os.path.join(output_dir, 'hybrid_scaffolds', '*_HYBRID_SCAFFOLD_NCBI.fasta'))[0] unscaffolded_final = glob.glob( os.path.join(output_dir, 'hybrid_scaffolds', '*_HYBRID_SCAFFOLD_NOT_SCAFFOLDED.fasta'))[0] scaffold_final = glob.glob( os.path.join(output_dir, 'hybrid_scaffolds', '*_HYBRID_SCAFFOLD.fasta')) scaffold_final.extend(glob.glob( os.path.join(output_dir, 'hybrid_scaffolds', '*_HYBRID_SCAFFOLD.cmap'))) scaffold_final.extend(glob.glob( os.path.join(output_dir, 'hybrid_scaffolds', '*_HYBRID_SCAFFOLD.agp'))) scaffold_output = glob.glob(os.path.join(output_dir, 'hybrid_scaffolds', '*_HYBRID_SCAFFOLD.xmap')) scaffold_output.extend(glob.glob(os.path.join(output_dir, 'hybrid_scaffolds', '*_HYBRID_SCAFFOLD_q.cmap'))) scaffold_output.extend(glob.glob(os.path.join(output_dir, 'hybrid_scaffolds', '*_HYBRID_SCAFFOLD_r.cmap'))) scaffold_output = [f for f in scaffold_output if f not in scaffold_final] cut_and_conflict = glob.glob(os.path.join(output_dir, 'hybrid_scaffolds*', 'conflicts*.txt')) cut_and_conflict.extend(glob.glob(os.path.join(output_dir, 'hybrid_scaffolds*', '*_annotations.bed'))) # make sure output files don't have colons dx_utils.run_cmd(["sed", "-i.bak", "s/:/_/g", scaffold_final_ncbi]) dx_utils.run_cmd(["sed", "-i.bak", "s/:/_/g", unscaffolded_final]) # upload outputs output = {"scaffold_final": [dx_utils.gzip_and_upload(f) for f in scaffold_final], "scaffold_output": [dx_utils.gzip_and_upload(f) for f in scaffold_output], "cut_and_conflict": [dxpy.dxlink(dxpy.upload_local_file(f)) for f in cut_and_conflict], "ncbi_scaffold_final": dx_utils.gzip_and_upload(scaffold_final_ncbi), "unscaffolded_final": dx_utils.gzip_and_upload(unscaffolded_final)} tar_name = "hybrid_scaffold_output.tar.gz" tar_cmd = "tar czvf {tar_name} {outdir}".format( tar_name=tar_name, outdir=output_dir) dx_utils.run_cmd(tar_cmd) output_id = dxpy.upload_local_file(tar_name) output["scaffold_targz"] = dxpy.dxlink(output_id) return output
def main(**job_inputs): bionano_cmap_1_link = job_inputs['bng_enzyme1'] bionano_cmap_2_link = job_inputs['bng_enzyme2'] ngs_fasta_link = job_inputs['ngs_fasta_or_cmap'] args_xml_link = job_inputs.get('args_xml') # Download all the inputs bionano_cmap_1_filename = os.path.join( '/home/dnanexus', dx_utils.download_and_gunzip_file(bionano_cmap_1_link)) bionano_cmap_2_filename = os.path.join( '/home/dnanexus', dx_utils.download_and_gunzip_file(bionano_cmap_2_link)) ngs_fasta_filename = os.path.join('/home/dnanexus', dx_utils.download_and_gunzip_file(ngs_fasta_link)) if args_xml_link: args_xml_filename = dx_utils.download_and_gunzip_file(args_xml_link) else: args_xml_filename = os.path.join(HYBRID_DIR, 'TGH', 'hybridScaffold_two_enzymes.xml') output_dir = "hybrid_scaffold_output" dx_utils.run_cmd(['mkdir', output_dir]) results_tar = output_dir + '_results.tar' cmd = ["Rscript", os.path.join(HYBRID_DIR, "runTGH.R"), "--help"] dx_utils.run_cmd(cmd) scaffold_cmd = ["Rscript", os.path.join(HYBRID_DIR, "runTGH.R"), "-N", ngs_fasta_filename, "-b1", bionano_cmap_1_filename, "-b2", bionano_cmap_2_filename, "-O", output_dir, "-R", os.path.join(TOOLS_DIR, 'RefAligner'), "-t", results_tar, "-e1", job_inputs['enzyme1_name'], "-e2", job_inputs['enzyme2_name']] if job_inputs.get("cuts1_file") and job_inputs.get("cuts2_file"): cuts1_file = dx_utils.download_and_gunzip_file(job_inputs["cuts1_file"]) cuts2_file = dx_utils.download_and_gunzip_file(job_inputs["cuts2_file"]) scaffold_cmd += ["-m1", cuts1_file, "-m2", cuts2_file] scaffold_cmd += [args_xml_filename] dx_utils.run_cmd(scaffold_cmd) # try locating the outputs final_dirs = ["TGH_M2", "TGH_M1", "two_enzyme_hybrid_scaffold_M2", "two_enzyme_hybrid_scaffold_M1"] for possible_loc in final_dirs: scaffold_final = glob.glob(os.path.join(output_dir, possible_loc, 'AGPExport', '*HYBRID_Export.fasta')) if scaffold_final: scaffold_final_ncbi = glob.glob( os.path.join(output_dir, possible_loc, 'AGPExport', '*HYBRID_Export_NCBI.fasta'))[0] unscaffolded_final = glob.glob( os.path.join(output_dir, possible_loc, 'AGPExport', '*HYBRID_Export_NOT_SCAFFOLDED.fasta'))[0] scaffold_output = glob.glob(os.path.join(output_dir, possible_loc, '*_HYBRID_Export.agp')) scaffold_output.extend(glob.glob(os.path.join(output_dir, possible_loc, '*_HYBRID_Export.xmap'))) scaffold_output.extend(glob.glob(os.path.join(output_dir, possible_loc, '*_HYBRID_Export_q.cmap'))) scaffold_output.extend(glob.glob(os.path.join(output_dir, possible_loc, '*_HYBRID_Export_r.cmap'))) scaffold_output = [f for f in scaffold_output if f not in scaffold_final] break # if still not found, something went wrong if not scaffold_final: hybrid_scaffold_log = os.path.join(output_dir, 'TGH.log') dx_utils.run_cmd(["tail", "-n", "50", hybrid_scaffold_log]) raise dxpy.AppError("ERROR: No hybrid scaffolds produced.") # make sure output files don't have colons dx_utils.run_cmd(["sed", "-i.bak", "s/:/_/g", scaffold_final_ncbi]) dx_utils.run_cmd(["sed", "-i.bak", "s/:/_/g", unscaffolded_final]) output = { "scaffold_fasta": [dxpy.dxlink(dxpy.upload_local_file(f)) for f in scaffold_final if f.endswith(".fasta")], "scaffold_output": [dxpy.dxlink(dxpy.upload_local_file(f)) for f in scaffold_output], "ncbi_scaffold_final": dx_utils.gzip_and_upload(scaffold_final_ncbi), "unscaffolded_final": dx_utils.gzip_and_upload(unscaffolded_final) } tar_name = "hybrid_scaffold_output.tar.gz" tar_cmd = "tar czvf {tar_name} {outdir}".format( tar_name=tar_name, outdir=output_dir) dx_utils.run_cmd(tar_cmd) output_id = dxpy.upload_local_file(tar_name) output["scaffold_targz"] = dxpy.dxlink(output_id) return output