def run(self): """ 1. extract contigs.fasta and read-contig.sam 2. run pile up """ contigs, _scaffolds, read_contig_sam, _stats = self.input_files_local[ 0] coverage_json, coverage_summary_csv = self.output_files_local() if os.path.getsize(contigs) < MIN_CONTIG_FILE_SIZE: command.write_text_to_file('{}', coverage_json) command.write_text_to_file('No Contigs', coverage_summary_csv) return # generate bam files bam_file = read_contig_sam.replace(".sam", ".bam") command.execute( command_patterns.ShellScriptCommand( script= r'''samtools view -S -b "${read_contig_sam}" | samtools sort - -o "${bam_file}";''', named_args={ 'read_contig_sam': read_contig_sam, 'bam_file': bam_file })) command.execute( command_patterns.SingleCommand(cmd="samtools", args=["index", bam_file])) # run coverage info output_csv, output_json = self.calc_contig2coverage(bam_file) os.rename(output_csv, coverage_summary_csv) os.rename(output_json, coverage_json)
def validate(self): ''' Make sure all the output files are generated. ''' for f in self.output_files_local(): if not os.path.exists(f): raise RuntimeError("output file %s should be generated after run" % f) # Tag the done files done_file = self.done_file(f) fmt_now = datetime.datetime.now(tz=pytz.UTC).strftime("%a %b %e %H:%M:%S %Z %Y") command.write_text_to_file(fmt_now, done_file) self.count_reads()
def run(self): ''' Dummy implementation. just copy the files over. Real thing to be implemented later. ''' input_files = self.input_files_local[0] output_files = self.output_files_local() for i in range(len(input_files)): command.copy_file(input_files[i], output_files[i]) command.write_text_to_file('1234', output_files[4])
def fetch_input_files_from_s3(input_files, input_dir_s3, result_dir_local): for f in input_files: s3_file = os.path.join(input_dir_s3, f) local_file = os.path.join(result_dir_local, f) local_dir = os.path.dirname(local_file) command.make_dirs(local_dir) # copy the file over output_file = idseq_dag.util.s3.fetch_from_s3(s3_file, local_dir, allow_s3mi=True) if output_file: # write the done_file done_file = PipelineStep.done_file(local_file) fmt_now = datetime.datetime.now(tz=pytz.UTC).strftime("%a %b %e %H:%M:%S %Z %Y") command.write_text_to_file(fmt_now, done_file) else: raise RuntimeError(f"{s3_file} likely doesn't exist")
def make_star_index(fasta_file, gtf_file, output_star_genome_path, max_star_part_size): star_genome_dir_name = output_star_genome_path[:-4] # star genome organization # STAR_genome/part-${i}, parts.txt fasta_file_list = [] if max_star_part_size and os.path.getsize( fasta_file) > max_star_part_size: fasta_file_list = PipelineStepGenerateHostGenome.split_fasta( fasta_file, max_star_part_size) else: fasta_file_list.append(fasta_file) for i in range(len(fasta_file_list)): log.write("start making STAR index part %d" % i) gtf_command_part = [] if i == 0 and gtf_file: gtf_command_part = ["--sjdbGTFfile", gtf_file] star_genome_part_dir = f"{star_genome_dir_name}/part-{i}" command.make_dirs(star_genome_part_dir) star_command_params = [ '--runThreadN', str(multiprocessing.cpu_count()), '--runMode', 'genomeGenerate', *gtf_command_part, '--genomeDir', star_genome_part_dir, '--genomeFastaFiles', fasta_file_list[i], '--limitGenomeGenerateRAM', virtual_memory().available ] command.execute( command_patterns.SingleCommand(cmd='STAR', args=star_command_params)) log.write(f"finished making STAR index part {i}") # record # parts into parts.txt command.write_text_to_file( len(fasta_file_list), os.path.join(star_genome_dir_name, "parts.txt")) star_genome = os.path.basename(star_genome_dir_name) star_work_dir = os.path.dirname(star_genome_dir_name) command.execute( command_patterns.SingleCommand(cmd="tar", args=[ "cvf", output_star_genome_path, "-C", star_work_dir, star_genome ]))
def assemble( input_fasta, input_fasta2, bowtie_fasta, # fasta file for running bowtie against contigs duplicate_cluster_sizes_path, assembled_contig, assembled_scaffold, bowtie_sam, contig_stats, read2contig, memory=100): basedir = os.path.dirname(assembled_contig) assembled_dir = os.path.join(basedir, 'spades') command.make_dirs(assembled_dir) assembled_contig_tmp = os.path.join(assembled_dir, 'contigs.fasta') assembled_scaffold_tmp = os.path.join(assembled_dir, 'scaffolds.fasta') try: if input_fasta2: command.execute( command_patterns.SingleCommand(cmd="spades.py", args=[ "-1", input_fasta, "-2", input_fasta2, "-o", assembled_dir, "-m", memory, "-t", 32, "--only-assembler" ])) else: command.execute( command_patterns.SingleCommand(cmd="spades.py", args=[ "-s", input_fasta, "-o", assembled_dir, "-m", memory, "-t", 32, "--only-assembler" ])) command.move_file(assembled_contig_tmp, assembled_contig) command.move_file(assembled_scaffold_tmp, assembled_scaffold) PipelineStepRunAssembly.generate_read_to_contig_mapping( assembled_contig, bowtie_fasta, read2contig, duplicate_cluster_sizes_path, bowtie_sam, contig_stats) except: # Assembly failed. create dummy output files command.write_text_to_file(';ASSEMBLY FAILED', assembled_contig) command.write_text_to_file(';ASSEMBLY FAILED', assembled_scaffold) command.write_text_to_file('@NO INFO', bowtie_sam) command.write_text_to_file('{}', contig_stats) traceback.print_exc() command.remove_rf(assembled_dir)
def run(self): ''' 1. summarize hits 2. built blast index 3. blast assembled contigs to the index 4. update the summary ''' _align_m8, deduped_m8, hit_summary, orig_counts_with_dcr = self.input_files_local[0] assembled_contig, _assembled_scaffold, bowtie_sam, _contig_stats = self.input_files_local[1] reference_fasta, = self.input_files_local[2] duplicate_cluster_sizes_path, = self.input_files_local[3] blast_m8, refined_m8, refined_hit_summary, refined_counts_with_dcr, contig_summary_json, blast_top_m8 = self.output_files_local() assert refined_counts_with_dcr.endswith("with_dcr.json"), self.output_files_local() assert orig_counts_with_dcr.endswith("with_dcr.json"), self.output_files_local() db_type = self.additional_attributes["db_type"] no_assembled_results = ( os.path.getsize(assembled_contig) < MIN_ASSEMBLED_CONTIG_SIZE or os.path.getsize(reference_fasta) < MIN_REF_FASTA_SIZE) if no_assembled_results: # No assembled results or refseq fasta available. # Create empty output files. command.write_text_to_file(' ', blast_m8) command.write_text_to_file(' ', blast_top_m8) command.copy_file(deduped_m8, refined_m8) command.copy_file(hit_summary, refined_hit_summary) command.copy_file(orig_counts_with_dcr, refined_counts_with_dcr) command.write_text_to_file('[]', contig_summary_json) return # return in the middle of the function (read_dict, accession_dict, _selected_genera) = m8.summarize_hits(hit_summary) PipelineStepBlastContigs.run_blast(db_type, blast_m8, assembled_contig, reference_fasta, blast_top_m8) read2contig = {} PipelineStepRunAssembly.generate_info_from_sam(bowtie_sam, read2contig, duplicate_cluster_sizes_path) (updated_read_dict, read2blastm8, contig2lineage, added_reads) = self.update_read_dict( read2contig, blast_top_m8, read_dict, accession_dict, db_type) self.generate_m8_and_hit_summary(updated_read_dict, added_reads, read2blastm8, hit_summary, deduped_m8, refined_hit_summary, refined_m8) # Generating taxon counts based on updated results lineage_db = s3.fetch_reference( self.additional_files["lineage_db"], self.ref_dir_local, allow_s3mi=False) # Too small to waste s3mi deuterostome_db = None if self.additional_files.get("deuterostome_db"): deuterostome_db = s3.fetch_reference(self.additional_files["deuterostome_db"], self.ref_dir_local, allow_s3mi=False) # Too small for s3mi blacklist_s3_file = self.additional_files.get('taxon_blacklist', DEFAULT_BLACKLIST_S3) taxon_blacklist = s3.fetch_reference(blacklist_s3_file, self.ref_dir_local) taxon_whitelist = None if self.additional_attributes.get("use_taxon_whitelist"): taxon_whitelist = s3.fetch_reference(self.additional_files.get("taxon_whitelist", DEFAULT_WHITELIST_S3), self.ref_dir_local) with TraceLock("PipelineStepBlastContigs-CYA", PipelineStepBlastContigs.cya_lock, debug=False): with log.log_context("PipelineStepBlastContigs", {"substep": "generate_taxon_count_json_from_m8", "db_type": db_type, "refined_counts": refined_counts_with_dcr}): m8.generate_taxon_count_json_from_m8(refined_m8, refined_hit_summary, db_type.upper(), lineage_db, deuterostome_db, taxon_whitelist, taxon_blacklist, duplicate_cluster_sizes_path, refined_counts_with_dcr) # generate contig stats at genus/species level with log.log_context("PipelineStepBlastContigs", {"substep": "generate_taxon_summary"}): contig_taxon_summary = self.generate_taxon_summary( read2contig, contig2lineage, updated_read_dict, added_reads, db_type, duplicate_cluster_sizes_path, # same filter as applied in generate_taxon_count_json_from_m8 m8.build_should_keep_filter(deuterostome_db, taxon_whitelist, taxon_blacklist) ) with log.log_context("PipelineStepBlastContigs", {"substep": "generate_taxon_summary_json", "contig_summary_json": contig_summary_json}): with open(contig_summary_json, 'w') as contig_outf: json.dump(contig_taxon_summary, contig_outf) # Upload additional file contig2lineage_json = os.path.join(os.path.dirname(contig_summary_json), f"contig2lineage.{db_type}.json") with log.log_context("PipelineStepBlastContigs", {"substep": "contig2lineage_json", "contig2lineage_json": contig2lineage_json}): with open(contig2lineage_json, 'w') as c2lf: json.dump(contig2lineage, c2lf) self.additional_output_files_hidden.append(contig2lineage_json)
def test_write_text_to_file(self): '''WHEN write_text_to_file is invoked, THEN write to the file the provided string''' command.write_text_to_file("done", TMP_FILE) self.assertEqual(file_contents(TMP_FILE), "done\n")
def run(self): ''' 1. summarize hits 2. built blast index 3. blast assembled contigs to the index 4. update the summary ''' (_align_m8, deduped_m8, hit_summary, orig_counts) = self.input_files_local[0] assembled_contig, _assembled_scaffold, bowtie_sam, _contig_stats = self.input_files_local[ 1] reference_fasta = self.input_files_local[2][0] (blast_m8, refined_m8, refined_hit_summary, refined_counts, contig_summary_json, blast_top_m8) = self.output_files_local() db_type = self.additional_attributes["db_type"] if os.path.getsize(assembled_contig) < MIN_ASSEMBLED_CONTIG_SIZE or \ os.path.getsize(reference_fasta) < MIN_REF_FASTA_SIZE: # No assembled results or refseq fasta available. # Create empty output files. command.write_text_to_file(' ', blast_m8) command.write_text_to_file(' ', blast_top_m8) command.copy_file(deduped_m8, refined_m8) command.copy_file(hit_summary, refined_hit_summary) command.copy_file(orig_counts, refined_counts) command.write_text_to_file('[]', contig_summary_json) return # return in the middle of the function (read_dict, accession_dict, _selected_genera) = m8.summarize_hits(hit_summary) PipelineStepBlastContigs.run_blast(db_type, blast_m8, assembled_contig, reference_fasta, blast_top_m8) read2contig = {} contig_stats = defaultdict(int) PipelineStepRunAssembly.generate_info_from_sam(bowtie_sam, read2contig, contig_stats) (updated_read_dict, read2blastm8, contig2lineage, added_reads) = self.update_read_dict(read2contig, blast_top_m8, read_dict, accession_dict, db_type) self.generate_m8_and_hit_summary(updated_read_dict, added_reads, read2blastm8, hit_summary, deduped_m8, refined_hit_summary, refined_m8) # Generating taxon counts based on updated results lineage_db = s3.fetch_reference( self.additional_files["lineage_db"], self.ref_dir_local, allow_s3mi=False) # Too small to waste s3mi deuterostome_db = None evalue_type = 'raw' if self.additional_files.get("deuterostome_db"): deuterostome_db = s3.fetch_reference( self.additional_files["deuterostome_db"], self.ref_dir_local, allow_s3mi=False) # Too small for s3mi with TraceLock("PipelineStepBlastContigs-CYA", PipelineStepBlastContigs.cya_lock, debug=False): with log.log_context( "PipelineStepBlastContigs", { "substep": "generate_taxon_count_json_from_m8", "db_type": db_type, "refined_counts": refined_counts }): m8.generate_taxon_count_json_from_m8( refined_m8, refined_hit_summary, evalue_type, db_type.upper(), lineage_db, deuterostome_db, refined_counts) # generate contig stats at genus/species level with log.log_context("PipelineStepBlastContigs", {"substep": "generate_taxon_summary"}): contig_taxon_summary = self.generate_taxon_summary( read2contig, contig2lineage, updated_read_dict, added_reads, db_type) with log.log_context( "PipelineStepBlastContigs", { "substep": "generate_taxon_summary_json", "contig_summary_json": contig_summary_json }): with open(contig_summary_json, 'w') as contig_outf: json.dump(contig_taxon_summary, contig_outf) # Upload additional file contig2lineage_json = os.path.join( os.path.dirname(contig_summary_json), f"contig2lineage.{db_type}.json") with log.log_context( "PipelineStepBlastContigs", { "substep": "contig2lineage_json", "contig2lineage_json": contig2lineage_json }): with open(contig2lineage_json, 'w') as c2lf: json.dump(contig2lineage, c2lf) self.additional_output_files_hidden.append(contig2lineage_json)