def run(self): (_align_m8, _deduped_m8, hit_summary, _orig_counts) = self.input_files_local[0] output_reference_fasta = self.output_files_local()[0] loc_db = s3.fetch_reference( self.additional_files["loc_db"], self.ref_dir_local, auto_unzip= True, # This is default for references, but let's be explicit. allow_s3mi=ALLOW_S3MI) db_s3_path = self.additional_attributes["db"] # db_type = self.additional_attributes["db_type"] (_read_dict, accession_dict, _selected_genera) = m8.summarize_hits(hit_summary) with open_file_db_by_extension( loc_db, IdSeqDictValue.VALUE_TYPE_ARRAY) as loc_dict: db_path = s3.fetch_reference( db_s3_path, self.ref_dir_local, auto_unzip= True, # This is default for references, but let's be explicit allow_s3mi=ALLOW_S3MI) self.download_ref_sequences_from_file(accession_dict, loc_dict, db_path, output_reference_fasta)
def run(self): ''' Run alignmment remotely ''' alignment_algorithm_inputs = PipelineStepRunAlignment._alignment_algorithm_inputs( self.input_files_local[0]) duplicate_cluster_sizes_path, = self.input_files_local[1] output_m8, deduped_output_m8, output_hitsummary, output_counts_with_dcr_json = self.output_files_local( ) assert output_counts_with_dcr_json.endswith( "_with_dcr.json"), self.output_files_local() if self.is_local_run: self.run_locally( alignment_algorithm_inputs[self.alignment_algorithm], output_m8) else: self.run_remotely( alignment_algorithm_inputs[self.alignment_algorithm], output_m8) # get database lineage_db = fetch_reference(self.additional_files["lineage_db"], self.ref_dir_local) accession2taxid_db = fetch_reference( self.additional_files["accession2taxid_db"], self.ref_dir_local, allow_s3mi=True) min_alignment_length = NT_MIN_ALIGNMENT_LEN if self.alignment_algorithm == 'gsnap' else 0 m8.call_hits_m8(output_m8, lineage_db, accession2taxid_db, deduped_output_m8, output_hitsummary, min_alignment_length) db_type = 'NT' if self.alignment_algorithm == 'gsnap' else 'NR' deuterostome_db = None if self.additional_files.get("deuterostome_db"): deuterostome_db = fetch_reference( self.additional_files["deuterostome_db"], self.ref_dir_local, allow_s3mi=True) blacklist_s3_file = self.additional_files.get('taxon_blacklist', DEFAULT_BLACKLIST_S3) taxon_blacklist = fetch_reference(blacklist_s3_file, self.ref_dir_local) taxon_whitelist = None if self.additional_attributes.get("use_taxon_whitelist"): taxon_whitelist = fetch_reference( self.additional_files.get("taxon_whitelist", DEFAULT_WHITELIST_S3), self.ref_dir_local) m8.generate_taxon_count_json_from_m8(deduped_output_m8, output_hitsummary, db_type, lineage_db, deuterostome_db, taxon_whitelist, taxon_blacklist, duplicate_cluster_sizes_path, output_counts_with_dcr_json)
def create_taxon_count_file(self): # TOOO: Can this be consolidated throughout the pipeline? # This setup is mostly repeated in three steps. The list of taxa do not seem to change. count_type = 'merged_NT_NR' lineage_db = fetch_reference(self.additional_files["lineage_db"], self.ref_dir_local, allow_s3mi=False) deuterostome_db = None if self.additional_files.get("deuterostome_db"): deuterostome_db = fetch_reference( self.additional_files["deuterostome_db"], self.ref_dir_local, allow_s3mi=False) # Too small for s3mi taxon_whitelist = None if self.additional_attributes.get("use_taxon_whitelist"): taxon_whitelist = fetch_reference( self.additional_files.get("taxon_whitelist", DEFAULT_WHITELIST_S3), self.ref_dir_local) blacklist_s3_file = self.additional_files.get('taxon_blacklist', DEFAULT_BLACKLIST_S3) taxon_blacklist = fetch_reference(blacklist_s3_file, self.ref_dir_local) cdhit_cluster_sizes_path = self.inputs.cluster_sizes_filename generate_taxon_count_json_from_m8( self.outputs.merged_m8_filename, self.outputs.merged_hit_filename, count_type, lineage_db, deuterostome_db, taxon_whitelist, taxon_blacklist, cdhit_cluster_sizes_path, self.outputs.merged_taxon_count_filename)
def generate_mapped_reads_tsv(self): """Use bedtools to generate a table of mapped reads for each genome in the ARG ANNOT database. If a new resistance gene db is used, the .bed file will need to be updated manually.""" bed_file_path = fetch_reference( self.additional_files["resist_genome_bed"], self.ref_dir_local, allow_s3mi=False) sample_bam_file_path = self.output_files_local()[5] tmp_sort_dir = os.path.join(self.output_dir_local, "tmp_sort") command.make_dirs(tmp_sort_dir) # Convert the sorted.bam output from SRST2 to the bed format, then sort the bed file. # This allows us to use the "sorted" mode of bedtools coverage, which is memory-efficient. # Otherwise, large sorted.bam files will cause our machines to run out of RAM. # # Note that despite being called "sorted.bam", the bam is not sorted the way we need it to be. # # env LC_ALL=C ensures that the sort command uses the same sort order on all machines. # # The -T flag with tmp_sort_dir ensures that we make tmp files inside /mnt, which is where our huge AWS volumes are mounted. # By default, the sort command creates temp files in /tmp, which has very little disk space. command.execute( command_patterns.ShellScriptCommand( script=''' bedtools bamtobed -i "$1" | env LC_ALL=C sort -T "$2" -k1,1 -k2,2n | bedtools coverage -sorted -a "$3" -b stdin > "$4";''', args=[ sample_bam_file_path, tmp_sort_dir, bed_file_path, os.path.join(self.output_dir_local, MATCHED_READS_FILE) ])) command.remove_rf(tmp_sort_dir)
def run(self): input_fas = self.input_files_local[0][0:2] output_fas = self.output_files_local() output_sam_file = os.path.join( self.output_dir_local, self.additional_attributes["output_sam_file"]) self.additional_output_files_hidden.append(output_sam_file) genome_dir = fetch_reference(self.additional_files["gsnap_genome"], self.ref_dir_local, allow_s3mi=True, auto_untar=True) gsnap_base_dir = os.path.dirname(genome_dir) gsnap_index_name = os.path.basename(genome_dir) # Run Gsnap gsnap_params = [ '-A', 'sam', '--batch=0', '--use-shared-memory=0', '--gmap-mode=all', '--npaths=1', '--ordered', '-t', 32, '--max-mismatches=40', '-D', gsnap_base_dir, '-d', gsnap_index_name, '-o', output_sam_file ] + input_fas command.execute( command_patterns.SingleCommand(cmd='gsnapl', args=gsnap_params)) log.write("Finished GSNAP alignment.") # Extract out unmapped files from sam if len(input_fas) == 2: convert.generate_unmapped_pairs_from_sam(output_sam_file, output_fas) else: convert.generate_unmapped_singles_from_sam(output_sam_file, output_fas[0])
def run(self): # Setup if len(self.input_files_local) > 1: input_fa_name = self.input_files_local[0][0] hit_summary_files = { 'NT': self.input_files_local[1][2], 'NR': self.input_files_local[2][2] } else: # TODO(yf): Old implementation. TO BE DEPRECATED once 3.1 is fully deployed input_files = self.input_files_local[0] input_fa_name = input_files[0] hit_summary_files = {'NT': input_files[1], 'NR': input_files[2]} # Open lineage db lineage_db = s3.fetch_reference(self.additional_files["lineage_db"], self.ref_dir_local, allow_s3mi=True) # Get primary hit mappings valid_hits = PipelineStepGenerateTaxidFasta.parse_hits( hit_summary_files) with open(input_fa_name, 'rb') as input_fa, \ open(self.output_files_local()[0], 'wb') as output_fa, \ open_file_db_by_extension(lineage_db, IdSeqDictValue.VALUE_TYPE_ARRAY) as lineage_map: # noqa seq_name = input_fa.readline() seq_data = input_fa.readline() while len(seq_name) > 0 and len(seq_data) > 0: # Example read_id: "NR::NT:CP010376.2:NB501961:14:HM7TLBGX2:1:23109 # :12720:8743/2" # Translate the read information into our custom format with fake # taxids at non-specific hit levels. annotated_read_id = seq_name.decode("utf-8").rstrip().lstrip( '>') read_id = annotated_read_id.split(":", 4)[-1] nr_taxid_species, nr_taxid_genus, nr_taxid_family = PipelineStepGenerateTaxidFasta.get_valid_lineage( valid_hits, lineage_map, read_id, 'NR') nt_taxid_species, nt_taxid_genus, nt_taxid_family = PipelineStepGenerateTaxidFasta.get_valid_lineage( valid_hits, lineage_map, read_id, 'NT') fields = [ "family_nr", nr_taxid_family, "family_nt", nt_taxid_family ] fields += [ "genus_nr", nr_taxid_genus, "genus_nt", nt_taxid_genus ] fields += [ "species_nr", nr_taxid_species, "species_nt", nt_taxid_species ] fields += [annotated_read_id] new_read_name = ('>' + ':'.join(fields) + '\n').encode() output_fa.write(new_read_name) output_fa.write(seq_data) seq_name = input_fa.readline() seq_data = input_fa.readline()
def run(self): input_fa_name = self.input_files_local[0][0] if len(self.input_files_local) > 1: input_fa_name = self.input_files_local[0][0] nt_hit_summary_path, nr_hit_summary_path = self.input_files_local[ 1][2], self.input_files_local[2][2] else: # This is used in `short-read-mngs/experimental.wdl` input_fa_name = self.input_files_local[0][0] nt_hit_summary_path, nr_hit_summary_path = self.input_files_local[ 0][1], self.input_files_local[0][2] # Open lineage db lineage_db = s3.fetch_reference(self.additional_files["lineage_db"], self.ref_dir_local, allow_s3mi=True) with open(nt_hit_summary_path) as nt_hit_summary_f, open( nr_hit_summary_path) as nr_hit_summary_f: nr_hits_by_read_id = { row["read_id"]: (row["taxid"], row["level"]) for row in HitSummaryMergedReader(nr_hit_summary_f) } nt_hits_by_read_id = { row["read_id"]: (row["taxid"], row["level"]) for row in HitSummaryMergedReader(nt_hit_summary_f) } with open(self.output_files_local()[0], "w") as output_fa, \ open_file_db_by_extension(lineage_db) as lineage_map: # noqa for read in fasta.iterator(input_fa_name): # Example read_id: "NR::NT:CP010376.2:NB501961:14:HM7TLBGX2:1:23109 # :12720:8743/2" # Translate the read information into our custom format with fake # taxids at non-specific hit levels. # TODO: (tmorse) fasta parsing annotated_read_id = read.header.lstrip('>') read_id = annotated_read_id.split(":", 4)[-1] nr_taxid_species, nr_taxid_genus, nr_taxid_family = PipelineStepGenerateTaxidFasta.get_valid_lineage( nr_hits_by_read_id, lineage_map, read_id) nt_taxid_species, nt_taxid_genus, nt_taxid_family = PipelineStepGenerateTaxidFasta.get_valid_lineage( nt_hits_by_read_id, lineage_map, read_id) fields = [ "family_nr", nr_taxid_family, "family_nt", nt_taxid_family ] fields += [ "genus_nr", nr_taxid_genus, "genus_nt", nt_taxid_genus ] fields += [ "species_nr", nr_taxid_species, "species_nt", nt_taxid_species ] fields += [annotated_read_id] new_read_name = ('>' + ':'.join(fields) + '\n') output_fa.write(new_read_name) output_fa.write(read.sequence + "\n")
def get_common_params(self): """Helper that gets srst2 parameters common to both paired and single rds.""" # TODO: Why is this not fetch_reference? So it can be cached. db_file_path = fetch_reference(self.additional_files["resist_gene_db"], self.ref_dir_local, allow_s3mi=False) # too small for s3mi min_cov = str(self.additional_attributes['min_cov']) # srst2 expects this to be a string, in dag could be passed in as a number n_threads = str(self.additional_attributes['n_threads']) return ['--min_coverage', min_cov, '--threads', n_threads, '--output', os.path.join(self.output_dir_local, 'output'), '--log', '--gene_db', db_file_path]
def run(self): input_fas = self.input_fas() output_fas = self.output_files_local() genome_dir = fetch_reference( self.additional_files["bowtie2_genome"], self.ref_dir_local, allow_s3mi=True, auto_untar=True) output_sam_file = os.path.join( self.output_dir_local, self.additional_attributes["output_sam_file"]) self.additional_output_files_hidden.append(output_sam_file) # The file structure looks like # "bowtie2_genome/GRCh38.primary_assembly.genome.3.bt2" genome_basename = command.glob(f"{genome_dir}/*.bt2*", max_results=1)[0] # remove two extensions: ex: hg38_phiX_rRNA_mito_ERCC.3.bt2 -> hg38_phiX_rRNA_mito_ERCC genome_basename = os.path.splitext(os.path.splitext(genome_basename)[0])[0] bowtie2_params = [ '-q', '-x', genome_basename, '-f', '--very-sensitive-local', '-S', output_sam_file ] # --seed cannot be used with -p multithreading # We have observed the lack of multithreading resulting in # severe performance degradation in some cases. So for the # time being multithreading is being chosen over determinism. # To seed bowtie2 do something similar to: # bowtie2_params.extend(['--seed', '4']) bowtie2_params.extend(['-p', str(multiprocessing.cpu_count())]) if len(input_fas) == 2: bowtie2_params.extend(['-1', input_fas[0], '-2', input_fas[1]]) else: bowtie2_params.extend(['-U', input_fas[0]]) # Example: # bowtie2 -q -x /mnt/idseq/ref/bowtie2_genome/hg38_phiX_rRNA_mito_ERCC -f \ # --very-sensitive-local -S /mnt/idseq/results/589/bowtie2_human.sam \ # -p 32 \ # -1 /mnt/idseq/results/589/unmapped_human_1.fa -2 /mnt/idseq/results/589/unmapped_human_2.fa command.execute( command_patterns.SingleCommand( cmd='bowtie2', args=bowtie2_params ) ) log.write("Finished Bowtie alignment.") if len(input_fas) == 2: convert.generate_unmapped_pairs_from_sam(output_sam_file, output_fas) else: convert.generate_unmapped_singles_from_sam(output_sam_file, output_fas[0])
def run(self): input_fas = self.input_files_local[0][0:2] output_fas = self.output_files_local() genome_dir = fetch_reference( self.additional_files["bowtie2_genome"], self.ref_dir_local, allow_s3mi=True, auto_untar=True) output_sam_file = os.path.join( self.output_dir_local, self.additional_attributes["output_sam_file"]) self.additional_output_files_hidden.append(output_sam_file) # The file structure looks like # "bowtie2_genome/GRCh38.primary_assembly.genome.3.bt2" genome_basename = command.glob(f"{genome_dir}/*.bt2*", max_results=1)[0] # remove two extensions: ex: hg38_phiX_rRNA_mito_ERCC.3.bt2 -> hg38_phiX_rRNA_mito_ERCC genome_basename = os.path.splitext(os.path.splitext(genome_basename)[0])[0] bowtie2_params = [ '-q', '-x', genome_basename, '-f', '--very-sensitive-local', '-S', output_sam_file ] seed = self.additional_attributes.get("random_seed") if seed: bowtie2_params.extend(['--seed', str(seed)]) else: # Seed option won't work with -p threading option. bowtie2_params.extend(['-p', str(multiprocessing.cpu_count())]) if len(input_fas) == 2: bowtie2_params.extend(['-1', input_fas[0], '-2', input_fas[1]]) else: bowtie2_params.extend(['-U', input_fas[0]]) # Example: # bowtie2 -q -x /mnt/idseq/ref/bowtie2_genome/hg38_phiX_rRNA_mito_ERCC -f \ # --very-sensitive-local -S /mnt/idseq/results/589/bowtie2_human.sam \ # -p 32 \ # -1 /mnt/idseq/results/589/unmapped_human_1.fa -2 /mnt/idseq/results/589/unmapped_human_2.fa command.execute( command_patterns.SingleCommand( cmd='bowtie2', args=bowtie2_params ) ) log.write("Finished Bowtie alignment.") if len(input_fas) == 2: convert.generate_unmapped_pairs_from_sam(output_sam_file, output_fas) else: convert.generate_unmapped_singles_from_sam(output_sam_file, output_fas[0])
def run(self): input_fas = self.input_fas() output_fas = self.output_files_local() genome_dir = fetch_reference(self.additional_files["bowtie2_genome"], self.ref_dir_local, allow_s3mi=True, auto_untar=True) output_sam_file = os.path.join( self.output_dir_local, self.additional_attributes["output_sam_file"]) self.additional_output_files_hidden.append(output_sam_file) # The file structure looks like # "bowtie2_genome/GRCh38.primary_assembly.genome.3.bt2" genome_basename = command.glob(f"{genome_dir}/*.bt2*", max_results=1)[0] # remove two extensions: ex: hg38_phiX_rRNA_mito_ERCC.3.bt2 -> hg38_phiX_rRNA_mito_ERCC genome_basename = os.path.splitext( os.path.splitext(genome_basename)[0])[0] bowtie2_params = [ '-q', '-x', genome_basename, '-f', '--very-sensitive-local', '-S', output_sam_file ] # FIXME: https://jira.czi.team/browse/IDSEQ-2738 # We want to move towards a general randomness solution in which # all randomness is seeded based on the content of the original input. # This is currently introducing non-determinism and hard coding # an arbitrary seed here shouldn't impact correctness. bowtie2_params.extend( ['--seed', '4']) # chosen by fair dice role, guaranteed to be random if len(input_fas) == 2: bowtie2_params.extend(['-1', input_fas[0], '-2', input_fas[1]]) else: bowtie2_params.extend(['-U', input_fas[0]]) # Example: # bowtie2 -q -x /mnt/idseq/ref/bowtie2_genome/hg38_phiX_rRNA_mito_ERCC -f \ # --very-sensitive-local -S /mnt/idseq/results/589/bowtie2_human.sam \ # -p 32 \ # -1 /mnt/idseq/results/589/unmapped_human_1.fa -2 /mnt/idseq/results/589/unmapped_human_2.fa command.execute( command_patterns.SingleCommand(cmd='bowtie2', args=bowtie2_params)) log.write("Finished Bowtie alignment.") if len(input_fas) == 2: convert.generate_unmapped_pairs_from_sam(output_sam_file, output_fas) else: convert.generate_unmapped_singles_from_sam(output_sam_file, output_fas[0])
def run(self): ''' Run alignmment remotely ''' input_fas = self.get_input_fas() [output_m8, deduped_output_m8, output_hitsummary, output_counts_json] = self.output_files_local() service = self.additional_attributes["service"] assert service in ("gsnap", "rapsearch2") min_alignment_length = 36 if service == 'gsnap' else 0 # alignments < 36-NT are false positives self.run_remotely(input_fas, output_m8, service) # get database lineage_db = fetch_reference(self.additional_files["lineage_db"], self.ref_dir_local) accession2taxid_db = fetch_reference( self.additional_files["accession2taxid_db"], self.ref_dir_local, allow_s3mi=True) blacklist_s3_file = self.additional_attributes.get( 'taxon_blacklist', DEFAULT_BLACKLIST_S3) taxon_blacklist = fetch_reference(blacklist_s3_file, self.ref_dir_local) m8.call_hits_m8(output_m8, lineage_db, accession2taxid_db, deduped_output_m8, output_hitsummary, min_alignment_length, taxon_blacklist) # check deuterostome deuterostome_db = None db_type = 'NT' if service == 'gsnap' else 'NR' evalue_type = 'log10' if service == 'rapsearch2' else 'raw' if self.additional_files.get("deuterostome_db"): deuterostome_db = fetch_reference( self.additional_files["deuterostome_db"], self.ref_dir_local, allow_s3mi=True) m8.generate_taxon_count_json_from_m8(deduped_output_m8, output_hitsummary, evalue_type, db_type, lineage_db, deuterostome_db, output_counts_json)
def run(self): """ Trim any residual Illumina adapters. Discard any reads that become too short. See: http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/TrimmomaticManual_V0.32.pdf """ input_files = self.input_files_local[0][0:2] output_files = self.output_files_local() is_paired = (len(input_files) == 2) adapter_fasta = s3.fetch_reference( self.additional_files["adapter_fasta"], self.ref_dir_local) if fasta.input_file_type(input_files[0]) != 'fastq': # Not fastq for in_file, out_file in zip(input_files, output_files): command.copy_file(in_file, out_file) return if is_paired: paired_arg = "PE" output_args = [ output_files[0], # R1, paired, to be kept f"{output_files[0]}__unpaired", # R1, no longer paired, to be discarded output_files[1], # R2, paired, to be kept f"{output_files[1]}__unpaired" ] # R2, no longer paired, to be discarded else: paired_arg = "SE" output_args = output_files params = [ "-jar", "/usr/local/bin/trimmomatic-0.38.jar", paired_arg, "-phred33", *input_files, *output_args, f"ILLUMINACLIP:{adapter_fasta}:2:30:10:8:true", # Remove Illumina adapters provided in the fasta file. Initially, look for seed matches # allowing maximally *2* mismatches. These seeds will be extended and clipped if in the case of paired end # reads a score of *30* is reached, or in the case of single ended reads a # score of *10*. # additional parameters: minAdapterLength = 8, keepBothReads = true; these are set to require pairs to be # kept even when an adapter read-through occurs and R2 is a direct reverse complement of R1. "MINLEN:35" # Discard reads which are less than *75* bases long after these steps. ] command.execute(command_patterns.SingleCommand(cmd="java", args=params))
def run(self): """ Extract data from input files. Generate coverage viz data. Output JSON output files. """ max_num_bins_coverage = self.additional_attributes.get( "max_num_bins_coverage", MAX_NUM_BINS_COVERAGE) num_accessions_per_taxon = self.additional_attributes.get( "num_accessions_per_taxon", NUM_ACCESSIONS_PER_TAXON) min_contig_size = self.additional_attributes.get( "min_contig_size", MIN_CONTIG_SIZE) info_db = s3.fetch_reference(self.additional_files["info_db"], self.ref_dir_local, allow_s3mi=True) with open_file_db_by_extension( info_db, IdSeqDictValue.VALUE_TYPE_ARRAY) as info_dict: # Extract data from input files. (taxon_data, accession_data, contig_data, read_data) = self.prepare_data(self.input_files_local, info_dict, min_contig_size, num_accessions_per_taxon) # Generate the coverage viz data for each accession. coverage_viz_data = self.generate_coverage_viz_data( accession_data, contig_data, read_data, max_num_bins_coverage) # Generate the summary data, which contains a dict of all taxons for which coverage viz data is available. # For each taxon, summary data for the best accessions, plus the number of total accessions, is included. coverage_viz_summary_data = self.generate_coverage_viz_summary_data( taxon_data, accession_data, coverage_viz_data) coverage_viz_summary = self.output_files_local()[0] # Write the summary JSON file which is initially loaded on the report page. with open(coverage_viz_summary, 'w') as cvs: json.dump(coverage_viz_summary_data, cvs) # Create a separate coverage viz JSON file for each accession. # This file will be passed to the front-end when the user views that particular accession. coverage_viz_dir = os.path.join(self.output_dir_local, "coverage_viz") command.make_dirs(coverage_viz_dir) for accession_id in coverage_viz_data: upload_file = os.path.join(coverage_viz_dir, f"{accession_id}_coverage_viz.json") with open(upload_file, 'w') as uf: json.dump(coverage_viz_data[accession_id], uf) self.additional_output_folders_hidden.append(coverage_viz_dir)
def run(self): input_fas = self.input_fas() output_fas = self.output_files_local() output_sam_file = os.path.join(self.output_dir_local, self.additional_attributes["output_sam_file"]) self.additional_output_files_hidden.append(output_sam_file) genome_dir = fetch_reference(self.additional_files["gsnap_genome"], self.ref_dir_local, allow_s3mi=True, auto_untar=True) gsnap_base_dir = os.path.dirname(genome_dir) gsnap_index_name = os.path.basename(genome_dir) # Hack to determine gsnap vs gsnapl error_message = subprocess.run( ['gsnapl', '-D', gsnap_base_dir, '-d', gsnap_index_name], input='>'.encode('utf-8'), stderr=subprocess.PIPE, stdout=subprocess.PIPE ).stderr gsnap_exe = "gsnap" if 'please run gsnap instead' in error_message.decode('utf-8') else "gsnapl" # Run Gsnap gsnap_params = [ '-A', 'sam', '--batch=0', '--use-shared-memory=0', '--gmap-mode=all', '--npaths=1', '--ordered', '-t', 32, '--max-mismatches=40', '-D', gsnap_base_dir, '-d', gsnap_index_name, '-o', output_sam_file ] + input_fas command.execute( command_patterns.SingleCommand( cmd=gsnap_exe, args=gsnap_params ) ) log.write("Finished GSNAP alignment.") # Extract out unmapped files from sam if len(input_fas) == 2: convert.generate_unmapped_pairs_from_sam( output_sam_file, output_fas) else: convert.generate_unmapped_singles_from_sam( output_sam_file, output_fas[0])
def get_accession_sequences(self, dest_dir, taxid, n=10): ''' Retrieve NCBI NT references for the most-matched accession in each hitsummary2 file, up to a maximum of n references. Write each reference to a separate fasta file. ''' if n == 0: return {} # Retrieve files nt_db = self.additional_attributes["nt_db"] nt_loc_db = s3.fetch_reference( self.additional_files["nt_loc_db"], self.ref_dir_local, allow_s3mi=True) # Choose accessions to process. s3_hitsummary2_files = self.additional_attributes["hitsummary2_files"].values() accessions = defaultdict(lambda: 0) # TODO: Address issue where accessions in nr can be chosen in the following code. # These accessions will not be found in nt_loc and will be subsequently omitted. for file_list in s3_hitsummary2_files: tally = defaultdict(lambda: 0) for s3_file in file_list: local_basename = s3_file.replace("/", "-").replace(":", "-") local_file = s3.fetch_from_s3( s3_file, os.path.join(self.output_dir_local, local_basename)) if local_file is None: continue with open(local_file, 'r') as f: for line in f: acc, species_taxid, genus_taxid, family_taxid = line.rstrip().split("\t")[3:7] if any(int(hit_taxid) == taxid for hit_taxid in [species_taxid, genus_taxid, family_taxid]): tally[acc] += 1 if tally: best_acc, max_count = max(tally.items(), key=lambda x: x[1]) accessions[best_acc] += max_count if len(accessions) > n: accessions = dict(sorted(accessions.items(), key=lambda x: x[1], reverse=True)[:n]) accessions = set(accessions.keys()) # Make map of accession to sequence file accession2info = dict((acc, {}) for acc in accessions) with open_file_db_by_extension(nt_loc_db) as nt_loc_dict: PipelineStepGenerateAlignmentViz.get_sequences_by_accession_list_from_s3( accession2info, nt_loc_dict, nt_db) # Put 1 fasta file per accession into the destination directory accession_fastas = {} for acc, info in accession2info.items(): if 'seq_file' not in info or info['seq_file'] is None: log.write(f"WARNING: No sequence retrieved for {acc}") continue clean_accession = self.clean_name_for_ksnp3(acc) local_fasta = f"{dest_dir}/NCBI_NT_accession_{clean_accession}.fasta" command.execute( command_patterns.SingleCommand( cmd="ln", args=[ "-s", info['seq_file'], local_fasta ] ) ) command.execute_with_output( command_patterns.ShellScriptCommand( script=r'''echo ">${acc}" | cat - "${local_fasta}" > temp_file;''', named_args={ 'acc': acc, 'local_fasta': local_fasta } ) ) command.move_file('temp_file', local_fasta) accession_fastas[acc] = local_fasta # Return kept accessions and paths of their fasta files return accession_fastas
def run(self): ''' 1. summarize hits 2. built blast index 3. blast assembled contigs to the index 4. update the summary ''' _align_m8, deduped_m8, hit_summary, orig_counts_with_dcr = self.input_files_local[0] assembled_contig, _assembled_scaffold, bowtie_sam, _contig_stats = self.input_files_local[1] reference_fasta, = self.input_files_local[2] duplicate_cluster_sizes_path, = self.input_files_local[3] blast_m8, refined_m8, refined_hit_summary, refined_counts_with_dcr, contig_summary_json, blast_top_m8 = self.output_files_local() assert refined_counts_with_dcr.endswith("with_dcr.json"), self.output_files_local() assert orig_counts_with_dcr.endswith("with_dcr.json"), self.output_files_local() db_type = self.additional_attributes["db_type"] no_assembled_results = ( os.path.getsize(assembled_contig) < MIN_ASSEMBLED_CONTIG_SIZE or os.path.getsize(reference_fasta) < MIN_REF_FASTA_SIZE) if no_assembled_results: # No assembled results or refseq fasta available. # Create empty output files. command.write_text_to_file(' ', blast_m8) command.write_text_to_file(' ', blast_top_m8) command.copy_file(deduped_m8, refined_m8) command.copy_file(hit_summary, refined_hit_summary) command.copy_file(orig_counts_with_dcr, refined_counts_with_dcr) command.write_text_to_file('[]', contig_summary_json) return # return in the middle of the function (read_dict, accession_dict, _selected_genera) = m8.summarize_hits(hit_summary) PipelineStepBlastContigs.run_blast(db_type, blast_m8, assembled_contig, reference_fasta, blast_top_m8) read2contig = {} PipelineStepRunAssembly.generate_info_from_sam(bowtie_sam, read2contig, duplicate_cluster_sizes_path) (updated_read_dict, read2blastm8, contig2lineage, added_reads) = self.update_read_dict( read2contig, blast_top_m8, read_dict, accession_dict, db_type) self.generate_m8_and_hit_summary(updated_read_dict, added_reads, read2blastm8, hit_summary, deduped_m8, refined_hit_summary, refined_m8) # Generating taxon counts based on updated results lineage_db = s3.fetch_reference( self.additional_files["lineage_db"], self.ref_dir_local, allow_s3mi=False) # Too small to waste s3mi deuterostome_db = None if self.additional_files.get("deuterostome_db"): deuterostome_db = s3.fetch_reference(self.additional_files["deuterostome_db"], self.ref_dir_local, allow_s3mi=False) # Too small for s3mi blacklist_s3_file = self.additional_files.get('taxon_blacklist', DEFAULT_BLACKLIST_S3) taxon_blacklist = s3.fetch_reference(blacklist_s3_file, self.ref_dir_local) taxon_whitelist = None if self.additional_attributes.get("use_taxon_whitelist"): taxon_whitelist = s3.fetch_reference(self.additional_files.get("taxon_whitelist", DEFAULT_WHITELIST_S3), self.ref_dir_local) with TraceLock("PipelineStepBlastContigs-CYA", PipelineStepBlastContigs.cya_lock, debug=False): with log.log_context("PipelineStepBlastContigs", {"substep": "generate_taxon_count_json_from_m8", "db_type": db_type, "refined_counts": refined_counts_with_dcr}): m8.generate_taxon_count_json_from_m8(refined_m8, refined_hit_summary, db_type.upper(), lineage_db, deuterostome_db, taxon_whitelist, taxon_blacklist, duplicate_cluster_sizes_path, refined_counts_with_dcr) # generate contig stats at genus/species level with log.log_context("PipelineStepBlastContigs", {"substep": "generate_taxon_summary"}): contig_taxon_summary = self.generate_taxon_summary( read2contig, contig2lineage, updated_read_dict, added_reads, db_type, duplicate_cluster_sizes_path, # same filter as applied in generate_taxon_count_json_from_m8 m8.build_should_keep_filter(deuterostome_db, taxon_whitelist, taxon_blacklist) ) with log.log_context("PipelineStepBlastContigs", {"substep": "generate_taxon_summary_json", "contig_summary_json": contig_summary_json}): with open(contig_summary_json, 'w') as contig_outf: json.dump(contig_taxon_summary, contig_outf) # Upload additional file contig2lineage_json = os.path.join(os.path.dirname(contig_summary_json), f"contig2lineage.{db_type}.json") with log.log_context("PipelineStepBlastContigs", {"substep": "contig2lineage_json", "contig2lineage_json": contig2lineage_json}): with open(contig2lineage_json, 'w') as c2lf: json.dump(contig2lineage, c2lf) self.additional_output_files_hidden.append(contig2lineage_json)
def run(self): # Setup nt_db = self.additional_attributes["nt_db"] nt_loc_db = s3.fetch_reference( self.additional_files["nt_loc_db"], self.ref_dir_local, auto_unzip= True, # This is default for reference download, just being explicit. allow_s3mi=True) db_type = "nt" # Only NT supported for now # TODO: Design a way to map in/out files more robustly, e.g. by name/type annotated_m8 = self.input_files_local[0][0] annotated_fasta = self.input_files_local[1][0] output_json_dir = os.path.join(self.output_dir_local, "align_viz") # Go through annotated_fasta with a db_type (NT/NR match). Infer the # family/genus/species info read2seq = PipelineStepGenerateAlignmentViz.parse_reads( annotated_fasta, db_type) log.write(f"Read to Seq dictionary size: {len(read2seq)}") groups, line_count = self.process_reads_from_m8_file( annotated_m8, read2seq) # If nt_db is not yet downloaded, then do download nt_db here if nt_db.startswith("s3://"): # TODO: Handle this better. We might be poorly provisioned to allow s3mi speed # for this step, on the instance where it is running. nt_db = s3.fetch_reference( nt_db, self.ref_dir_local, auto_unzip= True, # this is default for reference uploads, just being explicit allow_s3mi=True ) # s3mi probably okay here because we tend to download only NT and little else in this stage with open_file_db_by_extension( nt_loc_db, IdSeqDictValue.VALUE_TYPE_ARRAY) as nt_loc_dict: log.write("Getting sequences by accession list from file...") PipelineStepGenerateAlignmentViz.get_sequences_by_accession_list_from_file( groups, nt_loc_dict, nt_db) for _accession_id, ad in groups.items(): ad['coverage_summary'] = PipelineStepGenerateAlignmentViz.calculate_alignment_coverage( ad) result_dict, to_be_deleted = self.populate_reference_sequences(groups) # Delete temp files def safe_multi_delete(files): for f in files: try: os.remove(f) except: pass deleter_thread = threading.Thread(target=safe_multi_delete, args=[to_be_deleted]) deleter_thread.start() self.dump_align_viz_json(output_json_dir, db_type, result_dict) deleter_thread.join() # Write summary file summary_msg = f"Read2Seq Size: {len(read2seq)}, M8 lines {line_count}, " \ f"{len(groups)} unique accession ids " summary_file_name = f"{output_json_dir}.summary" with open(summary_file_name, 'w') as summary_f: summary_f.write(summary_msg)
def run(self): """Run STAR to filter out host reads.""" # Setup if self.sequence_input_files is not None and self.validated_input_counts_file is not None: validated_input_counts_file = self.validated_input_counts_file input_files = self.sequence_input_files else: validated_input_counts_file = self.input_files_local[0][0] input_files = self.input_files_local[0][1:3] num_inputs = len(input_files) scratch_dir = os.path.join(self.output_dir_local, "scratch_star") output_files_local = self.output_files_local() output_gene_file = self.additional_attributes.get("output_gene_file") output_log_file = self.additional_attributes.get("output_log_file") genome_dir = s3.fetch_reference( self.additional_files["star_genome"], self.ref_dir_local, allow_s3mi=True, auto_untar=True) # Check parts file for the number of partitioned indexes parts_file = os.path.join(genome_dir, "parts.txt") assert os.path.isfile(parts_file) with open(parts_file, 'rb') as parts_f: num_parts = int(parts_f.read()) # Don't compute insert size metrics if the STAR index has more than one part # Logic for combining BAM output from STAR or insert size metrics not implemented if self.collect_insert_size_metrics_for and num_parts != 1: log.write("Insert size metrics were expected to be collected for sample but were not because the STAR index has more than one part") self.collect_insert_size_metrics_for = None # Run STAR on each partition and save the unmapped read info unmapped = input_files with open(validated_input_counts_file) as validated_input_counts_f: validated_input_counts = json.load(validated_input_counts_f) use_starlong = validated_input_counts[vc.BUCKET_LONG] > 1 or \ validated_input_counts[vc.BUCKET_TOO_LONG] > 1 for part_idx in range(num_parts): tmp = f"{scratch_dir}/star-part-{part_idx}" genome_part = f"{genome_dir}/part-{part_idx}" count_genes = part_idx == 0 self.run_star_part(tmp, genome_part, unmapped, count_genes, use_starlong) unmapped, too_discrepant = PipelineStepRunStar.sync_pairs( PipelineStepRunStar.unmapped_files_in(tmp, num_inputs)) if too_discrepant: raise BrokenReadPairError("Broken pairs") # Run part 0 in gene-counting mode: # (a) ERCCs are doped into part 0 and we want their counts. # (b) If there is only 1 part (e.g. human), the host gene counts also # make sense. if part_idx == 0: gene_count_file = os.path.join(tmp, "ReadsPerGene.out.tab") if os.path.isfile(gene_count_file) and output_gene_file: moved = os.path.join(self.output_dir_local, output_gene_file) command.move_file(gene_count_file, moved) self.additional_output_files_hidden.append(moved) log_file = os.path.join(tmp, "Log.final.out") if os.path.isfile(log_file) and output_log_file: moved = os.path.join(self.output_dir_local, output_log_file) command.move_file(log_file, moved) # STAR names the output BAM file Aligned.out.bam without TranscriptomeSAM and # Aligned.toTranscriptome.out.bam with TranscriptomeSAM, this doesn't # appear to be configurable is_dna = self.collect_insert_size_metrics_for == "dna" bam_filename = "Aligned.out.bam" if is_dna else "Aligned.toTranscriptome.out.bam" if self.collect_insert_size_metrics_for: bam_path = os.path.join(tmp, bam_filename) # If this file wasn't generated but self.collect_insert_size_metrics_for has a value # something unexpected has gone wrong assert(os.path.isfile(bam_path)), \ "Expected STAR to generate Aligned.out.bam but it was not found" try: self.collect_insert_size_metrics(tmp, bam_path, self.output_metrics_file, self.output_histogram_file) if os.path.exists(self.output_metrics_file): self.additional_output_files_visible.append(self.output_metrics_file) else: message = "expected picard to generate a metrics file but none was found" log.write(message=message, warning=True) if os.path.exists(self.output_histogram_file): self.additional_output_files_visible.append(self.output_histogram_file) else: message = "expected picard to generate a histogram file but none was found" log.write(message=message, warning=True) except Exception as e: log.write(message=f"encountered error while running picard: {type(e).__name__}: {e}", warning=True) # Sort unmapped files for deterministic output for unmapped_file in unmapped: sort_fastx_by_entry_id(unmapped_file) # Cleanup for src, dst in zip(unmapped, output_files_local): command.move_file(src, dst) # Move out of scratch dir command.remove_rf(f"{scratch_dir}/*")
def run(self): ''' 1. summarize hits 2. built blast index 3. blast assembled contigs to the index 4. update the summary ''' (_align_m8, deduped_m8, hit_summary, orig_counts) = self.input_files_local[0] assembled_contig, _assembled_scaffold, bowtie_sam, _contig_stats = self.input_files_local[ 1] reference_fasta = self.input_files_local[2][0] (blast_m8, refined_m8, refined_hit_summary, refined_counts, contig_summary_json, blast_top_m8) = self.output_files_local() db_type = self.additional_attributes["db_type"] if os.path.getsize(assembled_contig) < MIN_ASSEMBLED_CONTIG_SIZE or \ os.path.getsize(reference_fasta) < MIN_REF_FASTA_SIZE: # No assembled results or refseq fasta available. # Create empty output files. command.write_text_to_file(' ', blast_m8) command.write_text_to_file(' ', blast_top_m8) command.copy_file(deduped_m8, refined_m8) command.copy_file(hit_summary, refined_hit_summary) command.copy_file(orig_counts, refined_counts) command.write_text_to_file('[]', contig_summary_json) return # return in the middle of the function (read_dict, accession_dict, _selected_genera) = m8.summarize_hits(hit_summary) PipelineStepBlastContigs.run_blast(db_type, blast_m8, assembled_contig, reference_fasta, blast_top_m8) read2contig = {} contig_stats = defaultdict(int) PipelineStepRunAssembly.generate_info_from_sam(bowtie_sam, read2contig, contig_stats) (updated_read_dict, read2blastm8, contig2lineage, added_reads) = self.update_read_dict(read2contig, blast_top_m8, read_dict, accession_dict, db_type) self.generate_m8_and_hit_summary(updated_read_dict, added_reads, read2blastm8, hit_summary, deduped_m8, refined_hit_summary, refined_m8) # Generating taxon counts based on updated results lineage_db = s3.fetch_reference( self.additional_files["lineage_db"], self.ref_dir_local, allow_s3mi=False) # Too small to waste s3mi deuterostome_db = None evalue_type = 'raw' if self.additional_files.get("deuterostome_db"): deuterostome_db = s3.fetch_reference( self.additional_files["deuterostome_db"], self.ref_dir_local, allow_s3mi=False) # Too small for s3mi with TraceLock("PipelineStepBlastContigs-CYA", PipelineStepBlastContigs.cya_lock, debug=False): with log.log_context( "PipelineStepBlastContigs", { "substep": "generate_taxon_count_json_from_m8", "db_type": db_type, "refined_counts": refined_counts }): m8.generate_taxon_count_json_from_m8( refined_m8, refined_hit_summary, evalue_type, db_type.upper(), lineage_db, deuterostome_db, refined_counts) # generate contig stats at genus/species level with log.log_context("PipelineStepBlastContigs", {"substep": "generate_taxon_summary"}): contig_taxon_summary = self.generate_taxon_summary( read2contig, contig2lineage, updated_read_dict, added_reads, db_type) with log.log_context( "PipelineStepBlastContigs", { "substep": "generate_taxon_summary_json", "contig_summary_json": contig_summary_json }): with open(contig_summary_json, 'w') as contig_outf: json.dump(contig_taxon_summary, contig_outf) # Upload additional file contig2lineage_json = os.path.join( os.path.dirname(contig_summary_json), f"contig2lineage.{db_type}.json") with log.log_context( "PipelineStepBlastContigs", { "substep": "contig2lineage_json", "contig2lineage_json": contig2lineage_json }): with open(contig2lineage_json, 'w') as c2lf: json.dump(contig2lineage, c2lf) self.additional_output_files_hidden.append(contig2lineage_json)