def test_cd(self): '''WHEN using cd parameter, THEN it executes the command in the selected directory and resets it to previous dir before executing next command''' _shared_test_cd_parameter( test_context=self, pwd_command_pattern_with_cd=command_patterns.ShellScriptCommand( cd=TMP_FOLDER, script="pwd", args=[]), pwd_command_pattern_without_cd=command_patterns.ShellScriptCommand( script="pwd", args=[]))
def generate_read_to_contig_mapping(assembled_contig, fasta_file, read2contig, duplicate_cluster_sizes_path, output_bowtie_sam, output_contig_stats): ''' read -> contig mapping through bowtie2 alignment ''' base_output_dir = os.path.dirname(fasta_file) # build bowtie index based on assembled_contig bowtie_index_path = os.path.join(base_output_dir, 'bowtie-contig') command.make_dirs(bowtie_index_path) command.execute( command_patterns.SingleCommand( cmd='bowtie2-build', args=[assembled_contig, bowtie_index_path])) command.execute( command_patterns.ShellScriptCommand( script= r'''bowtie2 -x "${bowtie_index_path}" -f -U "${fasta_file}" --very-sensitive -p 32 > "${output_bowtie_sam}";''', named_args={ 'bowtie_index_path': bowtie_index_path, 'fasta_file': fasta_file, 'output_bowtie_sam': output_bowtie_sam })) contig_stats = PipelineStepRunAssembly.generate_info_from_sam( output_bowtie_sam, read2contig, duplicate_cluster_sizes_path) with open(output_contig_stats, 'w') as ocf: json.dump(contig_stats, ocf)
def generate_mapped_reads_tsv(self): """Use bedtools to generate a table of mapped reads for each genome in the ARG ANNOT database. If a new resistance gene db is used, the .bed file will need to be updated manually.""" bed_file_path = fetch_reference( self.additional_files["resist_genome_bed"], self.ref_dir_local, allow_s3mi=False) sample_bam_file_path = self.output_files_local()[5] tmp_sort_dir = os.path.join(self.output_dir_local, "tmp_sort") command.make_dirs(tmp_sort_dir) # Convert the sorted.bam output from SRST2 to the bed format, then sort the bed file. # This allows us to use the "sorted" mode of bedtools coverage, which is memory-efficient. # Otherwise, large sorted.bam files will cause our machines to run out of RAM. # # Note that despite being called "sorted.bam", the bam is not sorted the way we need it to be. # # env LC_ALL=C ensures that the sort command uses the same sort order on all machines. # # The -T flag with tmp_sort_dir ensures that we make tmp files inside /mnt, which is where our huge AWS volumes are mounted. # By default, the sort command creates temp files in /tmp, which has very little disk space. command.execute( command_patterns.ShellScriptCommand( script=''' bedtools bamtobed -i "$1" | env LC_ALL=C sort -T "$2" -k1,1 -k2,2n | bedtools coverage -sorted -a "$3" -b stdin > "$4";''', args=[ sample_bam_file_path, tmp_sort_dir, bed_file_path, os.path.join(self.output_dir_local, MATCHED_READS_FILE) ])) command.remove_rf(tmp_sort_dir)
def reads(local_file_path, max_reads=None): ''' Count reads in a local file based on file format inferred from extension, up to a maximum of max_reads. ''' if local_file_path.endswith(".gz"): cmd = r'''zcat "${local_file_path}"''' file_format = local_file_path.split(".")[-2] else: cmd = r'''cat "${local_file_path}"''' file_format = local_file_path.split(".")[-1] named_args = { 'local_file_path': local_file_path } if max_reads: max_lines = reads2lines(max_reads, file_format) assert max_lines is not None, "Could not convert max_reads to max_lines" cmd += r''' | head -n "${max_lines}"''' named_args.update({ 'max_lines': max_lines }) cmd += " | wc -l" cmd_output = command.execute_with_output( command_patterns.ShellScriptCommand( script=cmd, named_args=named_args ) ) line_count = int(cmd_output.strip().split(' ')[0]) return lines2reads(line_count, file_format)
def run(self): """ 1. extract contigs.fasta and read-contig.sam 2. run pile up """ contigs, _scaffolds, read_contig_sam, _stats = self.input_files_local[ 0] coverage_json, coverage_summary_csv = self.output_files_local() if os.path.getsize(contigs) < MIN_CONTIG_FILE_SIZE: command.write_text_to_file('{}', coverage_json) command.write_text_to_file('No Contigs', coverage_summary_csv) return # generate bam files bam_file = read_contig_sam.replace(".sam", ".bam") command.execute( command_patterns.ShellScriptCommand( script= r'''samtools view -S -b "${read_contig_sam}" | samtools sort - -o "${bam_file}";''', named_args={ 'read_contig_sam': read_contig_sam, 'bam_file': bam_file })) command.execute( command_patterns.SingleCommand(cmd="samtools", args=["index", bam_file])) # run coverage info output_csv, output_json = self.calc_contig2coverage(bam_file) os.rename(output_csv, coverage_summary_csv) os.rename(output_json, coverage_json)
def generate_unidentified_fasta(input_fa, output_fa): # TODO remove annotated fasta intermediate file and replace > with : below command.execute( command_patterns.ShellScriptCommand( script= r'''grep -A 1 '>NR::NT::' "$1" | sed '/^--$/d' > "$2";''', args=[input_fa, output_fa]))
def execute( command: Union[command_patterns.CommandPattern, str], progress_file: str = None, timeout: int = None, grace_period: int = None, capture_stdout: bool = False, merge_stderr: bool = False, log_context_mode: log.LogContextMode = log.LogContextMode. START_END_LOG_EVENTS ) -> Union[str, None]: """Primary way to start external commands in subprocesses and handle execution with logging. """ if not isinstance(command, command_patterns.CommandPattern): # log warning if using legacy format log.write( warning=True, message= f"Command parameter is using legacy type str. Use idseq_dag.util.command_patterns.", obj_data={ "cmd": command, "type": type(command) }) cmd = command_patterns.ShellScriptCommand(script=command, args=[]) else: cmd = command with CommandTracker() as ct: log_values = {"cid": f"Command {ct.id}", "command": cmd.as_dict()} with log.log_context('command_execute', values=log_values, log_context_mode=log_context_mode) as lctx: with ProgressFile(progress_file): if timeout: ct.timeout = timeout if grace_period: ct.grace_period = grace_period if capture_stdout: # Capture only stdout. Child stderr = parent stderr unless # merge_stderr specified. Child input = parent stdin. ct.proc = cmd.open(stdin=sys.stdin.fileno(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT if merge_stderr else sys.stderr.fileno()) stdout, _ = ct.proc.communicate() else: # Capture nothing. Child inherits parent stdin/out/err. ct.proc = cmd.open() ct.proc.wait() stdout = None lctx.values.update({"returncode": ct.proc.returncode}) if ct.proc.returncode: raise subprocess.CalledProcessError( ct.proc.returncode, str(command), stdout) if capture_stdout: return stdout
def grab_wgs_accessions(self, source_file, dest_file): command.execute( command_patterns.ShellScriptCommand( script= r'''grep '^>' "${source_file}" | grep 'complete genome' | cut -f 1 -d' ' > "${dest_file}";''', named_args={ 'source_file': source_file, 'dest_file': dest_file }))
def test_open_2(self): '''WHEN script uses shell variables within the script THEN it can get access to those variables but can use them from parameters''' cp1 = command_patterns.ShellScriptCommand( script=r'abc=123; echo abc=$abc, \$1=$1', args=["$abc"]) p = cp1.open(stdout=subprocess.PIPE) stdout, stderr = p.communicate() self.assertFalse(stderr) self.assertEqual(stdout.decode(), "abc=123, $1=$abc\n")
def multilinefa2singlelinefa(input_fasta, output_fasta): ''' Multi-line FASTA to Single-line FASTA conversion ''' command.execute( command_patterns.ShellScriptCommand( script= r'''awk 'NR==1 {print $0} NR>1 && /^>/ {printf("\n%s\n",$0);next; } NR>1 { printf("%s",$0);} END {printf("\n");}' <"${input_fasta}" > "${output_fasta}";''', named_args={ 'input_fasta': input_fasta, 'output_fasta': output_fasta }))
def fq2fa(input_fastq, output_fasta): ''' FASTQ to FASTA conversion ''' command.execute( command_patterns.ShellScriptCommand( script= r'''sed -n '1~4s/^@/>/p;2~4p' <"${input_fastq}" > "${output_fasta}";''', named_args={ 'input_fastq': input_fastq, 'output_fasta': output_fasta }))
def test_shellscript_with_param_array(self): '''WHEN using ShellScriptCommand to invoke a command with an array of parameters, THEN it works as expected''' cp1 = command_patterns.ShellScriptCommand( script=r'''paste "${slice_outputs[@]}"''', named_args={ 'slice_outputs': ["-d", r"\n", TESTFILE_ABC_TXT, TESTFILE_BCD_TXT] }) result = command.execute_with_output(cp1) self.assertEqual(result, "abc\nbcd\n")
def test_specific_pattern_1(self): '''WHEN using ShellScriptCommand with multiline script, THEN it works as expected''' cp1 = command_patterns.ShellScriptCommand(script=r''' a=123; echo May $a the force be with you \ | sed "s/a/Z/g" ''', args=[]) result = command.execute_with_output(cp1) self.assertEqual(result, "MZy 123 the force be with you\n")
def _vcf_replace_column_description(input_file, output_file, new_column_description): escaped_new_column_description = new_column_description.replace("\\", "\\\\").replace("&", "\\&").replace("/", r"\/") command.execute( command_patterns.ShellScriptCommand( script=r'''sed "${sed_pattern}" "${input_file}" > "${output_file}"''', named_args={ 'sed_pattern': f"s/^#CHROM.*/{escaped_new_column_description}/", 'input_file': input_file, 'output_file': output_file } ) )
def truncate_file(self, infile, outfile, is_fastq, max_fragments): num_lines = self.calc_max_num_lines(is_fastq, max_fragments) command.execute( command_patterns.ShellScriptCommand( script= r'''head -n "${num_lines}" "${infile}" > "${outfile}";''', named_args={ 'num_lines': num_lines, 'infile': infile, 'outfile': outfile })) num_fragments = count.reads(outfile) self.summary_dict[vc.BUCKET_NORMAL] += num_fragments return num_fragments
def generate_nonhost_fastq( nonhost_headers: str, fastq: str, output_file: str ) -> None: command.execute( command_patterns.ShellScriptCommand( script=r'''seqtk subseq "$1" "$2" > "$3";''', args=[ fastq, nonhost_headers, output_file ] ) )
def delimit_fasta(input_fa, tmp, taxid_field_num, output_fa): # Put every 2-line fasta record on a single line with delimiter # ":lineseparator:": script = r'''awk 'NR % 2 == 1 { o=$0 ; next } { print o ":lineseparator:" $0 }' "${input_fa}" ''' # Sort the records based on the field containing the taxids script += r''' | sort -T "${tmp}" --key "${taxid_field_num}" --field-separator ':' --numeric-sort ''' # Split every record back over 2 lines script += r''' | sed 's/:lineseparator:/\n/g' > "${output_fa}";''' command.execute( command_patterns.ShellScriptCommand(script=script, named_args={ "input_fa": input_fa, "tmp": tmp, "taxid_field_num": taxid_field_num, "output_fa": output_fa }))
def test_execute_shell_script_command_2(self): '''WHEN using SingleCommand with args that have special shell characters, THEN it doesn't execute subcommands''' cp1 = command_patterns.ShellScriptCommand( script=r'echo ${@: 1:$#-1} | sed ${@: $#}', args=[ "1", 2, "$(pwd)", ";ls", ";", "ls", "\n", "ls", "&& ls", "`pwd`", ">", "test.txt", "> test.txt", ">> test.txt", "&&", "ls", "$", "abc\nls" "pwd", "s/w/a/g" ]) result = command.execute_with_output(cp1) self.assertEqual( result, "1 2 $(pad) ;ls ; ls ls && ls `pad` > test.txt > test.txt >> test.txt && ls $ abc lspad\n" )
def lzw_compute(input_files, threshold_readlength, cutoff, slice_step=NUM_SLICES): """Spawn subprocesses on NUM_SLICES of the input files, then coalesce the scores into a temp file, and return that file's name.""" temp_file_names = [ f"lzwslice_{slice_step}_{slice_start}.txt" for slice_start in range(slice_step + 1) ] for tfn in temp_file_names: assert not os.path.exists(tfn) @run_in_subprocess def lzw_compute_slice(slice_start): """For each read, or read pair, in input_files, such that read_index % slice_step == slice_start, output the lzw score for the read, or the min lzw score for the pair.""" lzw_score = PipelineStepRunLZW.lzw_score with open(temp_file_names[slice_start], "a") as slice_output: for i, reads in enumerate( fasta.synchronized_iterator(input_files)): if i % slice_step == slice_start: lzw_min_score = min( lzw_score(r.sequence, threshold_readlength, cutoff) for r in reads) slice_output.write(str(lzw_min_score) + "\n") # slices run in parallel mt_map(lzw_compute_slice, range(slice_step)) slice_outputs = temp_file_names[:-1] coalesced_score_file = temp_file_names[-1] # Paste can insert newlines at the end; we grep those out. command.execute( command_patterns.ShellScriptCommand( script= r'''paste -d '\n' "${slice_outputs[@]}" | grep -v ^$ > "${coalesced_score_file}";''', named_args={ 'coalesced_score_file': coalesced_score_file, 'slice_outputs': slice_outputs })) for tfn in slice_outputs: os.remove(tfn) return coalesced_score_file
def test_named_args(self): '''WHEN parameter named_args is used THEN variables are automatically expanded''' cp1 = command_patterns.ShellScriptCommand( script=r''' set -e; echo "original_string = \"${original_string}\""; echo "sed_patterns = \"${sed_patterns[@]}\""; echo "end_sed_options = \"${end_sed_options[@]}\""; echo "sed_patterns[0] = \"${sed_patterns[0]}\""; echo "sed_patterns[1] = \"${sed_patterns[1]}\""; echo "sed_patterns[2] = \"${sed_patterns[2]}\""; echo "sed_patterns[3] = \"${sed_patterns[3]}\""; echo "sed_patterns[4] = \"${sed_patterns[4]}\""; echo "sed_patterns[5] = \"${sed_patterns[5]}\""; echo "empty_array = \"${empty_array[@]}\""; echo "empty_str = \"${empty_str}\""; echo "${original_string}" | $sed_command "${sed_patterns[@]}" | sed "${end_sed_options[@]}" ''', named_args={ 'original_string': "ABCDEF", 'sed_command': 'sed', 'end_sed_options': ['-e', 's/X/Y/'], 'sed_patterns': ['-e', 's/A/Z /', '-e', 's/B/X/', '-e', 's/;&`/^/'], 'empty_str': '', 'empty_array': [] }) p = cp1.open(stdout=subprocess.PIPE) stdout, stderr = p.communicate() self.assertFalse(stderr) self.assertEqual( stdout.decode(), 'original_string = "ABCDEF"\n' 'sed_patterns = "-e s/A/Z / -e s/B/X/ -e s/;&`/^/"\n' 'end_sed_options = "-e s/X/Y/"\n' 'sed_patterns[0] = "-e"\n' 'sed_patterns[1] = "s/A/Z /"\n' 'sed_patterns[2] = "-e"\n' 'sed_patterns[3] = "s/B/X/"\n' 'sed_patterns[4] = "-e"\n' 'sed_patterns[5] = "s/;&`/^/"\n' 'empty_array = ""\n' 'empty_str = ""\n' 'Z YCDEF\n')
def get_taxid_genomes(genome_list_local, taxid, n_per_taxid): cmd = command_patterns.ShellScriptCommand( script=( # columns: 1 = assembly_accession; 6 = taxid; 7 = species_taxid, 8 = organism_name, 20 = ftp_path r'''cut -f1,6,7,8,20 "${genome_list_local}" ''' # try to find taxid in the taxid column (2nd column of the piped input) r''' | awk -F '\t' "${awk_find_pattern}" ''' # take only top n_per_taxid results r''' | head -n "${n_per_taxid}";''' ), named_args={ 'genome_list_local': genome_list_local, 'awk_find_pattern': f'$2 == "{taxid}"', 'n_per_taxid': n_per_taxid } ) taxid_genomes = list(filter(None, command.execute_with_output(cmd).split("\n"))) return taxid_genomes
def test_open_1(self): '''WHEN args have special shell characters, THEN it doesn't execute subcommands''' cp1 = command_patterns.ShellScriptCommand( script=r'''echo "${@:1:$#-1}" ${@:1:$#-1} | sed "${@:$#}";''', args=[ "1", 2, "$(pwd)", ";ls", ";", "ls", "\n", "ls", "&& ls", "`pwd`", ">", "test.txt", "> test.txt", ">> test.txt", "&&", "ls", "$", "abc\nls", "pwd", '"quotes"', "s/w/a/g" ]) p = cp1.open(stdout=subprocess.PIPE) stdout, stderr = p.communicate() self.assertFalse(stderr) self.assertEqual( stdout.decode(), "1 2 $(pad) ;ls ; ls \n ls && ls `pad` > test.txt > test.txt >> test.txt && ls $ abc\nls pad \"quotes\" " "1 2 $(pad) ;ls ; ls ls && ls `pad` > test.txt > test.txt >> test.txt && ls $ abc ls pad \"quotes\"\n" )
def test_execute_shell_script_command_1(self): '''WHEN using ShellScriptCommand with args that contain spaces or special characters, THEN it doesn't split them into separate arguments''' cp1 = command_patterns.ShellScriptCommand( script=r''' echo May the force be with you >> "$1" echo The truth is out there > "$1" echo Live longer and prosper >> "$1" grep "${@:3}" "$1" > "$2" cat "$2" rm "$1" "$2" ''', args=[ r'''/tmp/tmp file with spaces, 'quotes', "double-quotes" and other bizarre characters `~&>.txt''', r'''/tmp/another tmp output file.txt''', "-e", "is out", "-e", "longer and prosper" ]) result = command.execute_with_output(cp1) self.assertEqual(result, "The truth is out there\nLive longer and prosper\n")
def run(self): """ Generate host genome indexes for STAR and bowtie2 """ # Set up input_fasta_path = self.input_files_local[0][0] ercc_fasta_path = s3.fetch_from_s3(self.additional_files["ercc_fasta"], self.output_dir_local, allow_s3mi=True, auto_unzip=True) if input_fasta_path[-3:] == '.gz': # unzip the file dest_path = input_fasta_path[:-3] command.execute( command_patterns.ShellScriptCommand( script= r'''gzip -dc "${input_fasta_path}" > "${dest_path}";''', named_args={ 'input_fasta_path': input_fasta_path, 'dest_path': dest_path })) input_fasta_path = dest_path input_gtf_path = None if self.additional_files.get("input_gtf"): input_gtf_path = s3.fetch_from_s3( self.additional_files["input_gtf"], self.output_dir_local, allow_s3mi=True) ercc_gtf_path = s3.fetch_from_s3(self.additional_files["ercc_gtf"], self.output_dir_local, allow_s3mi=True, auto_unzip=True) host_name = self.additional_attributes["host_name"] max_star_part_size = self.additional_attributes.get( "max_star_part_size") input_fasta_with_ercc = f"{input_fasta_path}.with_ercc" command.execute( command_patterns.ShellScriptCommand( script= r'''cat "${ercc_fasta_path}" "${input_fasta_path}" > "${input_fasta_with_ercc}";''', named_args={ 'ercc_fasta_path': ercc_fasta_path, 'input_fasta_path': input_fasta_path, 'input_fasta_with_ercc': input_fasta_with_ercc })) input_gtf_with_ercc = ercc_gtf_path if input_gtf_path: input_gtf_with_ercc = f"{input_gtf_path}.with_ercc" command.execute( command_patterns.ShellScriptCommand( script= r'''cat "${ercc_gtf_path}" "${input_gtf_path}" > "${input_gtf_with_ercc}";''', named_args={ 'ercc_gtf_path': ercc_gtf_path, 'input_gtf_path': input_gtf_path, 'input_gtf_with_ercc': input_gtf_with_ercc })) output_fasta_file, output_gtf_file, output_star_index, output_bowtie2_index = self.output_files_local( ) command.copy_file(input_fasta_with_ercc, output_fasta_file) command.copy_file(input_gtf_with_ercc, output_gtf_file) # make STAR index self.make_star_index(input_fasta_with_ercc, input_gtf_with_ercc, output_star_index, max_star_part_size) # make bowtie2 index self.make_bowtie2_index(host_name, input_fasta_with_ercc, output_bowtie2_index)
def run(self): # Setup input_files = self.input_files_local[0][0:2] num_inputs = len(input_files) assert num_inputs in [1, 2], 'Invalid number of input files' output_files = self.output_files_local()[1:3] summary_file = self.output_files_local()[0] max_fragments = self.additional_attributes["truncate_fragments_to"] file_ext = self.additional_attributes.get("file_ext") assert file_ext in ['fastq', 'fasta'], 'Invalid file extension' is_fastq = file_ext == 'fastq' try: for i in range(num_inputs): input_file = input_files[i] splited_input_file_name, splited_input_file_ext = os.path.splitext( input_file) num_lines = self.calc_max_num_lines(is_fastq, max_fragments) # unzip if .gz file if splited_input_file_ext == '.gz': input_files[i] = splited_input_file_name try: # test if a valid gzip file command.execute( command_patterns.SingleCommand( cmd="gzip", args=["-t", input_file])) # then decompress it command.execute( command_patterns.ShellScriptCommand( script= r'''gzip -dc "${input_file}" | cut -c -"$[max_line_length+1]" | head -n "${num_lines}" | awk -f "${awk_script_file}" -v max_line_length="${max_line_length}" > "${output_file}";''', named_args={ "input_file": input_file, "awk_script_file": command.get_resource_filename( "scripts/fastq-fasta-line-validation.awk" ), "max_line_length": vc.MAX_LINE_LENGTH, "num_lines": num_lines, "output_file": splited_input_file_name })) except: raise InvalidFileFormatError( "Invalid fastq/fasta/gzip file") else: # Validate and truncate the input file to keep behavior consistent with gz input files try: tmp_file = splited_input_file_name + ".tmp" command.execute( command_patterns.ShellScriptCommand( script= r'''cat "${input_file}" | cut -c -"$[max_line_length+1]" | head -n "${num_lines}" | awk -f "${awk_script_file}" -v max_line_length="${max_line_length}" > "${output_file}";''', named_args={ "input_file": input_file, "awk_script_file": command.get_resource_filename( "scripts/fastq-fasta-line-validation.awk" ), "max_line_length": vc.MAX_LINE_LENGTH, "num_lines": num_lines, "output_file": tmp_file })) input_files[i] = tmp_file except: raise InvalidFileFormatError( "Invalid fastq/fasta file") # keep a dictionary of the distribution of read lengths in the files self.summary_dict = { vc.BUCKET_TOO_SHORT: 0, vc.BUCKET_NORMAL: 0, vc.BUCKET_LONG: 0, vc.BUCKET_TOO_LONG: 0 } quick_check_passed = \ self.quick_check_file(input_files[0], is_fastq) and \ (num_inputs == 1 or self.quick_check_file(input_files[1], is_fastq)) all_fragments = [] for infile, outfile in zip(input_files, output_files): if quick_check_passed: num_fragments = self.truncate_file(infile, outfile, is_fastq, max_fragments) else: num_fragments = self._full_check_and_truncate_file( infile, outfile, is_fastq, max_fragments, num_inputs) all_fragments.append(num_fragments) if len(all_fragments) == 2 and abs(all_fragments[1] - all_fragments[0]) > 1000: raise InvalidFileFormatError( "Paired input files need to contain the same number of reads" ) with open(summary_file, 'w') as summary_f: json.dump(self.summary_dict, summary_f) except Exception as e: with open(summary_file, 'w') as summary_f: json.dump({'Validation error': str(e)}, summary_f) s3_path = self.s3_path(summary_file) s3.upload_with_retries(summary_file, s3_path) raise e return
def calc_contig2coverage(bam_filename): # PySAM pileup is CPU-intenstive. Each CPU core is assigned a slice of the input bam file on which to perform pileup. The slice contigs are selected by slice_idx modulo num_slices. Each slice gets its own pair of temporary output files, one in CSV format and one in JSON. In the end, these slice outputs are concatenated. This is a similar pattern to run_lzw. num_physical_cpu = (cpu_count() + 1) // 2 num_slices = num_physical_cpu output_csv_filenames = [ f"tmp_slice_{num_slices}_{slice}.csv" for slice in range(num_slices + 1) ] output_json_filenames = [ f"tmp_slice_{num_slices}_{slice}.json" for slice in range(num_slices + 1) ] for fn in output_csv_filenames + output_json_filenames: if os.path.exists(fn): os.remove(fn) @run_in_subprocess def compute_slice(slice_idx): with open(output_csv_filenames[slice_idx], "w") as output_csv, \ open(output_json_filenames[slice_idx], "w") as output_json, \ pysam.AlignmentFile(bam_filename, "rb") as input_bam: # noqa: E126 for contig_idx, contig_name in enumerate(input_bam.references): if contig_idx % num_slices == slice_idx: PipelineStepGenerateCoverageStats._process_contig( input_bam, output_csv, output_json, contig_name) # Compute pileup for each slice with LongRunningCodeSection( "PipelineStepGenerateCoverageStats.calc_contig2coverage.mt_map" ): mt_map(compute_slice, range(num_slices)) # Output CSV headers with open(output_csv_filenames[-1], "w") as ocsv: ocsv.write(",".join(COVERAGE_STATS_SCHEMA)) ocsv.write("\n") # Output JSON dict open paren with open(output_json_filenames[-1], "w") as ojson: ojson.write("{") # Collate CSV slices command.execute( command_patterns.ShellScriptCommand( script= r'''cat "${individual_slice_outputs[@]}" >> "${collated_csv}";''', # note >> for appending named_args={ 'collated_csv': output_csv_filenames[-1], 'individual_slice_outputs': output_csv_filenames[:-1] })) for tfn in output_csv_filenames[:-1]: os.remove(tfn) # Collate JSON slices, replacing final ", " with "}" command.execute( command_patterns.ShellScriptCommand( script= r'''cat "${individual_slice_outputs[@]}" | sed 's=, $=}=' >> "${collated_json}";''', # note >> for appending named_args={ 'collated_json': output_json_filenames[-1], 'individual_slice_outputs': output_json_filenames[:-1] })) for tfn in output_json_filenames[:-1]: os.remove(tfn) return (output_csv_filenames[-1], output_json_filenames[-1])
def get_accession_sequences(self, dest_dir, taxid, n=10): ''' Retrieve NCBI NT references for the most-matched accession in each hitsummary2 file, up to a maximum of n references. Write each reference to a separate fasta file. ''' if n == 0: return {} # Retrieve files nt_db = self.additional_attributes["nt_db"] nt_loc_db = s3.fetch_reference( self.additional_files["nt_loc_db"], self.ref_dir_local, allow_s3mi=True) # Choose accessions to process. s3_hitsummary2_files = self.additional_attributes["hitsummary2_files"].values() accessions = defaultdict(lambda: 0) # TODO: Address issue where accessions in nr can be chosen in the following code. # These accessions will not be found in nt_loc and will be subsequently omitted. for file_list in s3_hitsummary2_files: tally = defaultdict(lambda: 0) for s3_file in file_list: local_basename = s3_file.replace("/", "-").replace(":", "-") local_file = s3.fetch_from_s3( s3_file, os.path.join(self.output_dir_local, local_basename)) if local_file is None: continue with open(local_file, 'r') as f: for line in f: acc, species_taxid, genus_taxid, family_taxid = line.rstrip().split("\t")[3:7] if any(int(hit_taxid) == taxid for hit_taxid in [species_taxid, genus_taxid, family_taxid]): tally[acc] += 1 if tally: best_acc, max_count = max(tally.items(), key=lambda x: x[1]) accessions[best_acc] += max_count if len(accessions) > n: accessions = dict(sorted(accessions.items(), key=lambda x: x[1], reverse=True)[:n]) accessions = set(accessions.keys()) # Make map of accession to sequence file accession2info = dict((acc, {}) for acc in accessions) with open_file_db_by_extension(nt_loc_db) as nt_loc_dict: PipelineStepGenerateAlignmentViz.get_sequences_by_accession_list_from_s3( accession2info, nt_loc_dict, nt_db) # Put 1 fasta file per accession into the destination directory accession_fastas = {} for acc, info in accession2info.items(): if 'seq_file' not in info or info['seq_file'] is None: log.write(f"WARNING: No sequence retrieved for {acc}") continue clean_accession = self.clean_name_for_ksnp3(acc) local_fasta = f"{dest_dir}/NCBI_NT_accession_{clean_accession}.fasta" command.execute( command_patterns.SingleCommand( cmd="ln", args=[ "-s", info['seq_file'], local_fasta ] ) ) command.execute_with_output( command_patterns.ShellScriptCommand( script=r'''echo ">${acc}" | cat - "${local_fasta}" > temp_file;''', named_args={ 'acc': acc, 'local_fasta': local_fasta } ) ) command.move_file('temp_file', local_fasta) accession_fastas[acc] = local_fasta # Return kept accessions and paths of their fasta files return accession_fastas
def fetch_from_s3( src, # pylint: disable=dangerous-default-value dst, auto_unzip=DEFAULT_AUTO_UNZIP, auto_untar=DEFAULT_AUTO_UNTAR, allow_s3mi=DEFAULT_ALLOW_S3MI, okay_if_missing=False, is_reference=False, touch_only=False, mutex=TraceLock("fetch_from_s3", multiprocessing.RLock()), locks={}): """Fetch a file from S3 if needed, using either s3mi or aws cp. IT IS NOT SAFE TO CALL THIS FUNCTION FROM MULTIPLE PROCESSES. It is totally fine to call it from multiple threads (it is designed for that). When is_reference=True, "dst" must be an existing directory. If src does not exist or there is a failure fetching it, the function returns None, without raising an exception. If the download is successful, it returns the path to the downloaded file or folder. If the download already exists, it is touched to update its timestamp. When touch_only=True, if the destination does not already exist, the function simply returns None (as if the download failed). If the destination does exist, it is touched as usual. This is useful in implementing an LRU cache policy. An exception is raised only if there is a coding error or equivalent problem, not if src simply doesn't exist. """ # FIXME: this is a compatibility hack so we can replace this function # We are removing ad-hoc s3 downloads from within steps and converting # additional_files to wdl inputs. These files will be transparently # downloaded by miniwdl. miniwdl will also handle the caching that # is currently done here. This hack bypasses the s3 download if the # source is already a local file, and returns the source (which is # a local file path). This way, when we change the additional_files # to inputs we can provide the local file path to the step instead # of the s3 path and seamlessly transition without a coordinated # change between idseq-dag and the idseq monorepo. if not src.startswith("s3://"): log.write( f"fetch_from_s3 is skipping download because source: {src} does not start with s3://" ) if not os.path.isfile(src): return None if auto_untar and src.endswith(".tar"): dst = src[:-4] if not os.path.isdir(dst): command.make_dirs(dst + ".untarring") script = 'tar xvf "${src}" -C "${tmp_destdir}"' named_args = {"src": src, "tmp_destdir": dst + ".untarring"} command.execute( command_patterns.ShellScriptCommand(script=script, named_args=named_args)) command.rename(dst + ".untarring/" + os.path.basename(dst), dst) return dst return src # Do not be mislead by the multiprocessing.RLock() above -- that just means it won't deadlock # if called from multiple processes but does not mean the behaivior will be correct. It will # be incorrect, because the locks dict (cointaining per-file locks) cannot be shared across # processes, the way it can be shared across threads. if is_reference: assert config[ "REF_DIR"], "The is_reference code path becomes available only after initializing gloabal config['REF_DIR']" if os.path.exists(dst) and os.path.isdir(dst): dirname, basename = os.path.split(src) if is_reference or os.path.abspath(dst).startswith(config["REF_DIR"]): # Downloads to the reference dir are persisted from job to job, so we must include # version information from the full s3 path. # # The final destination for s3://path/to/source.db will look like /mnt/ref/s3__path__to/source.db # The final destination for s3://path/to/myarchive.tar will look like /mnt/ref/s3__path__to/myarchive/... # # We considered some other alternatives, for example /mnt/ref/s3__path__to__source.db, but unfortunately, # some tools incorporate the base name of their database input into the output filenames, so any approach # that changes the basename causes problems downstream. An example such tool is srst2. is_reference = True if dirname.startswith("s3://"): dirname = dirname.replace("s3://", "s3__", 1) # If dirname contains slashes, it has to be flattened to single level. dirname = dirname.replace("/", "__") dst = os.path.join(dst, dirname, basename) else: dst = os.path.join(dst, basename) else: assert not is_reference, f"When fetching references, dst must be an existing directory: {dst}" unzip = "" if auto_unzip: file_without_ext, ext = os.path.splitext(dst) if ext in ZIP_EXTENSIONS: unzip = " | " + ZIP_EXTENSIONS[ ext] # this command will be used to decompress stdin to stdout dst = file_without_ext # remove file extension from dst untar = auto_untar and dst.lower().endswith(".tar") if untar: dst = dst[:-4] # Remove .tar # Downloads are staged under tmp_destdir. Only after a download completes successfully it is moved to dst. destdir = os.path.dirname(dst) tmp_destdir = os.path.join(destdir, "tmp_downloads") tmp_dst = os.path.join(tmp_destdir, os.path.basename(dst)) abspath = os.path.abspath(dst) with mutex: if abspath not in locks: locks[abspath] = TraceLock(f"fetch_from_s3: {abspath}", multiprocessing.RLock()) destination_lock = locks[abspath] # shouldn't happen and makes it impossible to ensure that any dst that exists is complete and correct. assert tmp_dst != dst, f"Problematic use of fetch_from_s3 with tmp_dst==dst=='{dst}'" with destination_lock: # This check is a bit imperfect when untarring... unless you follow the discipline that # all contents of file foo.tar are under directory foo/... (which we do follow in IDseq) if os.path.exists(dst): command.touch(dst) return dst if touch_only: return None for (kind, ddir) in [("destinaiton", destdir), ("temporary download", tmp_destdir)]: try: if ddir: command.make_dirs(ddir) except OSError as e: # It's okay if the parent directory already exists, but all other # errors fail the download. if e.errno != errno.EEXIST: log.write(f"Error in creating {kind} directory.") return None with IOSTREAM: try: if allow_s3mi: wait_start = time.time() allow_s3mi = S3MI_SEM.acquire(timeout=MAX_S3MI_WAIT) wait_duration = time.time() - wait_start if not allow_s3mi: log.write( f"Failed to acquire S3MI semaphore after waiting {wait_duration} seconds for {src}." ) elif wait_duration >= 5: log.write( f"Waited {wait_duration} seconds to acquire S3MI semaphore for {src}." ) if untar: write_dst = r''' | tar xvf - -C "${tmp_destdir}";''' named_args = {'tmp_destdir': tmp_destdir} else: write_dst = r''' > "${tmp_dst}";''' named_args = {'tmp_dst': tmp_dst} command_params = f"{unzip} {write_dst}" named_args.update({'src': src}) try_cli = not allow_s3mi if allow_s3mi: if os.path.exists(tmp_dst): command.remove_rf(tmp_dst) try: command.execute( command_patterns.ShellScriptCommand( script= r'set -o pipefail; s3mi cat --quiet "${src}" ' + command_params, named_args=named_args)) except subprocess.CalledProcessError: try_cli = not okay_if_missing allow_s3mi = False S3MI_SEM.release() if try_cli: log.write( "Failed to download with s3mi. Trying with aws s3 cp..." ) else: raise if try_cli: if os.path.exists(tmp_dst): command.remove_rf(tmp_dst) if okay_if_missing: script = r'set -o pipefail; aws s3 cp --quiet "${src}" - ' + command_params else: script = r'set -o pipefail; aws s3 cp --only-show-errors "${src}" - ' + command_params command.execute( command_patterns.ShellScriptCommand( script=script, named_args=named_args, env=dict(os.environ, **refreshed_credentials()))) # Move staged download into final location. Leave this last, so it only happens if no exception has occurred. # By this point we have already asserted that tmp_dst != dst. command.rename(tmp_dst, dst) return dst except BaseException as e: # Deliberately super broad to make doubly certain that dst will be removed if there has been any exception if os.path.exists(dst): command.remove_rf(dst) if not isinstance(e, subprocess.CalledProcessError): # Coding error of some sort. Best not hide it. raise if okay_if_missing: # We presume. log.write("File most likely does not exist in S3.") else: log.write("Failed to fetch file from S3.") return None finally: if allow_s3mi: S3MI_SEM.release() if os.path.exists( tmp_dst ): # by this point we have asserted that tmp_dst != dst (and that assert may have failed, but so be it) command.remove_rf(tmp_dst)
def run(self): output_files = self.output_files_local() local_taxon_fasta_files = [f for input_item in self.input_files_local for f in input_item] taxid = self.additional_attributes["taxid"] reference_taxids = self.additional_attributes.get("reference_taxids", [taxid]) # Note: will only produce a result if species-level or below # During phylo tree creation, if the taxon is in an unknown superkingdom then the k selected from k_config is supposed to be from the key None. superkingdom_name = self.additional_attributes.get("superkingdom_name") if self.additional_attributes.get("superkingdom_name") != '' else None # knsp3 has a command (MakeKSNP3infile) for making a ksnp3-compatible input file from a directory of fasta files. # Before we can use the command, we symlink all fasta files to a dedicated directory. # The command makes certain unreasonable assumptions: # - current directory is parent directory of the fasta file directory # - file names do not have dots except before extension (also no spaces) # - file names cannot be too long (for kSNP3 tree building). input_dir_for_ksnp3 = f"{self.output_dir_local}/inputs_for_ksnp3" command.make_dirs(input_dir_for_ksnp3) for local_file in local_taxon_fasta_files: command.execute( command_patterns.SingleCommand( cmd="ln", args=[ "-s", local_file, os.path.join(input_dir_for_ksnp3, os.path.basename(local_file)) ] ) ) # Retrieve Genbank references (full assembled genomes). genbank_fastas = self.get_genbank_genomes(reference_taxids, input_dir_for_ksnp3, superkingdom_name, 0) # Retrieve NCBI NT references for the accessions in the alignment viz files. # These are the accessions (not necessarily full genomes) that were actually matched # by the sample's reads during GSNAP alignment. accession_fastas = self.get_accession_sequences(input_dir_for_ksnp3, taxid, 10) # Retrieve NCBI metadata for the accessions metadata_by_node = self.get_metadata_by_tree_node({**accession_fastas, **genbank_fastas}) metadata_output = output_files[1] with open(metadata_output, 'w') as f: json.dump(metadata_by_node, f) # Run MakeKSNP3infile. ksnp3_input_file = f"{self.output_dir_local}/inputs.txt" command.execute( command_patterns.SingleCommand( cd=self.output_dir_local, cmd='MakeKSNP3infile', args=[ os.path.basename(input_dir_for_ksnp3), ksnp3_input_file, "A" ] ) ) # Specify the names of finished reference genomes. # Used for annotation & variant-calling. annotated_genome_input = f"{self.output_dir_local}/annotated_genomes" reference_fasta_files = list(genbank_fastas.values()) + list(accession_fastas.values()) if reference_fasta_files: grep_options = (("-e", path) for path in reference_fasta_files) grep_options = list(itertools.chain.from_iterable(grep_options)) # flatmap command.execute( command_patterns.ShellScriptCommand( script=r'''grep "${grep_options[@]}" "${ksnp3_input_file}" | cut -f2 > "${annotated_genome_input}";''', named_args={ 'ksnp3_input_file': ksnp3_input_file, 'annotated_genome_input': annotated_genome_input, 'grep_options': grep_options } ) ) # Now build ksnp3 command: k_config = { # All entries to be revisited and benchmarked. # Values for viruses and bacteria come from kSNP3 recommendations (13-15 / 19-21). "Viruses": 13, "Bacteria": 19, "Eukaryota": 19, None: 13 } k = k_config[superkingdom_name] ksnp_output_dir = f"{self.output_dir_local}/ksnp3_outputs" command.make_dirs(ksnp_output_dir) ksnp_cd = os.path.dirname(ksnp_output_dir) ksnp_cmd = "kSNP3" ksnap_args = [ "-in", "inputs.txt", "-outdir", os.path.basename(ksnp_output_dir), "-k", k ] # Annotate SNPs using reference genomes: # TODO: fix gi vs accession problem if os.path.isfile(annotated_genome_input): ksnap_args.extend([ "-annotate", os.path.basename(annotated_genome_input) ]) snps_all_annotated = f"{ksnp_output_dir}/SNPs_all_annotated" if os.path.isfile(snps_all_annotated): self.additional_output_files_hidden.append(snps_all_annotated) # Produce VCF file with respect to first reference genome in annotated_genome_input: if os.path.isfile(annotated_genome_input): ksnap_args.append("-vcf") # Run ksnp3 command: command.execute( command_patterns.SingleCommand( cd=ksnp_cd, cmd=ksnp_cmd, args=ksnap_args ) ) # Postprocess output names in preparation for upload: command.move_file(os.path.join(ksnp_output_dir, "tree.parsimony.tre"), output_files[0]) ksnp_vcf_file = glob.glob(f"{ksnp_output_dir}/*.vcf") if ksnp_vcf_file: target_vcf_file = f"{ksnp_output_dir}/variants_reference1.vcf" self.name_samples_vcf(ksnp_vcf_file[0], target_vcf_file) self.additional_output_files_hidden.append(target_vcf_file) # Upload all kSNP3 output files for potential future reference supplementary_files = [f for f in glob.glob(f"{ksnp_output_dir}/*") if os.path.isfile(f) and f not in self.additional_output_files_hidden] self.additional_output_files_hidden.extend(supplementary_files)