def test_move_file_to_folder(self): '''WHEN move_file is invoked with a file path and a folder name, THEN it copies the file to the destination folder''' dest_file_path = os.path.join(TMP_DEST_FOLDER, TMP_FILE_NAME) command.move_file(TMP_SOURCE_FILE_PATH, TMP_DEST_FOLDER) self.assertTrue(os.path.exists(dest_file_path), f"file {dest_file_path} doesn't exist") self.assertFalse(os.path.exists(TMP_SOURCE_FILE_PATH), f"file {TMP_SOURCE_FILE_PATH} shouldn't exist")
def test_move_file_with_different_name(self): '''WHEN move_file is invoked with a file path and a full path with a different file name, THEN it copies the file using the new file name''' new_file_name = TMP_FILE_NAME + ".new" dest_file_path = os.path.join(TMP_DEST_FOLDER, new_file_name) command.move_file(TMP_SOURCE_FILE_PATH, dest_file_path) self.assertTrue(os.path.exists(dest_file_path), f"file {dest_file_path} doesn't exist") self.assertFalse(os.path.exists(TMP_SOURCE_FILE_PATH), f"file {TMP_SOURCE_FILE_PATH} shouldn't exist")
def trim_adapters_in_place(local_file): local_file_trimmed = os.path.join( os.path.dirname(local_file), "trimmed_" + os.path.basename(local_file)) command.execute( command_patterns.SingleCommand(cmd='cutadapt', args=[ "-a", "AGATCGGAAGAGCACACGTCT", "-o", local_file_trimmed, local_file ])) command.move_file(local_file_trimmed, local_file)
def assemble( input_fasta, input_fasta2, bowtie_fasta, # fasta file for running bowtie against contigs duplicate_cluster_sizes_path, assembled_contig, assembled_scaffold, bowtie_sam, contig_stats, read2contig, memory=100): basedir = os.path.dirname(assembled_contig) assembled_dir = os.path.join(basedir, 'spades') command.make_dirs(assembled_dir) assembled_contig_tmp = os.path.join(assembled_dir, 'contigs.fasta') assembled_scaffold_tmp = os.path.join(assembled_dir, 'scaffolds.fasta') try: if input_fasta2: command.execute( command_patterns.SingleCommand(cmd="spades.py", args=[ "-1", input_fasta, "-2", input_fasta2, "-o", assembled_dir, "-m", memory, "-t", 32, "--only-assembler" ])) else: command.execute( command_patterns.SingleCommand(cmd="spades.py", args=[ "-s", input_fasta, "-o", assembled_dir, "-m", memory, "-t", 32, "--only-assembler" ])) command.move_file(assembled_contig_tmp, assembled_contig) command.move_file(assembled_scaffold_tmp, assembled_scaffold) PipelineStepRunAssembly.generate_read_to_contig_mapping( assembled_contig, bowtie_fasta, read2contig, duplicate_cluster_sizes_path, bowtie_sam, contig_stats) except: # Assembly failed. create dummy output files command.write_text_to_file(';ASSEMBLY FAILED', assembled_contig) command.write_text_to_file(';ASSEMBLY FAILED', assembled_scaffold) command.write_text_to_file('@NO INFO', bowtie_sam) command.write_text_to_file('{}', contig_stats) traceback.print_exc() command.remove_rf(assembled_dir)
def run(self): """Run STAR to filter out host reads.""" # Setup if self.sequence_input_files is not None and self.validated_input_counts_file is not None: validated_input_counts_file = self.validated_input_counts_file input_files = self.sequence_input_files else: validated_input_counts_file = self.input_files_local[0][0] input_files = self.input_files_local[0][1:3] num_inputs = len(input_files) scratch_dir = os.path.join(self.output_dir_local, "scratch_star") output_files_local = self.output_files_local() output_gene_file = self.additional_attributes.get("output_gene_file") output_log_file = self.additional_attributes.get("output_log_file") genome_dir = s3.fetch_reference( self.additional_files["star_genome"], self.ref_dir_local, allow_s3mi=True, auto_untar=True) # Check parts file for the number of partitioned indexes parts_file = os.path.join(genome_dir, "parts.txt") assert os.path.isfile(parts_file) with open(parts_file, 'rb') as parts_f: num_parts = int(parts_f.read()) # Don't compute insert size metrics if the STAR index has more than one part # Logic for combining BAM output from STAR or insert size metrics not implemented if self.collect_insert_size_metrics_for and num_parts != 1: log.write("Insert size metrics were expected to be collected for sample but were not because the STAR index has more than one part") self.collect_insert_size_metrics_for = None # Run STAR on each partition and save the unmapped read info unmapped = input_files with open(validated_input_counts_file) as validated_input_counts_f: validated_input_counts = json.load(validated_input_counts_f) use_starlong = validated_input_counts[vc.BUCKET_LONG] > 1 or \ validated_input_counts[vc.BUCKET_TOO_LONG] > 1 for part_idx in range(num_parts): tmp = f"{scratch_dir}/star-part-{part_idx}" genome_part = f"{genome_dir}/part-{part_idx}" count_genes = part_idx == 0 self.run_star_part(tmp, genome_part, unmapped, count_genes, use_starlong) unmapped, too_discrepant = PipelineStepRunStar.sync_pairs( PipelineStepRunStar.unmapped_files_in(tmp, num_inputs)) if too_discrepant: raise BrokenReadPairError("Broken pairs") # Run part 0 in gene-counting mode: # (a) ERCCs are doped into part 0 and we want their counts. # (b) If there is only 1 part (e.g. human), the host gene counts also # make sense. if part_idx == 0: gene_count_file = os.path.join(tmp, "ReadsPerGene.out.tab") if os.path.isfile(gene_count_file) and output_gene_file: moved = os.path.join(self.output_dir_local, output_gene_file) command.move_file(gene_count_file, moved) self.additional_output_files_hidden.append(moved) log_file = os.path.join(tmp, "Log.final.out") if os.path.isfile(log_file) and output_log_file: moved = os.path.join(self.output_dir_local, output_log_file) command.move_file(log_file, moved) # STAR names the output BAM file Aligned.out.bam without TranscriptomeSAM and # Aligned.toTranscriptome.out.bam with TranscriptomeSAM, this doesn't # appear to be configurable is_dna = self.collect_insert_size_metrics_for == "dna" bam_filename = "Aligned.out.bam" if is_dna else "Aligned.toTranscriptome.out.bam" if self.collect_insert_size_metrics_for: bam_path = os.path.join(tmp, bam_filename) # If this file wasn't generated but self.collect_insert_size_metrics_for has a value # something unexpected has gone wrong assert(os.path.isfile(bam_path)), \ "Expected STAR to generate Aligned.out.bam but it was not found" try: self.collect_insert_size_metrics(tmp, bam_path, self.output_metrics_file, self.output_histogram_file) if os.path.exists(self.output_metrics_file): self.additional_output_files_visible.append(self.output_metrics_file) else: message = "expected picard to generate a metrics file but none was found" log.write(message=message, warning=True) if os.path.exists(self.output_histogram_file): self.additional_output_files_visible.append(self.output_histogram_file) else: message = "expected picard to generate a histogram file but none was found" log.write(message=message, warning=True) except Exception as e: log.write(message=f"encountered error while running picard: {type(e).__name__}: {e}", warning=True) # Sort unmapped files for deterministic output for unmapped_file in unmapped: sort_fastx_by_entry_id(unmapped_file) # Cleanup for src, dst in zip(unmapped, output_files_local): command.move_file(src, dst) # Move out of scratch dir command.remove_rf(f"{scratch_dir}/*")
def run(self): output_files = self.output_files_local() local_taxon_fasta_files = [f for input_item in self.input_files_local for f in input_item] taxid = self.additional_attributes["taxid"] reference_taxids = self.additional_attributes.get("reference_taxids", [taxid]) # Note: will only produce a result if species-level or below # During phylo tree creation, if the taxon is in an unknown superkingdom then the k selected from k_config is supposed to be from the key None. superkingdom_name = self.additional_attributes.get("superkingdom_name") if self.additional_attributes.get("superkingdom_name") != '' else None # knsp3 has a command (MakeKSNP3infile) for making a ksnp3-compatible input file from a directory of fasta files. # Before we can use the command, we symlink all fasta files to a dedicated directory. # The command makes certain unreasonable assumptions: # - current directory is parent directory of the fasta file directory # - file names do not have dots except before extension (also no spaces) # - file names cannot be too long (for kSNP3 tree building). input_dir_for_ksnp3 = f"{self.output_dir_local}/inputs_for_ksnp3" command.make_dirs(input_dir_for_ksnp3) for local_file in local_taxon_fasta_files: command.execute( command_patterns.SingleCommand( cmd="ln", args=[ "-s", local_file, os.path.join(input_dir_for_ksnp3, os.path.basename(local_file)) ] ) ) # Retrieve Genbank references (full assembled genomes). genbank_fastas = self.get_genbank_genomes(reference_taxids, input_dir_for_ksnp3, superkingdom_name, 0) # Retrieve NCBI NT references for the accessions in the alignment viz files. # These are the accessions (not necessarily full genomes) that were actually matched # by the sample's reads during GSNAP alignment. accession_fastas = self.get_accession_sequences(input_dir_for_ksnp3, taxid, 10) # Retrieve NCBI metadata for the accessions metadata_by_node = self.get_metadata_by_tree_node({**accession_fastas, **genbank_fastas}) metadata_output = output_files[1] with open(metadata_output, 'w') as f: json.dump(metadata_by_node, f) # Run MakeKSNP3infile. ksnp3_input_file = f"{self.output_dir_local}/inputs.txt" command.execute( command_patterns.SingleCommand( cd=self.output_dir_local, cmd='MakeKSNP3infile', args=[ os.path.basename(input_dir_for_ksnp3), ksnp3_input_file, "A" ] ) ) # Specify the names of finished reference genomes. # Used for annotation & variant-calling. annotated_genome_input = f"{self.output_dir_local}/annotated_genomes" reference_fasta_files = list(genbank_fastas.values()) + list(accession_fastas.values()) if reference_fasta_files: grep_options = (("-e", path) for path in reference_fasta_files) grep_options = list(itertools.chain.from_iterable(grep_options)) # flatmap command.execute( command_patterns.ShellScriptCommand( script=r'''grep "${grep_options[@]}" "${ksnp3_input_file}" | cut -f2 > "${annotated_genome_input}";''', named_args={ 'ksnp3_input_file': ksnp3_input_file, 'annotated_genome_input': annotated_genome_input, 'grep_options': grep_options } ) ) # Now build ksnp3 command: k_config = { # All entries to be revisited and benchmarked. # Values for viruses and bacteria come from kSNP3 recommendations (13-15 / 19-21). "Viruses": 13, "Bacteria": 19, "Eukaryota": 19, None: 13 } k = k_config[superkingdom_name] ksnp_output_dir = f"{self.output_dir_local}/ksnp3_outputs" command.make_dirs(ksnp_output_dir) ksnp_cd = os.path.dirname(ksnp_output_dir) ksnp_cmd = "kSNP3" ksnap_args = [ "-in", "inputs.txt", "-outdir", os.path.basename(ksnp_output_dir), "-k", k ] # Annotate SNPs using reference genomes: # TODO: fix gi vs accession problem if os.path.isfile(annotated_genome_input): ksnap_args.extend([ "-annotate", os.path.basename(annotated_genome_input) ]) snps_all_annotated = f"{ksnp_output_dir}/SNPs_all_annotated" if os.path.isfile(snps_all_annotated): self.additional_output_files_hidden.append(snps_all_annotated) # Produce VCF file with respect to first reference genome in annotated_genome_input: if os.path.isfile(annotated_genome_input): ksnap_args.append("-vcf") # Run ksnp3 command: command.execute( command_patterns.SingleCommand( cd=ksnp_cd, cmd=ksnp_cmd, args=ksnap_args ) ) # Postprocess output names in preparation for upload: command.move_file(os.path.join(ksnp_output_dir, "tree.parsimony.tre"), output_files[0]) ksnp_vcf_file = glob.glob(f"{ksnp_output_dir}/*.vcf") if ksnp_vcf_file: target_vcf_file = f"{ksnp_output_dir}/variants_reference1.vcf" self.name_samples_vcf(ksnp_vcf_file[0], target_vcf_file) self.additional_output_files_hidden.append(target_vcf_file) # Upload all kSNP3 output files for potential future reference supplementary_files = [f for f in glob.glob(f"{ksnp_output_dir}/*") if os.path.isfile(f) and f not in self.additional_output_files_hidden] self.additional_output_files_hidden.extend(supplementary_files)
def get_accession_sequences(self, dest_dir, taxid, n=10): ''' Retrieve NCBI NT references for the most-matched accession in each hitsummary2 file, up to a maximum of n references. Write each reference to a separate fasta file. ''' if n == 0: return {} # Retrieve files nt_db = self.additional_attributes["nt_db"] nt_loc_db = s3.fetch_reference( self.additional_files["nt_loc_db"], self.ref_dir_local, allow_s3mi=True) # Choose accessions to process. s3_hitsummary2_files = self.additional_attributes["hitsummary2_files"].values() accessions = defaultdict(lambda: 0) # TODO: Address issue where accessions in nr can be chosen in the following code. # These accessions will not be found in nt_loc and will be subsequently omitted. for file_list in s3_hitsummary2_files: tally = defaultdict(lambda: 0) for s3_file in file_list: local_basename = s3_file.replace("/", "-").replace(":", "-") local_file = s3.fetch_from_s3( s3_file, os.path.join(self.output_dir_local, local_basename)) if local_file is None: continue with open(local_file, 'r') as f: for line in f: acc, species_taxid, genus_taxid, family_taxid = line.rstrip().split("\t")[3:7] if any(int(hit_taxid) == taxid for hit_taxid in [species_taxid, genus_taxid, family_taxid]): tally[acc] += 1 if tally: best_acc, max_count = max(tally.items(), key=lambda x: x[1]) accessions[best_acc] += max_count if len(accessions) > n: accessions = dict(sorted(accessions.items(), key=lambda x: x[1], reverse=True)[:n]) accessions = set(accessions.keys()) # Make map of accession to sequence file accession2info = dict((acc, {}) for acc in accessions) with open_file_db_by_extension(nt_loc_db) as nt_loc_dict: PipelineStepGenerateAlignmentViz.get_sequences_by_accession_list_from_s3( accession2info, nt_loc_dict, nt_db) # Put 1 fasta file per accession into the destination directory accession_fastas = {} for acc, info in accession2info.items(): if 'seq_file' not in info or info['seq_file'] is None: log.write(f"WARNING: No sequence retrieved for {acc}") continue clean_accession = self.clean_name_for_ksnp3(acc) local_fasta = f"{dest_dir}/NCBI_NT_accession_{clean_accession}.fasta" command.execute( command_patterns.SingleCommand( cmd="ln", args=[ "-s", info['seq_file'], local_fasta ] ) ) command.execute_with_output( command_patterns.ShellScriptCommand( script=r'''echo ">${acc}" | cat - "${local_fasta}" > temp_file;''', named_args={ 'acc': acc, 'local_fasta': local_fasta } ) ) command.move_file('temp_file', local_fasta) accession_fastas[acc] = local_fasta # Return kept accessions and paths of their fasta files return accession_fastas
def generate_lzw_filtered(self, fasta_files, output_files, cutoff_scores, threshold_readlength): assert len(fasta_files) == len(output_files) cutoff_scores.sort( reverse=True) # Make sure cutoff is from high to low # This is the bulk of the computation. Everything else below is just binning by cutoff score. coalesced_score_file = PipelineStepRunLZW.lzw_compute( fasta_files, threshold_readlength, cutoff_scores[0]) readcount_list = [] # one item per cutoff outstreams_list = [] # one item per cutoff outfiles_list = [] # one item per cutoff for cutoff in cutoff_scores: readcount_list.append(0) outstreams = [] outfiles = [] for f in output_files: outfile_name = "%s-%f" % (f, cutoff) outfiles.append(outfile_name) outstreams.append(open(outfile_name, 'w')) outstreams_list.append(outstreams) outfiles_list.append(outfiles) outstreams_for_cutoff = list(zip(outstreams_list, cutoff_scores)) def score_iterator(score_file: str) -> Iterator[float]: with open(score_file, "r") as sf: for line in sf: yield float(line) total_reads = 0 for reads, score in zip(fasta.synchronized_iterator(fasta_files), score_iterator(coalesced_score_file)): total_reads += 1 for i, (outstreams, cutoff) in enumerate(outstreams_for_cutoff): if score > cutoff: readcount_list[i] += 1 for ostr, r in zip(outstreams, reads): ostr.write(r.header + "\n") ostr.write(r.sequence + "\n") break os.remove(coalesced_score_file) # closing all the streams for outstreams in outstreams_list: for ostr in outstreams: ostr.close() # get the right output file and metrics kept_count = 0 filtered = total_reads cutoff_frac = None for cutoff_frac, readcount, outfiles in zip(cutoff_scores, readcount_list, outfiles_list): if readcount > 0: # found the right bin kept_count = readcount filtered = total_reads - kept_count # move the output files over for outfile, output_file in zip(outfiles, output_files): command.move_file(outfile, output_file) break if kept_count == 0: self.input_file_error = InputFileErrors.INSUFFICIENT_READS self.status = StepStatus.INVALID_INPUT return kept_ratio = float(kept_count) / float(total_reads) msg = "LZW filter: cutoff_frac: %f, total reads: %d, filtered reads: %d, " \ "kept ratio: %f" % (cutoff_frac, total_reads, filtered, kept_ratio) log.write(msg)