def get_accession_sequences(self, dest_dir, n=10): ''' Retrieve NCBI NT references for the most-matched accession in each alignment viz file, up to a maximum of n references. Write each reference to a separate fasta file. ''' if n == 0: return {} # Retrieve files nt_db = self.additional_attributes["nt_db"] nt_loc_db = s3.fetch_from_s3( self.additional_files["nt_loc_db"], self.ref_dir_local, allow_s3mi=True) s3_align_viz_files = self.additional_attributes["align_viz_files"].values() local_align_viz_files = [] for s3_file in s3_align_viz_files: local_basename = s3_file.replace("/", "-").replace(":", "-") # needs to be unique locally local_file = s3.fetch_from_s3( s3_file, os.path.join(self.ref_dir_local, local_basename)) if local_file != None: local_align_viz_files.append(local_file) # Choose accessions to process. # align_viz files are a bit brittle, so we just log exceptions rather than failing the job. accessions = set() for local_file in local_align_viz_files: try: with open(local_file, 'rb') as f: align_viz_dict = json.load(f) most_matched_accession = None max_num_reads = 0 flat_align_viz_dict = {} self.parse_tree(align_viz_dict, flat_align_viz_dict) for acc, info in flat_align_viz_dict.items(): num_reads = info["coverage_summary"]["num_reads"] if num_reads > max_num_reads: max_num_reads = num_reads most_matched_accession = acc accessions.add(most_matched_accession) if len(accessions) >= n: break except: log.write(f"Warning: couldn't get accession from {local_file}!") traceback.print_exc() if len(accessions) > n: accessions = set(list(accessions)[0:n]) # Make map of accession to sequence file accession2info = dict((acc, {}) for acc in accessions) nt_loc_dict = shelve.open(nt_loc_db.replace(".db", "")) PipelineStepGenerateAlignmentViz.get_sequences_by_accession_list_from_s3( accession2info, nt_loc_dict, nt_db) # Put 1 fasta file per accession into the destination directory accession_fastas = {} for acc, info in accession2info.items(): clean_accession = self.clean_name_for_ksnp3(acc) local_fasta = f"{dest_dir}/NCBI_NT_accession_{clean_accession}.fasta" command.execute(f"ln -s {info['seq_file']} {local_fasta}") command.execute(f"echo '>{acc}' | cat - {local_fasta} > temp_file && mv temp_file {local_fasta}") accession_fastas[acc] = local_fasta # Return kept accessions and paths of their fasta files return accession_fastas
def get_accession_sequences(self, dest_dir, taxid, n=10): ''' Retrieve NCBI NT references for the most-matched accession in each hitsummary2 file, up to a maximum of n references. Write each reference to a separate fasta file. ''' if n == 0: return {} # Retrieve files nt_db = self.additional_attributes["nt_db"] nt_loc_db = s3.fetch_reference( self.additional_files["nt_loc_db"], self.ref_dir_local, allow_s3mi=True) # Choose accessions to process. s3_hitsummary2_files = self.additional_attributes["hitsummary2_files"].values() accessions = defaultdict(lambda: 0) # TODO: Address issue where accessions in nr can be chosen in the following code. # These accessions will not be found in nt_loc and will be subsequently omitted. for file_list in s3_hitsummary2_files: tally = defaultdict(lambda: 0) for s3_file in file_list: local_basename = s3_file.replace("/", "-").replace(":", "-") local_file = s3.fetch_from_s3( s3_file, os.path.join(self.output_dir_local, local_basename)) if local_file is None: continue with open(local_file, 'r') as f: for line in f: acc, species_taxid, genus_taxid, family_taxid = line.rstrip().split("\t")[3:7] if any(int(hit_taxid) == taxid for hit_taxid in [species_taxid, genus_taxid, family_taxid]): tally[acc] += 1 if tally: best_acc, max_count = max(tally.items(), key=lambda x: x[1]) accessions[best_acc] += max_count if len(accessions) > n: accessions = dict(sorted(accessions.items(), key=lambda x: x[1], reverse=True)[:n]) accessions = set(accessions.keys()) # Make map of accession to sequence file accession2info = dict((acc, {}) for acc in accessions) with open_file_db_by_extension(nt_loc_db) as nt_loc_dict: PipelineStepGenerateAlignmentViz.get_sequences_by_accession_list_from_s3( accession2info, nt_loc_dict, nt_db) # Put 1 fasta file per accession into the destination directory accession_fastas = {} for acc, info in accession2info.items(): if 'seq_file' not in info or info['seq_file'] is None: log.write(f"WARNING: No sequence retrieved for {acc}") continue clean_accession = self.clean_name_for_ksnp3(acc) local_fasta = f"{dest_dir}/NCBI_NT_accession_{clean_accession}.fasta" command.execute( command_patterns.SingleCommand( cmd="ln", args=[ "-s", info['seq_file'], local_fasta ] ) ) command.execute_with_output( command_patterns.ShellScriptCommand( script=r'''echo ">${acc}" | cat - "${local_fasta}" > temp_file;''', named_args={ 'acc': acc, 'local_fasta': local_fasta } ) ) command.move_file('temp_file', local_fasta) accession_fastas[acc] = local_fasta # Return kept accessions and paths of their fasta files return accession_fastas