Пример #1
0
    def get_accession_sequences(self, dest_dir, n=10):
        '''
        Retrieve NCBI NT references for the most-matched accession in each alignment viz file, up to a maximum of n references.
        Write each reference to a separate fasta file.
        '''
        if n == 0:
            return {}

        # Retrieve files
        nt_db = self.additional_attributes["nt_db"]
        nt_loc_db = s3.fetch_from_s3(
            self.additional_files["nt_loc_db"],
            self.ref_dir_local,
            allow_s3mi=True)
        s3_align_viz_files = self.additional_attributes["align_viz_files"].values()
        local_align_viz_files = []
        for s3_file in s3_align_viz_files:
            local_basename = s3_file.replace("/", "-").replace(":", "-") # needs to be unique locally
            local_file = s3.fetch_from_s3(
                s3_file,
                os.path.join(self.ref_dir_local, local_basename))
            if local_file != None:
                local_align_viz_files.append(local_file)

        # Choose accessions to process.
        # align_viz files are a bit brittle, so we just log exceptions rather than failing the job.
        accessions = set()
        for local_file in local_align_viz_files:
            try:
                with open(local_file, 'rb') as f:
                    align_viz_dict = json.load(f)
                most_matched_accession = None
                max_num_reads = 0
                flat_align_viz_dict = {}
                self.parse_tree(align_viz_dict, flat_align_viz_dict)
                for acc, info in flat_align_viz_dict.items():
                    num_reads = info["coverage_summary"]["num_reads"]
                    if num_reads > max_num_reads:
                        max_num_reads = num_reads
                        most_matched_accession = acc
                accessions.add(most_matched_accession)
                if len(accessions) >= n:
                    break
            except:
                log.write(f"Warning: couldn't get accession from {local_file}!")
                traceback.print_exc()
        if len(accessions) > n:
            accessions = set(list(accessions)[0:n])

        # Make map of accession to sequence file
        accession2info = dict((acc, {}) for acc in accessions)
        nt_loc_dict = shelve.open(nt_loc_db.replace(".db", ""))
        PipelineStepGenerateAlignmentViz.get_sequences_by_accession_list_from_s3(
            accession2info, nt_loc_dict, nt_db)

        # Put 1 fasta file per accession into the destination directory
        accession_fastas = {}
        for acc, info in accession2info.items():
            clean_accession = self.clean_name_for_ksnp3(acc)
            local_fasta = f"{dest_dir}/NCBI_NT_accession_{clean_accession}.fasta"
            command.execute(f"ln -s {info['seq_file']} {local_fasta}")
            command.execute(f"echo '>{acc}' | cat - {local_fasta} > temp_file && mv temp_file {local_fasta}")
            accession_fastas[acc] = local_fasta

        # Return kept accessions and paths of their fasta files
        return accession_fastas
    def get_accession_sequences(self, dest_dir, taxid, n=10):
        '''
        Retrieve NCBI NT references for the most-matched accession in each hitsummary2 file, up to a maximum of n references.
        Write each reference to a separate fasta file.
        '''
        if n == 0:
            return {}

        # Retrieve files
        nt_db = self.additional_attributes["nt_db"]
        nt_loc_db = s3.fetch_reference(
            self.additional_files["nt_loc_db"],
            self.ref_dir_local,
            allow_s3mi=True)

        # Choose accessions to process.
        s3_hitsummary2_files = self.additional_attributes["hitsummary2_files"].values()
        accessions = defaultdict(lambda: 0)
        # TODO: Address issue where accessions in nr can be chosen in the following code.
        # These accessions will not be found in nt_loc and will be subsequently omitted.
        for file_list in s3_hitsummary2_files:
            tally = defaultdict(lambda: 0)
            for s3_file in file_list:
                local_basename = s3_file.replace("/", "-").replace(":", "-")
                local_file = s3.fetch_from_s3(
                    s3_file,
                    os.path.join(self.output_dir_local, local_basename))
                if local_file is None:
                    continue
                with open(local_file, 'r') as f:
                    for line in f:
                        acc, species_taxid, genus_taxid, family_taxid = line.rstrip().split("\t")[3:7]
                        if any(int(hit_taxid) == taxid for hit_taxid in [species_taxid, genus_taxid, family_taxid]):
                            tally[acc] += 1
            if tally:
                best_acc, max_count = max(tally.items(), key=lambda x: x[1])
                accessions[best_acc] += max_count
        if len(accessions) > n:
            accessions = dict(sorted(accessions.items(), key=lambda x: x[1], reverse=True)[:n])
        accessions = set(accessions.keys())

        # Make map of accession to sequence file
        accession2info = dict((acc, {}) for acc in accessions)
        with open_file_db_by_extension(nt_loc_db) as nt_loc_dict:
            PipelineStepGenerateAlignmentViz.get_sequences_by_accession_list_from_s3(
                accession2info, nt_loc_dict, nt_db)

        # Put 1 fasta file per accession into the destination directory
        accession_fastas = {}
        for acc, info in accession2info.items():
            if 'seq_file' not in info or info['seq_file'] is None:
                log.write(f"WARNING: No sequence retrieved for {acc}")
                continue
            clean_accession = self.clean_name_for_ksnp3(acc)
            local_fasta = f"{dest_dir}/NCBI_NT_accession_{clean_accession}.fasta"
            command.execute(
                command_patterns.SingleCommand(
                    cmd="ln",
                    args=[
                        "-s",
                        info['seq_file'],
                        local_fasta
                    ]
                )
            )
            command.execute_with_output(
                command_patterns.ShellScriptCommand(
                    script=r'''echo ">${acc}" | cat - "${local_fasta}" > temp_file;''',
                    named_args={
                        'acc': acc,
                        'local_fasta': local_fasta
                    }
                )
            )
            command.move_file('temp_file', local_fasta)

            accession_fastas[acc] = local_fasta

        # Return kept accessions and paths of their fasta files
        return accession_fastas