def _count_reads_expanding_duplicates(local_file_path, cluster_sizes, cluster_key): # See documentation for reads_in_group use case with cluster_sizes, below. unique_count, nonunique_count = 0, 0 for read in fasta.iterator(local_file_path): # A read header looks someting like # # >M05295:357:000000000-CRPNR:1:1101:22051:10534 OPTIONAL RANDOM STUFF" # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # # where the first character on the line is '>' and the read ID (underlined above) # extends from '>' to the first whitespace character, not including '>' itself. # # The fasta iterator already asserts that read.header[0] is '>'. # # As we proceed down along the pipeline, read IDs get annotated with taxonomic information, # changing the above into something like # # >NT:ABC2433.1:NR:ABC5656.2:M05295:357:000000000-CRPNR:1:1101:22051:10534 OPTIONAL RANDOM STUFF" # ^^^^^^^^^^^^^^^^^^^^^^^^^^ # # The underlined annotation has to be stripped out by the cluster_key function, # so that we can use the original read ID to look up the cluster size. # read_id = read.header.split(None, 1)[0][1:] unique_count += 1 nonunique_count += get_read_cluster_size(cluster_sizes, cluster_key(read_id)) return unique_count, nonunique_count
def run(self): input_fa_name = self.input_files_local[0][0] if len(self.input_files_local) > 1: input_fa_name = self.input_files_local[0][0] nt_hit_summary_path, nr_hit_summary_path = self.input_files_local[ 1][2], self.input_files_local[2][2] else: # This is used in `short-read-mngs/experimental.wdl` input_fa_name = self.input_files_local[0][0] nt_hit_summary_path, nr_hit_summary_path = self.input_files_local[ 0][1], self.input_files_local[0][2] # Open lineage db lineage_db = s3.fetch_reference(self.additional_files["lineage_db"], self.ref_dir_local, allow_s3mi=True) with open(nt_hit_summary_path) as nt_hit_summary_f, open( nr_hit_summary_path) as nr_hit_summary_f: nr_hits_by_read_id = { row["read_id"]: (row["taxid"], row["level"]) for row in HitSummaryMergedReader(nr_hit_summary_f) } nt_hits_by_read_id = { row["read_id"]: (row["taxid"], row["level"]) for row in HitSummaryMergedReader(nt_hit_summary_f) } with open(self.output_files_local()[0], "w") as output_fa, \ open_file_db_by_extension(lineage_db) as lineage_map: # noqa for read in fasta.iterator(input_fa_name): # Example read_id: "NR::NT:CP010376.2:NB501961:14:HM7TLBGX2:1:23109 # :12720:8743/2" # Translate the read information into our custom format with fake # taxids at non-specific hit levels. # TODO: (tmorse) fasta parsing annotated_read_id = read.header.lstrip('>') read_id = annotated_read_id.split(":", 4)[-1] nr_taxid_species, nr_taxid_genus, nr_taxid_family = PipelineStepGenerateTaxidFasta.get_valid_lineage( nr_hits_by_read_id, lineage_map, read_id) nt_taxid_species, nt_taxid_genus, nt_taxid_family = PipelineStepGenerateTaxidFasta.get_valid_lineage( nt_hits_by_read_id, lineage_map, read_id) fields = [ "family_nr", nr_taxid_family, "family_nt", nt_taxid_family ] fields += [ "genus_nr", nr_taxid_genus, "genus_nt", nt_taxid_genus ] fields += [ "species_nr", nr_taxid_species, "species_nt", nt_taxid_species ] fields += [annotated_read_id] new_read_name = ('>' + ':'.join(fields) + '\n') output_fa.write(new_read_name) output_fa.write(read.sequence + "\n")
def generate_unidentified_fasta(self, input_fa, output_fa, clusters_dict=None, unique_output_fa=None): """ Generates files with all unmapped reads. If COUNT_ALL, which was added in v4, then include non-unique reads extracted upstream by idseq-dedup. unique_output_fa exists primarily for counting. See count_reads above. """ unique_output_file = open(unique_output_fa, "w") if clusters_dict else None with open(output_fa, "w") as output_file: for read in fasta.iterator(input_fa): if not read.header.startswith(UNMAPPED_HEADER_PREFIX): continue output_file.write(read.header + "\n") output_file.write(read.sequence + "\n") if unique_output_file: unique_output_file.write(read.header + "\n") unique_output_file.write(read.sequence + "\n") if clusters_dict: # get inner part of header like # '>NR::NT::NB501961:14:HM7TLBGX2:4:23511:18703:20079/2' line = read.header header_suffix = "" if line[-2:-1] == "/": # /1 or /2 line, header_suffix = line[:-2], line[-2:] assert header_suffix in ('/1', '/2') assert len( read.header) == len(line) + len(header_suffix) key = line.split(UNMAPPED_HEADER_PREFIX)[1] other_keys = clusters_dict[key][ 1:] # key should always be present for other_key in other_keys: other_header = UNMAPPED_HEADER_PREFIX + other_key + header_suffix output_file.write(other_header + "\n") output_file.write(read.sequence + "\n") # write duplicate seq
def parse_clusters_file( cdhit_clusters_path: str, deduped_fasta_path: str, ) -> Dict[str, Optional[Tuple]]: # First identify the cluster representative reads emitted by cd-hit-dup. Originally we # used the ".clstr" output of cd-hit-dup for this, but turns out that for unpaired reads # the actual deduped output of cdhit contains different representatives. clusters_dict: Dict[str, Optional[Tuple]] = {} for read in iterator(deduped_fasta_path): # A read header looks someting like # # >M05295:357:000000000-CRPNR:1:1101:22051:10534 OPTIONAL RANDOM STUFF" # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # # where the first character on the line is '>' and the read ID (underlined above) # extends from '>' to the first whitespace character, not including '>' itself. # # The fasta iterator already asserts that read.header[0] is '>'. # read_id = read.header.split(None, 1)[0][1:] clusters_dict[read_id] = None # not yet known def record_cluster_size( cluster_size: int, emitted_reads_from_cluster: set, other_reads_from_cluster: set, line_number: int, ): assert emitted_reads_from_cluster, f"""If this assertion fails, CD-HIT-DUP has forgotten to emit a read for this cluster. In that case, just use the current read_id as cluster_representative. Everything will work fine, aside from reduced sensitivity. {line_number}""" assert len(emitted_reads_from_cluster) == 1, f"""If this assertion fails, CD-HIT-DUP has emitted multiple reads from the same cluster. Feel free to comment out this assertion if that happens a lot in practice. Everything will run fine, but read counts contributed by that cluster will be exaggerated. If you want to fix that, make the cluster sizes a float --- divide the actual cluster size by the number of reads emitted for the cluster, i.e. by len(emitted_reads_from_cluster). Probably an even better way of fixing it would be to emit your own fasta based on the .clstr file if that's reliable, or use a tool other than cdhit that doesn't have this bug. {line_number}: {emitted_reads_from_cluster}""" cluster_representative = emitted_reads_from_cluster.pop() assert cluster_representative in clusters_dict, "If this fails it's our bug here." assert cluster_size - 1 == len(other_reads_from_cluster), """other_reads_from_cluster should contain the number of reads specified by cluster_size minus cluster_representative: {}, {}""".format(cluster_size, other_reads_from_cluster) clusters_dict[cluster_representative] = (cluster_size,) + tuple(other_reads_from_cluster) return with open(cdhit_clusters_path, "r") as clusters_file: # set of reads in both dedup1.fa and current cluster; cardinality 1! emitted_reads_from_cluster: Set[str] = set() other_reads_from_cluster: Set[str] = set() cluster_size = 0 read_id = None line_number = 0 for line in clusters_file: line_number += 1 if line.startswith(">"): continue parts = line.strip().split() serial = int(parts[0]) assert parts[2][0] == ">", line assert parts[2].endswith("..."), line if serial == 0 and cluster_size > 0: # We've just encountered the first read of a new cluster. Emit # all data held for the old cluster. record_cluster_size( cluster_size, emitted_reads_from_cluster, other_reads_from_cluster, line_number, ) emitted_reads_from_cluster = set() other_reads_from_cluster = set() cluster_size = 0 assert cluster_size == serial, f"{line_number}: {cluster_size}, {serial}, {line}" read_id = parts[2][1:-3] cluster_size += 1 if read_id in clusters_dict: emitted_reads_from_cluster.add(read_id) else: other_reads_from_cluster.add(read_id) # record last cluster if cluster_size > 0: record_cluster_size( cluster_size, emitted_reads_from_cluster, other_reads_from_cluster, line_number, ) return clusters_dict