def get_valid_lineage(valid_hits, lineage_map, read_id, count_type): # If the read aligned to something, then it would be present in the # summary file for count type, and correspondingly in valid_hits[ # count_type], even if the hits disagree so much that the # "valid_hits" entry is just ("-1", "-1"). If the read didn't align # to anything, we also represent that with ("-1", "-1"). This ("-1", # "-1") gets translated to NULL_LINEAGE. hit_taxid_str, hit_level_str = valid_hits[count_type].get( read_id, ("-1", "-1")) hit_lineage = lineage_map.get(hit_taxid_str, lineage.NULL_LINEAGE) return lineage.validate_taxid_lineage(hit_lineage, hit_taxid_str, hit_level_str)
def get_valid_lineage(hits_by_read_id, lineage_map, read_id: str) -> List[str]: # If the read aligned to something, then it would be present in the # summary file for count type, and correspondingly in hits_by_read_id # even if the hits disagree so much that the # "valid_hits" entry is just ("-1", "-1"). If the read didn't align # to anything, we also represent that with ("-1", "-1"). This ("-1", # "-1") gets translated to NULL_LINEAGE. hit_taxid, hit_level = hits_by_read_id.get(read_id, (-1, -1)) hit_lineage = lineage_map.get(str(hit_taxid), lineage.NULL_LINEAGE) return lineage.validate_taxid_lineage(hit_lineage, hit_taxid, hit_level)
def generate_taxon_count_json_from_m8( blastn_6_path, hit_level_path, count_type, lineage_map_path, deuterostome_path, taxon_whitelist_path, taxon_blacklist_path, duplicate_cluster_sizes_path, output_json_file): # Parse through hit file and m8 input file and format a JSON file with # our desired attributes, including aggregated statistics. duplicate_cluster_sizes = load_duplicate_cluster_sizes(duplicate_cluster_sizes_path) should_keep = build_should_keep_filter( deuterostome_path, taxon_whitelist_path, taxon_blacklist_path) # Setup aggregation = {} with open(hit_level_path) as hit_level_f, \ open(blastn_6_path) as blastn_6_f, \ open_file_db_by_extension(lineage_map_path) as lineage_map: num_ranks = len(lineage.NULL_LINEAGE) # See https://en.wikipedia.org/wiki/Double-precision_floating-point_format MIN_NORMAL_POSITIVE_DOUBLE = 2.0**-1022 with log.log_context("generate_taxon_count_json_from_m8", {"substep": "loop_1"}): # Lines in m8_file and hit_level_file correspond (same read_id) for hit_row, blastn_6_row in zip(HitSummaryMergedReader(hit_level_f), BlastnOutput6NTRerankedReader(blastn_6_f)): # Retrieve data values from files read_id = hit_row["read_id"] hit_level = hit_row["level"] hit_taxid = hit_row["taxid"] if hit_level < 0: log.write('hit_level < 0', debug=True) hit_source_count_type = hit_row.get("source_count_type") msg = "read_ids in %s and %s do not match: %s vs. %s" % ( os.path.basename(blastn_6_path), os.path.basename(hit_level_path), blastn_6_row["qseqid"], read_id) assert blastn_6_row["qseqid"] == read_id, msg percent_identity = blastn_6_row["pident"] alignment_length = blastn_6_row["length"] if count_type == 'merged_NT_NR' and hit_source_count_type == 'NR': # NOTE: At the moment of the change, applied ONLY in the scope of the prototype of NT/NR consensus project. # Protein alignments (NR) are done at amino acid level. Each amino acid is composed of 3 nucleotides. # To make alignment length values comparable across NT and NR alignments (for combined statistics), # the NR alignment lengths are multiplied by 3. alignment_length *= 3 e_value = blastn_6_row["evalue"] # These have been filtered out before the creation of blastn_6_f and hit_level_f assert alignment_length > 0 assert -0.25 < percent_identity < 100.25 assert e_value == e_value if count_type == "NT" or hit_source_count_type == "NT": # e_value could be 0 when large contigs are mapped if e_value <= MIN_NORMAL_POSITIVE_DOUBLE: e_value = MIN_NORMAL_POSITIVE_DOUBLE e_value = math.log10(e_value) # Retrieve the taxon lineage and mark meaningless calls with fake # taxids. # lineage_map expects string ids hit_taxids_all_levels = lineage_map.get( str(hit_taxid), lineage.NULL_LINEAGE) cleaned_hit_taxids_all_levels = lineage.validate_taxid_lineage( hit_taxids_all_levels, hit_taxid, hit_level) assert num_ranks == len(cleaned_hit_taxids_all_levels) if should_keep(cleaned_hit_taxids_all_levels): # Aggregate each level and collect statistics agg_key = tuple(cleaned_hit_taxids_all_levels) while agg_key: agg_bucket = aggregation.get(agg_key) if not agg_bucket: agg_bucket = { 'nonunique_count': 0, 'unique_count': 0, 'sum_percent_identity': 0.0, 'sum_alignment_length': 0.0, 'sum_e_value': 0.0 } aggregation[agg_key] = agg_bucket agg_bucket['nonunique_count'] += get_read_cluster_size( duplicate_cluster_sizes, read_id) agg_bucket['unique_count'] += 1 agg_bucket['sum_percent_identity'] += percent_identity agg_bucket['sum_alignment_length'] += alignment_length agg_bucket['sum_e_value'] += e_value if hit_source_count_type: agg_bucket.setdefault('source_count_type', set()).add(hit_source_count_type) # Chomp off the lowest rank as we aggregate up the tree agg_key = agg_key[1:] # Produce the final output taxon_counts_attributes = [] with log.log_context("generate_taxon_count_json_from_m8", {"substep": "loop_2"}): for agg_key, agg_bucket in aggregation.items(): unique_count = agg_bucket['unique_count'] nonunique_count = agg_bucket['nonunique_count'] tax_level = num_ranks - len(agg_key) + 1 # TODO: Extend taxonomic ranks as indicated on the commented out lines. taxon_counts_row = { "tax_id": agg_key[0], "tax_level": tax_level, # 'species_taxid' : agg_key[tax_level - 1] if tax_level == 1 else "-100", 'genus_taxid': agg_key[2 - tax_level] if tax_level <= 2 else "-200", 'family_taxid': agg_key[3 - tax_level] if tax_level <= 3 else "-300", # 'order_taxid' : agg_key[4 - tax_level] if tax_level <= 4 else "-400", # 'class_taxid' : agg_key[5 - tax_level] if tax_level <= 5 else "-500", # 'phyllum_taxid' : agg_key[6 - tax_level] if tax_level <= 6 else "-600", # 'kingdom_taxid' : agg_key[7 - tax_level] if tax_level <= 7 else "-700", # 'domain_taxid' : agg_key[8 - tax_level] if tax_level <= 8 else "-800", "count": # this field will be consumed by the webapp nonunique_count if READ_COUNTING_MODE == ReadCountingMode.COUNT_ALL else unique_count, "nonunique_count": nonunique_count, "unique_count": unique_count, "dcr": nonunique_count / unique_count, "percent_identity": agg_bucket['sum_percent_identity'] / unique_count, "alignment_length": agg_bucket['sum_alignment_length'] / unique_count, "e_value": agg_bucket['sum_e_value'] / unique_count, "count_type": count_type } if agg_bucket.get('source_count_type'): taxon_counts_row['source_count_type'] = list(agg_bucket['source_count_type']) taxon_counts_attributes.append(taxon_counts_row) output_dict = { "pipeline_output": { "taxon_counts_attributes": taxon_counts_attributes } } with log.log_context( "generate_taxon_count_json_from_m8", {"substep": "json_dump", "output_json_file": output_json_file} ): with open(output_json_file, 'w') as outf: json.dump(output_dict, outf) outf.flush()
def generate_taxon_count_json_from_m8(m8_file, hit_level_file, e_value_type, count_type, lineage_map_path, deuterostome_path, output_json_file): # Parse through hit file and m8 input file and format a JSON file with # our desired attributes, including aggregated statistics. if deuterostome_path: taxids_to_remove = read_file_into_set(deuterostome_path) def any_hits_to_remove(hits): if not deuterostome_path: return False for taxid in hits: if int(taxid) >= 0 and taxid in taxids_to_remove: return True return False # Setup aggregation = {} hit_f = open(hit_level_file, 'r', encoding='utf-8') m8_f = open(m8_file, 'r', encoding='utf-8') # Lines in m8_file and hit_level_file correspond (same read_id) hit_line = hit_f.readline() m8_line = m8_f.readline() lineage_map = shelve.open(lineage_map_path.replace('.db', ''), 'r') num_ranks = len(lineage.NULL_LINEAGE) # See https://en.wikipedia.org/wiki/Double-precision_floating-point_format MIN_NORMAL_POSITIVE_DOUBLE = 2.0**-1022 while hit_line and m8_line: # Retrieve data values from files hit_line_columns = hit_line.rstrip("\n").split("\t") _read_id = hit_line_columns[0] hit_level = hit_line_columns[1] hit_taxid = hit_line_columns[2] if int(hit_level) < 0: # Skip negative levels and continue hit_line = hit_f.readline() m8_line = m8_f.readline() continue # m8 files correspond to BLAST tabular output format 6: # Columns: read_id | _ref_id | percent_identity | alignment_length... # # * read_id = query (e.g., gene) sequence id # * _ref_id = subject (e.g., reference genome) sequence id # * percent_identity = percentage of identical matches # * alignment_length = length of the alignments # * e_value = the expect value # # See: # * http://www.metagenomics.wiki/tools/blast/blastn-output-format-6 # * http://www.metagenomics.wiki/tools/blast/evalue m8_line_columns = m8_line.split("\t") msg = "read_ids in %s and %s do not match: %s vs. %s" % ( os.path.basename(m8_file), os.path.basename(hit_level_file), m8_line_columns[0], hit_line_columns[0]) assert m8_line_columns[0] == hit_line_columns[0], msg percent_identity = float(m8_line_columns[2]) alignment_length = float(m8_line_columns[3]) e_value = float(m8_line_columns[10]) # These have been filtered out before the creation of m8_f and hit_f assert alignment_length > 0 assert -0.25 < percent_identity < 100.25 assert e_value == e_value if e_value_type != 'log10': # e_value could be 0 when large contigs are mapped if e_value <= MIN_NORMAL_POSITIVE_DOUBLE: e_value = MIN_NORMAL_POSITIVE_DOUBLE e_value = math.log10(e_value) # Retrieve the taxon lineage and mark meaningless calls with fake # taxids. hit_taxids_all_levels = lineage_map.get(hit_taxid, lineage.NULL_LINEAGE) cleaned_hit_taxids_all_levels = lineage.validate_taxid_lineage( hit_taxids_all_levels, hit_taxid, hit_level) assert num_ranks == len(cleaned_hit_taxids_all_levels) if not any_hits_to_remove(cleaned_hit_taxids_all_levels): # Aggregate each level and collect statistics agg_key = tuple(cleaned_hit_taxids_all_levels) while agg_key: agg_bucket = aggregation.get(agg_key) if not agg_bucket: agg_bucket = { 'count': 0, 'sum_percent_identity': 0.0, 'sum_alignment_length': 0.0, 'sum_e_value': 0.0 } aggregation[agg_key] = agg_bucket agg_bucket['count'] += 1 agg_bucket['sum_percent_identity'] += percent_identity agg_bucket['sum_alignment_length'] += alignment_length agg_bucket['sum_e_value'] += e_value # Chomp off the lowest rank as we aggregate up the tree agg_key = agg_key[1:] hit_line = hit_f.readline() m8_line = m8_f.readline() # Produce the final output taxon_counts_attributes = [] for agg_key, agg_bucket in aggregation.items(): count = agg_bucket['count'] tax_level = num_ranks - len(agg_key) + 1 # TODO: Extend taxonomic ranks as indicated on the commented out lines. taxon_counts_attributes.append({ "tax_id": agg_key[0], "tax_level": tax_level, # 'species_taxid' : agg_key[tax_level - 1] if tax_level == 1 else "-100", 'genus_taxid': agg_key[2 - tax_level] if tax_level <= 2 else "-200", 'family_taxid': agg_key[3 - tax_level] if tax_level <= 3 else "-300", # 'order_taxid' : agg_key[4 - tax_level] if tax_level <= 4 else "-400", # 'class_taxid' : agg_key[5 - tax_level] if tax_level <= 5 else "-500", # 'phyllum_taxid' : agg_key[6 - tax_level] if tax_level <= 6 else "-600", # 'kingdom_taxid' : agg_key[7 - tax_level] if tax_level <= 7 else "-700", # 'domain_taxid' : agg_key[8 - tax_level] if tax_level <= 8 else "-800", "count": count, "percent_identity": agg_bucket['sum_percent_identity'] / count, "alignment_length": agg_bucket['sum_alignment_length'] / count, "e_value": agg_bucket['sum_e_value'] / count, "count_type": count_type }) output_dict = { "pipeline_output": { "taxon_counts_attributes": taxon_counts_attributes } } with open(output_json_file, 'w') as outf: json.dump(output_dict, outf)
def generate_taxon_count_json_from_m8(m8_file, hit_level_file, e_value_type, count_type, lineage_map_path, deuterostome_path, taxon_whitelist_path, taxon_blacklist_path, cdhit_cluster_sizes_path, output_json_file): # Parse through hit file and m8 input file and format a JSON file with # our desired attributes, including aggregated statistics. cdhit_cluster_sizes = load_cdhit_cluster_sizes(cdhit_cluster_sizes_path) should_keep = build_should_keep_filter(deuterostome_path, taxon_whitelist_path, taxon_blacklist_path) # Setup aggregation = {} with open(hit_level_file, 'r', encoding='utf-8') as hit_f, \ open(m8_file, 'r', encoding='utf-8') as m8_f, \ open_file_db_by_extension(lineage_map_path, IdSeqDictValue.VALUE_TYPE_ARRAY) as lineage_map: # noqa # Lines in m8_file and hit_level_file correspond (same read_id) hit_line = hit_f.readline() m8_line = m8_f.readline() num_ranks = len(lineage.NULL_LINEAGE) # See https://en.wikipedia.org/wiki/Double-precision_floating-point_format MIN_NORMAL_POSITIVE_DOUBLE = 2.0**-1022 with log.log_context("generate_taxon_count_json_from_m8", {"substep": "loop_1"}): while hit_line and m8_line: # Retrieve data values from files hit_line_columns = hit_line.rstrip("\n").split("\t") read_id = hit_line_columns[0] hit_level = hit_line_columns[1] hit_taxid = hit_line_columns[2] if int(hit_level) < 0: # Skip negative levels and continue hit_line = hit_f.readline() m8_line = m8_f.readline() continue # m8 files correspond to BLAST tabular output format 6: # Columns: read_id | _ref_id | percent_identity | alignment_length... # # * read_id = query (e.g., gene) sequence id # * _ref_id = subject (e.g., reference genome) sequence id # * percent_identity = percentage of identical matches # * alignment_length = length of the alignments # * e_value = the expect value # # See: # * http://www.metagenomics.wiki/tools/blast/blastn-output-format-6 # * http://www.metagenomics.wiki/tools/blast/evalue m8_line_columns = m8_line.split("\t") msg = "read_ids in %s and %s do not match: %s vs. %s" % ( os.path.basename(m8_file), os.path.basename(hit_level_file), m8_line_columns[0], hit_line_columns[0]) assert m8_line_columns[0] == hit_line_columns[0], msg percent_identity = float(m8_line_columns[2]) alignment_length = float(m8_line_columns[3]) e_value = float(m8_line_columns[10]) # These have been filtered out before the creation of m8_f and hit_f assert alignment_length > 0 assert -0.25 < percent_identity < 100.25 assert e_value == e_value if e_value_type != 'log10': # e_value could be 0 when large contigs are mapped if e_value <= MIN_NORMAL_POSITIVE_DOUBLE: e_value = MIN_NORMAL_POSITIVE_DOUBLE e_value = math.log10(e_value) # Retrieve the taxon lineage and mark meaningless calls with fake # taxids. hit_taxids_all_levels = lineage_map.get( hit_taxid, lineage.NULL_LINEAGE) cleaned_hit_taxids_all_levels = lineage.validate_taxid_lineage( hit_taxids_all_levels, hit_taxid, hit_level) assert num_ranks == len(cleaned_hit_taxids_all_levels) if should_keep(cleaned_hit_taxids_all_levels): # Aggregate each level and collect statistics agg_key = tuple(cleaned_hit_taxids_all_levels) while agg_key: agg_bucket = aggregation.get(agg_key) if not agg_bucket: agg_bucket = { 'nonunique_count': 0, 'unique_count': 0, 'sum_percent_identity': 0.0, 'sum_alignment_length': 0.0, 'sum_e_value': 0.0 } aggregation[agg_key] = agg_bucket agg_bucket['nonunique_count'] += get_read_cluster_size( cdhit_cluster_sizes, read_id) agg_bucket['unique_count'] += 1 agg_bucket['sum_percent_identity'] += percent_identity agg_bucket['sum_alignment_length'] += alignment_length agg_bucket['sum_e_value'] += e_value # Chomp off the lowest rank as we aggregate up the tree agg_key = agg_key[1:] hit_line = hit_f.readline() m8_line = m8_f.readline() # Produce the final output taxon_counts_attributes = [] with log.log_context("generate_taxon_count_json_from_m8", {"substep": "loop_2"}): for agg_key, agg_bucket in aggregation.items(): unique_count = agg_bucket['unique_count'] nonunique_count = agg_bucket['nonunique_count'] tax_level = num_ranks - len(agg_key) + 1 # TODO: Extend taxonomic ranks as indicated on the commented out lines. taxon_counts_attributes.append({ "tax_id": agg_key[0], "tax_level": tax_level, # 'species_taxid' : agg_key[tax_level - 1] if tax_level == 1 else "-100", 'genus_taxid': agg_key[2 - tax_level] if tax_level <= 2 else "-200", 'family_taxid': agg_key[3 - tax_level] if tax_level <= 3 else "-300", # 'order_taxid' : agg_key[4 - tax_level] if tax_level <= 4 else "-400", # 'class_taxid' : agg_key[5 - tax_level] if tax_level <= 5 else "-500", # 'phyllum_taxid' : agg_key[6 - tax_level] if tax_level <= 6 else "-600", # 'kingdom_taxid' : agg_key[7 - tax_level] if tax_level <= 7 else "-700", # 'domain_taxid' : agg_key[8 - tax_level] if tax_level <= 8 else "-800", "count": # this field will be consumed by the webapp nonunique_count if READ_COUNTING_MODE == ReadCountingMode.COUNT_ALL else unique_count, "nonunique_count": nonunique_count, "unique_count": unique_count, "dcr": nonunique_count / unique_count, "percent_identity": agg_bucket['sum_percent_identity'] / unique_count, "alignment_length": agg_bucket['sum_alignment_length'] / unique_count, "e_value": agg_bucket['sum_e_value'] / unique_count, "count_type": count_type }) output_dict = { "pipeline_output": { "taxon_counts_attributes": taxon_counts_attributes } } with log.log_context("generate_taxon_count_json_from_m8", { "substep": "json_dump", "output_json_file": output_json_file }): with open(output_json_file, 'w') as outf: json.dump(output_dict, outf) outf.flush()