Пример #1
0
 def get_valid_lineage(valid_hits, lineage_map, read_id, count_type):
     # If the read aligned to something, then it would be present in the
     # summary file for count type, and correspondingly in valid_hits[
     # count_type], even if the hits disagree so much that the
     # "valid_hits" entry is just ("-1", "-1"). If the read didn't align
     # to anything, we also represent that with ("-1", "-1"). This ("-1",
     # "-1") gets translated to NULL_LINEAGE.
     hit_taxid_str, hit_level_str = valid_hits[count_type].get(
         read_id, ("-1", "-1"))
     hit_lineage = lineage_map.get(hit_taxid_str, lineage.NULL_LINEAGE)
     return lineage.validate_taxid_lineage(hit_lineage, hit_taxid_str,
                                           hit_level_str)
Пример #2
0
 def get_valid_lineage(hits_by_read_id, lineage_map,
                       read_id: str) -> List[str]:
     # If the read aligned to something, then it would be present in the
     # summary file for count type, and correspondingly in hits_by_read_id
     # even if the hits disagree so much that the
     # "valid_hits" entry is just ("-1", "-1"). If the read didn't align
     # to anything, we also represent that with ("-1", "-1"). This ("-1",
     # "-1") gets translated to NULL_LINEAGE.
     hit_taxid, hit_level = hits_by_read_id.get(read_id, (-1, -1))
     hit_lineage = lineage_map.get(str(hit_taxid), lineage.NULL_LINEAGE)
     return lineage.validate_taxid_lineage(hit_lineage, hit_taxid,
                                           hit_level)
Пример #3
0
def generate_taxon_count_json_from_m8(
        blastn_6_path, hit_level_path, count_type, lineage_map_path,
        deuterostome_path, taxon_whitelist_path, taxon_blacklist_path,
        duplicate_cluster_sizes_path, output_json_file):
    # Parse through hit file and m8 input file and format a JSON file with
    # our desired attributes, including aggregated statistics.

    duplicate_cluster_sizes = load_duplicate_cluster_sizes(duplicate_cluster_sizes_path)

    should_keep = build_should_keep_filter(
        deuterostome_path, taxon_whitelist_path, taxon_blacklist_path)
    # Setup
    aggregation = {}
    with open(hit_level_path) as hit_level_f, \
         open(blastn_6_path) as blastn_6_f, \
         open_file_db_by_extension(lineage_map_path) as lineage_map:

        num_ranks = len(lineage.NULL_LINEAGE)
        # See https://en.wikipedia.org/wiki/Double-precision_floating-point_format
        MIN_NORMAL_POSITIVE_DOUBLE = 2.0**-1022

        with log.log_context("generate_taxon_count_json_from_m8", {"substep": "loop_1"}):
            # Lines in m8_file and hit_level_file correspond (same read_id)
            for hit_row, blastn_6_row in zip(HitSummaryMergedReader(hit_level_f), BlastnOutput6NTRerankedReader(blastn_6_f)):
                # Retrieve data values from files
                read_id = hit_row["read_id"]
                hit_level = hit_row["level"]
                hit_taxid = hit_row["taxid"]
                if hit_level < 0:
                    log.write('hit_level < 0', debug=True)
                hit_source_count_type = hit_row.get("source_count_type")

                msg = "read_ids in %s and %s do not match: %s vs. %s" % (
                    os.path.basename(blastn_6_path), os.path.basename(hit_level_path),
                    blastn_6_row["qseqid"], read_id)
                assert blastn_6_row["qseqid"] == read_id, msg
                percent_identity = blastn_6_row["pident"]
                alignment_length = blastn_6_row["length"]

                if count_type == 'merged_NT_NR' and hit_source_count_type == 'NR':
                    # NOTE: At the moment of the change, applied ONLY in the scope of the prototype of NT/NR consensus project.
                    # Protein alignments (NR) are done at amino acid level. Each amino acid is composed of 3 nucleotides.
                    # To make alignment length values comparable across NT and NR alignments (for combined statistics),
                    # the NR alignment lengths are multiplied by 3.
                    alignment_length *= 3
                e_value = blastn_6_row["evalue"]

                # These have been filtered out before the creation of blastn_6_f and hit_level_f
                assert alignment_length > 0
                assert -0.25 < percent_identity < 100.25
                assert e_value == e_value

                if count_type == "NT" or hit_source_count_type == "NT":
                    # e_value could be 0 when large contigs are mapped
                    if e_value <= MIN_NORMAL_POSITIVE_DOUBLE:
                        e_value = MIN_NORMAL_POSITIVE_DOUBLE
                    e_value = math.log10(e_value)

                # Retrieve the taxon lineage and mark meaningless calls with fake
                # taxids.
                # lineage_map expects string ids
                hit_taxids_all_levels = lineage_map.get(
                    str(hit_taxid), lineage.NULL_LINEAGE)
                cleaned_hit_taxids_all_levels = lineage.validate_taxid_lineage(
                    hit_taxids_all_levels, hit_taxid, hit_level)
                assert num_ranks == len(cleaned_hit_taxids_all_levels)

                if should_keep(cleaned_hit_taxids_all_levels):
                    # Aggregate each level and collect statistics
                    agg_key = tuple(cleaned_hit_taxids_all_levels)
                    while agg_key:
                        agg_bucket = aggregation.get(agg_key)
                        if not agg_bucket:
                            agg_bucket = {
                                'nonunique_count': 0,
                                'unique_count': 0,
                                'sum_percent_identity': 0.0,
                                'sum_alignment_length': 0.0,
                                'sum_e_value': 0.0
                            }
                            aggregation[agg_key] = agg_bucket
                        agg_bucket['nonunique_count'] += get_read_cluster_size(
                            duplicate_cluster_sizes, read_id)
                        agg_bucket['unique_count'] += 1
                        agg_bucket['sum_percent_identity'] += percent_identity
                        agg_bucket['sum_alignment_length'] += alignment_length
                        agg_bucket['sum_e_value'] += e_value
                        if hit_source_count_type:
                            agg_bucket.setdefault('source_count_type', set()).add(hit_source_count_type)
                        # Chomp off the lowest rank as we aggregate up the tree
                        agg_key = agg_key[1:]

    # Produce the final output
    taxon_counts_attributes = []
    with log.log_context("generate_taxon_count_json_from_m8", {"substep": "loop_2"}):
        for agg_key, agg_bucket in aggregation.items():
            unique_count = agg_bucket['unique_count']
            nonunique_count = agg_bucket['nonunique_count']
            tax_level = num_ranks - len(agg_key) + 1
            # TODO: Extend taxonomic ranks as indicated on the commented out lines.
            taxon_counts_row = {
                "tax_id":
                agg_key[0],
                "tax_level":
                tax_level,
                # 'species_taxid' : agg_key[tax_level - 1] if tax_level == 1 else "-100",
                'genus_taxid':
                agg_key[2 - tax_level] if tax_level <= 2 else "-200",
                'family_taxid':
                agg_key[3 - tax_level] if tax_level <= 3 else "-300",
                # 'order_taxid' : agg_key[4 - tax_level] if tax_level <= 4 else "-400",
                # 'class_taxid' : agg_key[5 - tax_level] if tax_level <= 5 else "-500",
                # 'phyllum_taxid' : agg_key[6 - tax_level] if tax_level <= 6 else "-600",
                # 'kingdom_taxid' : agg_key[7 - tax_level] if tax_level <= 7 else "-700",
                # 'domain_taxid' : agg_key[8 - tax_level] if tax_level <= 8 else "-800",
                "count":  # this field will be consumed by the webapp
                nonunique_count if READ_COUNTING_MODE == ReadCountingMode.COUNT_ALL else unique_count,
                "nonunique_count":
                nonunique_count,
                "unique_count":
                unique_count,
                "dcr":
                nonunique_count / unique_count,
                "percent_identity":
                agg_bucket['sum_percent_identity'] / unique_count,
                "alignment_length":
                agg_bucket['sum_alignment_length'] / unique_count,
                "e_value":
                agg_bucket['sum_e_value'] / unique_count,
                "count_type":
                count_type
            }
            if agg_bucket.get('source_count_type'):
                taxon_counts_row['source_count_type'] = list(agg_bucket['source_count_type'])

            taxon_counts_attributes.append(taxon_counts_row)
        output_dict = {
            "pipeline_output": {
                "taxon_counts_attributes": taxon_counts_attributes
            }
        }

    with log.log_context(
        "generate_taxon_count_json_from_m8",
        {"substep": "json_dump", "output_json_file": output_json_file}
    ):
        with open(output_json_file, 'w') as outf:
            json.dump(output_dict, outf)
            outf.flush()
Пример #4
0
def generate_taxon_count_json_from_m8(m8_file, hit_level_file, e_value_type,
                                      count_type, lineage_map_path,
                                      deuterostome_path, output_json_file):
    # Parse through hit file and m8 input file and format a JSON file with
    # our desired attributes, including aggregated statistics.

    if deuterostome_path:
        taxids_to_remove = read_file_into_set(deuterostome_path)

    def any_hits_to_remove(hits):
        if not deuterostome_path:
            return False
        for taxid in hits:
            if int(taxid) >= 0 and taxid in taxids_to_remove:
                return True
        return False

    # Setup
    aggregation = {}
    hit_f = open(hit_level_file, 'r', encoding='utf-8')
    m8_f = open(m8_file, 'r', encoding='utf-8')
    # Lines in m8_file and hit_level_file correspond (same read_id)
    hit_line = hit_f.readline()
    m8_line = m8_f.readline()
    lineage_map = shelve.open(lineage_map_path.replace('.db', ''), 'r')
    num_ranks = len(lineage.NULL_LINEAGE)
    # See https://en.wikipedia.org/wiki/Double-precision_floating-point_format
    MIN_NORMAL_POSITIVE_DOUBLE = 2.0**-1022

    while hit_line and m8_line:
        # Retrieve data values from files
        hit_line_columns = hit_line.rstrip("\n").split("\t")
        _read_id = hit_line_columns[0]
        hit_level = hit_line_columns[1]
        hit_taxid = hit_line_columns[2]
        if int(hit_level) < 0:  # Skip negative levels and continue
            hit_line = hit_f.readline()
            m8_line = m8_f.readline()
            continue

        # m8 files correspond to BLAST tabular output format 6:
        # Columns: read_id | _ref_id | percent_identity | alignment_length...
        #
        # * read_id = query (e.g., gene) sequence id
        # * _ref_id = subject (e.g., reference genome) sequence id
        # * percent_identity = percentage of identical matches
        # * alignment_length = length of the alignments
        # * e_value = the expect value
        #
        # See:
        # * http://www.metagenomics.wiki/tools/blast/blastn-output-format-6
        # * http://www.metagenomics.wiki/tools/blast/evalue

        m8_line_columns = m8_line.split("\t")
        msg = "read_ids in %s and %s do not match: %s vs. %s" % (
            os.path.basename(m8_file), os.path.basename(hit_level_file),
            m8_line_columns[0], hit_line_columns[0])
        assert m8_line_columns[0] == hit_line_columns[0], msg
        percent_identity = float(m8_line_columns[2])
        alignment_length = float(m8_line_columns[3])
        e_value = float(m8_line_columns[10])

        # These have been filtered out before the creation of m8_f and hit_f
        assert alignment_length > 0
        assert -0.25 < percent_identity < 100.25
        assert e_value == e_value
        if e_value_type != 'log10':
            # e_value could be 0 when large contigs are mapped
            if e_value <= MIN_NORMAL_POSITIVE_DOUBLE:
                e_value = MIN_NORMAL_POSITIVE_DOUBLE
            e_value = math.log10(e_value)

        # Retrieve the taxon lineage and mark meaningless calls with fake
        # taxids.
        hit_taxids_all_levels = lineage_map.get(hit_taxid,
                                                lineage.NULL_LINEAGE)
        cleaned_hit_taxids_all_levels = lineage.validate_taxid_lineage(
            hit_taxids_all_levels, hit_taxid, hit_level)
        assert num_ranks == len(cleaned_hit_taxids_all_levels)

        if not any_hits_to_remove(cleaned_hit_taxids_all_levels):
            # Aggregate each level and collect statistics
            agg_key = tuple(cleaned_hit_taxids_all_levels)
            while agg_key:
                agg_bucket = aggregation.get(agg_key)
                if not agg_bucket:
                    agg_bucket = {
                        'count': 0,
                        'sum_percent_identity': 0.0,
                        'sum_alignment_length': 0.0,
                        'sum_e_value': 0.0
                    }
                    aggregation[agg_key] = agg_bucket
                agg_bucket['count'] += 1
                agg_bucket['sum_percent_identity'] += percent_identity
                agg_bucket['sum_alignment_length'] += alignment_length
                agg_bucket['sum_e_value'] += e_value
                # Chomp off the lowest rank as we aggregate up the tree
                agg_key = agg_key[1:]

        hit_line = hit_f.readline()
        m8_line = m8_f.readline()

    # Produce the final output
    taxon_counts_attributes = []
    for agg_key, agg_bucket in aggregation.items():
        count = agg_bucket['count']
        tax_level = num_ranks - len(agg_key) + 1
        # TODO: Extend taxonomic ranks as indicated on the commented out lines.
        taxon_counts_attributes.append({
            "tax_id":
            agg_key[0],
            "tax_level":
            tax_level,
            # 'species_taxid' : agg_key[tax_level - 1] if tax_level == 1 else "-100",
            'genus_taxid':
            agg_key[2 - tax_level] if tax_level <= 2 else "-200",
            'family_taxid':
            agg_key[3 - tax_level] if tax_level <= 3 else "-300",
            # 'order_taxid' : agg_key[4 - tax_level] if tax_level <= 4 else "-400",
            # 'class_taxid' : agg_key[5 - tax_level] if tax_level <= 5 else "-500",
            # 'phyllum_taxid' : agg_key[6 - tax_level] if tax_level <= 6 else "-600",
            # 'kingdom_taxid' : agg_key[7 - tax_level] if tax_level <= 7 else "-700",
            # 'domain_taxid' : agg_key[8 - tax_level] if tax_level <= 8 else "-800",
            "count":
            count,
            "percent_identity":
            agg_bucket['sum_percent_identity'] / count,
            "alignment_length":
            agg_bucket['sum_alignment_length'] / count,
            "e_value":
            agg_bucket['sum_e_value'] / count,
            "count_type":
            count_type
        })
    output_dict = {
        "pipeline_output": {
            "taxon_counts_attributes": taxon_counts_attributes
        }
    }
    with open(output_json_file, 'w') as outf:
        json.dump(output_dict, outf)
Пример #5
0
def generate_taxon_count_json_from_m8(m8_file, hit_level_file, e_value_type,
                                      count_type, lineage_map_path,
                                      deuterostome_path, taxon_whitelist_path,
                                      taxon_blacklist_path,
                                      cdhit_cluster_sizes_path,
                                      output_json_file):
    # Parse through hit file and m8 input file and format a JSON file with
    # our desired attributes, including aggregated statistics.

    cdhit_cluster_sizes = load_cdhit_cluster_sizes(cdhit_cluster_sizes_path)

    should_keep = build_should_keep_filter(deuterostome_path,
                                           taxon_whitelist_path,
                                           taxon_blacklist_path)
    # Setup
    aggregation = {}
    with open(hit_level_file, 'r', encoding='utf-8') as hit_f, \
         open(m8_file, 'r', encoding='utf-8') as m8_f, \
         open_file_db_by_extension(lineage_map_path, IdSeqDictValue.VALUE_TYPE_ARRAY) as lineage_map:  # noqa
        # Lines in m8_file and hit_level_file correspond (same read_id)
        hit_line = hit_f.readline()
        m8_line = m8_f.readline()
        num_ranks = len(lineage.NULL_LINEAGE)
        # See https://en.wikipedia.org/wiki/Double-precision_floating-point_format
        MIN_NORMAL_POSITIVE_DOUBLE = 2.0**-1022

        with log.log_context("generate_taxon_count_json_from_m8",
                             {"substep": "loop_1"}):
            while hit_line and m8_line:
                # Retrieve data values from files
                hit_line_columns = hit_line.rstrip("\n").split("\t")
                read_id = hit_line_columns[0]
                hit_level = hit_line_columns[1]
                hit_taxid = hit_line_columns[2]
                if int(hit_level) < 0:  # Skip negative levels and continue
                    hit_line = hit_f.readline()
                    m8_line = m8_f.readline()
                    continue

                # m8 files correspond to BLAST tabular output format 6:
                # Columns: read_id | _ref_id | percent_identity | alignment_length...
                #
                # * read_id = query (e.g., gene) sequence id
                # * _ref_id = subject (e.g., reference genome) sequence id
                # * percent_identity = percentage of identical matches
                # * alignment_length = length of the alignments
                # * e_value = the expect value
                #
                # See:
                # * http://www.metagenomics.wiki/tools/blast/blastn-output-format-6
                # * http://www.metagenomics.wiki/tools/blast/evalue

                m8_line_columns = m8_line.split("\t")
                msg = "read_ids in %s and %s do not match: %s vs. %s" % (
                    os.path.basename(m8_file),
                    os.path.basename(hit_level_file), m8_line_columns[0],
                    hit_line_columns[0])
                assert m8_line_columns[0] == hit_line_columns[0], msg
                percent_identity = float(m8_line_columns[2])
                alignment_length = float(m8_line_columns[3])
                e_value = float(m8_line_columns[10])

                # These have been filtered out before the creation of m8_f and hit_f
                assert alignment_length > 0
                assert -0.25 < percent_identity < 100.25
                assert e_value == e_value
                if e_value_type != 'log10':
                    # e_value could be 0 when large contigs are mapped
                    if e_value <= MIN_NORMAL_POSITIVE_DOUBLE:
                        e_value = MIN_NORMAL_POSITIVE_DOUBLE
                    e_value = math.log10(e_value)

                # Retrieve the taxon lineage and mark meaningless calls with fake
                # taxids.
                hit_taxids_all_levels = lineage_map.get(
                    hit_taxid, lineage.NULL_LINEAGE)
                cleaned_hit_taxids_all_levels = lineage.validate_taxid_lineage(
                    hit_taxids_all_levels, hit_taxid, hit_level)
                assert num_ranks == len(cleaned_hit_taxids_all_levels)

                if should_keep(cleaned_hit_taxids_all_levels):
                    # Aggregate each level and collect statistics
                    agg_key = tuple(cleaned_hit_taxids_all_levels)
                    while agg_key:
                        agg_bucket = aggregation.get(agg_key)
                        if not agg_bucket:
                            agg_bucket = {
                                'nonunique_count': 0,
                                'unique_count': 0,
                                'sum_percent_identity': 0.0,
                                'sum_alignment_length': 0.0,
                                'sum_e_value': 0.0
                            }
                            aggregation[agg_key] = agg_bucket
                        agg_bucket['nonunique_count'] += get_read_cluster_size(
                            cdhit_cluster_sizes, read_id)
                        agg_bucket['unique_count'] += 1
                        agg_bucket['sum_percent_identity'] += percent_identity
                        agg_bucket['sum_alignment_length'] += alignment_length
                        agg_bucket['sum_e_value'] += e_value
                        # Chomp off the lowest rank as we aggregate up the tree
                        agg_key = agg_key[1:]

                hit_line = hit_f.readline()
                m8_line = m8_f.readline()

    # Produce the final output
    taxon_counts_attributes = []
    with log.log_context("generate_taxon_count_json_from_m8",
                         {"substep": "loop_2"}):
        for agg_key, agg_bucket in aggregation.items():
            unique_count = agg_bucket['unique_count']
            nonunique_count = agg_bucket['nonunique_count']
            tax_level = num_ranks - len(agg_key) + 1
            # TODO: Extend taxonomic ranks as indicated on the commented out lines.
            taxon_counts_attributes.append({
                "tax_id":
                agg_key[0],
                "tax_level":
                tax_level,
                # 'species_taxid' : agg_key[tax_level - 1] if tax_level == 1 else "-100",
                'genus_taxid':
                agg_key[2 - tax_level] if tax_level <= 2 else "-200",
                'family_taxid':
                agg_key[3 - tax_level] if tax_level <= 3 else "-300",
                # 'order_taxid' : agg_key[4 - tax_level] if tax_level <= 4 else "-400",
                # 'class_taxid' : agg_key[5 - tax_level] if tax_level <= 5 else "-500",
                # 'phyllum_taxid' : agg_key[6 - tax_level] if tax_level <= 6 else "-600",
                # 'kingdom_taxid' : agg_key[7 - tax_level] if tax_level <= 7 else "-700",
                # 'domain_taxid' : agg_key[8 - tax_level] if tax_level <= 8 else "-800",
                "count":  # this field will be consumed by the webapp
                nonunique_count if READ_COUNTING_MODE == ReadCountingMode.COUNT_ALL else unique_count,
                "nonunique_count":
                nonunique_count,
                "unique_count":
                unique_count,
                "dcr":
                nonunique_count / unique_count,
                "percent_identity":
                agg_bucket['sum_percent_identity'] / unique_count,
                "alignment_length":
                agg_bucket['sum_alignment_length'] / unique_count,
                "e_value":
                agg_bucket['sum_e_value'] / unique_count,
                "count_type":
                count_type
            })
        output_dict = {
            "pipeline_output": {
                "taxon_counts_attributes": taxon_counts_attributes
            }
        }

    with log.log_context("generate_taxon_count_json_from_m8", {
            "substep": "json_dump",
            "output_json_file": output_json_file
    }):
        with open(output_json_file, 'w') as outf:
            json.dump(output_dict, outf)
            outf.flush()