Exemplo n.º 1
0
def process_deltadir(delta_dir, org_lengths):
    """Returns a tuple of ANIm results for .deltas in passed directory.

    - delta_dir - path to the directory containing .delta files
    - org_lengths - dictionary of total sequence lengths, keyed by sequence

    Returns the following pandas dataframes in a tuple; query sequences are
    rows, subject sequences are columns:

    - alignment_lengths - symmetrical: total length of alignment
    - percentage_identity - symmetrical: percentage identity of alignment
    - alignment_coverage - non-symmetrical: coverage of query and subject
    - similarity_errors - symmetrical: count of similarity errors

    May throw a ZeroDivisionError if one or more NUCmer runs failed, or a
    very distant sequence was included in the analysis.
    """
    # Process directory to identify input files
    deltafiles = pyani_files.get_input_files(delta_dir, '.delta')
    labels = org_lengths.keys()
    # Hold data in pandas dataframe
    alignment_lengths = pd.DataFrame(index=labels, columns=labels, dtype=float)
    similarity_errors = pd.DataFrame(index=labels, columns=labels,
                                     dtype=float).fillna(0)
    percentage_identity = pd.DataFrame(index=labels,
                                       columns=labels,
                                       dtype=float).fillna(1.0)
    alignment_coverage = pd.DataFrame(index=labels,
                                      columns=labels,
                                      dtype=float).fillna(1.0)
    # Fill diagonal NA values for alignment_length with org_lengths
    for org, length in org_lengths.items():
        alignment_lengths[org][org] = length
    # Process .delta files assuming that the filename format holds:
    # org1_vs_org2.delta
    for deltafile in deltafiles:
        qname, sname = \
            os.path.splitext(os.path.split(deltafile)[-1])[0].split('_vs_')
        tot_length, tot_sim_error = parse_delta(deltafile)
        query_cover = float(tot_length) / org_lengths[qname]
        sbjct_cover = float(tot_length) / org_lengths[sname]
        # Calculate percentage ID of aligned length. This may fail if
        # total length is zero.
        # The ZeroDivisionError that would arise should be handled
        # Common causes are that a NUCmer run failed, or that a very
        # distant sequence was included in the analysis.
        perc_id = 1 - float(tot_sim_error) / tot_length
        # Populate dataframes: when assigning data, pandas dataframes
        # take column, index order, i.e. df['column']['row'] - this only
        # matters for asymmetrical data
        alignment_lengths.loc[qname, sname] = tot_length
        alignment_lengths.loc[sname, qname] = tot_length
        similarity_errors.loc[qname, sname] = tot_sim_error
        similarity_errors.loc[sname, qname] = tot_sim_error
        percentage_identity.loc[qname, sname] = perc_id
        percentage_identity.loc[sname, qname] = perc_id
        alignment_coverage.loc[sname, qname] = query_cover
        alignment_coverage.loc[qname, sname] = sbjct_cover
    return (alignment_lengths, percentage_identity, alignment_coverage,
            similarity_errors)
Exemplo n.º 2
0
def process_deltadir(delta_dir, org_lengths):
    """Returns a tuple of ANIm results for .deltas in passed directory.

    - delta_dir - path to the directory containing .delta files
    - org_lengths - dictionary of total sequence lengths, keyed by sequence

    Returns the following pandas dataframes in a tuple; query sequences are
    rows, subject sequences are columns:

    - alignment_lengths - symmetrical: total length of alignment
    - percentage_identity - symmetrical: percentage identity of alignment
    - alignment_coverage - non-symmetrical: coverage of query and subject
    - similarity_errors - symmetrical: count of similarity errors

    May throw a ZeroDivisionError if one or more NUCmer runs failed, or a
    very distant sequence was included in the analysis.
    """
    # Process directory to identify input files
    deltafiles = pyani_files.get_input_files(delta_dir, '.delta')
    labels = org_lengths.keys()
    # Hold data in pandas dataframe
    alignment_lengths = pd.DataFrame(index=labels, columns=labels,
                                     dtype=float)
    similarity_errors = pd.DataFrame(index=labels, columns=labels,
                                     dtype=float).fillna(0)
    percentage_identity = pd.DataFrame(index=labels, columns=labels,
                                       dtype=float).fillna(1.0)
    alignment_coverage = pd.DataFrame(index=labels, columns=labels,
                                      dtype=float).fillna(1.0)
    # Fill diagonal NA values for alignment_length with org_lengths
    for org, length in org_lengths.items():
        alignment_lengths[org][org] = length
    # Process .delta files assuming that the filename format holds:
    # org1_vs_org2.delta
    for deltafile in deltafiles:
        qname, sname = \
            os.path.splitext(os.path.split(deltafile)[-1])[0].split('_vs_')
        tot_length, tot_sim_error = parse_delta(deltafile)
        query_cover = float(tot_length) / org_lengths[qname]
        sbjct_cover = float(tot_length) / org_lengths[sname]
        # Calculate percentage ID of aligned length. This may fail if
        # total length is zero.
        # The ZeroDivisionError that would arise should be handled
        # Common causes are that a NUCmer run failed, or that a very
        # distant sequence was included in the analysis.
        perc_id = 1 - float(tot_sim_error) / tot_length
        # Populate dataframes: when assigning data, pandas dataframes
        # take column, index order, i.e. df['column']['row'] - this only
        # matters for asymmetrical data
        alignment_lengths.loc[qname, sname] = tot_length
        alignment_lengths.loc[sname, qname] = tot_length
        similarity_errors.loc[qname, sname] = tot_sim_error
        similarity_errors.loc[sname, qname] = tot_sim_error
        percentage_identity.loc[qname, sname] = perc_id
        percentage_identity.loc[sname, qname] = perc_id
        alignment_coverage.loc[sname, qname] = query_cover
        alignment_coverage.loc[qname, sname] = sbjct_cover
    return(alignment_lengths, percentage_identity, alignment_coverage,
           similarity_errors)
Exemplo n.º 3
0
def process_blast(blast_dir, org_lengths, fraglengths=None, mode="ANIb"):
    """Returns a tuple of ANIb results for .blast_tab files in the output dir.

    - blast_dir - path to the directory containing .blast_tab files
    - org_lengths - the base count for each input sequence
    - fraglengths - dictionary of query sequence fragment lengths, only
    needed for BLASTALL output
    - mode - parsing BLASTN+ or BLASTALL output?

    Returns the following pandas dataframes in a tuple:

    - alignment_lengths - non-symmetrical: total length of alignment
    - percentage_identity - non-symmetrical: ANIb (Goris) percentage identity
    - alignment_coverage - non-symmetrical: coverage of query
    - similarity_errors - non-symmetrical: count of similarity errors

    May throw a ZeroDivisionError if one or more BLAST runs failed, or a
    very distant sequence was included in the analysis.
    """
    # Process directory to identify input files
    blastfiles = pyani_files.get_input_files(blast_dir, '.blast_tab')
    labels = org_lengths.keys()
    # Hold data in pandas dataframe
    alignment_lengths = pd.DataFrame(index=labels, columns=labels, dtype=float)
    similarity_errors = pd.DataFrame(index=labels, columns=labels,
                                     dtype=float).fillna(0)
    percentage_identity = pd.DataFrame(index=labels,
                                       columns=labels,
                                       dtype=float).fillna(1.0)
    alignment_coverage = pd.DataFrame(index=labels,
                                      columns=labels,
                                      dtype=float).fillna(1.0)
    # Fill diagonal NA values for alignment_length with org_lengths
    for org, length in org_lengths.items():
        alignment_lengths[org][org] = length
    # Process .blast_tab files assuming that the filename format holds:
    # org1_vs_org2.blast_tab:
    for blastfile in blastfiles:
        qname, sname = \
            os.path.splitext(os.path.split(blastfile)[-1])[0].split('_vs_')
        tot_length, tot_sim_error, ani_pid = parse_blast_tab(
            blastfile, fraglengths, mode)
        query_cover = float(tot_length) / org_lengths[qname]
        # Populate dataframes: when assigning data, pandas dataframes
        # take column, index order, i.e. df['column']['row'] - this only
        # matters for asymmetrical data
        alignment_lengths.loc[qname, sname] = tot_length
        similarity_errors.loc[qname, sname] = tot_sim_error
        percentage_identity.loc[qname, sname] = 0.01 * ani_pid
        alignment_coverage.loc[qname, sname] = query_cover
    return (alignment_lengths, percentage_identity, alignment_coverage,
            similarity_errors)
Exemplo n.º 4
0
Arquivo: anib.py Projeto: brwnj/pyani
def process_blast(blast_dir, org_lengths, fraglengths=None, mode="ANIb"):
    """Returns a tuple of ANIb results for .blast_tab files in the output dir.

    - blast_dir - path to the directory containing .blast_tab files
    - org_lengths - the base count for each input sequence
    - fraglengths - dictionary of query sequence fragment lengths, only
    needed for BLASTALL output
    - mode - parsing BLASTN+ or BLASTALL output?

    Returns the following pandas dataframes in a tuple:

    - alignment_lengths - non-symmetrical: total length of alignment
    - percentage_identity - non-symmetrical: ANIb (Goris) percentage identity
    - alignment_coverage - non-symmetrical: coverage of query
    - similarity_errors - non-symmetrical: count of similarity errors

    May throw a ZeroDivisionError if one or more BLAST runs failed, or a
    very distant sequence was included in the analysis.
    """
    # Process directory to identify input files
    blastfiles = pyani_files.get_input_files(blast_dir, '.blast_tab')
    labels = org_lengths.keys()
    # Hold data in pandas dataframe
    alignment_lengths = pd.DataFrame(index=labels, columns=labels,
                                     dtype=float)
    similarity_errors = pd.DataFrame(index=labels, columns=labels,
                                     dtype=float).fillna(0)
    percentage_identity = pd.DataFrame(index=labels, columns=labels,
                                       dtype=float).fillna(1.0)
    alignment_coverage = pd.DataFrame(index=labels, columns=labels,
                                      dtype=float).fillna(1.0)
    # Fill diagonal NA values for alignment_length with org_lengths
    for org, length in org_lengths.items():
        alignment_lengths[org][org] = length
    # Process .blast_tab files assuming that the filename format holds:
    # org1_vs_org2.blast_tab:
    for blastfile in blastfiles:
        qname, sname = \
            os.path.splitext(os.path.split(blastfile)[-1])[0].split('_vs_')
        tot_length, tot_sim_error, ani_pid = parse_blast_tab(blastfile,
                                                             fraglengths,
                                                             mode)
        query_cover = float(tot_length) / org_lengths[qname]
        # Populate dataframes: when assigning data, pandas dataframes
        # take column, index order, i.e. df['column']['row'] - this only
        # matters for asymmetrical data
        alignment_lengths.loc[qname, sname] = tot_length
        similarity_errors.loc[qname, sname] = tot_sim_error
        percentage_identity.loc[qname, sname] = 0.01 * ani_pid
        alignment_coverage.loc[qname, sname] = query_cover
    return(alignment_lengths, percentage_identity, alignment_coverage,
           similarity_errors)
Exemplo n.º 5
0
def process_deltadir(delta_dir, org_lengths, logger=None):
    """Returns a tuple of ANIm results for .deltas in passed directory.

    - delta_dir - path to the directory containing .delta files
    - org_lengths - dictionary of total sequence lengths, keyed by sequence

    Returns the following pandas dataframes in a tuple; query sequences are
    rows, subject sequences are columns:

    - alignment_lengths - symmetrical: total length of alignment
    - percentage_identity - symmetrical: percentage identity of alignment
    - alignment_coverage - non-symmetrical: coverage of query and subject
    - similarity_errors - symmetrical: count of similarity errors

    May throw a ZeroDivisionError if one or more NUCmer runs failed, or a
    very distant sequence was included in the analysis.
    """
    # Process directory to identify input files
    deltafiles = pyani_files.get_input_files(delta_dir, '.delta')
    labels = org_lengths.keys()
    # Hold data in pandas dataframe
    alignment_lengths = pd.DataFrame(index=labels, columns=labels,
                                     dtype=float)
    similarity_errors = pd.DataFrame(index=labels, columns=labels,
                                     dtype=float).fillna(0)
    percentage_identity = pd.DataFrame(index=labels, columns=labels,
                                       dtype=float).fillna(1.0)
    alignment_coverage = pd.DataFrame(index=labels, columns=labels,
                                      dtype=float).fillna(1.0)
    # Fill diagonal NA values for alignment_length with org_lengths
    for org, length in org_lengths.items():
        alignment_lengths[org][org] = length
    # Process .delta files assuming that the filename format holds:
    # org1_vs_org2.delta
    zero_error = False  # flag to register a divide-by-zero error
    for deltafile in deltafiles:
        qname, sname = \
            os.path.splitext(os.path.split(deltafile)[-1])[0].split('_vs_')
        tot_length, tot_sim_error = parse_delta(deltafile)
        if tot_length == 0 and logger is not None:
            logger.warning("Total alignment length reported in %s is zero!" % deltafile)
        query_cover = float(tot_length) / org_lengths[qname]
        sbjct_cover = float(tot_length) / org_lengths[sname]
        # Calculate percentage ID of aligned length. This may fail if
        # total length is zero.
        # The ZeroDivisionError that would arise should be handled
        # Common causes are that a NUCmer run failed, or that a very
        # distant sequence was included in the analysis.
        try:
            perc_id = 1 - float(tot_sim_error) / tot_length
        except ZeroDivisionError:
            logger.error("One or more NUCmer output files has a problem.")
            logger.error("This is possibly due to a NUCmer comparison " +
                         "being too distant for use. If so, please consider " +
                         "using the --maxmatch option.")
            logger.error("Alternatively, this may be due to NUCmer run failure: " +
                         "analysis may continue, but please investigate.")
            perc_id = 0  # set arbitrary value of zero identity
            zero_error = True
        # Populate dataframes: when assigning data, pandas dataframes
        # take column, index order, i.e. df['column']['row'] - this only
        # matters for asymmetrical data
        alignment_lengths.loc[qname, sname] = tot_length
        alignment_lengths.loc[sname, qname] = tot_length
        similarity_errors.loc[qname, sname] = tot_sim_error
        similarity_errors.loc[sname, qname] = tot_sim_error
        percentage_identity.loc[qname, sname] = perc_id
        percentage_identity.loc[sname, qname] = perc_id
        alignment_coverage.loc[sname, qname] = query_cover
        alignment_coverage.loc[qname, sname] = sbjct_cover
    return(alignment_lengths, percentage_identity, alignment_coverage,
           similarity_errors, zero_error)