Пример #1
0
def test_maybe_write_parquet(reads, single_alphabet_ksize_true_scores,
                             true_scores_parquet):
    create_ss = CreateSaveSummary(
        reads,
        True,
        true_scores_parquet,
        True,
        "bloom_filter.nodegraph",
        "protein",
        7,
        0.5,
        single_alphabet_ksize_true_scores,
    )
    create_ss.maybe_write_parquet()
Пример #2
0
def test_maybe_write_json_summary_empty(coding_scores_empty, alphabet,
                                        peptide_bloom_filter_path,
                                        peptide_ksize):
    create_ss = CreateSaveSummary(
        ["nonexistent.fa"],
        True,
        True,
        True,
        peptide_bloom_filter_path,
        alphabet,
        peptide_ksize,
        DEFAULT_JACCARD_THRESHOLD,
        coding_scores_empty,
    )
    summary = create_ss.maybe_write_json_summary()
    assert summary["input_files"] == ["nonexistent.fa"]
    assert summary["jaccard_info"]["count"] == 0
Пример #3
0
def test_make_empty_coding_categories(single_alphabet_ksize_true_scores):
    create_ss = CreateSaveSummary(
        ["nonexistent.fa"],
        True,
        True,
        True,
        "bloom_filter.nodegraph",
        "protein",
        7,
        0.5,
        single_alphabet_ksize_true_scores,
    )
    test_coding_categories = {
        "Translation is shorter than peptide k-mer size + 1": 0,
        "Translation frame has stop codon(s)": 0,
        "Coding": 0,
        "Non-coding": 0,
        "Low complexity nucleotide": 0,
        "Read length was shorter than 3 * peptide k-mer size": 0,
        "Low complexity peptide in protein20 alphabet": 0,
    }
    true_coding_categories = create_ss.make_empty_coding_categories()
    assert true_coding_categories == test_coding_categories
Пример #4
0
def test_get_n_translated_frames_per_read(coding_scores_nonempty, alphabet,
                                          peptide_bloom_filter_path,
                                          peptide_ksize):
    create_ss = CreateSaveSummary(
        ["nonexistent.fa"],
        True,
        True,
        True,
        peptide_bloom_filter_path,
        alphabet,
        peptide_ksize,
        DEFAULT_JACCARD_THRESHOLD,
        coding_scores_nonempty,
    )
    percentages, histogram = create_ss.get_n_translated_frames_per_read()
    assert histogram == {
        "Number of reads with 1 putative protein-coding translations": 5,
        "Number of reads with 2 putative protein-coding translations": 2,
        "Number of reads with 6 putative protein-coding translations": 1,
        "Number of reads with 5 putative protein-coding translations": 1,
        "Number of reads with 4 putative protein-coding translations": 1,
        "Number of reads with 3 putative protein-coding translations": 1,
    }
    assert percentages == {
        "Number of reads with 1 putative protein-coding translations":
        45.45454545454545,
        "Number of reads with 2 putative protein-coding translations":
        18.181818181818183,
        "Number of reads with 6 putative protein-coding translations":
        9.090909090909092,
        "Number of reads with 5 putative protein-coding translations":
        9.090909090909092,
        "Number of reads with 4 putative protein-coding translations":
        9.090909090909092,
        "Number of reads with 3 putative protein-coding translations":
        9.090909090909092,
    }
Пример #5
0
def test_generate_coding_summary(reads, data_folder,
                                 single_alphabet_ksize_true_scores):
    create_ss = CreateSaveSummary(
        reads,
        True,
        True,
        True,
        "bloom_filter.nodegraph",
        "protein",
        7,
        0.5,
        single_alphabet_ksize_true_scores,
    )
    test_summary = create_ss.generate_coding_summary()
    true_summary = {
        "input_files":
        ["SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled_n22.fq"],
        "jaccard_info": {
            "count": 44,
            "mean": 0.085830733808675,
            "std": 0.2503210514088884,
            "min": 0.0,
            "25%": 0.0,
            "50%": 0.0,
            "75%": 0.0588235294117647,
            "max": 1.0,
        },
        "categorization_counts": {
            "Translation is shorter than peptide k-mer size + 1": 0,
            "Translation frame has stop codon(s)": 3,
            "Coding": 3,
            "Non-coding": 14,
            "Low complexity nucleotide": 0,
            "Read length was shorter than 3 * peptide k-mer size": 2,
            "Low complexity peptide in protein20 alphabet": 1,
        },
        "categorization_percentages": {
            "Translation is shorter than peptide k-mer size + 1": 0.0,
            "Translation frame has stop codon(s)": 13.043478260869565,
            "Coding": 13.043478260869565,
            "Non-coding": 60.869565217391305,
            "Low complexity nucleotide": 0.0,
            "Read length was shorter than 3 * peptide k-mer size":
            8.695652173913043,
            "Low complexity peptide in protein20 alphabet": 4.3478260869565215,
        },
        "histogram_n_coding_frames_per_read": {
            "Number of reads with 1 putative protein-coding translations": 3
        },
        "histogram_n_coding_frames_per_read_percentages": {
            "Number of reads with 1 putative protein-coding translations":
            100.0
        },
        "peptide_bloom_filter": "bloom_filter.nodegraph",
        "peptide_alphabet": "protein",
        "peptide_ksize": 7,
        "jaccard_threshold": 0.5,
    }
    for key, value in test_summary.items():
        if type(value) is str:
            assert value == true_summary[key]
        elif type(value) is dict:
            for key_value, value_value in value.items():
                if type(value_value) is str:
                    assert value_value == true_summary[key][key_value]
                else:
                    assert_almost_equal(
                        float(value_value),
                        float(true_summary[key][key_value]),
                        decimal=13,
                    )
        else:
            assert value == true_summary[key]
Пример #6
0
def test_get_n_per_coding_category(
    coding_scores_nonempty,
    alphabet,
    peptide_bloom_filter_path,
    peptide_ksize,
    jaccard_threshold,
):
    from orpheum.sequence_encodings import ALIAS_TO_ALPHABET

    data = [
        ["read1", 0.9, 0, "Non-coding", 0, ""],
        ["read1", 0.9, 0, "Coding", 0, ""],
        ["read1", 0.9, 0, "Non-coding", 0, ""],
        ["read2", 0.9, 0, "Translation frame has stop codon(s)", 0, ""],
        ["read3", 0.9, 0, "Coding", 0, ""],
        ["read4", 0.9, 0, "Non-coding", 0, ""],
        ["read5", 0.9, 0, "Low complexity nucleotide", 0, ""],
        [
            "read6", 0.9, 0,
            "Read length was shorter than 3 * peptide k-mer size", 0, ""
        ],
        ["read7", 0.9, 0, LOW_COMPLEXITY_CATEGORIES[alphabet], 0, ""],
    ]

    create_ss = CreateSaveSummary(
        ["nonexistent.fa"],
        True,
        True,
        True,
        peptide_bloom_filter_path,
        alphabet,
        peptide_ksize,
        jaccard_threshold,
        data,
    )

    test_counts, test_percentages = create_ss.get_n_per_coding_category()
    canonical_alphabet = ALIAS_TO_ALPHABET[alphabet]
    # read1 and read3 are coding, there is zero too_short_peptide
    true_counts = {
        "Translation is shorter than peptide k-mer size + 1":
        0.0,
        "Translation frame has stop codon(s)":
        14.285714285714286,
        "Coding":
        28.571428571428573,
        "Non-coding":
        14.285714285714286,
        "Low complexity nucleotide":
        14.285714285714286,
        "Read length was shorter than 3 * peptide k-mer size":
        14.285714285714286,
        f"Low complexity peptide in {canonical_alphabet} alphabet":
        14.285714285714286,
    }
    true_percentages = {
        "Translation is shorter than peptide k-mer size + 1": 0,
        "Translation frame has stop codon(s)": 1,
        "Coding": 2,
        "Non-coding": 1,
        "Low complexity nucleotide": 1,
        "Read length was shorter than 3 * peptide k-mer size": 1,
        f"Low complexity peptide in {canonical_alphabet} alphabet": 1,
    }
    assert test_counts == true_counts
    assert test_percentages == true_percentages
Пример #7
0
def cli(
    peptides,
    reads,
    peptide_ksize=None,
    save_peptide_bloom_filter=True,
    peptides_are_bloom_filter=False,
    jaccard_threshold=None,
    alphabet="protein",
    csv=None,
    parquet=None,
    json_summary=None,
    coding_nucleotide_fasta=None,
    noncoding_nucleotide_fasta=None,
    low_complexity_nucleotide_fasta=None,
    low_complexity_peptide_fasta=None,
    tablesize=constants_index.DEFAULT_MAX_TABLESIZE,
    n_tables=constants_index.DEFAULT_N_TABLES,
    long_reads=False,
    verbose=False,
):
    """Writes coding peptides from reads to standard output

    \b
    Sane defaults for peptide_ksize for different peptide encodings:
    - with "protein" or "peptide" --> --peptide-ksize = 5-10
      7 is pretty universal but can go down to 5 for less species specificity
      and up to 10 to be very specific
    - with "dayhoff" --> --peptide-ksize = 10-15
    - with "hydrophobic-polar" or "hp" --> --peptide-ksize = 15-21
      15 is pretty good but can do up to 21

    \b
    Parameters
    ----------
    reads : str
        Sequence file of reads to filter
    peptides : str
        Sequence file of peptides
    peptide_ksize : int
        Number of characters in amino acid words
    save_peptide_bloom_filter : str or bool
        Whether or not to save the created bloom filter to file. If a string,
        save to this filename
    peptides_are_bloom_filter : bool
        Input ilfe of peptides is already a bloom filter
    jaccard_threshold : float
        Value between 0 and 1. By default, the (empirically-chosen) "best"
        threshold is chosen for each alphabet. For "protein" and  "dayhoff",
        the default is 0.5, and for "hydrophobic-polar," it is 0.8, since it is
        so lossy it's more likely to match random sequence. These thresholds
        were determined empirically with a pre-chosen human RNA-seq dataset and
        human peptides.
    alphabet : str
        One of "protein"|"peptide", "dayhoff", or "hydrophobic-polar"|"hp" to
        encode the protein-coding space. Where "protein"|"peptide" is the
        original 20-letter amino acid encoding, Dayhoff ("dayhoff") is a lossy
        6-letter encoding that categorizes the amino acids into:
            1. Cysteine,
            2. Small (A, G, P, S, T)
            3. Acid and Amide (D, E, N, Q)
            4. Basic (H, K, R)
            5. Hydrophobic (I, L, M, V)
            6. Aromatic (F, W, Y)
        Hydrophobic-polar maps to a mere two categories:
            1. Hydrophobic (A, F, G, I, L, M, P, V, W, Y)
            2. Polar (C, D, E, H, K, N, Q, R, S, T)
    csv : str
        Save the coding scores as a csv to this file
    parquet : str
        Save the coding scores as a parquet to this file
    long_reads : bool -- NOT IMPLEMENTED!!
        Input sequencing reads are long reads. Not implemented, but the plan
        is, instead of doing 6-frame translation as on the short reads, test
        all ATG (start codon) to stop codon reading frames for the one(s) that
        matches the known peptide database best. Unknown whether this requires
        new thresholds
    coding_nucleotide_fasta : None or str
        If specified, save coding nucleotide sequence to this file
    noncoding_nucleotide_fasta : None or str
        If specified, save noncoding nucleotide sequence to this file
    low_complexity_nucleotide_fasta : None or str
        If specified, save low complexity nucleotide sequence to this file
    low_complexity_peptide_fasta : None or str
        If specified, save low complexity peptide sequence to this file
    verbose : bool
        Whether or not to print lots of stuff. Can specify multiple, e.g. -vv
        if you really like having everything in stdout

    \b
    Returns
    -------
    coding_peptides : str
        Outputs a fasta-formatted sequence of translated peptides
    """
    # \b above prevents re-wrapping of paragraphs
    translate_obj = Translate(locals())
    translate_obj.set_coding_scores_all_files()
    assemble_summary_obj = CreateSaveSummary(
        reads,
        csv,
        parquet,
        json_summary,
        translate_obj.peptide_bloom_filter_filename,
        alphabet,
        translate_obj.peptide_ksize,
        translate_obj.jaccard_threshold,
        translate_obj.coding_scores,
    )
    del translate_obj.coding_scores
    assemble_summary_obj.maybe_write_csv()
    assemble_summary_obj.maybe_write_parquet()
    assemble_summary_obj.maybe_write_json_summary()