def test_section_input_process(sections, lookup):
    np.random.seed(0)

    d = DarwinianShift(data=MUTATION_DATA_FILE,
                       exon_file=EXON_FILE,
                       reference_fasta=REFERENCE_FASTA_FILE,
                       lookup=lookup,
                       sections=sections,
                       statistics=[CDFMonteCarloTest(), ChiSquareTest()],
                       testing_random_seed=1,
                       verbose=True
                       )
    d.run_all()

    if 'start' in sections.columns:
        test_name = 'sections1'
    else:
        test_name = 'sections2'

    # output new test file. Do not uncomment unless results have changed and confident new results are correct
    # Be careful to only overwrite for the particular test(s) the you want.
    # This function runs with many parameters. Uncommenting and running all will overwrite all cases.
    # pickle.dump(d.scored_data, open(os.path.join(RESULTS_DIR, "scored_data_{}.pickle".format(test_name)), 'wb'))
    # pickle.dump(d.results, open(os.path.join(RESULTS_DIR, "results_{}.pickle".format(test_name)), 'wb'))

    expected = pickle.load(open(os.path.join(RESULTS_DIR, "scored_data_{}.pickle".format(test_name)), 'rb'))
    assert_frame_equal(sort_dataframe(d.scored_data), sort_dataframe(expected), check_dtype=False)

    expected = pickle.load(open(os.path.join(RESULTS_DIR, "results_{}.pickle".format(test_name)), 'rb'))
    assert_frame_equal(sort_dataframe(d.results), sort_dataframe(expected))
def run_full_process(spectra, lookup, gene_list, transcript_list, deduplicate, excluded_positions,
                      use_longest_transcript_only, exclude_synonymous, exclude_nonsense):
    np.random.seed(0)
    # This will be a very slow test.
    # Checks that the process works from start to finish for all options.
    if gene_list is not None or transcript_list is not None:
        # Results should be same if any of these are defined. Always testing the same transcripts
        gene_list_name = '2genes'
    else:
        gene_list_name = 'None'
    if excluded_positions is not None:
        ep = '1'
    else:
        ep = '0'

    excluded_mutations = []
    if exclude_synonymous:
        excluded_mutations.append('synonymous')
    if exclude_nonsense:
        excluded_mutations.append('nonsense')

    test_name = "_".join([lookup.__class__.__name__, gene_list_name, str(int(deduplicate)), ep,
                          str(int(use_longest_transcript_only)), str(int(exclude_synonymous)),
                          str(int(exclude_nonsense))])

    statistics = [CDFMonteCarloTest(num_draws=1000), ChiSquareTest(),
                  MonteCarloTest(stat_function=np.mean, num_draws=1000),
                  KSTest()]

    d = DarwinianShift(data=MUTATION_DATA_FILE,
                       exon_file=EXON_FILE,
                       reference_fasta=REFERENCE_FASTA_FILE,
                       lookup=lookup,
                       statistics=statistics,
                       spectra=spectra['spectra'],
                       gene_list=gene_list,
                       transcript_list=transcript_list,
                       deduplicate=deduplicate,
                       excluded_positions=excluded_positions,
                       use_longest_transcript_only=use_longest_transcript_only,
                       excluded_mutation_types=excluded_mutations,
                       testing_random_seed=1,
                       verbose=True
                       )
    d.run_all()

    # output new test file. Do not uncomment unless results have changed and confident new results are correct
    # Be careful to only overwrite for the particular test(s) the you want.
    # This function runs with many parameters. Uncommenting and running all will overwrite all cases.
    # pickle.dump(d.scored_data, open(os.path.join(RESULTS_DIR, "scored_data_{}.pickle".format(test_name)), 'wb'))
    # pickle.dump(d.results, open(os.path.join(RESULTS_DIR, "results_{}.pickle".format(test_name)), 'wb'))

    expected = pickle.load(open(os.path.join(RESULTS_DIR, "scored_data_{}.pickle".format(test_name)), 'rb'))
    assert_frame_equal(sort_dataframe(d.scored_data), sort_dataframe(expected), check_dtype=False)

    expected = pickle.load(open(os.path.join(RESULTS_DIR, "results_{}.pickle".format(test_name)), 'rb'))
    assert_frame_equal(sort_dataframe(d.results), sort_dataframe(expected))
def test_incorrect_bases():
    d = DarwinianShift(data=os.path.join(TEST_DATA_DIR, 'test_mutation_data_wrong_bases.tsv'),
                           exon_file=EXON_FILE,
                           reference_fasta=REFERENCE_FASTA_FILE,
                           lookup=lambda x: [1]*len(x.null_mutations),  # Make simple lookup here
                           spectra=[GlobalKmerSpectrum(k=3)],
                           low_mem=False)
    assert d.total_spectrum_ref_mismatch == 87

    s1 = d.run_transcript('ENST00000263388')
    assert s1.ref_mismatch_count == 77
예제 #4
0
def proj():
    np.random.seed(0)
    d = DarwinianShift(data=MUTATION_DATA_FILE,
                       exon_file=EXON_FILE,
                       reference_fasta=REFERENCE_FASTA_FILE,
                       lookup=DummyValuesRandom(random_function=np.random.random, testing_random_seed=0),
                       statistics=[CDFMonteCarloTest(testing_random_seed=0), ChiSquareTest()],
                       spectra=(EvenMutationalSpectrum(),
                                TranscriptKmerSpectrum(k=1),
                                GlobalKmerSpectrum(k=3))
                       )
    d.run_all()
    return d
예제 #5
0
def seq_pdb():
    np.random.seed(0)
    d = DarwinianShift(data=MUTATION_DATA_FILE,
                       exon_file=EXON_FILE,
                       reference_fasta=REFERENCE_FASTA_FILE,
                       lookup=DummyValuesRandom(random_function=np.random.random, testing_random_seed=0),
                       statistics=[CDFMonteCarloTest(testing_random_seed=0), ChiSquareTest()],
                       spectra=(EvenMutationalSpectrum(),
                                TranscriptKmerSpectrum(k=1),
                                GlobalKmerSpectrum(k=3)),
                       pdb_directory=TEST_DATA_DIR,
                       sifts_directory=TEST_DATA_DIR,
                       download_sifts=False
                       )
    s = d.make_section(dict(transcript_id='ENST00000263388', pdb_id='4ZLP', pdb_chain='A', start=1378, end=1640))
    s.apply_scores()
    return s
예제 #6
0
def project():
    # General project that can be used for many tests but doesn't take too long to load.
    d = DarwinianShift(
        data=MUTATION_DATA_FILE,
        exon_file=EXON_FILE,
        reference_fasta=REFERENCE_FASTA_FILE,
        lookup=lambda x: [1] * len(x.null_mutations
                                   ),  # Make simple lookup here
        # stats=[CDFMonteCarloTest(testing_random_seed=0), ChiSquareTest()],
        spectra=[GlobalKmerSpectrum(k=3)],
        low_mem=False)
    return d
예제 #7
0
def test_neutral():
    exon_file = os.path.join(FILE_DIR, 'gene1_exons.txt')
    reference_fasta = os.path.join(FILE_DIR, 'gene1.fasta')
    d1 = DarwinianShift(data=os.path.join(FILE_DIR, 'data1_neutral.tsv'),
                        exon_file=exon_file,
                        reference_fasta=reference_fasta,
                        spectra=os.path.join(FILE_DIR, 'spectrum1.txt'))

    d2 = DarwinianShift(data=os.path.join(FILE_DIR, 'data2_neutral.tsv'),
                        exon_file=exon_file,
                        reference_fasta=reference_fasta,
                        spectra=os.path.join(FILE_DIR, 'spectrum2.txt'))

    s1 = d1.make_section(gene='gene1')
    s1.load_section_mutations()

    s2 = d2.make_section(gene='gene1')
    s2.load_section_mutations()

    res = homtest_sections(s1, s2, use_weights=True)

    # output new test file. Do not uncomment unless results have changed and confident new results are correct
    # pickle.dump(res, open(os.path.join(FILE_DIR, "res_neutral.pickle"), 'wb'))

    expected = pickle.load(
        open(os.path.join(FILE_DIR, "res_neutral.pickle"), 'rb'))
    assert res == expected
예제 #8
0
def project_spectrum():
    # A project that can do lots of the spectra
    # Needs to be set up with the spectra in advance to the correct kmers are collected.
    d = DarwinianShift(
        data=MUTATION_DATA_FILE,
        exon_file=EXON_FILE,
        reference_fasta=REFERENCE_FASTA_FILE,
        lookup=lambda x: [1] * len(x.null_mutations
                                   ),  # Make simple lookup here
        # stats=[CDFMonteCarloTest(testing_random_seed=0), ChiSquareTest()],
        low_mem=False,
        spectra=[
            GlobalKmerSpectrum(
                deduplicate_spectrum=False,
                k=3,  # Size of kmer nucleotide context. Use 3 for trinucleotides.
                ignore_strand=False,
                missing_value=
                0,  # To replace missing values. Useful to make non-zero in some cases.
                name='glob_k3'),
            GlobalKmerSpectrum(
                deduplicate_spectrum=False,
                k=1,  # Size of kmer nucleotide context. Use 3 for trinucleotides.
                ignore_strand=False,
                missing_value=0,
                # To replace missing values. Useful to make non-zero in some cases.
                name='glob_k1'),
            GlobalKmerSpectrum(
                deduplicate_spectrum=False,
                k=5,
                # Size of kmer nucleotide context. Use 3 for trinucleotides.
                ignore_strand=False,
                missing_value=0,
                # To replace missing values. Useful to make non-zero in some cases.
                name='glob_k5'),
            GlobalKmerSpectrum(
                deduplicate_spectrum=True,
                k=3,
                # Size of kmer nucleotide context. Use 3 for trinucleotides.
                ignore_strand=False,
                missing_value=0,
                # To replace missing values. Useful to make non-zero in some cases.
                name='glob_k3_dd'),
            GlobalKmerSpectrum(
                deduplicate_spectrum=False,
                k=3,  # Size of kmer nucleotide context. Use 3 for trinucleotides.
                ignore_strand=True,
                missing_value=0,
                # To replace missing values. Useful to make non-zero in some cases.
                name='glob_k3_is')
        ])
    return d