def test_section_input_process(sections, lookup): np.random.seed(0) d = DarwinianShift(data=MUTATION_DATA_FILE, exon_file=EXON_FILE, reference_fasta=REFERENCE_FASTA_FILE, lookup=lookup, sections=sections, statistics=[CDFMonteCarloTest(), ChiSquareTest()], testing_random_seed=1, verbose=True ) d.run_all() if 'start' in sections.columns: test_name = 'sections1' else: test_name = 'sections2' # output new test file. Do not uncomment unless results have changed and confident new results are correct # Be careful to only overwrite for the particular test(s) the you want. # This function runs with many parameters. Uncommenting and running all will overwrite all cases. # pickle.dump(d.scored_data, open(os.path.join(RESULTS_DIR, "scored_data_{}.pickle".format(test_name)), 'wb')) # pickle.dump(d.results, open(os.path.join(RESULTS_DIR, "results_{}.pickle".format(test_name)), 'wb')) expected = pickle.load(open(os.path.join(RESULTS_DIR, "scored_data_{}.pickle".format(test_name)), 'rb')) assert_frame_equal(sort_dataframe(d.scored_data), sort_dataframe(expected), check_dtype=False) expected = pickle.load(open(os.path.join(RESULTS_DIR, "results_{}.pickle".format(test_name)), 'rb')) assert_frame_equal(sort_dataframe(d.results), sort_dataframe(expected))
def run_full_process(spectra, lookup, gene_list, transcript_list, deduplicate, excluded_positions, use_longest_transcript_only, exclude_synonymous, exclude_nonsense): np.random.seed(0) # This will be a very slow test. # Checks that the process works from start to finish for all options. if gene_list is not None or transcript_list is not None: # Results should be same if any of these are defined. Always testing the same transcripts gene_list_name = '2genes' else: gene_list_name = 'None' if excluded_positions is not None: ep = '1' else: ep = '0' excluded_mutations = [] if exclude_synonymous: excluded_mutations.append('synonymous') if exclude_nonsense: excluded_mutations.append('nonsense') test_name = "_".join([lookup.__class__.__name__, gene_list_name, str(int(deduplicate)), ep, str(int(use_longest_transcript_only)), str(int(exclude_synonymous)), str(int(exclude_nonsense))]) statistics = [CDFMonteCarloTest(num_draws=1000), ChiSquareTest(), MonteCarloTest(stat_function=np.mean, num_draws=1000), KSTest()] d = DarwinianShift(data=MUTATION_DATA_FILE, exon_file=EXON_FILE, reference_fasta=REFERENCE_FASTA_FILE, lookup=lookup, statistics=statistics, spectra=spectra['spectra'], gene_list=gene_list, transcript_list=transcript_list, deduplicate=deduplicate, excluded_positions=excluded_positions, use_longest_transcript_only=use_longest_transcript_only, excluded_mutation_types=excluded_mutations, testing_random_seed=1, verbose=True ) d.run_all() # output new test file. Do not uncomment unless results have changed and confident new results are correct # Be careful to only overwrite for the particular test(s) the you want. # This function runs with many parameters. Uncommenting and running all will overwrite all cases. # pickle.dump(d.scored_data, open(os.path.join(RESULTS_DIR, "scored_data_{}.pickle".format(test_name)), 'wb')) # pickle.dump(d.results, open(os.path.join(RESULTS_DIR, "results_{}.pickle".format(test_name)), 'wb')) expected = pickle.load(open(os.path.join(RESULTS_DIR, "scored_data_{}.pickle".format(test_name)), 'rb')) assert_frame_equal(sort_dataframe(d.scored_data), sort_dataframe(expected), check_dtype=False) expected = pickle.load(open(os.path.join(RESULTS_DIR, "results_{}.pickle".format(test_name)), 'rb')) assert_frame_equal(sort_dataframe(d.results), sort_dataframe(expected))
def test_incorrect_bases(): d = DarwinianShift(data=os.path.join(TEST_DATA_DIR, 'test_mutation_data_wrong_bases.tsv'), exon_file=EXON_FILE, reference_fasta=REFERENCE_FASTA_FILE, lookup=lambda x: [1]*len(x.null_mutations), # Make simple lookup here spectra=[GlobalKmerSpectrum(k=3)], low_mem=False) assert d.total_spectrum_ref_mismatch == 87 s1 = d.run_transcript('ENST00000263388') assert s1.ref_mismatch_count == 77
def proj(): np.random.seed(0) d = DarwinianShift(data=MUTATION_DATA_FILE, exon_file=EXON_FILE, reference_fasta=REFERENCE_FASTA_FILE, lookup=DummyValuesRandom(random_function=np.random.random, testing_random_seed=0), statistics=[CDFMonteCarloTest(testing_random_seed=0), ChiSquareTest()], spectra=(EvenMutationalSpectrum(), TranscriptKmerSpectrum(k=1), GlobalKmerSpectrum(k=3)) ) d.run_all() return d
def seq_pdb(): np.random.seed(0) d = DarwinianShift(data=MUTATION_DATA_FILE, exon_file=EXON_FILE, reference_fasta=REFERENCE_FASTA_FILE, lookup=DummyValuesRandom(random_function=np.random.random, testing_random_seed=0), statistics=[CDFMonteCarloTest(testing_random_seed=0), ChiSquareTest()], spectra=(EvenMutationalSpectrum(), TranscriptKmerSpectrum(k=1), GlobalKmerSpectrum(k=3)), pdb_directory=TEST_DATA_DIR, sifts_directory=TEST_DATA_DIR, download_sifts=False ) s = d.make_section(dict(transcript_id='ENST00000263388', pdb_id='4ZLP', pdb_chain='A', start=1378, end=1640)) s.apply_scores() return s
def project(): # General project that can be used for many tests but doesn't take too long to load. d = DarwinianShift( data=MUTATION_DATA_FILE, exon_file=EXON_FILE, reference_fasta=REFERENCE_FASTA_FILE, lookup=lambda x: [1] * len(x.null_mutations ), # Make simple lookup here # stats=[CDFMonteCarloTest(testing_random_seed=0), ChiSquareTest()], spectra=[GlobalKmerSpectrum(k=3)], low_mem=False) return d
def test_neutral(): exon_file = os.path.join(FILE_DIR, 'gene1_exons.txt') reference_fasta = os.path.join(FILE_DIR, 'gene1.fasta') d1 = DarwinianShift(data=os.path.join(FILE_DIR, 'data1_neutral.tsv'), exon_file=exon_file, reference_fasta=reference_fasta, spectra=os.path.join(FILE_DIR, 'spectrum1.txt')) d2 = DarwinianShift(data=os.path.join(FILE_DIR, 'data2_neutral.tsv'), exon_file=exon_file, reference_fasta=reference_fasta, spectra=os.path.join(FILE_DIR, 'spectrum2.txt')) s1 = d1.make_section(gene='gene1') s1.load_section_mutations() s2 = d2.make_section(gene='gene1') s2.load_section_mutations() res = homtest_sections(s1, s2, use_weights=True) # output new test file. Do not uncomment unless results have changed and confident new results are correct # pickle.dump(res, open(os.path.join(FILE_DIR, "res_neutral.pickle"), 'wb')) expected = pickle.load( open(os.path.join(FILE_DIR, "res_neutral.pickle"), 'rb')) assert res == expected
def project_spectrum(): # A project that can do lots of the spectra # Needs to be set up with the spectra in advance to the correct kmers are collected. d = DarwinianShift( data=MUTATION_DATA_FILE, exon_file=EXON_FILE, reference_fasta=REFERENCE_FASTA_FILE, lookup=lambda x: [1] * len(x.null_mutations ), # Make simple lookup here # stats=[CDFMonteCarloTest(testing_random_seed=0), ChiSquareTest()], low_mem=False, spectra=[ GlobalKmerSpectrum( deduplicate_spectrum=False, k=3, # Size of kmer nucleotide context. Use 3 for trinucleotides. ignore_strand=False, missing_value= 0, # To replace missing values. Useful to make non-zero in some cases. name='glob_k3'), GlobalKmerSpectrum( deduplicate_spectrum=False, k=1, # Size of kmer nucleotide context. Use 3 for trinucleotides. ignore_strand=False, missing_value=0, # To replace missing values. Useful to make non-zero in some cases. name='glob_k1'), GlobalKmerSpectrum( deduplicate_spectrum=False, k=5, # Size of kmer nucleotide context. Use 3 for trinucleotides. ignore_strand=False, missing_value=0, # To replace missing values. Useful to make non-zero in some cases. name='glob_k5'), GlobalKmerSpectrum( deduplicate_spectrum=True, k=3, # Size of kmer nucleotide context. Use 3 for trinucleotides. ignore_strand=False, missing_value=0, # To replace missing values. Useful to make non-zero in some cases. name='glob_k3_dd'), GlobalKmerSpectrum( deduplicate_spectrum=False, k=3, # Size of kmer nucleotide context. Use 3 for trinucleotides. ignore_strand=True, missing_value=0, # To replace missing values. Useful to make non-zero in some cases. name='glob_k3_is') ]) return d