def test_get_various_datatypes(tmpdir): collection = GenomeCollection(data_dir=str(tmpdir)) for data_type in ["protein_fasta", "genomic_fasta", "genomic_genbank"]: path = collection.get_taxid_genome_data_path( taxid=PHAGE_TAXID, data_type=data_type ) assert os.path.exists(path)
def test_blast_against_taxid(tmpdir): collection = GenomeCollection(data_dir=str(tmpdir)) blast_results_file = os.path.join(str(tmpdir), "results.txt") queries_file = os.path.join("tests", "queries.fa") assert not os.path.exists(blast_results_file) collection.blast_against_taxid( PHAGE_TAXID, "nucl", ["blastn", "-query", queries_file, "-out", blast_results_file], ) file_size = os.stat(blast_results_file).st_size assert 1200 > file_size > 800
def test_avoid_matches_with_phage(): PHAGE_TAXID = "697289" collection = GenomeCollection() index = collection.get_taxid_bowtie_index_path(PHAGE_TAXID, version="1") problem = DnaOptimizationProblem( sequence=random_dna_sequence(30, seed=123), constraints=[AvoidMatches(bowtie_index=index, match_length=10)], logger=None, ) all_breaches = problem.constraints_evaluations().all_locations() assert len(all_breaches) == 5 problem.resolve_constraints() assert problem.all_constraints_pass()
def test_avoid_phage_blast_matches(): PHAGE_TAXID = "697289" collection = GenomeCollection() blastdb = collection.get_taxid_blastdb_path(PHAGE_TAXID, db_type="nucl") problem = DnaOptimizationProblem(sequence=random_dna_sequence(30, seed=123), constraints=[ AvoidBlastMatches(blast_db=blastdb, min_align_length=10, word_size=7) ], logger=None) assert not problem.all_constraints_pass() problem.resolve_constraints() assert problem.all_constraints_pass()
def test_get_blast_database(tmpdir): collection = GenomeCollection(data_dir=str(tmpdir)) taxid = PHAGE_TAXID path = collection.datafile_path(taxid, data_type="blast_nucl") assert not os.path.exists(path + ".nsq") # Test nucleotide database blast_db_path = collection.get_taxid_blastdb_path(taxid, db_type="nucl") assert path == blast_db_path file_size = os.stat(blast_db_path + ".nsq").st_size assert 50_000 > file_size > 30_000 # Test protein database blast_db_path = collection.get_taxid_blastdb_path(taxid, db_type="prot") file_size = os.stat(blast_db_path + ".psq").st_size assert 60_000 > file_size > 40_000
def test_delete_all_data_files(tmpdir): collection = GenomeCollection(data_dir=str(tmpdir)) taxids = ["224308", "511145", "559292"] for taxid in taxids: collection.get_taxid_infos(taxid) found_taxids = collection.list_locally_available_taxids("infos") assert len(found_taxids) == 3 collection.remove_all_local_data_files() found_taxids = collection.list_locally_available_taxids("infos") assert len(found_taxids) == 0
def test_autodownload_false(tmpdir): collection = GenomeCollection(data_dir=str(tmpdir)) collection.autodownload = False with pytest.raises(FileNotFoundError) as excinfo: collection.get_taxid_infos("224308") assert "No infos" in str(excinfo.value) with pytest.raises(FileNotFoundError) as excinfo: collection.get_taxid_genome_data_path("224308") assert "No genome" in str(excinfo.value)
def test_get_biopython_records(tmpdir): collection = GenomeCollection(data_dir=str(tmpdir)) records = collection.get_taxid_biopython_records(PHAGE_TAXID) assert len(records) == 1 assert 168000 < len(records[0]) < 170000
"""Example of use for AvoidBlastMatches. In this example we create a 1000bp random sequence, then edit out every match with E. coli that is 14bp or longer. """ from dnachisel import DnaOptimizationProblem, random_dna_sequence, AvoidMatches from genome_collector import GenomeCollection # THIS CREATES THE ECOLI BLAST DATABASE ON YOUR MACHINE IF NOT ALREADY HERE collection = GenomeCollection() ecoli_index = collection.get_taxid_bowtie_index_path(511145, version="1") # DEFINE AND SOLVE THE PROBLEM problem = DnaOptimizationProblem( sequence=random_dna_sequence(500, seed=123), constraints=[ AvoidMatches(bowtie_index=ecoli_index, match_length=15, mismatches=1) ], ) print( "Constraints validity before optimization\n", problem.constraints_text_summary(), ) print("\nNow resolving the problems\n") problem.resolve_constraints(final_check=True)
In this example we create a 1000bp random sequence, then edit out every match with E. coli that is 14bp or longer. """ import os from genome_collector import GenomeCollection from dnachisel import ( DnaOptimizationProblem, random_dna_sequence, AvoidBlastMatches, ) # THIS CREATES THE ECOLI BLAST DATABASE ON YOUR MACHINE IF NOT ALREADY HERE collection = GenomeCollection() ecoli_blastdb = collection.get_taxid_blastdb_path(511145, db_type="nucl") # DEFINE AND SOLVE THE PROBLEM problem = DnaOptimizationProblem( sequence=random_dna_sequence(500, seed=123), constraints=[ AvoidBlastMatches( blast_db=ecoli_blastdb, min_align_length=13, perc_identity=100, word_size=5, # The bigger the word size, the faster e_value=1e20, # ungapped=False )
# This is the basic example of the README to get you started. from genome_collector import GenomeCollection import subprocess # GET A BLAST PATH collection = GenomeCollection() db_path = collection.get_taxid_blastdb_path(taxid=511145, db_type="nucl") process = subprocess.run([ "blastn", "-db", db_path, "-query", "basic_example_queries.fa", "-out", "basic_example_results.txt", ], stderr=subprocess.PIPE) if process.returncode: raise OSError("BLAST failed: %s" % process.stderr) print("All good! see basic_example_results.txt for results.")