def test_get_blast_database(tmpdir):
    collection = GenomeCollection(data_dir=str(tmpdir))
    taxid = PHAGE_TAXID
    path = collection.datafile_path(taxid, data_type="blast_nucl")
    assert not os.path.exists(path + ".nsq")

    # Test nucleotide database
    blast_db_path = collection.get_taxid_blastdb_path(taxid, db_type="nucl")
    assert path == blast_db_path
    file_size = os.stat(blast_db_path + ".nsq").st_size
    assert 50_000 > file_size > 30_000

    # Test protein database
    blast_db_path = collection.get_taxid_blastdb_path(taxid, db_type="prot")
    file_size = os.stat(blast_db_path + ".psq").st_size
    assert 60_000 > file_size > 40_000
Exemplo n.º 2
0
def test_avoid_phage_blast_matches():
    PHAGE_TAXID = "697289"
    collection = GenomeCollection()
    blastdb = collection.get_taxid_blastdb_path(PHAGE_TAXID, db_type="nucl")
    problem = DnaOptimizationProblem(sequence=random_dna_sequence(30,
                                                                  seed=123),
                                     constraints=[
                                         AvoidBlastMatches(blast_db=blastdb,
                                                           min_align_length=10,
                                                           word_size=7)
                                     ],
                                     logger=None)
    assert not problem.all_constraints_pass()
    problem.resolve_constraints()
    assert problem.all_constraints_pass()
Exemplo n.º 3
0
In this example we create a 1000bp random sequence, then edit out every match
with E. coli that is 14bp or longer.

"""
import os
from genome_collector import GenomeCollection
from dnachisel import (
    DnaOptimizationProblem,
    random_dna_sequence,
    AvoidBlastMatches,
)

# THIS CREATES THE ECOLI BLAST DATABASE ON YOUR MACHINE IF NOT ALREADY HERE

collection = GenomeCollection()
ecoli_blastdb = collection.get_taxid_blastdb_path(511145, db_type="nucl")

# DEFINE AND SOLVE THE PROBLEM

problem = DnaOptimizationProblem(
    sequence=random_dna_sequence(500, seed=123),
    constraints=[
        AvoidBlastMatches(
            blast_db=ecoli_blastdb,
            min_align_length=13,
            perc_identity=100,
            word_size=5, # The bigger the word size, the faster
            e_value=1e20,
            # ungapped=False
        )
    ],
# This is the basic example of the README to get you started.

from genome_collector import GenomeCollection
import subprocess

# GET A BLAST PATH
collection = GenomeCollection()
db_path = collection.get_taxid_blastdb_path(taxid=511145, db_type="nucl")

process = subprocess.run([
    "blastn",
    "-db",
    db_path,
    "-query",
    "basic_example_queries.fa",
    "-out",
    "basic_example_results.txt",
],
                         stderr=subprocess.PIPE)
if process.returncode:
    raise OSError("BLAST failed: %s" % process.stderr)

print("All good! see basic_example_results.txt for results.")