def test_wellSeparatedExons(self): exons = BLAST.getLargestExon( os.path.join(self.cwd, "BLAST", "query_blastn_output3.csv"), E_value=0.001, ident=98, exon_len=300 ) exons = BLAST.wellSeparatedExons(exons) result = len(exons) self.assertEqual(result, 3)
def test_do_blast(self): f = os.path.join(self.cwd, "BLAST", "query.fas") genome = os.path.join(self.cwd, "BLAST", "silkcds.fa") BLAST.makeblastdb(genome) command = ( "blastn -query " + f + " -db " + genome + " -task blastn " + "-evalue 0.0001 " + " -out " + f + "_out.csv" + " -num_threads 1 " + " -outfmt 10" ) BLAST.do_blast(command) output = open(os.path.join(self.cwd, "BLAST", "query.fas_out.csv"), "r") for line in output: self.assertTrue("BGIBMGA000001" in line) break for name in os.listdir(self.cwd + "/BLAST/"): if name[:10] == "silkcds.fa" and len(name) > 10: os.remove(self.cwd + "/BLAST/" + name) os.remove(os.path.join(self.cwd, "BLAST", "query.fas_out.csv"))
def test_get_cds(self): """Extracting genes and saving them as fasta file""" BLAST.get_cds(self.genes, self.genome) f = open("pulled_seqs.fasta", "r") result = len(f.read()) f.close() self.assertEqual(result, 54397) os.remove("pulled_seqs.fasta")
def test_makeblastdb_false(self): mask = False BLAST.makeblastdb(self.genome, mask) for name in os.listdir(self.cwd + "/BLAST/"): if name[:10] == "silkcds.fa" and len(name) > 10: os.remove(self.cwd + "/BLAST/" + name) result = "true" self.assertEqual(result, "true")
def test_blastn(self): BLAST.blastn(self.cwd + "/BLAST/query.fas", self.cwd + "/BLAST/silkcds.fa") file = open(self.cwd + "/BLAST/query_blastn_out.csv", "r") for line in file: result = line.split(",")[1] break for name in os.listdir(self.cwd + "/BLAST/"): if name[:10] == "silkcds.fa" and len(name) > 10: os.remove(self.cwd + "/BLAST/" + name) self.assertEqual(result, "BGIBMGA000001-TA")
def test_filterByMinDist(self): # The gene2 is too close to other genes genes_loci = [("gene1", 1, 350), ("gene2", 360, 670), ("gene3", 821001, 821351)] result = BLAST.filterByMinDist(genes_loci, 810000) self.assertEqual(["gene2"], result) # The function is not affected by order of genes genes_loci = [("gene3", 821001, 821351), ("gene1", 1, 350), ("gene2", 360, 670)] result = BLAST.filterByMinDist(genes_loci, 810000) self.assertEqual(["gene2"], result) # These genes are well separated already genes_loci = [("gene1", 1, 350), ("gene3", 821001, 821351)] result = BLAST.filterByMinDist(genes_loci, 810000) self.assertEqual([], result)
def test_storeExonsInFrame(self): # These seqs are in frame exons_dict = { ("BGIBMGA000001-TA", "nscaf1070"): [ "BGIBMGA000001-TA", "nscaf1070", "100.00", 450, 0, 0, 1, 450, 1, 450, "0.0", " 812", 1, ], ("BGIBMGA000002-TA", "nscaf1071"): [ "BGIBMGA000002-TA", "nscaf1071", "100.00", 350, 0, 0, 1, 350, 1, 350, "0.0", " 632", 1, ], } queries_db = self.cwd + "/BLAST/queries_db1.fas" out_file = self.cwd + "/BLAST/outfile_storeExonsInFrame.csv" BLAST.storeExonsInFrame(exons_dict, queries_db, out_file) for i in SeqIO.parse(out_file, "fasta"): translated_seq = i.seq.translate()[0:10] self.assertEqual("MRRVVWFALV", translated_seq) os.remove(out_file)
def test_storeExonsInFrame_not_in_frame3(self): # Length of seq is not multiple of 3 exons_dict = { ("BGIBMGA000001-TA", "nscaf1070"): [ "BGIBMGA000001-TA", "nscaf1070", "100.00", 449, 0, 0, 2, 451, 2, 451, "0.0", " 812", 1, ], ("BGIBMGA000002-TA", "nscaf1071"): [ "BGIBMGA000002-TA", "nscaf1071", "100.00", 449, 0, 0, 2, 451, 2, 451, "0.0", " 832", 1, ], } queries_db = self.cwd + "/BLAST/queries_db4.fas" out_file = self.cwd + "/BLAST/outfile_storeExonsInFrame.csv" BLAST.storeExonsInFrame(exons_dict, queries_db, out_file) for i in SeqIO.parse(out_file, "fasta"): translated_seq = i.seq.translate()[0:10] self.assertEqual("RVVWFALVRL", translated_seq) os.remove(out_file)
def test_blastn_big_query_file(self): query_file = os.path.join(self.cwd, "BLAST", "query_big.fas.gz") cmd = "gunzip " + query_file p = subprocess.check_call(cmd, shell=True) if p == 0: gunzipped_query_file = os.path.join(self.cwd, "BLAST", "query_big.fas") BLAST.blastn(gunzipped_query_file, self.cwd + "/BLAST/silkcds.fa") file = open(self.cwd + "/BLAST/query_big_blastn_out.csv", "r") for line in file: result = line.split(",")[1] break for name in os.listdir(self.cwd + "/BLAST/"): if name[:10] == "silkcds.fa" and len(name) > 10: os.remove(self.cwd + "/BLAST/" + name) self.assertEqual(result, "BGIBMGA000001-TA") os.remove(os.path.join(self.cwd, "BLAST", "query_big_blastn_out.csv")) cmd = "gzip " + gunzipped_query_file p = subprocess.check_call(cmd, shell=True) else: raise Exception("test failed.")
def test_blastParser(self): query = self.cwd + "/BLAST/queries_db1.fas" blast_table = self.cwd + "/BLAST/queries_db1_blastn_out.csv" sbj = self.cwd + "/BLAST/silkcds.fa" out = self.cwd + "/BLAST/output.txt" BLAST.blastn(query, sbj) BLAST.blastParser(blast_table, sbj, out) seqs = [i.id for i in SeqIO.parse(out, "fasta")] self.assertEqual(2, len(seqs)) os.remove(out) # with header row query = self.cwd + "/BLAST/queries_db1.fas" blast_table = self.cwd + "/BLAST/queries_db2_blastn_out.csv" sbj = self.cwd + "/BLAST/silkcds.fa" out = self.cwd + "/BLAST/output.txt" BLAST.blastn(query, sbj) BLAST.blastParser(blast_table, sbj, out) seqs = [i.id for i in SeqIO.parse(out, "fasta")] self.assertEqual(2, len(seqs)) os.remove(out)
import sys import glob import os from Bio import SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord if len(sys.argv) < 2: print """This script takes as input a FASTA file and will put all sequences in same direction using blast results.""" sys.exit() infile = sys.argv[1].strip() BLAST.blastn(infile, infile) # remove BLAST files for file in glob.glob(infile + ".*"): os.remove(file) if os.path.isfile(infile + "_dust.asnb"): os.remove(infile + "_dust.asnb") # parse BLAST output file blast_file = infile.replace(".fasta", "_blastn_out.csv") f = open(blast_file, "r") lines = f.readlines() f.close() def reverse(id, infile): # reverse complement sequence of ID in FASTA file infile
#!/usr/bin/env python import os; from pyphylogenomics import BLAST """ Do a BLASTn of the sequences against the Bombyx mori genome. The input arguments are your file containing the sequences for single-copy genes (pulled_seqs.fa) and your file with the genome of Bombyx mori which is in FASTA format (silkgenome.fa). """ BLAST.blastn('data/pulled_seqs.fasta', 'data/silkgenome.fa')
from pyphylogenomics import BLAST; import sys query_seqs = sys.argv[1].strip() genome = sys.argv[2].strip() BLAST.blastn(query_seqs, genome);
#!/usr/bin/env python import os; from pyphylogenomics import OrthoDB from pyphylogenomics import BLAST """ We will find all single-copy genes for the silk moth Bombyx mori using the table from OrthoDB as input file: """ in_file = 'data/OrthoDB6_Arthropoda_tabtext.csv' genes = OrthoDB.single_copy_genes(in_file, 'Bombyx mori') """ Pull all sequences for our gene IDs from the CDS file and write them to a file pulled_seqs.fa: """ cds_file = "data/silkcds.fa" if os.path.exists("data/pulled_seqs.fasta") != True: BLAST.get_cds(genes, cds_file) print "File moved to data/pulled_seqs.fa" os.rename("pulled_seqs.fasta", "data/pulled_seqs.fasta")
def test_getLargestExon_output_has_headers(self): exons = BLAST.getLargestExon( self.cwd + "/BLAST/query_blastn_output2.csv", E_value=0.001, ident=98, exon_len=300 ) result = len(exons) self.assertEqual(result, 38)
def test_eraseFalsePosi(self): exons = BLAST.getLargestExon(self.cwd + "/BLAST/query_blastn_out.csv", E_value=0.001, ident=98, exon_len=300) exons = BLAST.eraseFalsePosi(exons) result = len(exons) self.assertEqual(result, 3)
#!/usr/bin/env python import os; from pyphylogenomics import OrthoDB from pyphylogenomics import BLAST from pyphylogenomics import MUSCLE """ As stated before, we prefer long exons for each of the candidate genes ( > 300 nucleotides): """ exons = BLAST.getLargestExon("data/pulled_seqs_blastn_out.csv", E_value=0.001, ident=98, exon_len=300) """ Some small segments of sequences might be similar to non-homologous regions of the genome. We will use the function eraseFalsePosi to keep those matches of longest length: """ exons = BLAST.eraseFalsePosi(exons) # Drop presumable false positives. """ Ideally we want exons that are not too close to each other in the genome to avoid gene linkage. So we will keep only those exons that are apart by 810 kilobases: """ exons = BLAST.wellSeparatedExons(exons) # Keep exons separated by > 810KB
from pyphylogenomics import OrthoDB from pyphylogenomics import BLAST in_file = 'grefs/OrthoDB7_Arthropoda_tabtext' genes = OrthoDB.single_copy_genes(in_file, 'Bombyx mori') cds_file = 'grefs/silkcds.fa' BLAST.get_cds(genes, cds_file) BLAST.blastn('pulled_seqs.fasta', 'grefs/silkgenome.fa') exons = BLAST.getLargestExon("pulled_seqs_blastn_out.csv", E_value=0.001, ident=98, exon_len=300) exons = BLAST.eraseFalsePosi(exons) BLAST.storeExonsInFrame(exons, "pulled_seqs.fasta", "grefs/Bombyx_exons.fas")
#!/usr/bin/env python from pyphylogenomics import BLAST BLAST.blastn("grefs/Bombyx_exons.fas", "grefs/Dp_genome_v2.fasta") BLAST.blastParser("grefs/Bombyx_exons_blastn_out.csv", "grefs/Dp_genome_v2.fasta", "grefs/Danaus_exons.fasta", sp_name = "Danaus")
#!/usr/bin/env python from pyphylogenomics import BLAST import sys blast_output = sys.argv[1].strip() model_genome = sys.argv[2].strip() output_file = sys.argv[3].strip() species_name = sys.argv[4].strip() BLAST.blastParser(blast_output, model_genome, output_file, species_name)