Пример #1
0
 def test_wellSeparatedExons(self):
     exons = BLAST.getLargestExon(
         os.path.join(self.cwd, "BLAST", "query_blastn_output3.csv"), E_value=0.001, ident=98, exon_len=300
     )
     exons = BLAST.wellSeparatedExons(exons)
     result = len(exons)
     self.assertEqual(result, 3)
Пример #2
0
    def test_do_blast(self):
        f = os.path.join(self.cwd, "BLAST", "query.fas")
        genome = os.path.join(self.cwd, "BLAST", "silkcds.fa")
        BLAST.makeblastdb(genome)

        command = (
            "blastn -query "
            + f
            + " -db "
            + genome
            + " -task blastn "
            + "-evalue 0.0001 "
            + " -out "
            + f
            + "_out.csv"
            + " -num_threads 1 "
            + " -outfmt 10"
        )
        BLAST.do_blast(command)
        output = open(os.path.join(self.cwd, "BLAST", "query.fas_out.csv"), "r")
        for line in output:
            self.assertTrue("BGIBMGA000001" in line)
            break

        for name in os.listdir(self.cwd + "/BLAST/"):
            if name[:10] == "silkcds.fa" and len(name) > 10:
                os.remove(self.cwd + "/BLAST/" + name)
        os.remove(os.path.join(self.cwd, "BLAST", "query.fas_out.csv"))
Пример #3
0
 def test_get_cds(self):
     """Extracting genes and saving them as fasta file"""
     BLAST.get_cds(self.genes, self.genome)
     f = open("pulled_seqs.fasta", "r")
     result = len(f.read())
     f.close()
     self.assertEqual(result, 54397)
     os.remove("pulled_seqs.fasta")
Пример #4
0
 def test_makeblastdb_false(self):
     mask = False
     BLAST.makeblastdb(self.genome, mask)
     for name in os.listdir(self.cwd + "/BLAST/"):
         if name[:10] == "silkcds.fa" and len(name) > 10:
             os.remove(self.cwd + "/BLAST/" + name)
             result = "true"
     self.assertEqual(result, "true")
Пример #5
0
 def test_blastn(self):
     BLAST.blastn(self.cwd + "/BLAST/query.fas", self.cwd + "/BLAST/silkcds.fa")
     file = open(self.cwd + "/BLAST/query_blastn_out.csv", "r")
     for line in file:
         result = line.split(",")[1]
         break
     for name in os.listdir(self.cwd + "/BLAST/"):
         if name[:10] == "silkcds.fa" and len(name) > 10:
             os.remove(self.cwd + "/BLAST/" + name)
     self.assertEqual(result, "BGIBMGA000001-TA")
Пример #6
0
    def test_filterByMinDist(self):
        # The gene2 is too close to other genes
        genes_loci = [("gene1", 1, 350), ("gene2", 360, 670), ("gene3", 821001, 821351)]
        result = BLAST.filterByMinDist(genes_loci, 810000)
        self.assertEqual(["gene2"], result)

        # The function is not affected by order of genes
        genes_loci = [("gene3", 821001, 821351), ("gene1", 1, 350), ("gene2", 360, 670)]
        result = BLAST.filterByMinDist(genes_loci, 810000)
        self.assertEqual(["gene2"], result)

        # These genes are well separated already
        genes_loci = [("gene1", 1, 350), ("gene3", 821001, 821351)]
        result = BLAST.filterByMinDist(genes_loci, 810000)
        self.assertEqual([], result)
Пример #7
0
    def test_storeExonsInFrame(self):
        # These seqs are in frame
        exons_dict = {
            ("BGIBMGA000001-TA", "nscaf1070"): [
                "BGIBMGA000001-TA",
                "nscaf1070",
                "100.00",
                450,
                0,
                0,
                1,
                450,
                1,
                450,
                "0.0",
                " 812",
                1,
            ],
            ("BGIBMGA000002-TA", "nscaf1071"): [
                "BGIBMGA000002-TA",
                "nscaf1071",
                "100.00",
                350,
                0,
                0,
                1,
                350,
                1,
                350,
                "0.0",
                " 632",
                1,
            ],
        }
        queries_db = self.cwd + "/BLAST/queries_db1.fas"
        out_file = self.cwd + "/BLAST/outfile_storeExonsInFrame.csv"

        BLAST.storeExonsInFrame(exons_dict, queries_db, out_file)
        for i in SeqIO.parse(out_file, "fasta"):
            translated_seq = i.seq.translate()[0:10]
            self.assertEqual("MRRVVWFALV", translated_seq)
        os.remove(out_file)
Пример #8
0
    def test_storeExonsInFrame_not_in_frame3(self):
        # Length of seq is not multiple of 3
        exons_dict = {
            ("BGIBMGA000001-TA", "nscaf1070"): [
                "BGIBMGA000001-TA",
                "nscaf1070",
                "100.00",
                449,
                0,
                0,
                2,
                451,
                2,
                451,
                "0.0",
                " 812",
                1,
            ],
            ("BGIBMGA000002-TA", "nscaf1071"): [
                "BGIBMGA000002-TA",
                "nscaf1071",
                "100.00",
                449,
                0,
                0,
                2,
                451,
                2,
                451,
                "0.0",
                " 832",
                1,
            ],
        }
        queries_db = self.cwd + "/BLAST/queries_db4.fas"
        out_file = self.cwd + "/BLAST/outfile_storeExonsInFrame.csv"

        BLAST.storeExonsInFrame(exons_dict, queries_db, out_file)
        for i in SeqIO.parse(out_file, "fasta"):
            translated_seq = i.seq.translate()[0:10]
            self.assertEqual("RVVWFALVRL", translated_seq)
        os.remove(out_file)
Пример #9
0
    def test_blastn_big_query_file(self):
        query_file = os.path.join(self.cwd, "BLAST", "query_big.fas.gz")

        cmd = "gunzip " + query_file
        p = subprocess.check_call(cmd, shell=True)

        if p == 0:
            gunzipped_query_file = os.path.join(self.cwd, "BLAST", "query_big.fas")
            BLAST.blastn(gunzipped_query_file, self.cwd + "/BLAST/silkcds.fa")
            file = open(self.cwd + "/BLAST/query_big_blastn_out.csv", "r")
            for line in file:
                result = line.split(",")[1]
                break
            for name in os.listdir(self.cwd + "/BLAST/"):
                if name[:10] == "silkcds.fa" and len(name) > 10:
                    os.remove(self.cwd + "/BLAST/" + name)
            self.assertEqual(result, "BGIBMGA000001-TA")

            os.remove(os.path.join(self.cwd, "BLAST", "query_big_blastn_out.csv"))
            cmd = "gzip " + gunzipped_query_file
            p = subprocess.check_call(cmd, shell=True)
        else:
            raise Exception("test failed.")
Пример #10
0
    def test_blastParser(self):
        query = self.cwd + "/BLAST/queries_db1.fas"
        blast_table = self.cwd + "/BLAST/queries_db1_blastn_out.csv"
        sbj = self.cwd + "/BLAST/silkcds.fa"
        out = self.cwd + "/BLAST/output.txt"
        BLAST.blastn(query, sbj)
        BLAST.blastParser(blast_table, sbj, out)
        seqs = [i.id for i in SeqIO.parse(out, "fasta")]
        self.assertEqual(2, len(seqs))
        os.remove(out)

        # with header row
        query = self.cwd + "/BLAST/queries_db1.fas"
        blast_table = self.cwd + "/BLAST/queries_db2_blastn_out.csv"
        sbj = self.cwd + "/BLAST/silkcds.fa"
        out = self.cwd + "/BLAST/output.txt"
        BLAST.blastn(query, sbj)
        BLAST.blastParser(blast_table, sbj, out)
        seqs = [i.id for i in SeqIO.parse(out, "fasta")]
        self.assertEqual(2, len(seqs))
        os.remove(out)
import sys
import glob
import os
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord



if len(sys.argv) < 2:
    print """This script takes as input a FASTA file and will put all sequences
            in same direction using blast results."""
    sys.exit()

infile = sys.argv[1].strip()
BLAST.blastn(infile, infile)

# remove BLAST files
for file in glob.glob(infile + ".*"):
    os.remove(file)
if os.path.isfile(infile + "_dust.asnb"):
    os.remove(infile + "_dust.asnb")

# parse BLAST output file
blast_file = infile.replace(".fasta", "_blastn_out.csv")
f = open(blast_file, "r")
lines = f.readlines()
f.close()

def reverse(id, infile):
    # reverse complement sequence of ID in FASTA file infile
#!/usr/bin/env python

import os;
from pyphylogenomics import BLAST


"""
Do a BLASTn of the sequences against the Bombyx mori genome. The input arguments
are your file containing the sequences for single-copy genes (pulled_seqs.fa) 
and your file with the genome of Bombyx mori which is in FASTA format (silkgenome.fa).
"""
BLAST.blastn('data/pulled_seqs.fasta', 'data/silkgenome.fa')


from pyphylogenomics import BLAST;
import sys

query_seqs = sys.argv[1].strip()
genome = sys.argv[2].strip()
BLAST.blastn(query_seqs, genome);
#!/usr/bin/env python

import os;
from pyphylogenomics import OrthoDB
from pyphylogenomics import BLAST


"""
We will find all single-copy genes for the silk moth Bombyx mori using the table
from OrthoDB as input file:
"""
in_file = 'data/OrthoDB6_Arthropoda_tabtext.csv'
genes = OrthoDB.single_copy_genes(in_file, 'Bombyx mori')


"""
Pull all sequences for our gene IDs from the CDS file and write them to a file
pulled_seqs.fa:
"""
cds_file = "data/silkcds.fa"

if os.path.exists("data/pulled_seqs.fasta") != True:
    BLAST.get_cds(genes, cds_file)
    print "File moved to data/pulled_seqs.fa"
    os.rename("pulled_seqs.fasta", "data/pulled_seqs.fasta")
Пример #15
0
 def test_getLargestExon_output_has_headers(self):
     exons = BLAST.getLargestExon(
         self.cwd + "/BLAST/query_blastn_output2.csv", E_value=0.001, ident=98, exon_len=300
     )
     result = len(exons)
     self.assertEqual(result, 38)
Пример #16
0
 def test_eraseFalsePosi(self):
     exons = BLAST.getLargestExon(self.cwd + "/BLAST/query_blastn_out.csv", E_value=0.001, ident=98, exon_len=300)
     exons = BLAST.eraseFalsePosi(exons)
     result = len(exons)
     self.assertEqual(result, 3)
#!/usr/bin/env python

import os;
from pyphylogenomics import OrthoDB
from pyphylogenomics import BLAST
from pyphylogenomics import MUSCLE


"""
As stated before, we prefer long exons for each of the candidate genes ( > 300
nucleotides):
"""
exons = BLAST.getLargestExon("data/pulled_seqs_blastn_out.csv", E_value=0.001, ident=98, exon_len=300)


"""
Some small segments of sequences might be similar to non-homologous regions of
the genome. We will use the function eraseFalsePosi to keep those matches of
longest length:
"""
exons = BLAST.eraseFalsePosi(exons) # Drop presumable false positives.


"""
Ideally we want exons that are not too close to each other in the genome to
avoid gene linkage. So we will keep only those exons that are apart by 810
kilobases:
"""
exons = BLAST.wellSeparatedExons(exons) # Keep exons separated by > 810KB

from pyphylogenomics import OrthoDB
from pyphylogenomics import BLAST

in_file = 'grefs/OrthoDB7_Arthropoda_tabtext'
genes = OrthoDB.single_copy_genes(in_file, 'Bombyx mori')
cds_file = 'grefs/silkcds.fa'
BLAST.get_cds(genes, cds_file)
BLAST.blastn('pulled_seqs.fasta', 'grefs/silkgenome.fa')
exons = BLAST.getLargestExon("pulled_seqs_blastn_out.csv", E_value=0.001, ident=98, exon_len=300)
exons = BLAST.eraseFalsePosi(exons)
BLAST.storeExonsInFrame(exons, "pulled_seqs.fasta", "grefs/Bombyx_exons.fas")
Пример #19
0
#!/usr/bin/env python

from pyphylogenomics import BLAST

BLAST.blastn("grefs/Bombyx_exons.fas", "grefs/Dp_genome_v2.fasta")
BLAST.blastParser("grefs/Bombyx_exons_blastn_out.csv",
                    "grefs/Dp_genome_v2.fasta",     
                    "grefs/Danaus_exons.fasta",
                    sp_name = "Danaus")
#!/usr/bin/env python


from pyphylogenomics import BLAST
import sys


blast_output = sys.argv[1].strip()
model_genome = sys.argv[2].strip()
output_file = sys.argv[3].strip()
species_name = sys.argv[4].strip()

BLAST.blastParser(blast_output, model_genome, output_file, species_name)