Python BLAST示例，pyphylogenomics.BLAST Python示例

示例#1

0

显示文件

文件： test_BLAST.py 项目： carlosp420/PyPhyloGenomics

 def test_wellSeparatedExons(self):
     exons = BLAST.getLargestExon(
         os.path.join(self.cwd, "BLAST", "query_blastn_output3.csv"), E_value=0.001, ident=98, exon_len=300
     )
     exons = BLAST.wellSeparatedExons(exons)
     result = len(exons)
     self.assertEqual(result, 3)

示例#2

0

显示文件

文件： test_BLAST.py 项目： carlosp420/PyPhyloGenomics

    def test_do_blast(self):
        f = os.path.join(self.cwd, "BLAST", "query.fas")
        genome = os.path.join(self.cwd, "BLAST", "silkcds.fa")
        BLAST.makeblastdb(genome)

        command = (
            "blastn -query "
            + f
            + " -db "
            + genome
            + " -task blastn "
            + "-evalue 0.0001 "
            + " -out "
            + f
            + "_out.csv"
            + " -num_threads 1 "
            + " -outfmt 10"
        )
        BLAST.do_blast(command)
        output = open(os.path.join(self.cwd, "BLAST", "query.fas_out.csv"), "r")
        for line in output:
            self.assertTrue("BGIBMGA000001" in line)
            break

        for name in os.listdir(self.cwd + "/BLAST/"):
            if name[:10] == "silkcds.fa" and len(name) > 10:
                os.remove(self.cwd + "/BLAST/" + name)
        os.remove(os.path.join(self.cwd, "BLAST", "query.fas_out.csv"))

示例#3

0

显示文件

文件： test_BLAST.py 项目： carlosp420/PyPhyloGenomics

 def test_get_cds(self):
     """Extracting genes and saving them as fasta file"""
     BLAST.get_cds(self.genes, self.genome)
     f = open("pulled_seqs.fasta", "r")
     result = len(f.read())
     f.close()
     self.assertEqual(result, 54397)
     os.remove("pulled_seqs.fasta")

示例#4

0

显示文件

文件： test_BLAST.py 项目： carlosp420/PyPhyloGenomics

 def test_makeblastdb_false(self):
     mask = False
     BLAST.makeblastdb(self.genome, mask)
     for name in os.listdir(self.cwd + "/BLAST/"):
         if name[:10] == "silkcds.fa" and len(name) > 10:
             os.remove(self.cwd + "/BLAST/" + name)
             result = "true"
     self.assertEqual(result, "true")

示例#5

0

显示文件

文件： test_BLAST.py 项目： carlosp420/PyPhyloGenomics

 def test_blastn(self):
     BLAST.blastn(self.cwd + "/BLAST/query.fas", self.cwd + "/BLAST/silkcds.fa")
     file = open(self.cwd + "/BLAST/query_blastn_out.csv", "r")
     for line in file:
         result = line.split(",")[1]
         break
     for name in os.listdir(self.cwd + "/BLAST/"):
         if name[:10] == "silkcds.fa" and len(name) > 10:
             os.remove(self.cwd + "/BLAST/" + name)
     self.assertEqual(result, "BGIBMGA000001-TA")

示例#6

0

显示文件

文件： test_BLAST.py 项目： carlosp420/PyPhyloGenomics

    def test_filterByMinDist(self):
        # The gene2 is too close to other genes
        genes_loci = [("gene1", 1, 350), ("gene2", 360, 670), ("gene3", 821001, 821351)]
        result = BLAST.filterByMinDist(genes_loci, 810000)
        self.assertEqual(["gene2"], result)

        # The function is not affected by order of genes
        genes_loci = [("gene3", 821001, 821351), ("gene1", 1, 350), ("gene2", 360, 670)]
        result = BLAST.filterByMinDist(genes_loci, 810000)
        self.assertEqual(["gene2"], result)

        # These genes are well separated already
        genes_loci = [("gene1", 1, 350), ("gene3", 821001, 821351)]
        result = BLAST.filterByMinDist(genes_loci, 810000)
        self.assertEqual([], result)

示例#7

0

显示文件

文件： test_BLAST.py 项目： carlosp420/PyPhyloGenomics

    def test_storeExonsInFrame(self):
        # These seqs are in frame
        exons_dict = {
            ("BGIBMGA000001-TA", "nscaf1070"): [
                "BGIBMGA000001-TA",
                "nscaf1070",
                "100.00",
                450,
                0,
                0,
                1,
                450,
                1,
                450,
                "0.0",
                " 812",
                1,
            ],
            ("BGIBMGA000002-TA", "nscaf1071"): [
                "BGIBMGA000002-TA",
                "nscaf1071",
                "100.00",
                350,
                0,
                0,
                1,
                350,
                1,
                350,
                "0.0",
                " 632",
                1,
            ],
        }
        queries_db = self.cwd + "/BLAST/queries_db1.fas"
        out_file = self.cwd + "/BLAST/outfile_storeExonsInFrame.csv"

        BLAST.storeExonsInFrame(exons_dict, queries_db, out_file)
        for i in SeqIO.parse(out_file, "fasta"):
            translated_seq = i.seq.translate()[0:10]
            self.assertEqual("MRRVVWFALV", translated_seq)
        os.remove(out_file)

示例#8

0

显示文件

文件： test_BLAST.py 项目： carlosp420/PyPhyloGenomics

    def test_storeExonsInFrame_not_in_frame3(self):
        # Length of seq is not multiple of 3
        exons_dict = {
            ("BGIBMGA000001-TA", "nscaf1070"): [
                "BGIBMGA000001-TA",
                "nscaf1070",
                "100.00",
                449,
                0,
                0,
                2,
                451,
                2,
                451,
                "0.0",
                " 812",
                1,
            ],
            ("BGIBMGA000002-TA", "nscaf1071"): [
                "BGIBMGA000002-TA",
                "nscaf1071",
                "100.00",
                449,
                0,
                0,
                2,
                451,
                2,
                451,
                "0.0",
                " 832",
                1,
            ],
        }
        queries_db = self.cwd + "/BLAST/queries_db4.fas"
        out_file = self.cwd + "/BLAST/outfile_storeExonsInFrame.csv"

        BLAST.storeExonsInFrame(exons_dict, queries_db, out_file)
        for i in SeqIO.parse(out_file, "fasta"):
            translated_seq = i.seq.translate()[0:10]
            self.assertEqual("RVVWFALVRL", translated_seq)
        os.remove(out_file)

示例#9

0

显示文件

文件： test_BLAST.py 项目： carlosp420/PyPhyloGenomics

    def test_blastn_big_query_file(self):
        query_file = os.path.join(self.cwd, "BLAST", "query_big.fas.gz")

        cmd = "gunzip " + query_file
        p = subprocess.check_call(cmd, shell=True)

        if p == 0:
            gunzipped_query_file = os.path.join(self.cwd, "BLAST", "query_big.fas")
            BLAST.blastn(gunzipped_query_file, self.cwd + "/BLAST/silkcds.fa")
            file = open(self.cwd + "/BLAST/query_big_blastn_out.csv", "r")
            for line in file:
                result = line.split(",")[1]
                break
            for name in os.listdir(self.cwd + "/BLAST/"):
                if name[:10] == "silkcds.fa" and len(name) > 10:
                    os.remove(self.cwd + "/BLAST/" + name)
            self.assertEqual(result, "BGIBMGA000001-TA")

            os.remove(os.path.join(self.cwd, "BLAST", "query_big_blastn_out.csv"))
            cmd = "gzip " + gunzipped_query_file
            p = subprocess.check_call(cmd, shell=True)
        else:
            raise Exception("test failed.")

示例#10

0

显示文件

文件： test_BLAST.py 项目： carlosp420/PyPhyloGenomics

    def test_blastParser(self):
        query = self.cwd + "/BLAST/queries_db1.fas"
        blast_table = self.cwd + "/BLAST/queries_db1_blastn_out.csv"
        sbj = self.cwd + "/BLAST/silkcds.fa"
        out = self.cwd + "/BLAST/output.txt"
        BLAST.blastn(query, sbj)
        BLAST.blastParser(blast_table, sbj, out)
        seqs = [i.id for i in SeqIO.parse(out, "fasta")]
        self.assertEqual(2, len(seqs))
        os.remove(out)

        # with header row
        query = self.cwd + "/BLAST/queries_db1.fas"
        blast_table = self.cwd + "/BLAST/queries_db2_blastn_out.csv"
        sbj = self.cwd + "/BLAST/silkcds.fa"
        out = self.cwd + "/BLAST/output.txt"
        BLAST.blastn(query, sbj)
        BLAST.blastParser(blast_table, sbj, out)
        seqs = [i.id for i in SeqIO.parse(out, "fasta")]
        self.assertEqual(2, len(seqs))
        os.remove(out)

示例#11

0

显示文件

文件： get_seqs_same_direction.py 项目： carlosp420/dy_genome

import sys
import glob
import os
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord



if len(sys.argv) < 2:
    print """This script takes as input a FASTA file and will put all sequences
            in same direction using blast results."""
    sys.exit()

infile = sys.argv[1].strip()
BLAST.blastn(infile, infile)

# remove BLAST files
for file in glob.glob(infile + ".*"):
    os.remove(file)
if os.path.isfile(infile + "_dust.asnb"):
    os.remove(infile + "_dust.asnb")

# parse BLAST output file
blast_file = infile.replace(".fasta", "_blastn_out.csv")
f = open(blast_file, "r")
lines = f.readlines()
f.close()

def reverse(id, infile):
    # reverse complement sequence of ID in FASTA file infile

示例#12

0

显示文件

文件： gene_search_blast1.py 项目： carlosp420/PyPhyloGenomics_ms

#!/usr/bin/env python

import os;
from pyphylogenomics import BLAST


"""
Do a BLASTn of the sequences against the Bombyx mori genome. The input arguments
are your file containing the sequences for single-copy genes (pulled_seqs.fa) 
and your file with the genome of Bombyx mori which is in FASTA format (silkgenome.fa).
"""
BLAST.blastn('data/pulled_seqs.fasta', 'data/silkgenome.fa')

示例#13

0

显示文件

文件： blast_against_illumina_reads.py 项目： carlosp420/transcriptome_analyser

from pyphylogenomics import BLAST;
import sys

query_seqs = sys.argv[1].strip()
genome = sys.argv[2].strip()
BLAST.blastn(query_seqs, genome);

示例#14

0

显示文件

文件： start_gene_search.py 项目： carlosp420/PyPhyloGenomics_ms

#!/usr/bin/env python

import os;
from pyphylogenomics import OrthoDB
from pyphylogenomics import BLAST


"""
We will find all single-copy genes for the silk moth Bombyx mori using the table
from OrthoDB as input file:
"""
in_file = 'data/OrthoDB6_Arthropoda_tabtext.csv'
genes = OrthoDB.single_copy_genes(in_file, 'Bombyx mori')


"""
Pull all sequences for our gene IDs from the CDS file and write them to a file
pulled_seqs.fa:
"""
cds_file = "data/silkcds.fa"

if os.path.exists("data/pulled_seqs.fasta") != True:
    BLAST.get_cds(genes, cds_file)
    print "File moved to data/pulled_seqs.fa"
    os.rename("pulled_seqs.fasta", "data/pulled_seqs.fasta")

示例#15

0

显示文件

文件： test_BLAST.py 项目： carlosp420/PyPhyloGenomics

 def test_getLargestExon_output_has_headers(self):
     exons = BLAST.getLargestExon(
         self.cwd + "/BLAST/query_blastn_output2.csv", E_value=0.001, ident=98, exon_len=300
     )
     result = len(exons)
     self.assertEqual(result, 38)

示例#16

0

显示文件

文件： test_BLAST.py 项目： carlosp420/PyPhyloGenomics

 def test_eraseFalsePosi(self):
     exons = BLAST.getLargestExon(self.cwd + "/BLAST/query_blastn_out.csv", E_value=0.001, ident=98, exon_len=300)
     exons = BLAST.eraseFalsePosi(exons)
     result = len(exons)
     self.assertEqual(result, 3)

示例#17

0

显示文件

文件： gene_search_blast_filtering_exons.py 项目： carlosp420/PyPhyloGenomics_ms

#!/usr/bin/env python

import os;
from pyphylogenomics import OrthoDB
from pyphylogenomics import BLAST
from pyphylogenomics import MUSCLE


"""
As stated before, we prefer long exons for each of the candidate genes ( > 300
nucleotides):
"""
exons = BLAST.getLargestExon("data/pulled_seqs_blastn_out.csv", E_value=0.001, ident=98, exon_len=300)


"""
Some small segments of sequences might be similar to non-homologous regions of
the genome. We will use the function eraseFalsePosi to keep those matches of
longest length:
"""
exons = BLAST.eraseFalsePosi(exons) # Drop presumable false positives.


"""
Ideally we want exons that are not too close to each other in the genome to
avoid gene linkage. So we will keep only those exons that are apart by 810
kilobases:
"""
exons = BLAST.wellSeparatedExons(exons) # Keep exons separated by > 810KB

示例#18

0

显示文件

文件： search_genes_from_Bmori.py 项目： carlosp420/dy_genome

from pyphylogenomics import OrthoDB
from pyphylogenomics import BLAST

in_file = 'grefs/OrthoDB7_Arthropoda_tabtext'
genes = OrthoDB.single_copy_genes(in_file, 'Bombyx mori')
cds_file = 'grefs/silkcds.fa'
BLAST.get_cds(genes, cds_file)
BLAST.blastn('pulled_seqs.fasta', 'grefs/silkgenome.fa')
exons = BLAST.getLargestExon("pulled_seqs_blastn_out.csv", E_value=0.001, ident=98, exon_len=300)
exons = BLAST.eraseFalsePosi(exons)
BLAST.storeExonsInFrame(exons, "pulled_seqs.fasta", "grefs/Bombyx_exons.fas")

示例#19

0

显示文件

文件： get_danaus_seqs.py 项目： carlosp420/dy_genome

#!/usr/bin/env python

from pyphylogenomics import BLAST

BLAST.blastn("grefs/Bombyx_exons.fas", "grefs/Dp_genome_v2.fasta")
BLAST.blastParser("grefs/Bombyx_exons_blastn_out.csv",
                    "grefs/Dp_genome_v2.fasta",     
                    "grefs/Danaus_exons.fasta",
                    sp_name = "Danaus")

示例#20

0

显示文件

文件： parse_blast_against_models.py 项目： carlosp420/PyPhyloGenomics_ms

#!/usr/bin/env python


from pyphylogenomics import BLAST
import sys


blast_output = sys.argv[1].strip()
model_genome = sys.argv[2].strip()
output_file = sys.argv[3].strip()
species_name = sys.argv[4].strip()

BLAST.blastParser(blast_output, model_genome, output_file, species_name)