Пример #1
0
                print "failed to handle Genbank file"
                break
            else:
                print "...",
                seq_format = 'gbk'

        elif filename.find(".fas") > 0:
            # process fasta (for mfas, load first record)
            try:
                record = load_fasta(seq_dir+filename)
            except IOError:
                print "failed to load Fasta file as single-record file"
                break
            except Exception:
                try:
                    record = load_multifasta(seq_dir+filename)[0]
                except IOError:
                    print "failed to load Fasta file as multi-record file"
                    break
                except Exception:
                    print "failed to handle Fasta file"
                    break
            print "...",
            seq_format = 'fas'

        else:
            # reject as bad format
            print "invalid file format"
            break

        if len(record) < int(min_size):
Пример #2
0
# script to translate sequences in multifasta files into proteins

from sys import argv
from libs.common import load_multifasta, write_fasta
from Bio.SeqRecord import SeqRecord

origin_dir = "data/" + argv[1] + "/"
in_file = origin_dir + argv[2]
outfile = in_file[:-4] + "_aa.fas"

proteins = []

for record in load_multifasta(in_file):
    aa_rec = SeqRecord(id=record.id, seq=record.seq.translate())
    proteins.append(aa_rec)

write_fasta(outfile, proteins)
Пример #3
0
data_dir = "data/"+argv[1]+"/"
dir_in = data_dir+argv[2]+"/"
infile = data_dir+argv[3] # must be a fasta file with query sequences
file_ext = argv[4]
blast_mode = argv[5]

if len(argv) > 5:
    blast_mode = argv[5]
else:
    blast_mode = 'n' # nucleotide blast by default

blast_out = data_dir+"blast_out/"

ensure_dir([blast_out])

queries = load_multifasta(infile)

filenames = from_dir(dir_in, re.compile(r'.*\.'+file_ext))

for filename in filenames:

    rec_name = filename[:filename.find("."+file_ext)]
    print rec_name,

    genome_path = dir_in+filename
    dbfile_path = "data/blast_db/"+rec_name

    while True:
        if not path.exists(dbfile_path+".nhr"):
            if file_ext == 'gbk':
                try:
Пример #4
0
# script to rename contigs in multifasta files

<<<<<<< HEAD
from genomes import all as genomes
from libs.common import load_multifasta, write_fasta

for genome in genomes:
    print genome['file']
    file_path = "data/genomes/"+genome['file']
    outfile_path = "data/renamed/"+genome['file']
    contigs = load_multifasta(file_path)
    renamed = []
    counter = 1
    for contig in contigs:
        contig.id = genome['name']+"_"+str(counter)
        contig_path = "data/contigs/"+contig.id+".fas"
        write_fasta(contig_path, contig)
        renamed.append(contig)
        counter +=1
    write_fasta(outfile_path, renamed)
=======
from sys import argv
from libs.common import load_multifasta, write_fasta, ensure_dir

from genomes import all as genome_list

origin_dir = "data/"+argv[1]+"/"
destin_dir = "data/"+argv[2]+"/"

ensure_dir([destin_dir])
Пример #5
0
data_dir = "data/" + argv[1] + "/"
dir_in = data_dir + argv[2] + "/"
infile = data_dir + argv[3]  # must be a fasta file with query sequences
file_ext = argv[4]
blast_mode = argv[5]

if len(argv) > 5:
    blast_mode = argv[5]
else:
    blast_mode = 'n'  # nucleotide blast by default

blast_out = data_dir + "blast_out/"

ensure_dir([blast_out])

queries = load_multifasta(infile)

filenames = from_dir(dir_in, re.compile(r'.*\.' + file_ext))

for filename in filenames:

    rec_name = filename[:filename.find("." + file_ext)]
    print rec_name,

    genome_path = dir_in + filename
    dbfile_path = "data/blast_db/" + rec_name

    while True:
        if not path.exists(dbfile_path + ".nhr"):
            if file_ext == 'gbk':
                try:
Пример #6
0
import re
from sys import argv
from libs.common import load_multifasta, from_dir
import matplotlib.pyplot as plt
import numpy as np

data_dir = "data/" + argv[1]

filenames = from_dir(data_dir, re.compile(r'.*\.fas.*'))

ctg_ns = []
n50s = []

for filename in filenames:
    # load contigs from file
    contig_list = load_multifasta(data_dir + "/" + filename)
    # count contigs
    ctg_count = len(contig_list)
    if ctg_count < 200:
        ctg_ns.append(ctg_count)
    else:
        ctg_ns.append(200)

    # sort contig list by size
    contig_list.sort(key=len)
    contig_list.reverse()

    # count full sequence length
    full_seq_length = 0
    for contig in contig_list:
        full_seq_length += len(contig.seq)
Пример #7
0
                print "failed to handle Genbank file"
                break
            else:
                print "...",
                seq_format = 'gbk'

        elif filename.find(".fas") > 0:
            # process fasta (for mfas, load first record)
            try:
                record = load_fasta(seq_dir + filename)
            except IOError:
                print "failed to load Fasta file as single-record file"
                break
            except Exception:
                try:
                    record = load_multifasta(seq_dir + filename)[0]
                except IOError:
                    print "failed to load Fasta file as multi-record file"
                    break
                except Exception:
                    print "failed to handle Fasta file"
                    break
            print "...",
            seq_format = 'fas'

        else:
            # reject as bad format
            print "invalid file format"
            break

        if len(record) < int(min_size):
Пример #8
0
# script to translate sequences in multifasta files into proteins

from sys import argv
from libs.common import load_multifasta, write_fasta
from Bio.SeqRecord import SeqRecord

origin_dir = "data/"+argv[1]+"/"
in_file = origin_dir+argv[2]
outfile = in_file[:-4]+"_aa.fas"

proteins = []

for record in load_multifasta(in_file):
    aa_rec = SeqRecord(id=record.id, seq=record.seq.translate())
    proteins.append(aa_rec)

write_fasta(outfile, proteins)
Пример #9
0
    else:
        gbk_file = origin_dir+"/"+filename
        fas_file = gbk2fas(gbk_file)
        record = load_genbank(gbk_file)

    # run prediction
    annot_aa = annot_aa_dir+rec_name+"_ann.fas"
    annot_gbk = annot_gbk_dir+rec_name+"_ann.gbk"
    if not path.exists(trn_file):
        train_prodigal(fas_file, trn_file, "-q")
    if not path.exists(annot_aa):
        run_prodigal(fas_file, annot_gbk, annot_aa, trn_file, "-q")

    # collect orfs
    record.features = []
    aa_record = load_multifasta(annot_aa)
    counter = 1
    for aa_rec in aa_record:
        this_prot = rec_name+"_"+str(counter)
        # get feature details from description line
        # because prodigal output fails to load as valid genbank
        defline = aa_rec.description
        pattern = re.compile('.+#\s(\d+)\s#\s(\d+)\s#\s(\S*1)\s#\sID.+')
        match = pattern.match(defline)
        start_pos = int(match.group(1))
        end_pos = int(match.group(2))
        strand_pos = int(match.group(3))
        feat_loc = FeatureLocation(start_pos, end_pos)
        l_tag = rec_name+"_"+str(counter)
        # consolidation feature annotations
        quals = {'note': defline, 'locus_tag': l_tag,
Пример #10
0
            try:
                records = [load_genbank(origin_file)]
            except IOError:
                print "failed to load file"
                break

        elif genome['input'] == 'fas':
            try:
                records = [load_fasta(origin_file)]
            except IOError:
                print "failed to load file"
                break

        elif genome['input'] == 'mfas':
            try:
                records = load_multifasta(origin_file)
            except IOError:
                print "failed to load file"
                break

        else:
            print "input not recognized"
            break

        for record in records:
            try:
                write_fasta(destin_dir+record.id+".fas", record)
            except Exception:
                print "failed to write contig file"
                break
            else:
Пример #11
0
import re
from sys import argv
from libs.common import load_multifasta, from_dir
import matplotlib.pyplot as plt
import numpy as np

data_dir = "data/"+argv[1]

filenames = from_dir(data_dir, re.compile(r'.*\.fas.*'))

ctg_ns = []
n50s = []

for filename in filenames:
    # load contigs from file
    contig_list = load_multifasta(data_dir+"/"+filename)
    # count contigs
    ctg_count = len(contig_list)
    if ctg_count < 200:
        ctg_ns.append(ctg_count)
    else:
        ctg_ns.append(200)

    # sort contig list by size
    contig_list.sort(key=len)
    contig_list.reverse()

    # count full sequence length
    full_seq_length = 0
    for contig in contig_list:
        full_seq_length += len(contig.seq)
Пример #12
0
        record = load_genbank(gbk_file)

    assert record.id

    # run prediction
    annot_aa = annot_aa_dir+rec_name+"_ann.fas"
    annot_gbk = annot_gbk_dir+rec_name+"_ann.gbk"
    if not path.exists(trn_file):
        train_prodigal(fas_file, trn_file, "-q")
    if not path.exists(annot_aa):
        run_prodigal(fas_file, annot_gbk, annot_aa, trn_file, "-q")

    # blast the protein sequences against the remote DB
    record.features = []
    evalue = 0.01
    proteins = load_multifasta(annot_aa)
    for protein in proteins:
        print "  ", protein.id
        rec_hits_dir = hits_dir+rec_name+"/"
        ensure_dir([rec_hits_dir])
        hits_out = open(rec_hits_dir+protein.id+".txt", 'w')
        hits_out.write(" ".join([protein.id, "vs.", remote_prot_db,
                                 "@evalue =", str(evalue), "\n"]))
        temp_out = remote_blastp_2file(protein.seq, remote_prot_db,
                                       blast_dir+rec_name+"_temp.xml",
                                       evalue)
        #temp_out = blast_dir+rec_name+"_temp.xml"
        # collect best 10 hits
        rec_hits = collect_topNhits(temp_out, 10)
        for hit in rec_hits:
            if hasattr(hit, 'hsps'):
Пример #13
0
        record = load_genbank(gbk_file)

    assert record.id

    # run prediction
    annot_aa = annot_aa_dir + rec_name + "_ann.fas"
    annot_gbk = annot_gbk_dir + rec_name + "_ann.gbk"
    if not path.exists(trn_file):
        train_prodigal(fas_file, trn_file, "-q")
    if not path.exists(annot_aa):
        run_prodigal(fas_file, annot_gbk, annot_aa, trn_file, "-q")

    # blast the protein sequences against the remote DB
    record.features = []
    evalue = 0.01
    proteins = load_multifasta(annot_aa)
    for protein in proteins:
        print "  ", protein.id
        rec_hits_dir = hits_dir + rec_name + "/"
        ensure_dir([rec_hits_dir])
        hits_out = open(rec_hits_dir + protein.id + ".txt", 'w')
        hits_out.write(" ".join([
            protein.id, "vs.", remote_prot_db, "@evalue =",
            str(evalue), "\n"
        ]))
        temp_out = remote_blastp_2file(protein.seq, remote_prot_db,
                                       blast_dir + rec_name + "_temp.xml",
                                       evalue)
        #temp_out = blast_dir+rec_name+"_temp.xml"
        # collect best 10 hits
        rec_hits = collect_topNhits(temp_out, 10)