示例#1
0
## script to strip trailing tails from genome file names

import re
from sys import argv
from libs.common import from_dir, ensure_dir
from shutil import copyfile

origin_dir = "data/" + argv[1] + "/"
destin_dir = origin_dir + argv[2] + "/"
file_ext = argv[3]
tail = argv[4]

ensure_dir([destin_dir])

filenames = from_dir(origin_dir, re.compile(r'.*\.' + file_ext))

counter = 0

for filename in filenames:
    # identify strain name
    pattern = re.compile(r'^(.*)' + tail + '\.' + file_ext + '$')
    capture = re.match(pattern, filename)
    # substitute new name
    if capture:
        counter += 1
        new_filename = capture.group(1) + ".fas"
        # copy file
        copyfile(origin_dir + filename, destin_dir + new_filename)
        print capture.group(1)
示例#2
0
# script to generate a genome set file for bb_mapper from dir contents

import re
from sys import argv
from libs.common import from_dir, load_fasta, load_multifasta, load_genbank

data_dir = "data/"+argv[1]+"/"
seq_dir = data_dir+argv[2]+"/"
py_out = data_dir+argv[3]+"_set.py"
min_size = argv[4]

set_lines = ["all = ["]

filenames = from_dir(seq_dir, re.compile(r'.*\..*'))

counter = 1

for filename in filenames:

    print filename,

    while True:

        if filename.find(".gbk") > 0:
            # process genbank
            try:
                record = load_genbank(seq_dir+filename)
            except IOError:
                print "failed to load Genbank file"
                break
            except Exception:
示例#3
0
## script to strip trailing tails from genome file names

import re
from sys import argv
from libs.common import from_dir, ensure_dir
from shutil import copyfile

origin_dir = "data/"+argv[1]+"/"
destin_dir = origin_dir+argv[2]+"/"
file_ext = argv[3]
tail = argv[4]

ensure_dir([destin_dir])

filenames = from_dir(origin_dir, re.compile(r'.*\.'+file_ext))

counter = 0

for filename in filenames:
    # identify strain name
    pattern = re.compile(r'^(.*)'+tail+'\.'+file_ext+'$')
    capture = re.match(pattern, filename)
    # substitute new name
    if capture:
        counter +=1
        new_filename = capture.group(1)+".fas"
        # copy file
        copyfile(origin_dir+filename, destin_dir+new_filename)
        print capture.group(1)

示例#4
0
## script to compile basic stats about sets of contigs

import re
from sys import argv
from libs.common import load_multifasta, from_dir
import matplotlib.pyplot as plt
import numpy as np

data_dir = "data/" + argv[1]

filenames = from_dir(data_dir, re.compile(r'.*\.fas.*'))

ctg_ns = []
n50s = []

for filename in filenames:
    # load contigs from file
    contig_list = load_multifasta(data_dir + "/" + filename)
    # count contigs
    ctg_count = len(contig_list)
    if ctg_count < 200:
        ctg_ns.append(ctg_count)
    else:
        ctg_ns.append(200)

    # sort contig list by size
    contig_list.sort(key=len)
    contig_list.reverse()

    # count full sequence length
    full_seq_length = 0
示例#5
0
infile = data_dir+argv[3] # must be a fasta file with query sequences
file_ext = argv[4]
blast_mode = argv[5]

if len(argv) > 5:
    blast_mode = argv[5]
else:
    blast_mode = 'n' # nucleotide blast by default

blast_out = data_dir+"blast_out/"

ensure_dir([blast_out])

queries = load_multifasta(infile)

filenames = from_dir(dir_in, re.compile(r'.*\.'+file_ext))

for filename in filenames:

    rec_name = filename[:filename.find("."+file_ext)]
    print rec_name,

    genome_path = dir_in+filename
    dbfile_path = "data/blast_db/"+rec_name

    while True:
        if not path.exists(dbfile_path+".nhr"):
            if file_ext == 'gbk':
                try:
                    print "converting,",
                    record = load_genbank(genome_path)
示例#6
0
## script to rename and copy sets of files

import re
from sys import argv
from libs.common import from_dir, ensure_dir
from shutil import copyfile

origin_dir = "data/"+argv[1]
destin_dir = "data/"+argv[2]+"/"
prefix = argv[3]
postfix = argv[4]
#sub_base = argv[5]

ensure_dir([destin_dir])

filenames = from_dir(origin_dir, re.compile(r'.*\.fas.*'))

counter = 0

for filename in filenames:
    # identify strain name
    pattern = re.compile(r'^'+prefix+'(.*)'+postfix+'$')
    capture = re.match(pattern, filename)
    # substitute new name
    if capture:
        counter +=1
        #new_filename = sub_base+"_"+str(counter)+".fas"
        new_filename = capture.group(1)+".fas"
        # copy file
        copyfile(origin_dir+"/"+filename, destin_dir+new_filename)
        print capture.group(1), str(counter)
示例#7
0
infile = data_dir + argv[3]  # must be a fasta file with query sequences
file_ext = argv[4]
blast_mode = argv[5]

if len(argv) > 5:
    blast_mode = argv[5]
else:
    blast_mode = 'n'  # nucleotide blast by default

blast_out = data_dir + "blast_out/"

ensure_dir([blast_out])

queries = load_multifasta(infile)

filenames = from_dir(dir_in, re.compile(r'.*\.' + file_ext))

for filename in filenames:

    rec_name = filename[:filename.find("." + file_ext)]
    print rec_name,

    genome_path = dir_in + filename
    dbfile_path = "data/blast_db/" + rec_name

    while True:
        if not path.exists(dbfile_path + ".nhr"):
            if file_ext == 'gbk':
                try:
                    print "converting,",
                    record = load_genbank(genome_path)
示例#8
0
import re
from os import path
from sys import argv
from Bio.SeqRecord import SeqRecord
from libs.common import from_dir, read_array, blast_dtypes, load_fasta, write_fasta

data_dir = "data/" + argv[1] + "/"
blast_out_dir = "data/" + argv[1] + "/blast_out/"
idp = int(argv[2])

main_out = open(data_dir + "comp_results.txt", 'w')

records_dict = {}

# list files in blast results directory
filenames = from_dir(blast_out_dir, re.compile(r'.*\.txt.*'))
for filename in filenames:
    counter = 0
    # load text
    rec_array = read_array(blast_out_dir + filename, blast_dtypes)
    # parse lines
    for line in rec_array:
        # if idp is higher than spec'd:
        if line[2] > idp:
            query = line[0]
            subject = line[1]
            # write line to compiled results file
            main_out.write("\t".join([str(item) for item in line]) + "\n")
            outfile = data_dir + query + "_results.txt"
            if not path.exists(outfile):
                # create file
示例#9
0
# script to generate a genome set file for bb_mapper from dir contents

import re
from sys import argv
from libs.common import from_dir, load_fasta, load_multifasta, load_genbank

data_dir = "data/" + argv[1] + "/"
seq_dir = data_dir + argv[2] + "/"
py_out = data_dir + argv[3] + "_set.py"
min_size = argv[4]

set_lines = ["all = ["]

filenames = from_dir(seq_dir, re.compile(r'.*\..*'))

counter = 1

for filename in filenames:

    print filename,

    while True:

        if filename.find(".gbk") > 0:
            # process genbank
            try:
                record = load_genbank(seq_dir + filename)
            except IOError:
                print "failed to load Genbank file"
                break
            except Exception:
示例#10
0
import re
from sys import argv
from libs.common import load_genbank, write_fasta, ensure_dir, from_dir

data_dir = "data/"+argv[1]+"/"
dir_in = "data/"+argv[2]+"/"
feat_type = argv[3]
feat_tag = argv[4]
feat_name = argv[5]

main_out = data_dir+feat_name+"_seqs.fas"

records = []
ensure_dir([data_dir])

filenames = from_dir(dir_in, re.compile(r'.*\.gbk'))

for filename in filenames:
    rec_name = filename[:filename.find(".gbk")]
    print '.',

    # load data
    record = load_genbank(dir_in+"/"+filename)

    # scan annotations
    for feat in record.features:
        if feat.type == feat_type:
            try:
                if feat_name in feat.qualifiers[feat_tag]:
                    print '\nfound', feat_name, 'in', rec_name
                    # extract sequence
示例#11
0
import re
from sys import argv
from libs.common import load_genbank, write_fasta, ensure_dir, from_dir

data_dir = "data/" + argv[1] + "/"
dir_in = "data/" + argv[2] + "/"
feat_type = argv[3]
feat_tag = argv[4]
feat_name = argv[5]

main_out = data_dir + feat_name + "_seqs.fas"

records = []
ensure_dir([data_dir])

filenames = from_dir(dir_in, re.compile(r'.*\.gbk'))

for filename in filenames:
    rec_name = filename[:filename.find(".gbk")]
    print '.',

    # load data
    record = load_genbank(dir_in + "/" + filename)

    # scan annotations
    for feat in record.features:
        if feat.type == feat_type:
            try:
                if feat_name in feat.qualifiers[feat_tag]:
                    print '\nfound', feat_name, 'in', rec_name
                    # extract sequence
示例#12
0
## script to rename and copy sets of files

import re
from sys import argv
from libs.common import from_dir, ensure_dir
from shutil import copyfile

origin_dir = "data/" + argv[1]
destin_dir = "data/" + argv[2] + "/"
prefix = argv[3]
postfix = argv[4]
#sub_base = argv[5]

ensure_dir([destin_dir])

filenames = from_dir(origin_dir, re.compile(r'.*\.fas.*'))

counter = 0

for filename in filenames:
    # identify strain name
    pattern = re.compile(r'^' + prefix + '(.*)' + postfix + '$')
    capture = re.match(pattern, filename)
    # substitute new name
    if capture:
        counter += 1
        #new_filename = sub_base+"_"+str(counter)+".fas"
        new_filename = capture.group(1) + ".fas"
        # copy file
        copyfile(origin_dir + "/" + filename, destin_dir + new_filename)
        print capture.group(1), str(counter)
示例#13
0
## script to compile basic stats about sets of contigs

import re
from sys import argv
from libs.common import load_multifasta, from_dir
import matplotlib.pyplot as plt
import numpy as np

data_dir = "data/"+argv[1]

filenames = from_dir(data_dir, re.compile(r'.*\.fas.*'))

ctg_ns = []
n50s = []

for filename in filenames:
    # load contigs from file
    contig_list = load_multifasta(data_dir+"/"+filename)
    # count contigs
    ctg_count = len(contig_list)
    if ctg_count < 200:
        ctg_ns.append(ctg_count)
    else:
        ctg_ns.append(200)

    # sort contig list by size
    contig_list.sort(key=len)
    contig_list.reverse()

    # count full sequence length
    full_seq_length = 0
示例#14
0
import re
from os import path
from sys import argv
from Bio.SeqRecord import SeqRecord
from libs.common import from_dir, read_array, blast_dtypes, load_fasta, write_fasta

data_dir = "data/"+argv[1]+"/"
blast_out_dir = "data/"+argv[1]+"/blast_out/"
idp = int(argv[2])

main_out = open(data_dir+"comp_results.txt", 'w')

records_dict = {}

# list files in blast results directory
filenames = from_dir(blast_out_dir, re.compile(r'.*\.txt.*'))
for filename in filenames:
    counter = 0
    # load text
    rec_array = read_array(blast_out_dir+filename, blast_dtypes)
    # parse lines
    for line in rec_array:
        # if idp is higher than spec'd:
        if line[2] > idp:
            query = line[0]
            subject = line[1]
            # write line to compiled results file
            main_out.write("\t".join([str(item) for item in line])+"\n")
            outfile = data_dir+query+"_results.txt"
            if not path.exists(outfile):
                # create file
示例#15
0
if len(argv) < 5:
    trim_ids = ''
else:
    trim_ids = argv[4]

blast_dir = origin_dir+"blast/"
hits_dir = origin_dir+"hits/"
remote_prot_db = "nr"

annot_gbk_dir = origin_dir+"annot_gbk/"
annot_aa_dir = origin_dir+"annot_aa/"
trn_file = origin_dir+"prodigal.trn"

ensure_dir([annot_gbk_dir, annot_aa_dir, blast_dir, hits_dir])

filenames = from_dir(seq_dir, re.compile(r'.*\.'+file_ext+'.*'))

for filename in filenames:
    rec_name = filename[:filename.find(trim_ids+"."+file_ext)]

    print rec_name, "..."

    # load data
    if file_ext == 'fas':
        fas_file = seq_dir+"/"+filename
        gbk_file = fas2gbk(fas_file)
        record = load_genbank(gbk_file)
    else:
        gbk_file = seq_dir+"/"+filename
        fas_file = gbk2fas(gbk_file)
        record = load_genbank(gbk_file)
示例#16
0
if len(argv) < 5:
    trim_ids = ''
else:
    trim_ids = argv[4]

blast_dir = origin_dir + "blast/"
hits_dir = origin_dir + "hits/"
remote_prot_db = "nr"

annot_gbk_dir = origin_dir + "annot_gbk/"
annot_aa_dir = origin_dir + "annot_aa/"
trn_file = origin_dir + "prodigal.trn"

ensure_dir([annot_gbk_dir, annot_aa_dir, blast_dir, hits_dir])

filenames = from_dir(seq_dir, re.compile(r'.*\.' + file_ext + '.*'))

for filename in filenames:
    rec_name = filename[:filename.find(trim_ids + "." + file_ext)]

    print rec_name, "..."

    # load data
    if file_ext == 'fas':
        fas_file = seq_dir + "/" + filename
        gbk_file = fas2gbk(fas_file)
        record = load_genbank(gbk_file)
    else:
        gbk_file = seq_dir + "/" + filename
        fas_file = gbk2fas(gbk_file)
        record = load_genbank(gbk_file)