## script to strip trailing tails from genome file names import re from sys import argv from libs.common import from_dir, ensure_dir from shutil import copyfile origin_dir = "data/" + argv[1] + "/" destin_dir = origin_dir + argv[2] + "/" file_ext = argv[3] tail = argv[4] ensure_dir([destin_dir]) filenames = from_dir(origin_dir, re.compile(r'.*\.' + file_ext)) counter = 0 for filename in filenames: # identify strain name pattern = re.compile(r'^(.*)' + tail + '\.' + file_ext + '$') capture = re.match(pattern, filename) # substitute new name if capture: counter += 1 new_filename = capture.group(1) + ".fas" # copy file copyfile(origin_dir + filename, destin_dir + new_filename) print capture.group(1)
# script to generate a genome set file for bb_mapper from dir contents import re from sys import argv from libs.common import from_dir, load_fasta, load_multifasta, load_genbank data_dir = "data/"+argv[1]+"/" seq_dir = data_dir+argv[2]+"/" py_out = data_dir+argv[3]+"_set.py" min_size = argv[4] set_lines = ["all = ["] filenames = from_dir(seq_dir, re.compile(r'.*\..*')) counter = 1 for filename in filenames: print filename, while True: if filename.find(".gbk") > 0: # process genbank try: record = load_genbank(seq_dir+filename) except IOError: print "failed to load Genbank file" break except Exception:
## script to strip trailing tails from genome file names import re from sys import argv from libs.common import from_dir, ensure_dir from shutil import copyfile origin_dir = "data/"+argv[1]+"/" destin_dir = origin_dir+argv[2]+"/" file_ext = argv[3] tail = argv[4] ensure_dir([destin_dir]) filenames = from_dir(origin_dir, re.compile(r'.*\.'+file_ext)) counter = 0 for filename in filenames: # identify strain name pattern = re.compile(r'^(.*)'+tail+'\.'+file_ext+'$') capture = re.match(pattern, filename) # substitute new name if capture: counter +=1 new_filename = capture.group(1)+".fas" # copy file copyfile(origin_dir+filename, destin_dir+new_filename) print capture.group(1)
## script to compile basic stats about sets of contigs import re from sys import argv from libs.common import load_multifasta, from_dir import matplotlib.pyplot as plt import numpy as np data_dir = "data/" + argv[1] filenames = from_dir(data_dir, re.compile(r'.*\.fas.*')) ctg_ns = [] n50s = [] for filename in filenames: # load contigs from file contig_list = load_multifasta(data_dir + "/" + filename) # count contigs ctg_count = len(contig_list) if ctg_count < 200: ctg_ns.append(ctg_count) else: ctg_ns.append(200) # sort contig list by size contig_list.sort(key=len) contig_list.reverse() # count full sequence length full_seq_length = 0
infile = data_dir+argv[3] # must be a fasta file with query sequences file_ext = argv[4] blast_mode = argv[5] if len(argv) > 5: blast_mode = argv[5] else: blast_mode = 'n' # nucleotide blast by default blast_out = data_dir+"blast_out/" ensure_dir([blast_out]) queries = load_multifasta(infile) filenames = from_dir(dir_in, re.compile(r'.*\.'+file_ext)) for filename in filenames: rec_name = filename[:filename.find("."+file_ext)] print rec_name, genome_path = dir_in+filename dbfile_path = "data/blast_db/"+rec_name while True: if not path.exists(dbfile_path+".nhr"): if file_ext == 'gbk': try: print "converting,", record = load_genbank(genome_path)
## script to rename and copy sets of files import re from sys import argv from libs.common import from_dir, ensure_dir from shutil import copyfile origin_dir = "data/"+argv[1] destin_dir = "data/"+argv[2]+"/" prefix = argv[3] postfix = argv[4] #sub_base = argv[5] ensure_dir([destin_dir]) filenames = from_dir(origin_dir, re.compile(r'.*\.fas.*')) counter = 0 for filename in filenames: # identify strain name pattern = re.compile(r'^'+prefix+'(.*)'+postfix+'$') capture = re.match(pattern, filename) # substitute new name if capture: counter +=1 #new_filename = sub_base+"_"+str(counter)+".fas" new_filename = capture.group(1)+".fas" # copy file copyfile(origin_dir+"/"+filename, destin_dir+new_filename) print capture.group(1), str(counter)
infile = data_dir + argv[3] # must be a fasta file with query sequences file_ext = argv[4] blast_mode = argv[5] if len(argv) > 5: blast_mode = argv[5] else: blast_mode = 'n' # nucleotide blast by default blast_out = data_dir + "blast_out/" ensure_dir([blast_out]) queries = load_multifasta(infile) filenames = from_dir(dir_in, re.compile(r'.*\.' + file_ext)) for filename in filenames: rec_name = filename[:filename.find("." + file_ext)] print rec_name, genome_path = dir_in + filename dbfile_path = "data/blast_db/" + rec_name while True: if not path.exists(dbfile_path + ".nhr"): if file_ext == 'gbk': try: print "converting,", record = load_genbank(genome_path)
import re from os import path from sys import argv from Bio.SeqRecord import SeqRecord from libs.common import from_dir, read_array, blast_dtypes, load_fasta, write_fasta data_dir = "data/" + argv[1] + "/" blast_out_dir = "data/" + argv[1] + "/blast_out/" idp = int(argv[2]) main_out = open(data_dir + "comp_results.txt", 'w') records_dict = {} # list files in blast results directory filenames = from_dir(blast_out_dir, re.compile(r'.*\.txt.*')) for filename in filenames: counter = 0 # load text rec_array = read_array(blast_out_dir + filename, blast_dtypes) # parse lines for line in rec_array: # if idp is higher than spec'd: if line[2] > idp: query = line[0] subject = line[1] # write line to compiled results file main_out.write("\t".join([str(item) for item in line]) + "\n") outfile = data_dir + query + "_results.txt" if not path.exists(outfile): # create file
# script to generate a genome set file for bb_mapper from dir contents import re from sys import argv from libs.common import from_dir, load_fasta, load_multifasta, load_genbank data_dir = "data/" + argv[1] + "/" seq_dir = data_dir + argv[2] + "/" py_out = data_dir + argv[3] + "_set.py" min_size = argv[4] set_lines = ["all = ["] filenames = from_dir(seq_dir, re.compile(r'.*\..*')) counter = 1 for filename in filenames: print filename, while True: if filename.find(".gbk") > 0: # process genbank try: record = load_genbank(seq_dir + filename) except IOError: print "failed to load Genbank file" break except Exception:
import re from sys import argv from libs.common import load_genbank, write_fasta, ensure_dir, from_dir data_dir = "data/"+argv[1]+"/" dir_in = "data/"+argv[2]+"/" feat_type = argv[3] feat_tag = argv[4] feat_name = argv[5] main_out = data_dir+feat_name+"_seqs.fas" records = [] ensure_dir([data_dir]) filenames = from_dir(dir_in, re.compile(r'.*\.gbk')) for filename in filenames: rec_name = filename[:filename.find(".gbk")] print '.', # load data record = load_genbank(dir_in+"/"+filename) # scan annotations for feat in record.features: if feat.type == feat_type: try: if feat_name in feat.qualifiers[feat_tag]: print '\nfound', feat_name, 'in', rec_name # extract sequence
import re from sys import argv from libs.common import load_genbank, write_fasta, ensure_dir, from_dir data_dir = "data/" + argv[1] + "/" dir_in = "data/" + argv[2] + "/" feat_type = argv[3] feat_tag = argv[4] feat_name = argv[5] main_out = data_dir + feat_name + "_seqs.fas" records = [] ensure_dir([data_dir]) filenames = from_dir(dir_in, re.compile(r'.*\.gbk')) for filename in filenames: rec_name = filename[:filename.find(".gbk")] print '.', # load data record = load_genbank(dir_in + "/" + filename) # scan annotations for feat in record.features: if feat.type == feat_type: try: if feat_name in feat.qualifiers[feat_tag]: print '\nfound', feat_name, 'in', rec_name # extract sequence
## script to rename and copy sets of files import re from sys import argv from libs.common import from_dir, ensure_dir from shutil import copyfile origin_dir = "data/" + argv[1] destin_dir = "data/" + argv[2] + "/" prefix = argv[3] postfix = argv[4] #sub_base = argv[5] ensure_dir([destin_dir]) filenames = from_dir(origin_dir, re.compile(r'.*\.fas.*')) counter = 0 for filename in filenames: # identify strain name pattern = re.compile(r'^' + prefix + '(.*)' + postfix + '$') capture = re.match(pattern, filename) # substitute new name if capture: counter += 1 #new_filename = sub_base+"_"+str(counter)+".fas" new_filename = capture.group(1) + ".fas" # copy file copyfile(origin_dir + "/" + filename, destin_dir + new_filename) print capture.group(1), str(counter)
## script to compile basic stats about sets of contigs import re from sys import argv from libs.common import load_multifasta, from_dir import matplotlib.pyplot as plt import numpy as np data_dir = "data/"+argv[1] filenames = from_dir(data_dir, re.compile(r'.*\.fas.*')) ctg_ns = [] n50s = [] for filename in filenames: # load contigs from file contig_list = load_multifasta(data_dir+"/"+filename) # count contigs ctg_count = len(contig_list) if ctg_count < 200: ctg_ns.append(ctg_count) else: ctg_ns.append(200) # sort contig list by size contig_list.sort(key=len) contig_list.reverse() # count full sequence length full_seq_length = 0
import re from os import path from sys import argv from Bio.SeqRecord import SeqRecord from libs.common import from_dir, read_array, blast_dtypes, load_fasta, write_fasta data_dir = "data/"+argv[1]+"/" blast_out_dir = "data/"+argv[1]+"/blast_out/" idp = int(argv[2]) main_out = open(data_dir+"comp_results.txt", 'w') records_dict = {} # list files in blast results directory filenames = from_dir(blast_out_dir, re.compile(r'.*\.txt.*')) for filename in filenames: counter = 0 # load text rec_array = read_array(blast_out_dir+filename, blast_dtypes) # parse lines for line in rec_array: # if idp is higher than spec'd: if line[2] > idp: query = line[0] subject = line[1] # write line to compiled results file main_out.write("\t".join([str(item) for item in line])+"\n") outfile = data_dir+query+"_results.txt" if not path.exists(outfile): # create file
if len(argv) < 5: trim_ids = '' else: trim_ids = argv[4] blast_dir = origin_dir+"blast/" hits_dir = origin_dir+"hits/" remote_prot_db = "nr" annot_gbk_dir = origin_dir+"annot_gbk/" annot_aa_dir = origin_dir+"annot_aa/" trn_file = origin_dir+"prodigal.trn" ensure_dir([annot_gbk_dir, annot_aa_dir, blast_dir, hits_dir]) filenames = from_dir(seq_dir, re.compile(r'.*\.'+file_ext+'.*')) for filename in filenames: rec_name = filename[:filename.find(trim_ids+"."+file_ext)] print rec_name, "..." # load data if file_ext == 'fas': fas_file = seq_dir+"/"+filename gbk_file = fas2gbk(fas_file) record = load_genbank(gbk_file) else: gbk_file = seq_dir+"/"+filename fas_file = gbk2fas(gbk_file) record = load_genbank(gbk_file)
if len(argv) < 5: trim_ids = '' else: trim_ids = argv[4] blast_dir = origin_dir + "blast/" hits_dir = origin_dir + "hits/" remote_prot_db = "nr" annot_gbk_dir = origin_dir + "annot_gbk/" annot_aa_dir = origin_dir + "annot_aa/" trn_file = origin_dir + "prodigal.trn" ensure_dir([annot_gbk_dir, annot_aa_dir, blast_dir, hits_dir]) filenames = from_dir(seq_dir, re.compile(r'.*\.' + file_ext + '.*')) for filename in filenames: rec_name = filename[:filename.find(trim_ids + "." + file_ext)] print rec_name, "..." # load data if file_ext == 'fas': fas_file = seq_dir + "/" + filename gbk_file = fas2gbk(fas_file) record = load_genbank(gbk_file) else: gbk_file = seq_dir + "/" + filename fas_file = gbk2fas(gbk_file) record = load_genbank(gbk_file)