def main(args=None): info() if args is None: args = sys.argv[1:] if "-h" in args or "--help" in args: usage() sys.exit(2) if "-id" in args: run_id = get_arg(args, "-id") else: run_id = str(int(time.time())) # use timestamp as unique run identifier if "-bm" in args: blast_mode = get_arg(args, "-bm") else: blast_mode = 'n' # nucleotide blast by default if "-resume" in args: step = int(get_arg(args, "-resume")) resume = True else: step = 0 resume = False if "-short" in args: limit = 6 else: limit = 100 # unnecessarily high cap if "-ctg" in args: ctg_subset = get_arg(args, "-ctg") else: ctg_subset = 'exclude' if "-g" in args: g_select = get_arg(args, "-g") else: g_select = None start_timestamp = str(datetime.now()) # ensure existence of all directories ensure_dir(fixed_dirs.values()) run_dirs_go = ["".join([r_root_dir, run_id, "/", rdir]) for rdir in run_dirs.values()] ensure_dir(run_dirs_go) # define pickle paths pickle_root = r_root_dir+run_id+"/"+run_dirs['pickles']+run_id ref_pickles = pickle_root+"_refs.p" genome_pickles = pickle_root+"_genomes.p" blast_pickles = pickle_root+"_blast.p" match_pickles = pickle_root+"_matches.p" # check for pickles run_refs = [] run_gs = [] run_blast = False run_matches = [] if resume: # matches if step > 4: try: run_matches = pickle.load(open(match_pickles, 'rb')) except IOError: print "WARNING: Could not load matches pickle" run_matches = [] step = 4 # blast if step > 3: try: run_blast = pickle.load(open(blast_pickles, 'rb')) except IOError: print "WARNING: Could not load blast pickle" run_blast = False step = 3 # genomes if step > 2: try: run_gs = pickle.load(open(genome_pickles, 'rb')) except IOError: print "WARNING: Could not load genomes pickle" run_gs = [] step = 2 # references if step > 1: try: run_refs = pickle.load(open(ref_pickles, 'rb')) except IOError: print "WARNING: Could not load refs pickle" run_refs = [] step = 1 else: step = 0 ## pipeline print "starting pipeline" print step, limit if resume: log_resume_run(run_id, base_root, project_id, start_timestamp, step) else: print "\n###", step, ". Set up logging & reporting ###\n" log_start_run(run_id, base_root, project_id, run_dirs, start_timestamp) save_datasumm(run_id, blast_mode, r_root_dir, run_dirs, genomes, references, project_id, project_date, start_timestamp) init_reports(run_id, fixed_dirs, ctg_thresholds, start_timestamp) step +=1 while step < limit: if step is 1: print "\n###", step, ". Prepare references ###\n" for ref in references: timestamp = str(datetime.now()) ref_obj = process_ref(ref, ref_annot_flag, r_root_dir, fixed_dirs, run_dirs, run_id, timestamp, prot_db_name, project_id) run_refs.append(ref_obj) if os.path.exists(ref_pickles): os.remove(ref_pickles) pickle.dump(run_refs, open(ref_pickles, 'wb')) step +=1 elif step is 2: print "\n###", step, ". Prepare genomes ###\n" for genome in genomes: unpack_genomes(genome, separator, fixed_dirs, ctg_thresholds) make_genome_DB(genome, fixed_dirs) run_gs = add_refs_2g(genomes, references) if os.path.exists(genome_pickles): os.remove(genome_pickles) pickle.dump(run_gs, open(genome_pickles, 'wb')) step +=1 elif step is 3: print "\n###", step, ". Blast reference segments against genomes ###\n" for ref in run_refs: timestamp = str(datetime.now()) run_blast = basic_batch_blast(run_gs, ref, blast_mode, r_root_dir, run_dirs, fixed_dirs, blast_prefs, run_id, timestamp) if os.path.exists(blast_pickles): os.remove(blast_pickles) pickle.dump(run_blast, open(blast_pickles, 'wb')) step +=1 elif step is 4: print "\n###", step, ". Collect Blast results ###\n" for ref in run_refs: timestamp = str(datetime.now()) ref_hits, ctl_scores = glompX_blast_out(run_gs, ref, blast_mode, r_root_dir, run_dirs, run_id, fixed_dirs, blast_dtypes, references, min_nt_match, min_nt_score, min_nt_idp, min_aa_match, min_aa_score, min_aa_idp, capture_span, timestamp) ref_matches = {'ref': ref, 'run': run_id, 'hits': ref_hits, 'ctl': ctl_scores} run_matches.append(ref_matches) if os.path.exists(match_pickles): os.remove(match_pickles) pickle.dump(run_matches, open(match_pickles, 'wb')) step +=1 elif step is 5: print "\n###", step, ". Make match results table & graphs ###\n" for match_dict in run_matches: timestamp = str(datetime.now()) matches_table(match_dict, r_root_dir, run_dirs, timestamp) step +=1 ### model evaluation and filtering goes here elif step is 6: print "\n###", step, ". Annotate matching contigs ###\n" for ref in run_refs: timestamp = str(datetime.now()) annot_genome_contigs(ref, prot_db_name, fixed_dirs, r_root_dir, run_id, run_dirs, genomes, project_id, timestamp, blast_prefs) step +=1 elif step is 7: print "\n###", step, ". Align contigs pairwise to reference ###\n" for ref in run_refs: timestamp = str(datetime.now()) align_ctg2ref(ref, run_id, timestamp, r_root_dir, run_dirs, genomes, mauve_exec, max_size, chop_mode, mtype) step +=1 elif step is 8: print "\n###", step, ". Construct backbone-based scaffolds ###\n" for ref in run_refs: timestamp = str(datetime.now()) build_scaffolds(ref, r_root_dir, run_dirs, prox_D, separator, genomes, run_id, timestamp, mtype, ctg_subset) step +=1 elif step is 9: print "\n###", step, ". Align constructs pairwise to reference ###\n" for ref in run_refs: timestamp = str(datetime.now()) align_cstrct2ref(ref, run_id, timestamp, r_root_dir, run_dirs, genomes, max_size, chop_mode, mtype, mauve_exec) step +=1 elif step is 10: print "\n###", step, ". Generate maps ###\n" for ref in run_refs: timestamp = str(datetime.now()) prep_maps(ref, run_id, timestamp, g_select, r_root_dir, run_dirs, genomes, fixed_dirs, segtype, min_size, fct_flags, fct_colors, idpt) step +=1 elif step > 10: break stop_timestamp = str(datetime.now()) log_end_run(run_id, base_root, project_id, stop_timestamp) print "\n### Nothing more to do! ###\n"
counter += 1 while True: try: fname = EFetcher(rec_id, data_dir) except Exception: print "Error retrieving record" break else: if rec_id[0:2] == 'NZ': # disposition for WGS record sets print "fetching WGS dataset", # create a dedicated directory seqdir = data_dir + rec_id + "/" ensure_dir([seqdir]) # open genome record stub to get the contig count fname = data_dir + rec_id + ".gbk" try: stub = load_genbank(fname) except IOError: print "Error loading", fname break base_code = stub.annotations['wgs'][0][:10] # 7 if not NZ_ ctg_num = int(stub.annotations['wgs'][-1][10:]) # 7 records = [] # fetch contig records
ensure_dir, from_dir, read_array, blast_dtypes data_dir = "data/"+argv[1]+"/" dir_in = data_dir+argv[2]+"/" infile = data_dir+argv[3] # must be a fasta file with query sequences file_ext = argv[4] blast_mode = argv[5] if len(argv) > 5: blast_mode = argv[5] else: blast_mode = 'n' # nucleotide blast by default blast_out = data_dir+"blast_out/" ensure_dir([blast_out]) queries = load_multifasta(infile) filenames = from_dir(dir_in, re.compile(r'.*\.'+file_ext)) for filename in filenames: rec_name = filename[:filename.find("."+file_ext)] print rec_name, genome_path = dir_in+filename dbfile_path = "data/blast_db/"+rec_name while True: if not path.exists(dbfile_path+".nhr"):
counter += 1 while True: try: fname = EFetcher(rec_id, data_dir) except Exception: print "Error retrieving record" break else: if rec_id[0:2] == 'NZ': # disposition for WGS record sets print "fetching WGS dataset", # create a dedicated directory seqdir = data_dir+rec_id+"/" ensure_dir([seqdir]) # open genome record stub to get the contig count fname = data_dir+rec_id+".gbk" try: stub = load_genbank(fname) except IOError: print "Error loading", fname break base_code = stub.annotations['wgs'][0][:10] # 7 if not NZ_ ctg_num = int(stub.annotations['wgs'][-1][10:]) # 7 records = [] # fetch contig records
ensure_dir, from_dir, read_array, blast_dtypes data_dir = "data/" + argv[1] + "/" dir_in = data_dir + argv[2] + "/" infile = data_dir + argv[3] # must be a fasta file with query sequences file_ext = argv[4] blast_mode = argv[5] if len(argv) > 5: blast_mode = argv[5] else: blast_mode = 'n' # nucleotide blast by default blast_out = data_dir + "blast_out/" ensure_dir([blast_out]) queries = load_multifasta(infile) filenames = from_dir(dir_in, re.compile(r'.*\.' + file_ext)) for filename in filenames: rec_name = filename[:filename.find("." + file_ext)] print rec_name, genome_path = dir_in + filename dbfile_path = "data/blast_db/" + rec_name while True: if not path.exists(dbfile_path + ".nhr"):
from libs.common import load_genbank, write_fasta, make_blastDB, \ local_tblastn_2file, read_array, blast_dtypes, ensure_dir from libs.tetris import segment_finder from Bio.SeqRecord import SeqRecord import cPickle as pickle from sets.NheA_ctxt_set import test as genomes data_dir = 'data/'+argv[1]+'/' seq_dir = data_dir+argv[2]+'/' out_dir = data_dir+argv[3]+'/' feat_type = argv[4] threshold = int(argv[5]) min_com = int(argv[6]) # min number of non-core feats in common within groups ensure_dir([out_dir]) db_file = out_dir+'ref_DB.fas' db_path = out_dir+'refs' core_genome_pickle = out_dir+'core_genome.pik' cluster_set_file = out_dir+'clusters.py' new_DB = True init_DB = False symbolDB = {} vectorDB = {} segmentDB = {} # vectorDB is a dict that contains genome-keyed dicts,
# script to capture sequences from the results of a batch blast from sys import argv from Bio.SeqRecord import SeqRecord from libs.common import read_array, blast_dtypes, load_fasta, write_fasta, ensure_dir data_dir = "data/" + argv[1] + "/" main_in = data_dir + argv[2] + "_results.txt" main_out = data_dir + argv[2] + "_ctxt.fas" ctx_dir = data_dir + "context/" capture_span = int(argv[3]) ensure_dir([ctx_dir]) records = [] rec_array = read_array(main_in, blast_dtypes) descript = "Context of " + argv[2] + " (" + argv[3] + " bp either side)" for line in rec_array: query = line[0] subject = line[1] print subject rev_flag = False if line[8] < line[9]: q_start, q_stop = line[8] - 1, line[9] rev_flag = False
# script to capture features based on annotation tags import re from sys import argv from libs.common import load_genbank, write_fasta, ensure_dir, from_dir data_dir = "data/" + argv[1] + "/" dir_in = "data/" + argv[2] + "/" feat_type = argv[3] feat_tag = argv[4] feat_name = argv[5] main_out = data_dir + feat_name + "_seqs.fas" records = [] ensure_dir([data_dir]) filenames = from_dir(dir_in, re.compile(r'.*\.gbk')) for filename in filenames: rec_name = filename[:filename.find(".gbk")] print '.', # load data record = load_genbank(dir_in + "/" + filename) # scan annotations for feat in record.features: if feat.type == feat_type: try: if feat_name in feat.qualifiers[feat_tag]:
import re from os import path from sys import argv from libs.common import from_dir, ensure_dir, fas2gbk, gbk2fas, write_genbank, \ load_genbank, train_prodigal, run_prodigal, load_multifasta from Bio.SeqFeature import SeqFeature, FeatureLocation from Bio.Alphabet import generic_dna origin_dir = "data/"+argv[1]+"/" file_ext = argv[2] annot_gbk_dir = origin_dir+"annot_gbk/" annot_aa_dir = origin_dir+"annot_aa/" trn_file = origin_dir+"prodigal.trn" ensure_dir([annot_gbk_dir, annot_aa_dir]) filenames = from_dir(origin_dir, re.compile(r'.*\.'+file_ext+'.*')) for filename in filenames: rec_name = filename[:filename.find("."+file_ext)] print rec_name, "...", # load data if file_ext == 'fas': fas_file = origin_dir+"/"+filename gbk_file = fas2gbk(fas_file) record = load_genbank(gbk_file) else: gbk_file = origin_dir+"/"+filename
# script to capture features based on annotation tags import re from sys import argv from libs.common import load_genbank, write_fasta, ensure_dir, from_dir data_dir = "data/"+argv[1]+"/" dir_in = "data/"+argv[2]+"/" feat_type = argv[3] feat_tag = argv[4] feat_name = argv[5] main_out = data_dir+feat_name+"_seqs.fas" records = [] ensure_dir([data_dir]) filenames = from_dir(dir_in, re.compile(r'.*\.gbk')) for filename in filenames: rec_name = filename[:filename.find(".gbk")] print '.', # load data record = load_genbank(dir_in+"/"+filename) # scan annotations for feat in record.features: if feat.type == feat_type: try: if feat_name in feat.qualifiers[feat_tag]:
# script to capture sequences from the results of a batch blast from sys import argv from Bio.SeqRecord import SeqRecord from libs.common import read_array, blast_dtypes, load_fasta, write_fasta, ensure_dir data_dir = "data/"+argv[1]+"/" main_in = data_dir+argv[2]+"_results.txt" main_out = data_dir+argv[2]+"_ctxt.fas" ctx_dir = data_dir+"context/" capture_span = int(argv[3]) ensure_dir([ctx_dir]) records = [] rec_array = read_array(main_in, blast_dtypes) descript = "Context of "+argv[2]+" ("+argv[3]+" bp either side)" for line in rec_array: query = line[0] subject = line[1] print subject rev_flag = False if line[8] < line[9]: q_start, q_stop = line[8]-1, line[9] rev_flag = False
if len(argv) > 1 and argv[1] == '-h': print "Basic usage: \n", \ "$ python main_script.py [step#]\n" exit() if len(argv) < 2: step = 0 else: step = int(argv[1]) if step is 0: ### STEP 0: Ensure that all base directories exist ### print "\n###", step, ". Setting up the work environment ###" for dir_name in directories.keys(): ensure_dir(directories[dir_name]) step +=1 if step is 1: ### STEP 1: Trim & bin reads (based on results of FastQC) ### print "\n###", step, ". Trim & bin, then split for batching ###" for dataset in datasets: dataset['trim_files'] = [] bin_counts = trim_illumina(dataset) print bin_counts # TO LOG dataset['mft_files'] = [] for trim_file in dataset['trim_files']: mft_count = simple_q2a(dataset, trim_file) #print mft_count # TO LOG for mft_file in dataset['mft_files']: print mft_file
from libs.common import load_genbank, write_fasta, make_blastDB, \ local_tblastn_2file, read_array, blast_dtypes, ensure_dir from libs.tetris import segment_finder from Bio.SeqRecord import SeqRecord import cPickle as pickle from sets.NheA_ctxt_set import test as genomes data_dir = 'data/' + argv[1] + '/' seq_dir = data_dir + argv[2] + '/' out_dir = data_dir + argv[3] + '/' feat_type = argv[4] threshold = int(argv[5]) min_com = int(argv[6]) # min number of non-core feats in common within groups ensure_dir([out_dir]) db_file = out_dir + 'ref_DB.fas' db_path = out_dir + 'refs' core_genome_pickle = out_dir + 'core_genome.pik' cluster_set_file = out_dir + 'clusters.py' new_DB = True init_DB = False symbolDB = {} vectorDB = {} segmentDB = {} # vectorDB is a dict that contains genome-keyed dicts,
file_ext = argv[3] if len(argv) < 5: trim_ids = '' else: trim_ids = argv[4] blast_dir = origin_dir+"blast/" hits_dir = origin_dir+"hits/" remote_prot_db = "nr" annot_gbk_dir = origin_dir+"annot_gbk/" annot_aa_dir = origin_dir+"annot_aa/" trn_file = origin_dir+"prodigal.trn" ensure_dir([annot_gbk_dir, annot_aa_dir, blast_dir, hits_dir]) filenames = from_dir(seq_dir, re.compile(r'.*\.'+file_ext+'.*')) for filename in filenames: rec_name = filename[:filename.find(trim_ids+"."+file_ext)] print rec_name, "..." # load data if file_ext == 'fas': fas_file = seq_dir+"/"+filename gbk_file = fas2gbk(fas_file) record = load_genbank(gbk_file) else: gbk_file = seq_dir+"/"+filename
## script to strip trailing tails from genome file names import re from sys import argv from libs.common import from_dir, ensure_dir from shutil import copyfile origin_dir = "data/"+argv[1]+"/" destin_dir = origin_dir+argv[2]+"/" file_ext = argv[3] tail = argv[4] ensure_dir([destin_dir]) filenames = from_dir(origin_dir, re.compile(r'.*\.'+file_ext)) counter = 0 for filename in filenames: # identify strain name pattern = re.compile(r'^(.*)'+tail+'\.'+file_ext+'$') capture = re.match(pattern, filename) # substitute new name if capture: counter +=1 new_filename = capture.group(1)+".fas" # copy file copyfile(origin_dir+filename, destin_dir+new_filename) print capture.group(1)
def main(args=None): info() if args is None: args = sys.argv[1:] if "-h" in args or "--help" in args: usage() sys.exit(2) if "-id" in args: run_id = get_arg(args, "-id") else: run_id = str(int( time.time())) # use timestamp as unique run identifier if "-bm" in args: blast_mode = get_arg(args, "-bm") else: blast_mode = 'n' # nucleotide blast by default if "-resume" in args: step = int(get_arg(args, "-resume")) resume = True else: step = 0 resume = False if "-short" in args: limit = 6 else: limit = 100 # unnecessarily high cap if "-filter" in args: threshold = int(get_arg(args, "-filter")) resume = True step = 6 limit = 7 else: threshold = 5 # reduce for small references limit = 100 # unnecessarily high cap if "-ctg" in args: ctg_subset = get_arg(args, "-ctg") else: ctg_subset = 'exclude' if "-g" in args: g_select = get_arg(args, "-g") else: g_select = None start_timestamp = str(datetime.now()) # ensure existence of all directories ensure_dir(fixed_dirs.values()) run_dirs_go = [ "".join([r_root_dir, run_id, "/", rdir]) for rdir in run_dirs.values() ] ensure_dir(run_dirs_go) # define pickle paths pickle_root = r_root_dir + run_id + "/" + run_dirs['pickles'] + run_id ref_pickles = pickle_root + "_refs.p" genome_pickles = pickle_root + "_genomes.p" blast_pickles = pickle_root + "_blast.p" match_pickles = pickle_root + "_matches.p" norm_pickles = pickle_root + "_norm.p" # check for pickles run_refs = [] run_gs = [] run_blast = False run_matches = [] run_norm_matches = {} if resume: # normalized matches if step > 5: try: run_norm_matches = pickle.load(open(norm_pickles, 'rb')) except IOError: print "WARNING: Could not load norm pickle" run_norm_matches = {} step = 5 # matches if step > 4: try: run_matches = pickle.load(open(match_pickles, 'rb')) except IOError: print "WARNING: Could not load matches pickle" run_matches = [] step = 4 # blast if step > 3: try: run_blast = pickle.load(open(blast_pickles, 'rb')) except IOError: print "WARNING: Could not load blast pickle" run_blast = False step = 3 # genomes if step > 2: try: run_gs = pickle.load(open(genome_pickles, 'rb')) except IOError: print "WARNING: Could not load genomes pickle" run_gs = [] step = 2 # references if step > 1: try: run_refs = pickle.load(open(ref_pickles, 'rb')) except IOError: print "WARNING: Could not load refs pickle" run_refs = [] step = 1 else: step = 0 ## pipeline print "starting pipeline" print step, limit if resume: log_resume_run(run_id, base_root, project_id, start_timestamp, step) else: print "\n###", step, ". Set up logging & reporting ###\n" log_start_run(run_id, base_root, project_id, run_dirs, start_timestamp) save_datasumm(run_id, blast_mode, r_root_dir, run_dirs, genomes, references, project_id, project_date, start_timestamp) init_reports(run_id, fixed_dirs, ctg_thresholds, start_timestamp) step += 1 while step < limit: if step is 1: print "\n###", step, ". Prepare references ###\n" for ref in references: timestamp = str(datetime.now()) ref_obj = process_ref(ref, ref_annot_flag, r_root_dir, fixed_dirs, run_dirs, run_id, timestamp, prot_db_name, project_id) run_refs.append(ref_obj) if os.path.exists(ref_pickles): os.remove(ref_pickles) pickle.dump(run_refs, open(ref_pickles, 'wb')) step += 1 elif step is 2: print "\n###", step, ". Prepare genomes ###\n" for genome in genomes: unpack_genomes(genome, separator, fixed_dirs, ctg_thresholds) make_genome_DB(genome, fixed_dirs) run_gs = add_refs_2g(genomes, references) if os.path.exists(genome_pickles): os.remove(genome_pickles) pickle.dump(run_gs, open(genome_pickles, 'wb')) step += 1 elif step is 3: print "\n###", step, ". Blast reference segments against genomes ###\n" for ref in run_refs: timestamp = str(datetime.now()) run_blast = basic_batch_blast(run_gs, ref, blast_mode, r_root_dir, run_dirs, fixed_dirs, blast_prefs, run_id, timestamp) if os.path.exists(blast_pickles): os.remove(blast_pickles) pickle.dump(run_blast, open(blast_pickles, 'wb')) step += 1 elif step is 4: print "\n###", step, ". Collect Blast results ###\n" for ref in run_refs: timestamp = str(datetime.now()) ref_hits, ctl_scores = glompX_blast_out( run_gs, ref, blast_mode, r_root_dir, run_dirs, run_id, fixed_dirs, blast_dtypes, references, min_nt_match, min_nt_score, min_nt_idp, min_aa_match, min_aa_score, min_aa_idp, capture_span, timestamp) ref_matches = { 'ref': ref, 'run': run_id, 'hits': ref_hits, 'ctl': ctl_scores } run_matches.append(ref_matches) if os.path.exists(match_pickles): os.remove(match_pickles) pickle.dump(run_matches, open(match_pickles, 'wb')) step += 1 elif step is 5: print "\n###", step, ". Make match results table & graphs ###\n" for ref_matches in run_matches: timestamp = str(datetime.now()) ref_norm_matches = matches_table(ref_matches, r_root_dir, run_dirs, timestamp) run_norm_matches[ref_matches['ref'].name] = ref_norm_matches if os.path.exists(norm_pickles): os.remove(norm_pickles) pickle.dump(run_norm_matches, open(norm_pickles, 'wb')) step += 1 elif step is 6: print "\n###", step, ". Filter matching contigs ###\n" for ref in run_refs: timestamp = str(datetime.now()) filter_contigs(ref, run_id, genomes, run_norm_matches[ref.name], chop_size, threshold, r_root_dir, run_dirs, fixed_dirs, timestamp) step += 1 elif step is 7: print "\n###", step, ". Annotate matching contigs ###\n" for ref in run_refs: timestamp = str(datetime.now()) annot_genome_contigs(ref, prot_db_name, fixed_dirs, r_root_dir, run_id, run_dirs, genomes, project_id, timestamp, blast_prefs) step += 1 elif step is 8: print "\n###", step, ". Align contigs pairwise to reference ###\n" for ref in run_refs: timestamp = str(datetime.now()) align_ctg2ref(ref, run_id, timestamp, r_root_dir, run_dirs, genomes, mauve_exec, max_size, chop_mode, mtype) step += 1 elif step is 9: print "\n###", step, ". Construct backbone-based scaffolds ###\n" for ref in run_refs: timestamp = str(datetime.now()) build_scaffolds(ref, r_root_dir, run_dirs, prox_D, separator, genomes, run_id, timestamp, mtype, ctg_subset) step += 1 elif step is 10: print "\n###", step, ". Align constructs pairwise to reference ###\n" for ref in run_refs: timestamp = str(datetime.now()) align_cstrct2ref(ref, run_id, timestamp, r_root_dir, run_dirs, genomes, max_size, chop_mode, mtype, mauve_exec) step += 1 elif step is 11: print "\n###", step, ". Generate maps ###\n" for ref in run_refs: timestamp = str(datetime.now()) prep_maps(ref, run_id, timestamp, g_select, r_root_dir, run_dirs, genomes, fixed_dirs, segtype, min_size, fct_flags, fct_colors, idpt) step += 1 elif step > 12: break stop_timestamp = str(datetime.now()) log_end_run(run_id, base_root, project_id, stop_timestamp) print "\n### Nothing more to do! ###\n"
## script to strip trailing tails from genome file names import re from sys import argv from libs.common import from_dir, ensure_dir from shutil import copyfile origin_dir = "data/" + argv[1] + "/" destin_dir = origin_dir + argv[2] + "/" file_ext = argv[3] tail = argv[4] ensure_dir([destin_dir]) filenames = from_dir(origin_dir, re.compile(r'.*\.' + file_ext)) counter = 0 for filename in filenames: # identify strain name pattern = re.compile(r'^(.*)' + tail + '\.' + file_ext + '$') capture = re.match(pattern, filename) # substitute new name if capture: counter += 1 new_filename = capture.group(1) + ".fas" # copy file copyfile(origin_dir + filename, destin_dir + new_filename) print capture.group(1)
file_ext = argv[3] if len(argv) < 5: trim_ids = '' else: trim_ids = argv[4] blast_dir = origin_dir + "blast/" hits_dir = origin_dir + "hits/" remote_prot_db = "nr" annot_gbk_dir = origin_dir + "annot_gbk/" annot_aa_dir = origin_dir + "annot_aa/" trn_file = origin_dir + "prodigal.trn" ensure_dir([annot_gbk_dir, annot_aa_dir, blast_dir, hits_dir]) filenames = from_dir(seq_dir, re.compile(r'.*\.' + file_ext + '.*')) for filename in filenames: rec_name = filename[:filename.find(trim_ids + "." + file_ext)] print rec_name, "..." # load data if file_ext == 'fas': fas_file = seq_dir + "/" + filename gbk_file = fas2gbk(fas_file) record = load_genbank(gbk_file) else: gbk_file = seq_dir + "/" + filename