def execute_megablast(self, fasta1, fasta2): import shell_command for one_fasta in fasta2: cmd1 = "formatdb -i %s -p F" % one_fasta cmd2 = 'blastn -task megablast -query %s -db %s -evalue 1e-5 -outfmt 6 -out blast_result_%s.tab' % ( fasta1, one_fasta, one_fasta.split('.')[0]) a, b, c = shell_command.shell_command(cmd1) a, b, c = shell_command.shell_command(cmd2)
def blast_all_vs_all(input_fasta): import shell_command cmd1 = 'formatdb -i %s -n seq.db' % input_fasta a, b, c = shell_command.shell_command(cmd1) print a, b, c cmd2 = 'blastp -db seq.db -query %s -outfmt 6 -num_threads 6 -out blastall.out' % input_fasta a, b, c = shell_command.shell_command(cmd2) print a, b, c return "blastall.out"
def detailed_job_status(job_id): detailed_job_status = shell_command("bjobs %d" % job_id)[0].split('\n') detailed_exit_job_status = [] for i in detailed_job_status: if "EXIT" in i: detailed_exit_job_status.append(i) return detailed_exit_job_status
def run_circos(self, config_file="circos.config", out_prefix="circos"): import shell_command cmd = 'circos -outputfile %s.svg -conf %s' % (out_prefix, config_file) a, b, c = shell_command.shell_command(cmd) print "out", a, "err", b, "code", c if c == 255: raise CircosException("Circos problem, check files... quitting")
def __init__(self, fasta_file, bam_file): self.working_dir = os.getcwd() self.fasta_file = os.path.join(self.working_dir, fasta_file) self.bam_file = os.path.join(self.working_dir, bam_file) self.prefix = bam_file.split('.')[0] self.data_dir = os.path.join(self.working_dir, 'circosviz/data') self.etc_dir = os.path.join(self.working_dir, 'circosviz/etc') self.sam_file = os.path.join(self.data_dir, self.prefix + '.sam') try: os.makedirs(self.data_dir) except: print '%s already exits' % self.data_dir try: os.makedirs(self.etc_dir) except: print '%s already exits' % self.etc_dir # convert bam to sam print 'converting bam to sam' cmd = ' samtools view -h -o %s %s' % (self.sam_file, self.bam_file) print cmd out, err, code = shell_command.shell_command(cmd) if code != 0: print out, err self.template_config = '''
def blast_fasta(blast_flavor, input, blastdb, evalue, nb_hit, local): ''' :parameters for BLAST :return: blast results with taxonomic information scientific names and kingdom will only be retrieved if the taxid database has been installed locally ftp://ftp.ncbi.nlm.nih.gov/blast/db/taxdb.tar.gz ''' output_name = re.sub("([a-zA-Z_0-9]+)\.([a-zA-Z]+)","blast\\1.tab",input) if local: cmd="%s -task %s -query %s -out %s -db %s -evalue %s -max_target_seqs %s -outfmt '6 qgi qacc sgi sacc sscinames sskingdoms staxids evalue nident pident positive gaps length qstart qend qcovs sstart send sstrand stitle'" % (blast_flavor,blast_flavor, input, output_name, blastdb, evalue, nb_hit) print cmd out, err, code = shell_command(cmd) if code != 0: print err else: cmd="%s -remote -task %s -query %s -out %s -db %s -evalue %s -max_target_seqs %s -outfmt '6 qgi qacc sgi sacc sscinames sskingdoms staxids evalue nident pident positive gaps length qstart qend qcovs sstart send sstrand stitle'" % (blast_flavor,blast_flavor, input, output_name, blastdb, evalue, nb_hit) out, err, code = shell_command(cmd)
def run_hifix(fasta_file, silix_nodes, silix_net): import shell_command cmd1 = 'hifix %s %s %s > seq_HFX.fnodes' % (fasta_file, silix_net, silix_nodes) a, b, c = shell_command.shell_command(cmd1) print a, b, c
def generate_circos_data_files(self): os.chdir(self.data_dir) cmd = 'circosviz.pl -i %s -f %s -e 500 -m 3000 -a 125 -b 1000 -p 1000' % ( self.sam_file, self.fasta_file) out, err, code = shell_command.shell_command(cmd) if code != 0: print out, err os.chdir(self.working_dir)
def run_silix(input_fasta, blastall_file, identity=0.9, overlap=0.8): import shell_command if identity > 1 or overlap > 1: raise ('Identity and overlab should be between 0 and 1') cmd = 'silix %s %s -f FAM -i %s -r %s --net > seq.fnodes' % ( input_fasta, blastall_file, identity, overlap) a, b, c = shell_command.shell_command(cmd)
def execute_blast(self, fasta, database): import shell_command cmd1 = "formatdb -i %s -p T" % database cmd2 = 'blastp -query %s -db %s -evalue 1e-5 -num_threads 8 -outfmt 6 -out blast_result.tab' % ( fasta, database) #a, b, c = shell_command.shell_command(cmd1) a, b, c = shell_command.shell_command(cmd2) print a, b, c
def aafasta2phylogeny(aa_fasta, phylo=False): import os import shell_command import re from ete3 import Tree align_name = aa_fasta.split('.')[0] + '_mafft.fa' print('aligning with mafft...') cmd_mafft = 'mafft --anysymbol --amino --auto --maxiterate 1000 %s > %s' % ( aa_fasta, align_name) out, err, code = shell_command.shell_command(cmd_mafft) if code != 0: raise (err) if phylo: print('reconstructing phylogeny with RAxML...') output_prefix = aa_fasta.split('.')[0] output_tree_name = os.path.join('RAxML_result.%s' % output_prefix) output_shtree_name = os.path.join('shtest_%s' % output_prefix) cmd_raxml = 'raxml -m PROTGAMMALG -p 12345 -s %s -n %s -c 4 -T 8;' \ 'raxml -f J -m PROTGAMMALG -s %s -p 12345 -t %s -n %s -T 8' % (align_name, output_prefix, align_name, output_tree_name, output_shtree_name) out, err, code = shell_command.shell_command(cmd_raxml) if code != 0: print(out, code) print(err) import sys sys.exit() nw = re.sub( ":(\d+\.\d+)\[(\d+)\]", ":\\1[&&NHX:support=\\2]", open("RAxML_fastTreeSH_Support.shtest_%s" % output_prefix).read()) t = Tree(nw, format=0) t.write(outfile="RAxML_fastTreeSH_Support.shtest_%s.nwk" % output_prefix, format=0) return t
def reorder_contigs_with_mauve(reference, contigs, output_folder="mauve_ordering"): import shell_command cmd = 'link=$(readlink -f `which Mauve.jar`); java -Xmx500m -cp $link org.gel.mauve.contigs.ContigOrderer -output %s -ref %s -draft %s' % (output_folder, reference, contigs) out, err, code = shell_command.shell_command(cmd) if code != 0: sys.stdout.write('problem with command: \n %s\n' % cmd)
def execute_promer(self, fasta1, fasta2, out_file="out.coords", algo="nucmer"): import shell_command #cmd1 = 'promer --mum -l 5 %s %s' % (fasta1, fasta2) for one_fasta in fasta2: if algo == 'nucmer': cmd1 = 'nucmer -mum -b 200 -c 65 -g 90 -l 20 %s %s' % ( fasta1, one_fasta) cmd2 = 'show-coords -T -r -c -L 100 -I 30 out.delta > %s.coords' % one_fasta.split( '.')[0] a, b, c = shell_command.shell_command(cmd1) a, b, c = shell_command.shell_command(cmd2) elif algo == 'promer': cmd1 = 'promer --mum -l 5 %s %s' % (fasta1, one_fasta) cmd2 = 'show-coords -T -r -c -L 100 -I 30 out.delta > %s.coords' % one_fasta.split( '.')[0] a, b, c = shell_command.shell_command(cmd1) a, b, c = shell_command.shell_command(cmd2)
def download_seq_entrez(accession, multifasta, seq_type, append): # retrieves information about entry in ncbi using Entrez # if the user requests individual fasta file per accession if multifasta == "": for one_accession in accession: output_name = re.sub("([a-zA-Z_0-9]+)\.([a-zA-Z]+)", "\\1.fasta", one_accession) output_handle = open(output_name, "w") handle = Entrez.efetch(db=seq_type, id=one_accession, rettype="fasta") output_handle.write(handle.read()) output_handle.close() # if the user requests a single multifasta file else: # if the user required to add the blast file if append != "": output_handle = open("%s_only.fasta" % multifasta, "w") handle = Entrez.efetch(db=seq_type, id=accession, rettype="fasta") output_handle.write(handle.read()) output_handle.close() cmd = "cat %s %s_only.fasta > %s.fasta" % (append, multifasta, multifasta) print cmd shell_command(cmd) cmd = "rm %s_only.fasta" % multifasta print cmd shell_command(cmd) else: output_handle = open("%s.fasta" % multifasta, "w") handle = Entrez.efetch(db=seq_type, id=accession, rettype="fasta") output_handle.write(handle.read()) output_handle.close()
def is_job_running(job_id): try: merged_text = " ".join( shell_command("bjobs %d" % job_id)[0].split('\n')) stat = re.findall("EXIT|DONE|PEND|RUN", merged_text) print(stat) all_status = unique(stat) except: print("unknown job ID") return True if "EXIT" in all_status: raise (Exception('bsub command failed with status: EXIT')) if "PEND" in all_status: return False else: return True
def execute_circos(self): os.chdir(self.etc_dir) self.config_path = 'circosviz.config' self.plot_path = 'circosviz_plot.svg' with open(self.config_path, 'w') as f: f.write(self.template_config) cmd = 'circos -noparanoid -conf %s -outputfile %s' % (self.config_path, self.plot_path) out, err, code = shell_command.shell_command(cmd) if code != 0: print out, err os.chdir(self.working_dir)
def convert_leaf_labels(input_tree, biodb_name, accession2taxon=False, taxon2accession=False, sqlite=False): import shell_command import manipulate_trees import os print(accession2taxon, taxon2accession) cmd = 'newick2phyloxml.pl -i %s' % input_tree file_name = input_tree.split('.')[0] a, b, c = shell_command.shell_command(cmd) print(a, b, c) dirpath = os.getcwd() input_file = os.path.join(dirpath, file_name + '.phyloxml') output_file = os.path.join(dirpath, file_name + '_renamed.tree') print(input_file) print(output_file) if accession2taxon: print("sqlite") manipulate_trees.convert_tree_accession2taxon_id(biodb_name, input_file, output_file, sqlite=sqlite) elif taxon2accession: manipulate_trees.convert_tree_taxon_id2accession(biodb_name, input_file, output_file, sqlite=sqlite) else: manipulate_trees.convert_tree_taxon2genome(biodb_name, input_file, output_file, sqlite=sqlite)
def get_job_status(job_id): try: merged_text = " ".join( shell_command("bjobs %d" % job_id)[0].split('\n')) stat = re.findall("EXIT|DONE|PEND|RUN", merged_text) all_status = list(set(stat)) except: raise (Exception("unknown job ID")) # return exit only if all jobs (in case of job array) were exited if len(all_status) == 1 and all_status[0] == "EXIT": return "EXIT" elif "RUN" in all_status and "EXIT" in all_status: return "partial EXIT" elif "RUN" in all_status or "PEND" in all_status: return "RUN" elif "EXIT" in all_status: return "partial DONE" else: return "DONE"
def relaunch_vital_it_job(status, cmd_data): import generate_bsub_file import shell_command import sys if status == "MEMKILL": mem = int(cmd_data["-M"]) / 1000000 mem += 2 script = generate_bsub_file.BSUB_script(command=cmd_data["cmd"], mem_in_GB=mem, name=cmd_data["-J"], log_file=cmd_data["-o"], error_file=cmd_data["-e"]) generate_bsub_file.run_job(script) sys.stdout.write( "Job %s relaunched with increased memory limit: %s GB\n" % (cmd_data["-J"], mem)) shell_command.shell_command("rm %s" % cmd_data["-o"]) shell_command.shell_command("rm %s" % cmd_data["-e"]) elif status == "LIMITKILL": script = generate_bsub_file.BSUB_script(command=cmd_data["cmd"], mem_in_GB=int(cmd_data["-M"]) / 1000000, name=cmd_data["-J"], log_file=cmd_data["-o"], error_file=cmd_data["-e"], queue="long") generate_bsub_file.run_job(script) sys.stdout.write("Job %s relaunched with queue long" % cmd_data["-J"]) shell_command.shell_command("rm %s" % cmd_data["-o"]) shell_command.shell_command("rm %s" % cmd_data["-e"]) elif status == "SUCCESS": pass elif status == "UNKNOWN": print('status unknown for %s' % cmd_data) else: raise IOError("Uknwon LFS error status %s\n" % status)
def insert_blast_table(input_file, table_name): from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db('chlamydia_12_15') # query id, subject id, % identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score sql_profiles_table = 'CREATE TABLE IF NOT EXISTS temp_tables.%s (' \ ' query_id varchar(100), ' \ ' subject_id varchar(100), ' \ ' identity float, ' \ ' alignment_length int, ' \ ' mismatches int, ' \ ' gap_opens int, ' \ ' q_start int, ' \ ' q_end int, ' \ ' s_start int, ' \ ' s_end int, ' \ ' evalue float, ' \ ' bit_score float)' % (table_name) try: print sql_profiles_table server.adaptor.execute(sql_profiles_table) except: print 'problem creating the sql table' import shell_command import os wd = os.getcwd() path = os.path.join(wd, input_file) sqlpsw = os.environ['SQLPSW'] cmd = 'mysql -uroot -p%s biosqldb -e \'LOAD DATA LOCAL INFILE "%s" INTO TABLE temp_tables.%s;\'' % ( sqlpsw, path, table_name) print cmd a, b, c = shell_command.shell_command(cmd) print a print b print c
def launch(self): #import os # generate temporary file file_name = id_generator(24) + ".sub" #with NamedTemporaryFile() as temp_file: # add content to temporary file #temp_file.write(str(self)) with open(file_name, "w") as f: f.write(str(self)) #f.close() # write file content to the disk #temp_file.flush() #file_name.close() #import time #time.sleep(0.5) # define command line (file.name contain complete path) command = 'bsub < ' + file_name # file_name # execute command line print('command:', command) (stdout, stderr, return_code) = shell_command(command) # close temp file #file.close() #time.sleep(15) print("out:", stdout, "err:", stderr, "code:", return_code) if return_code == 0: # return job id print("Job submitted:", stdout) job_id = re.search("\d+", stdout).group(0) print(job_id) return int(job_id) else: print(command) raise ( Exception('bsub submission command failed with exit status: ' + str(return_code)))
def __init__(self, genbank, database, tabulated_blast_file=False, samtools_depth=None, gc=True, accession2classification=False, identity_cutoff=50, query_coverage_cutoff=0.6, evalue_cutoff=0.00005, execute_blast=True, blast_tab_file=False): import gbk2circos import shell_command from Bio import SeqIO self.reference_records = [ i for i in SeqIO.parse(open(genbank), 'genbank') ] self.contig_list = [record.name for record in self.reference_records] self.contig2gc_percent = circos_utils.contig2gc(self.reference_records) if samtools_depth: self.contig2median_depth = circos_utils.get_median_contig_coverage( self.contig_list, samtools_depth) else: self.contig2median_depth = False self.records2locus_tag2start_stop = circos_utils.records2locus_tag2start_stop( self.reference_records) # coordinates to get concatenated chromosome coord self.contigs_add = circos_utils.get_contigs_coords( self.reference_records) if not blast_tab_file: self.blast_tab_file = "blast_result.tab" else: self.blast_tab_file = blast_tab_file self.fasta_aa = genbank.split(".")[0] + '.faa' self.fasta_nucl = genbank.split(".")[0] + '.fna' print 'extractint aa and fna sequences...' a, b, c = shell_command.shell_command("gbk2fna.py -i %s -o %s" % (genbank, self.fasta_nucl)) a, b, c = shell_command.shell_command("gbk2faa.py -i %s -f -o %s" % (genbank, self.fasta_aa)) self.fasta_aa_records = [ i for i in SeqIO.parse(open(self.fasta_aa), 'fasta') ] self.fasta_nucl_records = [ i for i in SeqIO.parse(open(self.fasta_nucl), 'fasta') ] #else: # self.blast_tab_file = tabulated_blast_file if execute_blast: print 'executing BLASTP...' self.execute_blast(self.fasta_aa, database) self.circos_reference = gbk2circos.Circos_config( "circos_contigs.txt", show_ideogram_labels="yes", radius=0.7, show_tick_labels="yes", show_ticks="yes") self.first_contig, self.last_contig = circos_utils.get_karyotype_from_gbk_or_fasta_single_ref( self.reference_records, out="circos_contigs.txt") class_colors = self.blast2barplot( self.fasta_aa_records, database, self.blast_tab_file, bar_file="circos.bar", accession2classification=accession2classification, identity_cutoff=identity_cutoff, query_coverage_cutoff=query_coverage_cutoff, evalue_cutoff=evalue_cutoff) countour_col = 'black' if not class_colors: histo_colors = 'orange' self.last_track = 0.7 else: histo_colors = '' for class_col in class_colors: print class_col, class_colors[class_col] histo_colors += '%s,' % class_colors[class_col] histo_colors = histo_colors[0:-1] self.circos_reference.add_plot("best_hit.bar", type="histogram", r0="0.65r", r1="0.67r", color=histo_colors, fill_color=histo_colors, thickness=0, min=0, max=50) self.last_track = 0.65 add = ''' <axes> <axis> spacing = 0.1r color = lgrey thickness = 2 </axis> </axes> ''' self.circos_reference.add_plot("circos.bar", type="histogram", r0="0.7r", r1="0.9r", color=countour_col, fill_color=histo_colors, thickness=1, min="0", max="50", rules=add) if samtools_depth: #for i, depth_file in enumerate(samtools_depth): all_contigs_median = circos_utils.samtools_depth2circos_data( samtools_depth, self.contigs_add, 1) self.add_samtools_depth_track( 'circos_samtools_depth_1.txt', lower_cutoff=int(all_contigs_median) / 2, top_cutoff=int(all_contigs_median) * 2) if gc: import GC from Bio import SeqIO fasta_records = list(SeqIO.parse(self.fasta_nucl, 'fasta')) out_var_file = ('circos_GC_var.txt') out_skew_file = ('circos_GC_skew.txt') f = open(out_var_file, 'w') g = open(out_skew_file, 'w') out_var = '' out_skew = '' for record in fasta_records: # this function handle scaffolds (split sequence when encountering NNNNN regions) out_var += GC.circos_gc_var( record, 1000, shift=self.contigs_add[record.name][0]) out_skew += GC.circos_gc_skew( record, 1000, shift=self.contigs_add[record.name][0]) #print out_skew f.write(out_var) g.write(out_skew) f.close() g.close() rule = """<rule> condition = var(value) < 0 fill_color = lred color = red </rule> <rule> condition = var(value) > 0 fill_color = lblue color = blue </rule> """ rule2 = """<rule> condition = var(value) < 0 fill_color = lgreen color = green </rule> <rule> condition = var(value) > 0 fill_color = lblue color = blue </rule> """ conditions = self.circos_reference.template_rules % (rule) self.circos_reference.add_plot('circos_GC_skew.txt', fill_color="green", r1="%sr" % (self.last_track - 0.02), r0="%sr" % (self.last_track - 0.1), type="line", rules=conditions) conditions = self.circos_reference.template_rules % (rule2) self.circos_reference.add_plot('circos_GC_var.txt', fill_color="green", r1="%sr" % (self.last_track - 0.12), r0="%sr" % (self.last_track - 0.2), type="line", rules=conditions) self.last_track = self.last_track - 0.12 self.config = self.circos_reference.get_file() self.brewer_conf = """
def main(input_reference, input_queries_folder, blast_file, mlst_scheme, input_gbk, skip_parsnp=False, skip_blast=False, input_tree=None, skip_mlst=False, check_overlap=False, id_cutoff=80, blast_type='tblastn', reference_accession='-', blast_filter=False, show_identity_values=True, accession2description=False): import shell_command import os import sys import glob from Bio import SeqIO sys.stdout.write('Building tree using parsnp...\n') wd = os.getcwd() ordered_queries = [ record.name for record in SeqIO.parse(blast_file, 'fasta') ] #print 'input fasta folder', input_queries_folder fasta_folder = os.path.abspath(input_queries_folder) reference_file = os.path.abspath(input_reference) if blast_filter: accession2hit_filter = get_accession_filter_from_blast_list( blast_filter) else: accession2hit_filter = False #print 'fasta folder', fasta_folder pp = fasta_folder + '/*fna' #print pp fasta_files = glob.glob(pp) if len(fasta_files) == 0: pp = fasta_folder + '/*ffn' #print pp fasta_files = glob.glob(pp) if len(fasta_files) == 0: raise ('could not find fasta files') #print 'fasta files', fasta_files reference_phylogeny_folder = os.path.join(wd, 'reference_parsnp_phylogeny') reference_phylogeny = os.path.join(reference_phylogeny_folder, 'parsnp_edit.tree') out_mlst = os.path.join(wd, 'mlst_results/mlst.tab') if not skip_parsnp and not input_tree: cmd = 'parsnp -r %s -d %s -p 6 -c -o parsnp_tree' % (reference_file, fasta_folder) #print cmd out, err, code = shell_command.shell_command(cmd) print code, err print out if not os.path.exists(reference_phylogeny_folder): os.mkdir(reference_phylogeny_folder) sys.stdout.write('Editing parsnp phylogeny...\n') parsnp_raw_phylogeny = os.path.join(wd, 'parsnp_tree/parsnp.tree') cmd = 'cat %s | sed "s/.fna//g"> %s' % (parsnp_raw_phylogeny, reference_phylogeny) out, err, code = shell_command.shell_command(cmd) #print code, err cmd = '''sed -i "s/\'//g" %s''' % reference_phylogeny out, err, code = shell_command.shell_command(cmd) #print cmd, '##################' #print code, err cmd = '''sed -i "s/.ref//" %s''' % (reference_phylogeny) out, err, code = shell_command.shell_command(cmd) #print code, err os.chdir(wd) if not skip_mlst: sys.stdout.write('Identifying mlst...\n') if not os.path.exists(os.path.join(wd, 'mlst_results')): os.mkdir(os.path.join(wd, 'mlst_results')) all_fasta = ' '.join(fasta_files) cmd = 'mlst --quiet --nopath --scheme %s %s > %s' % ( mlst_scheme, all_fasta, out_mlst) out, err, code = shell_command.shell_command(cmd) cmd = 'sed -i "s/.fna//g" %s' % out_mlst out, err, code = shell_command.shell_command(cmd) cmd = 'sed -i "s/.ffn//g" %s' % out_mlst out, err, code = shell_command.shell_command(cmd) #print err, code if input_tree: reference_phylogeny = input_tree sys.stdout.write('Blasting %s...\n' % blast_file) blast_file_fullpath = os.path.join(wd, os.path.basename(blast_file)) blastp_folder = os.path.join(wd, 'blastp_results') if not os.path.exists(blastp_folder): os.mkdir(blastp_folder) os.chdir(input_queries_folder) fasta_file_name2fasta_header = {} blast_best_hit_results = [] for genome_file in fasta_files: file_name = os.path.basename(genome_file).split('.')[0] header_name = SeqIO.read(genome_file, 'fasta').name fasta_file_name2fasta_header[file_name] = header_name out_file = 'blast_' + os.path.basename(genome_file).split('.')[0] outpath = os.path.join(blastp_folder, out_file + '.tab') best_hit_path = os.path.join(blastp_folder, 'uniq_' + out_file + '.tab') blast_best_hit_results.append(best_hit_path) if not skip_blast: cmd = 'formatdb -i %s -p F' % genome_file out, err, code = shell_command.shell_command(cmd) if blast_type == 'tblastn': from Bio.Blast.Applications import NcbitblastnCommandline #print err, code out, err, code = shell_command.shell_command( 'export BLASTDB=$BLASTDB:%s' % blastp_folder) blastp_cline = NcbitblastnCommandline( query=blast_file_fullpath, db=genome_file, evalue=0.001, outfmt=6, out=outpath) elif blast_type == 'blastn': from Bio.Blast.Applications import NcbiblastnCommandline blastp_cline = NcbiblastnCommandline(query=blast_file_fullpath, db=genome_file, evalue=0.001, outfmt=6, out=outpath) else: raise ('unsupported blast type') stdout, stderr = blastp_cline() result_handle = open(outpath, 'r') best_hit_handle = open(best_hit_path, 'w') hit_list = [] for line in result_handle: if line.split('\t')[0] in hit_list: continue else: hit_list.append(line.split('\t')[0]) best_hit_handle.write(line) best_hit_handle.close() os.chdir(wd) print fasta_file_name2fasta_header if accession2description: id2description = {} with open(accession2description, 'r') as f: for row in f: data = row.rstrip().split('\t') id2description[data[0]] = data[1] else: id2description = gbk2accessiontodefinition.get_coressp(input_gbk) #else: # IOError('either provide id2description og gbk files') accession2st_type = parse_mlst_results(out_mlst) #print reference_phylogeny plot_blast_result(reference_phylogeny, blast_best_hit_results, id2description, accession2st_type, check_overlap, ordered_queries, fasta_file_name2fasta_header, id_cutoff, reference_accession=reference_accession, accession2hit_filter=accession2hit_filter, show_identity_values=show_identity_values)
def prokka_reannotation(seq_record_list, compare=False): ''' Reannotate seq record using prokka INPUT: list of seqrecord objects (one/sequence to reannotate) ''' import re import datetime from Bio.SeqRecord import SeqRecord from Bio import SeqIO import gbk_check all_locus = [] l = open('reannotation_prokka_log.txt', 'w') l.write( 'reference_name\tnew_name\taccession\tref_locus_tag\tnew_locus_tag\tref_n_CDS\tnew_n_CDS\tn_CDS_identical\n' ) #print 'reference_name\tnew_name\taccession\tref_locus_tag\tnew_locus_tag\tref_n_CDS\tnew_n_CDS\tn_CDS_identical' print 'reannotation:' for i, record in enumerate(seq_record_list): print i, record reanotated_gbk_list = [] for record in seq_record_list: print '####### one record #########' print record print print '####### features ###########' print record.features print '########## length ##########', len(record) import shell_command assert type(record) == SeqRecord # raise IOError('Wrong input, only scaffolded genomes should be reannotated with this script') record_annotations = record.annotations try: record_annotations[ 'comment'] += '\nGenome reannotated using PROKKA version 1.1' except KeyError: record_annotations[ 'comment'] = 'Genome reannotated using PROKKA version 1.1' record_name = record.name record_id = record.id record_description = gbk_check.clean_description(record.description) record_dbxrefs = record.dbxrefs # create locus tag based on genus and species name organism = re.sub('\'', '', record_annotations['source']).split(' ') if len(organism) > 2: locus_tag = "P%s%s%s" % (organism[0][0], organism[1][0:2], organism[2][0]) elif len(organism) == 2: locus_tag = "P%s%s" % (organism[0][0:2], organism[1][0]) else: print record # check if the new locus_tag is unique, otherwise add a count i = 2 no_match = False if locus_tag in all_locus: #print record while no_match == False: locus_tag = locus_tag + str(i) #print locus_tag if locus_tag in all_locus: i += 1 else: no_match = True all_locus.append(locus_tag) else: all_locus.append(locus_tag) # wite fasta and annotate it using prokka if str(record.seq) == len(record.seq) * 'N': print 'No sequence in record %s' % record.id continue with open('temp_genome.fna', 'w') as f: f.write('>temp_seq\n%s' % record.seq) cmd = 'prokka --force --kingdom Bacteria --compliant --centre CHUV --locustag %s --outdir %s -genus temp_genus -strain temp_strain temp_genome.fna' % ( locus_tag, locus_tag) today = datetime.date.today() date = today.strftime('%m%d%Y') prokka_genbank = '%s/%s_%s.gbk' % (locus_tag, locus_tag, date) cmd2 = 'prokka --kingdom Bacteria --compliant --proteins proteins.faa --locustag Citr -genus Citronella -strain virus citro_spades_123_1000.fa' out, err, n = shell_command.shell_command(cmd) print out reanotated_gbk = SeqIO.read(prokka_genbank, "genbank") reanotated_gbk.id = record_id reanotated_gbk.name = record_name reanotated_gbk.annotations = record_annotations reanotated_gbk.description = record_description reanotated_gbk.dbxrefs = record_dbxrefs reanotated_gbk.features[0] = record.features[0] #count number of identical ORF if compare: ref_CDS = 0 for feature in record.features: if feature.type == 'CDS': if ref_CDS == 0: ref_locus_tag = feature.qualifiers['locus_tag'][ 0].split('_')[0] ref_CDS += 1 new_CDS = 0 identical_CDS = 0 # count number of identical features (exact same location) for new_feature in reanotated_gbk.features: if new_feature.type == 'CDS': new_CDS += 1 for ref_feature in record.features: if ref_feature.type == 'CDS': if ref_feature.qualifiers[ 'translation'] == new_feature.qualifiers[ 'translation']: identical_CDS += 1 break #accession = reanotated_gbk[0].annotations["accessions"][0] #print 'Ref CDS', ref_CDS, 'New CDS', new_CDS, 'Identical CDS', identical_CDS #'reference_file_name\tnew_file_name\taccession\tref_locus_tag\tnew_locus_tag\tref_n_CDS\tnew_n_CDS\tn_CDS_identical' try: l.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (accession, ref_locus_tag, locus_tag, ref_CDS, new_CDS, identical_CDS)) except: pass #print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (genbank_file, out_name, accession, ref_locus_tag, locus_tag, ref_CDS, new_CDS, identical_CDS) reanotated_gbk_list.append(reanotated_gbk) return reanotated_gbk_list
out_name = args.input.split(".")[0] + "_locus.tmp" handle2 = open(out_name, "w") SeqIO.write(updated_records, handle2, "embl") handle2.close() out_handle = open(args.input.split(".")[0] + "_locus.embl", "w") for type in set(type_list): if type not in ["CDS", "tRNA", "rRNA", 'source']: raise IOError('Unexpected feature type: %s' % type) # replace specific rows i = 0 with open(args.input.split(".")[0] + "_locus.tmp") as f: match = False for line in f: if match: line += 'AC * _Contig_%s_%s\nXX\n' % (i + 1, contig_list[i]) i += 1 match = False if 'ID ' in line: line = 'ID XXX; XXX; linear; XXX; XXX; XXX; XXX.\n' if 'AC XXX;' in line: line = 'AC ;\n' match = True if 'RA XXX;' in line: line += 'RT ;\n' if 'OS .' in line or 'OC .' in line: continue out_handle.write(line) import shell_command shell_command.shell_command('rm %s' % (args.input.split(".")[0] + "_locus.tmp"))
def gc_coverage_plot(samtool_depth_file, contigs_file, blast_file=False, column1=1, column2=2, main=False, highlight=False): import os if not main: main = os.path.basename(samtool_depth_file) import shell_command out, err, code = shell_command.shell_command( "infoseq -auto -only -Name -length -pgc %s > /tmp/gc.tab" % contigs_file) print(out) print(err) print(code) if highlight: highlight_code = """ gc_coverage_table$color <- rep(rgb(1, 0, 0,0.5), length(gc_coverage_table[,1])) highlight_table <- read.table("%s", header=FALSE) m <- match(highlight_table[,1], gc_coverage_table$Name) gc_coverage_subset <- gc_coverage_table[m,] print("subset") print(m) gc_coverage_table[m,]$color<-rgb(0, 0, 1,0.5) """ % highlight highlight_code2 = """ m <- match(highlight_table[,1], gc_coverage_table_2m$Name) print("subset m2") print(m) gc_coverage_subset2 <- gc_coverage_table_2m[m,] """ else: highlight_code = '' highlight_code2 = '' print('high', highlight) if not blast_file: robjects.r(""" #library(Cairo) library(R.utils) if (isGzipped("%s")){ print('Gzipped file') all_depth <- read.table(gzfile('%s'), header=FALSE) }else{ print('Not Gzipped') all_depth <- read.table('%s', header=FALSE) } contigs_depth<- aggregate(all_depth["V3"],b=all_depth[c("V1")],FUN=median) contigs_gc <- read.table("/tmp/gc.tab", header=TRUE) gc_coverage_table <-cbind(contigs_gc,coverage=contigs_depth[match(contigs_gc$Name, contigs_depth$V1),2]) #w<-which(gc_coverage_table$Length >=1000) #gc_coverage_table <- gc_coverage_table[w,] write.table(gc_coverage_table, 'gc_coverage_table.tab', sep="\t", row.names=F) %s svg("gc_cov_buble.svg", width = 12, height = 12) symbols(x=gc_coverage_table[,3], y= gc_coverage_table[,4], circles=gc_coverage_table[,2], inches=1/3, ann=T, bg=rgb(1, 0, 0,0.5), fg=rgb(1, 0, 0,0.5), main="%s", xlab="GC(%%)", ylab="Sequencing depth") if (any("gc_coverage_subset" %%in%% ls())) { symbols(x=gc_coverage_table[,3], y= gc_coverage_table[,4], circles=gc_coverage_table[,2], inches=1/3, ann=T, bg=gc_coverage_table$color, fg=gc_coverage_table$color, add = TRUE) l <- gsub('(^[^_]+_[^_]+)_(.*)$', '\\\\1', gc_coverage_subset$Name) text(x=gc_coverage_subset[,3], y=gc_coverage_subset[,4], labels = l) }else{ print ('a') } dev.off() cov_biggest <- gc_coverage_table[which(gc_coverage_table$Length==max(gc_coverage_table$Length)),4] print('cov biggest:') print(cov_biggest) w <- which(gc_coverage_table[,4]< (4*cov_biggest)) gc_coverage_table_2m <- gc_coverage_table[w,] %s svg("gc_cov_buble_2m.svg", width = 12, height = 12) symbols(x=gc_coverage_table_2m[,3], y= gc_coverage_table_2m[,4], circles=gc_coverage_table_2m[,2], inches=1/3, ann=T, bg=rgb(1, 0, 0,0.5), fg=rgb(1, 0, 0,0.5), main="%s", xlab="GC(%%)", ylab="Sequencing depth") if (any("gc_coverage_subset" %%in%% ls())) { symbols(x=gc_coverage_table_2m[,3], y= gc_coverage_table_2m[,4], circles=gc_coverage_table_2m[,2], inches=1/3, ann=T, bg=gc_coverage_table_2m$color, fg=gc_coverage_table_2m$color, add = TRUE) l <- gsub('(^[^_]+_[^_]+)_(.*)$', '\\\\1', gc_coverage_subset2$Name) text(x=gc_coverage_subset2[,3], y=gc_coverage_subset2[,4], labels = l) }else{ print ('a') } dev.off() """ % (samtool_depth_file, samtool_depth_file, samtool_depth_file, highlight_code, main, highlight_code2, main)) else: robjects.r(""" #library(Cairo) library(R.utils) if (isGzipped("%s")){ print('Gzipped file') all_depth <- read.table(gzfile('%s'), header=FALSE) }else{ print('Not Gzipped') all_depth <- read.table('%s', header=FALSE) } blast_file <- read.table("%s", header=FALSE, sep="\t")[,c(2,6)] contigs_depth<- aggregate(all_depth["V3"],b=all_depth[c("V1")],FUN=median) contigs_gc <- read.table("/tmp/gc.tab", header=TRUE) gc_coverage_table <-cbind(contigs_gc,coverage=contigs_depth[match(contigs_gc$Name, contigs_depth$V1),2]) #w<-which(gc_coverage_table$Length >=1000) #gc_coverage_table <- gc_coverage_table[w,] gc_coverage_table$taxon <- blast_file[,2][match(gc_coverage_table$Name, blast_file[,1])] print (is.na(gc_coverage_table$taxon)) gc_coverage_table$taxon <- as.character(gc_coverage_table$taxon) gc_coverage_table$taxon[is.na(gc_coverage_table$taxon)] <- 'undefined' gc_coverage_table$taxon <- as.factor(gc_coverage_table$taxon) write.table(gc_coverage_table, 'gc_coverage_table.tab', sep="\t", row.names=F) svg("gc_cov_buble.svg", width = 12, height = 12,) symbols(x=gc_coverage_table[,3], y= gc_coverage_table[,4], circles=gc_coverage_table[,2], inches=1/3, ann=F, bg=gc_coverage_table$taxon, fg=gc_coverage_table$taxon, main="%s", xlab="GC(%%)", ylab="Sequencing depth") dev.off() cov_biggest <- gc_coverage_table[which(gc_coverage_table$Length==max(gc_coverage_table$Length)),4] print('cov biggest:') print(cov_biggest) w <- which(gc_coverage_table[,4]< (4*cov_biggest)) gc_coverage_table_2m <- gc_coverage_table[w,] svg("gc_cov_buble_2m.svg", width = 12, height = 12,) symbols(x=gc_coverage_table_2m[,3], y= gc_coverage_table_2m[,4], circles=gc_coverage_table_2m[,2], inches=1/3, ann=F, bg=gc_coverage_table$taxon, fg=gc_coverage_table$taxon, main="%s", xlab="GC(%%)", ylab="Sequencing depth") dev.off() """ % (samtool_depth_file, samtool_depth_file, samtool_depth_file, blast_file, main, main))
def gc_coverage_plot(contigs_file, contig_depth_table=False, samtool_depth_file=False, blast_file=False, column1=1, column2=2, main=False, highlight=False, taxonomy_file=False, output_prefix=False): if output_prefix[-1] != '/': output_prefix += '/' print("output_prefix", output_prefix) import os import shell_command import rpy2.robjects as robjects import rpy2.robjects.numpy2ri from pandas import DataFrame import pandas import rpy2 from rpy2.robjects import r from rpy2.robjects import pandas2ri pandas2ri.activate() if not main: main = os.path.basename(contigs_file) out, err, code = shell_command.shell_command( "infoseq -auto -only -Name -length -pgc %s > /tmp/gc.tab" % contigs_file) #print (out) #print (err) #print (code) if contig_depth_table: contig_depth = pandas.read_csv(contig_depth_table, sep='\t', names=["contig", "depth"]) #contig_depth = DataFrame(contig_depth, columns=['contig', 'depth']) #print (type(contig_depth["contig"])) #print (type(contig_depth)) robjects.r.assign('contigs_depth', pandas2ri.py2ri(contig_depth)) if taxonomy_file: with open(taxonomy_file, 'r') as f: contigs2taxon2count = {} for row in f: data = row.rstrip().split() contig = data[0] taxon = data[1] if contig not in contigs2taxon2count: contigs2taxon2count[contig] = {} contigs2taxon2count[contig][taxon] = 1 else: if taxon in contigs2taxon2count[contig]: contigs2taxon2count[contig][taxon] += 1 else: contigs2taxon2count[contig][taxon] = 1 contig2label = [] for contig in contigs2taxon2count: if len(contigs2taxon2count[contig]) > 1: # more than one taxon label = '' for taxon in contigs2taxon2count[contig]: label += '%s (%s) /' % (taxon, contigs2taxon2count[contig][taxon]) label = label[0:-2] else: label = list(contigs2taxon2count[contig].keys())[0] contig2label.append([contig, label]) label2freq = {} for contig in contig2label: if contig[1] not in label2freq: label2freq[contig[1]] = 1 else: label2freq[contig[1]] += 1 for contig in contig2label: if label2freq[contig[1]] <= 2: contig[1] = 'rare_taxon' df = DataFrame(contig2label, columns=['contig', 'label']) print(type(df["contig"])) print(type(df)) #m = m.astype(float) robjects.r.assign('contig_labels', pandas2ri.py2ri(df)) else: robjects.r.assign('contig_labels', False) if highlight: highlight_code = """ gc_coverage_table$color <- rep(rgb(1, 0, 0,0.5), length(gc_coverage_table[,1])) highlight_table <- read.table("%s", header=FALSE) m <- match(highlight_table[,1], gc_coverage_table$Name) gc_coverage_subset <- gc_coverage_table[m,] print("subset") print(m) gc_coverage_table[m,]$color<-rgb(0, 0, 1,0.5) """ % highlight highlight_code2 = """ m <- match(highlight_table[,1], gc_coverage_table_2m$Name) #print("subset m2") #print(m) gc_coverage_subset2 <- gc_coverage_table_2m[m,] """ else: highlight_code = '' highlight_code2 = '' if not blast_file: robjects.r(""" #library(Cairo) library(R.utils) library(ggplot2) if (exists("contigs_depth")==FALSE){ if (isGzipped("%s")){ #print('Gzipped file') all_depth <- read.table(gzfile('%s'), header=FALSE) }else{ #print('Not Gzipped') all_depth <- read.table('%s', header=FALSE) } contigs_depth<- aggregate(all_depth["V3"],b=all_depth[c("V1")],FUN=median) colnames(contigs_depth) <- c('contig', 'depth') } #print(contigs_depth) #print(contig_labels) contigs_gc <- read.table("/tmp/gc.tab", header=TRUE) gc_coverage_table <-cbind(contigs_gc,coverage=contigs_depth[match(contigs_gc$Name, contigs_depth$contig),2]) #w<-which(gc_coverage_table$Length >=1000) #gc_coverage_table <- gc_coverage_table[w,] cov_biggest <- gc_coverage_table[which(gc_coverage_table$Length==max(gc_coverage_table$Length)),4] #print('cov biggest:') #print(cov_biggest) w <- which(gc_coverage_table[,4]< (4*cov_biggest)) gc_coverage_table_2m <- gc_coverage_table[w,] if (contig_labels != FALSE) { library(RColorBrewer) color_palette <- c('red', 'blue','green', brewer.pal(12,"Paired"), brewer.pal(12,"Set3")) m <- match(contig_labels$contig, gc_coverage_table$Name) gc_coverage_table$color <- rep("Unclassified", length(gc_coverage_table[,1])) gc_coverage_table$contig_alpha <- rep(0.5, length(gc_coverage_table[,1])) gc_coverage_table$color[m] <- as.character(contig_labels$label) gc_coverage_table$contig_alpha[m] <- rep(1,length(contig_labels$label)) w<-which(gc_coverage_table$Length >=1000) gc_coverage_table <- gc_coverage_table[w,] #w2 <- which(gc_coverage_table$color != "Chlamydiae") #gc_coverage_table$contig_alpha[w2] <- 0.7 svg("%sgc_cov_buble_test.svg", width = 12, height = 12) p6 <- ggplot(gc_coverage_table, aes(x = X.GC, y = coverage, size = Length, fill = color, colour = color, alpha = contig_alpha)) + geom_point(shape = 21) + ggtitle("Scaffold GC vs Depth") + labs(x = "GC (%%)", y = "Sequencing depth") + scale_size(range = c(1, 10)) p6 <- p6 + scale_fill_manual(values=color_palette[0:length(unique(gc_coverage_table$color))])+ guides(color = guide_legend(override.aes = list(size=5))) p6 <- p6 + scale_colour_manual(values=color_palette[0:length(unique(gc_coverage_table$color))]) #print (max(gc_coverage_table$Length)) p6 <- p6 + scale_alpha_continuous(range=c(0.1, 1), limits=c(0.1,1)) #+ scale_alpha_continuous(range=c(0, max(gc_coverage_table$Length)), limits=c(0,max(gc_coverage_table$Length))) print(p6 + theme_bw()) dev.off() gc_coverage_table_2m$color <- rep("Unclassified", length(gc_coverage_table_2m[,1])) gc_coverage_table_2m$contig_alpha <- rep(0.5, length(gc_coverage_table_2m[,1])) gc_coverage_table_2m$color[m] <- as.character(contig_labels$label) gc_coverage_table_2m$contig_alpha[m] <- rep(1,length(contig_labels$label)) svg("%sgc_cov_buble_test_2m.svg", width = 12, height = 12) p6 <- ggplot(gc_coverage_table_2m, aes(x = X.GC, y = coverage, size = Length, fill = color, colour = color, alpha = contig_alpha)) + geom_point(shape = 21) + ggtitle("Scaffold GC vs Depth") + labs(x = "GC (%%)", y = "Sequencing depth") + scale_size(range = c(1, 10)) p6 <- p6 + scale_fill_manual(values=color_palette[0:length(unique(gc_coverage_table$color))])+ guides(color = guide_legend(override.aes = list(size=5))) p6 <- p6 + scale_colour_manual(values=color_palette[0:length(unique(gc_coverage_table$color))]) #print (max(gc_coverage_table$Length)) p6 <- p6 + scale_alpha_continuous(range=c(0.1, 1), limits=c(0.1,1)) #+ scale_alpha_continuous(range=c(0, max(gc_coverage_table$Length)), limits=c(0,max(gc_coverage_table$Length))) print(p6 + theme_bw()) dev.off() }else{ #print('NO contig_labels') } write.table(gc_coverage_table, 'gc_coverage_table.tab', sep="\t", row.names=F) %s svg("%sgc_cov_buble.svg", width = 12, height = 12) symbols(x=gc_coverage_table[,3], y= gc_coverage_table[,4], circles=gc_coverage_table[,2], inches=1/3, ann=T, bg=rgb(1, 0, 0,0.5), fg=rgb(1, 0, 0,0.5), main="%s", xlab="GC(%%)", ylab="Sequencing depth") if (any("gc_coverage_subset" %%in%% ls())) { symbols(x=gc_coverage_table[,3], y= gc_coverage_table[,4], circles=gc_coverage_table[,2], inches=1/3, ann=T, bg=gc_coverage_table$color, fg=gc_coverage_table$color, add = TRUE) l <- gsub('(^[^_]+_[^_]+)_(.*)$', '\\\\1', gc_coverage_subset$Name) text(x=gc_coverage_subset[,3], y=gc_coverage_subset[,4], labels = l) }else{ print ('a') } dev.off() %s svg("%sgc_cov_buble_2m.svg", width = 12, height = 12) symbols(x=gc_coverage_table_2m[,3], y= gc_coverage_table_2m[,4], circles=gc_coverage_table_2m[,2], inches=1/3, ann=T, bg=rgb(1, 0, 0,0.5), fg=rgb(1, 0, 0,0.5), main="%s", xlab="GC(%%)", ylab="Sequencing depth") if (any("gc_coverage_subset" %%in%% ls())) { symbols(x=gc_coverage_table_2m[,3], y= gc_coverage_table_2m[,4], circles=gc_coverage_table_2m[,2], inches=1/3, ann=T, bg=gc_coverage_table_2m$color, fg=gc_coverage_table_2m$color, add = TRUE) l <- gsub('(^[^_]+_[^_]+)_(.*)$', '\\\\1', gc_coverage_subset2$Name) text(x=gc_coverage_subset2[,3], y=gc_coverage_subset2[,4], labels = l) }else{ print ('a') } dev.off() """ % (samtool_depth_file, samtool_depth_file, samtool_depth_file, output_prefix, output_prefix, highlight_code, output_prefix, main, highlight_code2, output_prefix, main)) else: robjects.r(""" #library(Cairo) library(R.utils) if (isGzipped("%s")){ #print('Gzipped file') all_depth <- read.table(gzfile('%s'), header=FALSE) }else{ #print('Not Gzipped') all_depth <- read.table('%s', header=FALSE) } blast_file <- read.table("%s", header=FALSE, sep="\t")[,c(2,6)] contigs_depth<- aggregate(all_depth["V3"],b=all_depth[c("V1")],FUN=median) contigs_gc <- read.table("/tmp/gc.tab", header=TRUE) gc_coverage_table <-cbind(contigs_gc,coverage=contigs_depth[match(contigs_gc$Name, contigs_depth$V1),2]) #w<-which(gc_coverage_table$Length >=1000) #gc_coverage_table <- gc_coverage_table[w,] gc_coverage_table$taxon <- blast_file[,2][match(gc_coverage_table$Name, blast_file[,1])] #print (is.na(gc_coverage_table$taxon)) gc_coverage_table$taxon <- as.character(gc_coverage_table$taxon) gc_coverage_table$taxon[is.na(gc_coverage_table$taxon)] <- 'undefined' gc_coverage_table$taxon <- as.factor(gc_coverage_table$taxon) write.table(gc_coverage_table, 'gc_coverage_table.tab', sep="\t", row.names=F) svg("gc_cov_buble.svg", width = 12, height = 12,) symbols(x=gc_coverage_table[,3], y= gc_coverage_table[,4], circles=gc_coverage_table[,2], inches=1/3, ann=F, bg=gc_coverage_table$taxon, fg=gc_coverage_table$taxon, main="%s", xlab="GC(%%)", ylab="Sequencing depth") dev.off() cov_biggest <- gc_coverage_table[which(gc_coverage_table$Length==max(gc_coverage_table$Length)),4] #print('cov biggest:') #print(cov_biggest) w <- which(gc_coverage_table[,4]< (4*cov_biggest)) gc_coverage_table_2m <- gc_coverage_table[w,] svg("gc_cov_buble_2m.svg", width = 12, height = 12,) symbols(x=gc_coverage_table_2m[,3], y= gc_coverage_table_2m[,4], circles=gc_coverage_table_2m[,2], inches=1/3, ann=F, bg=gc_coverage_table$taxon, fg=gc_coverage_table$taxon, main="%s", xlab="GC(%%)", ylab="Sequencing depth") dev.off() """ % (samtool_depth_file, samtool_depth_file, samtool_depth_file, blast_file, main, main))
def locate_origin(contig_file, reference_dnaa=False, base_add=517): ''' perform a tblastn to identify location od dnaa return chromosome name and position of the split :param contig_file: :param reference_dnaa: seqrecord object with reference dnaa sequence :return: ''' ''' :param contig_file: :param reference_dnaa: :return: ''' import sys from Bio import SeqIO, Seq, SeqRecord from Bio.Alphabet import generic_dna, generic_protein from Bio.Blast.Applications import NcbitblastnCommandline import shell_command from Bio.Blast import NCBIXML if not reference_dnaa: seq = Seq.Seq('MSEKEIWEKVLEIAQEKLSAVSYSTFLKDTELYTIKDGEAIVLSS' ' IPFNANWLNQQYAEIIQAILFDVVGYEVKPHFITTEELANYSNNETATPKETTKPSTET' 'TEDNHVLGREQFNAHNTFDTFVIGPGNRFPHAASLAVAEAPAKAYNPLFIYGGVGLGKT' 'HLMHAIGHHVLDNNPDAKVIYTSSEKFTNEFIKSIRDNEGEAFRERYRNIDVLLIDDIQ' 'FIQNKVQTQEEFFYTFNELHQNNKQIVISSDRPPKEIAQLEDRLRSRFEWGLIVDITPP' 'DYETRMAILQKKIEEEKLDIPPEALNYIANQIQSNIRELEGALTRLLAYSQLLGKPITT' 'ELTAEALKDIIQAPKSKKITIQDIQKIVGQYYNVRIEDFSAKKRTKSIAYPRQIAMYLS' 'RELTDFSLPKIGEEFGGRDHTTVIHAHEKISKDLKEDPIFKQEVENLEKEIRNV', generic_protein) reference_dnaa = SeqRecord.SeqRecord(seq, id="ADC36215.1", name="DnaA", description="Chromosomal replication initiator protein DnaA") cmd = 'formatdb -i %s -p F' % contig_file out, err, code = shell_command.shell_command(cmd) print out, err, code if code != 0: sys.stdout.write('problem with command: \n %s' % cmd) #path = os.path.abspath(contig_file) #blast_dir = os.path.dirname(path) #out, err, code = shell_command.shell_command('export BLASTDB=$BLASTDB:%s' % (blast_dir)) #print out, err, code handle = open('dnaa.temp', 'w') SeqIO.write(reference_dnaa, handle, 'fasta') handle.close() tblastn_cline = NcbitblastnCommandline(query='dnaa.temp', db=contig_file, evalue=0.001, outfmt=5, out="dnaa_blast2.xml") stdout, stderr = tblastn_cline() result_handle = open("dnaa_blast2.xml", 'r') blast_records = [i for i in NCBIXML.parse(result_handle)] best_hit = blast_records[0].alignments[0].hsps[0] #print blast_records[0].alignments[0].hit_id contig = blast_records[0].alignments[0].hit_def identity_percentage = (best_hit.identities / best_hit.align_length)*100 sbjct_end = best_hit.sbjct_end sbjct_start = best_hit.sbjct_start if sbjct_start > sbjct_end: #print 'strand 1', sbjct_start, sbjct_end split_location = sbjct_start - base_add else: #print 'strand -1', sbjct_start, sbjct_end split_location = sbjct_end + base_add sys.stdout.write('contig: %s --> hit with evalue of %s and identity of %s\n' % (contig, best_hit.expect, identity_percentage)) return (contig, split_location)
def circos_orthology(all_record_list, ref_record_and_location, target_record_and_location_list, location = "assets/circos"): import gbk2circos import os import re import shell_command #print location with open(os.path.join(location, "circos.kar"), "w") as contig_file: i = 0 for record in all_record_list: if i%2 == 0: col = 1 else: col = 2 i+=1 # band chr_name band_name band_label start end color description = record.description description = re.sub(", complete genome\.", "", description) description = re.sub(", complete genome", "", description) description = re.sub(", complete sequence\.", "", description) description = re.sub("strain ", "", description) description = re.sub("str\. ", "", description) description = re.sub(" complete genome sequence\.", "", description) description = re.sub(" complete genome\.", "", description) description = re.sub(" chromosome", "", description) description = re.sub(" DNA", "S.", description) description = re.sub("Merged record from ", "", description) description = re.sub(", wgs", "", description) description = re.sub("Candidatus ", "", description) description = re.sub(".contig.0_1, whole genome shotgun sequence.", "", description) description = re.sub(" ", "-", description) description = re.sub("Chlamydia_", "C_", description) description = re.sub("Chlamydophila_", "C_", description) description = re.sub("Simkania_", "S_", description) description = re.sub("Parachlamydia_", "P_", description) description = re.sub("Neochlamydia_", "N_", description) description = re.sub("Protochlamydia_", "P_", description) description = re.sub("Waddlia_", "W_", description) description = re.sub("Estrella_", "E_", description) description = re.sub("Methylacidiphilum_", "M_", description) description = re.sub("Criblamydia_", "C_", description) description = re.sub("Methylacidiphilum_", "M_", description) accession = record.id.split(".")[0] contig_file.write('chr - %s %s %s %s spectral-5-div-%s\n' % (accession, description, 0, len(record), col)) #chr - Rhab_1 Rhab_1 0 125191 spectral-5-div-4 with open(os.path.join(location, "circos.link"), "w") as link_file: for link in target_record_and_location_list: line = '%s %s %s %s %s %s\n' % (ref_record_and_location[0], ref_record_and_location[1],ref_record_and_location[2], link[0], link[1], link[2]) link_file.write(line) circos_conf = gbk2circos.Circos_config("circos.kar", show_ticks="no", show_tick_labels="no", ideogram_spacing=100, label_radius=0.01, radius=0.45) circos_conf.add_link("circos.link", thickness=3) with open(os.path.join(location, "circos.config"), "w") as f: f.write(circos_conf.get_file()) cmd = "circos -outputfile %s -outputdir %s -conf %s" % ("circos_ortho", location, os.path.join(location, "circos.config")) #print cmd (stdout, stderr, return_code) = shell_command.shell_command(cmd)
parser.add_argument("-1", '--only_16s', action="store_true", help="get only 16s") parser.add_argument("-2", '--only_23s', action="store_true", help="get only 23s") args = parser.parse_args() if not args.title: title = args.input.split('.')[0] else: title = args.title stdout_str, stderr_str, runcode = shell_command.shell_command( "barrnap %s" % args.input) if 'not found' in stderr_str: raise (Exception('Barrnap was not found on path, please install it')) import sys sys.exit() rrna_16S, rrna_23S, rrna_5S = search_16S_rrna( parse_barrnap_output(stdout_str)) longest16 = find_longest_16S(rrna_16S) longest23 = find_longest_16S(rrna_23S) recs = [] if args.only_16s: