示例#1
0
 def execute_megablast(self, fasta1, fasta2):
     import shell_command
     for one_fasta in fasta2:
         cmd1 = "formatdb -i %s -p F" % one_fasta
         cmd2 = 'blastn -task megablast -query %s -db %s -evalue 1e-5 -outfmt 6 -out blast_result_%s.tab' % (
             fasta1, one_fasta, one_fasta.split('.')[0])
         a, b, c = shell_command.shell_command(cmd1)
         a, b, c = shell_command.shell_command(cmd2)
示例#2
0
def blast_all_vs_all(input_fasta):

    import shell_command

    cmd1 = 'formatdb -i %s -n seq.db' % input_fasta

    a, b, c = shell_command.shell_command(cmd1)
    print a, b, c

    cmd2 = 'blastp -db seq.db -query %s -outfmt 6 -num_threads 6 -out blastall.out' % input_fasta

    a, b, c = shell_command.shell_command(cmd2)
    print a, b, c

    return "blastall.out"
示例#3
0
def detailed_job_status(job_id):
    detailed_job_status = shell_command("bjobs %d" % job_id)[0].split('\n')
    detailed_exit_job_status = []
    for i in detailed_job_status:
        if "EXIT" in i:
            detailed_exit_job_status.append(i)
    return detailed_exit_job_status
示例#4
0
 def run_circos(self, config_file="circos.config", out_prefix="circos"):
     import shell_command
     cmd = 'circos -outputfile %s.svg -conf %s' % (out_prefix, config_file)
     a, b, c = shell_command.shell_command(cmd)
     print "out", a, "err", b, "code", c
     if c == 255:
         raise CircosException("Circos problem, check files... quitting")
示例#5
0
    def __init__(self, fasta_file, bam_file):

        self.working_dir = os.getcwd()
        self.fasta_file = os.path.join(self.working_dir, fasta_file)
        self.bam_file = os.path.join(self.working_dir, bam_file)
        self.prefix = bam_file.split('.')[0]

        self.data_dir = os.path.join(self.working_dir, 'circosviz/data')
        self.etc_dir = os.path.join(self.working_dir, 'circosviz/etc')
        self.sam_file = os.path.join(self.data_dir, self.prefix + '.sam')

        try:
            os.makedirs(self.data_dir)
        except:
            print '%s already exits' % self.data_dir

        try:
            os.makedirs(self.etc_dir)
        except:
            print '%s already exits' % self.etc_dir

        # convert bam to sam
        print 'converting bam to sam'
        cmd = ' samtools view -h -o %s %s' % (self.sam_file, self.bam_file)
        print cmd
        out, err, code = shell_command.shell_command(cmd)
        if code != 0:
            print out, err

        self.template_config = '''
示例#6
0
def blast_fasta(blast_flavor, input, blastdb, evalue, nb_hit, local):
    '''
    :parameters for BLAST
    :return: blast results with taxonomic information
    scientific names and kingdom will only be retrieved if the taxid database has been installed locally ftp://ftp.ncbi.nlm.nih.gov/blast/db/taxdb.tar.gz
    '''
    output_name = re.sub("([a-zA-Z_0-9]+)\.([a-zA-Z]+)","blast\\1.tab",input)
    if local:
        cmd="%s -task %s -query %s -out %s -db %s -evalue %s -max_target_seqs %s -outfmt '6 qgi qacc sgi sacc sscinames sskingdoms staxids evalue nident pident positive gaps length qstart qend qcovs sstart send sstrand stitle'" %  (blast_flavor,blast_flavor, input, output_name, blastdb, evalue, nb_hit)
        print cmd
        out, err, code = shell_command(cmd)
        if code != 0:
            print err
    else:
        cmd="%s -remote -task %s -query %s -out %s -db %s -evalue %s -max_target_seqs %s -outfmt '6 qgi qacc sgi sacc sscinames sskingdoms staxids evalue nident pident positive gaps length qstart qend qcovs sstart send sstrand stitle'" %  (blast_flavor,blast_flavor, input, output_name, blastdb, evalue, nb_hit)
        out, err, code = shell_command(cmd)
示例#7
0
def run_hifix(fasta_file, silix_nodes, silix_net):
    import shell_command

    cmd1 = 'hifix %s %s %s > seq_HFX.fnodes' % (fasta_file, silix_net,
                                                silix_nodes)

    a, b, c = shell_command.shell_command(cmd1)
    print a, b, c
示例#8
0
 def generate_circos_data_files(self):
     os.chdir(self.data_dir)
     cmd = 'circosviz.pl -i %s -f %s -e 500 -m 3000 -a 125 -b 1000 -p 1000' % (
         self.sam_file, self.fasta_file)
     out, err, code = shell_command.shell_command(cmd)
     if code != 0:
         print out, err
     os.chdir(self.working_dir)
示例#9
0
def run_silix(input_fasta, blastall_file, identity=0.9, overlap=0.8):
    import shell_command

    if identity > 1 or overlap > 1:
        raise ('Identity and overlab should be between 0 and 1')

    cmd = 'silix  %s %s -f FAM -i %s -r %s --net > seq.fnodes' % (
        input_fasta, blastall_file, identity, overlap)
    a, b, c = shell_command.shell_command(cmd)
示例#10
0
    def execute_blast(self, fasta, database):
        import shell_command

        cmd1 = "formatdb -i %s -p T" % database
        cmd2 = 'blastp -query %s -db %s -evalue 1e-5 -num_threads 8 -outfmt 6 -out blast_result.tab' % (
            fasta, database)
        #a, b, c = shell_command.shell_command(cmd1)
        a, b, c = shell_command.shell_command(cmd2)
        print a, b, c
def aafasta2phylogeny(aa_fasta, phylo=False):
    import os
    import shell_command
    import re
    from ete3 import Tree

    align_name = aa_fasta.split('.')[0] + '_mafft.fa'

    print('aligning with mafft...')
    cmd_mafft = 'mafft --anysymbol --amino --auto --maxiterate 1000 %s > %s' % (
        aa_fasta, align_name)

    out, err, code = shell_command.shell_command(cmd_mafft)

    if code != 0:
        raise (err)
    if phylo:
        print('reconstructing phylogeny with RAxML...')
        output_prefix = aa_fasta.split('.')[0]
        output_tree_name = os.path.join('RAxML_result.%s' % output_prefix)
        output_shtree_name = os.path.join('shtest_%s' % output_prefix)
        cmd_raxml = 'raxml -m PROTGAMMALG -p 12345 -s %s -n %s -c 4 -T 8;' \
              'raxml -f J -m PROTGAMMALG -s %s -p 12345 -t %s -n %s -T 8' % (align_name,
                                                                         output_prefix,
                                                                         align_name,
                                                                         output_tree_name,
                                                                         output_shtree_name)

        out, err, code = shell_command.shell_command(cmd_raxml)
        if code != 0:
            print(out, code)
            print(err)
            import sys
            sys.exit()

        nw = re.sub(
            ":(\d+\.\d+)\[(\d+)\]", ":\\1[&&NHX:support=\\2]",
            open("RAxML_fastTreeSH_Support.shtest_%s" % output_prefix).read())

        t = Tree(nw, format=0)
        t.write(outfile="RAxML_fastTreeSH_Support.shtest_%s.nwk" %
                output_prefix,
                format=0)
        return t
def reorder_contigs_with_mauve(reference, contigs, output_folder="mauve_ordering"):
    import shell_command

    cmd = 'link=$(readlink -f `which Mauve.jar`); java -Xmx500m -cp $link org.gel.mauve.contigs.ContigOrderer -output %s -ref %s -draft %s' % (output_folder,
                                                                                                                                               reference,
                                                                                                                                               contigs)
    out, err, code = shell_command.shell_command(cmd)

    if code != 0:
        sys.stdout.write('problem with command: \n %s\n' % cmd)
示例#13
0
    def execute_promer(self,
                       fasta1,
                       fasta2,
                       out_file="out.coords",
                       algo="nucmer"):
        import shell_command

        #cmd1 = 'promer --mum -l 5 %s %s' % (fasta1, fasta2)
        for one_fasta in fasta2:
            if algo == 'nucmer':
                cmd1 = 'nucmer -mum -b 200 -c 65 -g 90 -l 20 %s %s' % (
                    fasta1, one_fasta)
                cmd2 = 'show-coords -T -r -c -L 100 -I 30 out.delta > %s.coords' % one_fasta.split(
                    '.')[0]
                a, b, c = shell_command.shell_command(cmd1)
                a, b, c = shell_command.shell_command(cmd2)
            elif algo == 'promer':
                cmd1 = 'promer --mum -l 5 %s %s' % (fasta1, one_fasta)
                cmd2 = 'show-coords -T -r -c -L 100 -I 30 out.delta > %s.coords' % one_fasta.split(
                    '.')[0]
                a, b, c = shell_command.shell_command(cmd1)
                a, b, c = shell_command.shell_command(cmd2)
示例#14
0
def download_seq_entrez(accession, multifasta, seq_type, append):
    # retrieves information about entry in ncbi using Entrez

    # if the user requests individual fasta file per accession
    if multifasta == "":
        for one_accession in accession:
            output_name = re.sub("([a-zA-Z_0-9]+)\.([a-zA-Z]+)", "\\1.fasta",
                                 one_accession)
            output_handle = open(output_name, "w")
            handle = Entrez.efetch(db=seq_type,
                                   id=one_accession,
                                   rettype="fasta")
            output_handle.write(handle.read())
            output_handle.close()

    # if the user requests a single multifasta file
    else:
        # if the user required to add the blast file
        if append != "":
            output_handle = open("%s_only.fasta" % multifasta, "w")
            handle = Entrez.efetch(db=seq_type, id=accession, rettype="fasta")
            output_handle.write(handle.read())
            output_handle.close()

            cmd = "cat %s %s_only.fasta > %s.fasta" % (append, multifasta,
                                                       multifasta)
            print cmd
            shell_command(cmd)
            cmd = "rm %s_only.fasta" % multifasta
            print cmd
            shell_command(cmd)

        else:
            output_handle = open("%s.fasta" % multifasta, "w")
            handle = Entrez.efetch(db=seq_type, id=accession, rettype="fasta")
            output_handle.write(handle.read())
            output_handle.close()
示例#15
0
def is_job_running(job_id):
    try:
        merged_text = " ".join(
            shell_command("bjobs %d" % job_id)[0].split('\n'))
        stat = re.findall("EXIT|DONE|PEND|RUN", merged_text)
        print(stat)
        all_status = unique(stat)
    except:
        print("unknown job ID")
        return True
    if "EXIT" in all_status:
        raise (Exception('bsub command failed with status: EXIT'))
    if "PEND" in all_status:
        return False
    else:
        return True
示例#16
0
    def execute_circos(self):

        os.chdir(self.etc_dir)

        self.config_path = 'circosviz.config'
        self.plot_path = 'circosviz_plot.svg'

        with open(self.config_path, 'w') as f:
            f.write(self.template_config)

        cmd = 'circos -noparanoid -conf %s -outputfile %s' % (self.config_path,
                                                              self.plot_path)
        out, err, code = shell_command.shell_command(cmd)
        if code != 0:
            print out, err

        os.chdir(self.working_dir)
def convert_leaf_labels(input_tree,
                        biodb_name,
                        accession2taxon=False,
                        taxon2accession=False,
                        sqlite=False):

    import shell_command
    import manipulate_trees
    import os

    print(accession2taxon, taxon2accession)

    cmd = 'newick2phyloxml.pl -i %s' % input_tree

    file_name = input_tree.split('.')[0]

    a, b, c = shell_command.shell_command(cmd)
    print(a, b, c)

    dirpath = os.getcwd()

    input_file = os.path.join(dirpath, file_name + '.phyloxml')
    output_file = os.path.join(dirpath, file_name + '_renamed.tree')

    print(input_file)
    print(output_file)

    if accession2taxon:
        print("sqlite")
        manipulate_trees.convert_tree_accession2taxon_id(biodb_name,
                                                         input_file,
                                                         output_file,
                                                         sqlite=sqlite)

    elif taxon2accession:
        manipulate_trees.convert_tree_taxon_id2accession(biodb_name,
                                                         input_file,
                                                         output_file,
                                                         sqlite=sqlite)

    else:
        manipulate_trees.convert_tree_taxon2genome(biodb_name,
                                                   input_file,
                                                   output_file,
                                                   sqlite=sqlite)
示例#18
0
def get_job_status(job_id):
    try:
        merged_text = " ".join(
            shell_command("bjobs %d" % job_id)[0].split('\n'))
        stat = re.findall("EXIT|DONE|PEND|RUN", merged_text)
        all_status = list(set(stat))
    except:
        raise (Exception("unknown job ID"))
    # return exit only if all jobs (in case of job array) were exited
    if len(all_status) == 1 and all_status[0] == "EXIT":
        return "EXIT"
    elif "RUN" in all_status and "EXIT" in all_status:
        return "partial EXIT"
    elif "RUN" in all_status or "PEND" in all_status:
        return "RUN"
    elif "EXIT" in all_status:
        return "partial DONE"
    else:
        return "DONE"
示例#19
0
def relaunch_vital_it_job(status, cmd_data):
    import generate_bsub_file
    import shell_command
    import sys
    if status == "MEMKILL":
        mem = int(cmd_data["-M"]) / 1000000
        mem += 2
        script = generate_bsub_file.BSUB_script(command=cmd_data["cmd"],
                                                mem_in_GB=mem,
                                                name=cmd_data["-J"],
                                                log_file=cmd_data["-o"],
                                                error_file=cmd_data["-e"])
        generate_bsub_file.run_job(script)
        sys.stdout.write(
            "Job %s relaunched with increased memory limit: %s GB\n" %
            (cmd_data["-J"], mem))
        shell_command.shell_command("rm %s" % cmd_data["-o"])
        shell_command.shell_command("rm %s" % cmd_data["-e"])

    elif status == "LIMITKILL":
        script = generate_bsub_file.BSUB_script(command=cmd_data["cmd"],
                                                mem_in_GB=int(cmd_data["-M"]) /
                                                1000000,
                                                name=cmd_data["-J"],
                                                log_file=cmd_data["-o"],
                                                error_file=cmd_data["-e"],
                                                queue="long")
        generate_bsub_file.run_job(script)
        sys.stdout.write("Job %s relaunched with queue long" % cmd_data["-J"])
        shell_command.shell_command("rm %s" % cmd_data["-o"])
        shell_command.shell_command("rm %s" % cmd_data["-e"])

    elif status == "SUCCESS":
        pass
    elif status == "UNKNOWN":
        print('status unknown for %s' % cmd_data)
    else:
        raise IOError("Uknwon LFS error status %s\n" % status)
示例#20
0
def insert_blast_table(input_file, table_name):

    from chlamdb.biosqldb import manipulate_biosqldb

    server, db = manipulate_biosqldb.load_db('chlamydia_12_15')
    # query id, subject id, % identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score
    sql_profiles_table = 'CREATE TABLE IF NOT EXISTS temp_tables.%s (' \
                         ' query_id varchar(100), ' \
                         ' subject_id varchar(100), ' \
                         ' identity float, ' \
                         ' alignment_length int, ' \
                         ' mismatches int, ' \
                         ' gap_opens int, ' \
                         ' q_start int, ' \
                         ' q_end int, ' \
                         ' s_start int, ' \
                         ' s_end int, ' \
                         ' evalue float, ' \
                         ' bit_score float)' % (table_name)
    try:
        print sql_profiles_table
        server.adaptor.execute(sql_profiles_table)
    except:
        print 'problem creating the sql table'

    import shell_command
    import os
    wd = os.getcwd()
    path = os.path.join(wd, input_file)

    sqlpsw = os.environ['SQLPSW']
    cmd = 'mysql -uroot -p%s biosqldb -e \'LOAD DATA LOCAL INFILE "%s" INTO TABLE temp_tables.%s;\'' % (
        sqlpsw, path, table_name)
    print cmd
    a, b, c = shell_command.shell_command(cmd)
    print a
    print b
    print c
示例#21
0
    def launch(self):
        #import os
        # generate temporary file
        file_name = id_generator(24) + ".sub"
        #with NamedTemporaryFile() as temp_file:

        # add content to temporary file
        #temp_file.write(str(self))
        with open(file_name, "w") as f:
            f.write(str(self))
        #f.close()
        # write file content to the disk
        #temp_file.flush()
        #file_name.close()
        #import time
        #time.sleep(0.5)
        # define command line (file.name contain complete path)
        command = 'bsub < ' + file_name  # file_name
        # execute command line
        print('command:', command)
        (stdout, stderr, return_code) = shell_command(command)
        # close temp file
        #file.close()
        #time.sleep(15)
        print("out:", stdout, "err:", stderr, "code:", return_code)
        if return_code == 0:
            # return job id
            print("Job submitted:", stdout)
            job_id = re.search("\d+", stdout).group(0)
            print(job_id)
            return int(job_id)
        else:
            print(command)
            raise (
                Exception('bsub submission command failed with exit status: ' +
                          str(return_code)))
示例#22
0
    def __init__(self,
                 genbank,
                 database,
                 tabulated_blast_file=False,
                 samtools_depth=None,
                 gc=True,
                 accession2classification=False,
                 identity_cutoff=50,
                 query_coverage_cutoff=0.6,
                 evalue_cutoff=0.00005,
                 execute_blast=True,
                 blast_tab_file=False):

        import gbk2circos
        import shell_command
        from Bio import SeqIO

        self.reference_records = [
            i for i in SeqIO.parse(open(genbank), 'genbank')
        ]
        self.contig_list = [record.name for record in self.reference_records]
        self.contig2gc_percent = circos_utils.contig2gc(self.reference_records)

        if samtools_depth:
            self.contig2median_depth = circos_utils.get_median_contig_coverage(
                self.contig_list, samtools_depth)
        else:
            self.contig2median_depth = False

        self.records2locus_tag2start_stop = circos_utils.records2locus_tag2start_stop(
            self.reference_records)
        # coordinates to get concatenated chromosome coord
        self.contigs_add = circos_utils.get_contigs_coords(
            self.reference_records)

        if not blast_tab_file:
            self.blast_tab_file = "blast_result.tab"
        else:
            self.blast_tab_file = blast_tab_file
        self.fasta_aa = genbank.split(".")[0] + '.faa'
        self.fasta_nucl = genbank.split(".")[0] + '.fna'
        print 'extractint aa and fna sequences...'
        a, b, c = shell_command.shell_command("gbk2fna.py -i %s -o %s" %
                                              (genbank, self.fasta_nucl))
        a, b, c = shell_command.shell_command("gbk2faa.py -i %s -f -o %s" %
                                              (genbank, self.fasta_aa))

        self.fasta_aa_records = [
            i for i in SeqIO.parse(open(self.fasta_aa), 'fasta')
        ]
        self.fasta_nucl_records = [
            i for i in SeqIO.parse(open(self.fasta_nucl), 'fasta')
        ]

        #else:
        #    self.blast_tab_file = tabulated_blast_file

        if execute_blast:
            print 'executing BLASTP...'
            self.execute_blast(self.fasta_aa, database)

        self.circos_reference = gbk2circos.Circos_config(
            "circos_contigs.txt",
            show_ideogram_labels="yes",
            radius=0.7,
            show_tick_labels="yes",
            show_ticks="yes")

        self.first_contig, self.last_contig = circos_utils.get_karyotype_from_gbk_or_fasta_single_ref(
            self.reference_records, out="circos_contigs.txt")

        class_colors = self.blast2barplot(
            self.fasta_aa_records,
            database,
            self.blast_tab_file,
            bar_file="circos.bar",
            accession2classification=accession2classification,
            identity_cutoff=identity_cutoff,
            query_coverage_cutoff=query_coverage_cutoff,
            evalue_cutoff=evalue_cutoff)

        countour_col = 'black'

        if not class_colors:
            histo_colors = 'orange'

            self.last_track = 0.7
        else:
            histo_colors = ''

            for class_col in class_colors:
                print class_col, class_colors[class_col]
                histo_colors += '%s,' % class_colors[class_col]
            histo_colors = histo_colors[0:-1]
            self.circos_reference.add_plot("best_hit.bar",
                                           type="histogram",
                                           r0="0.65r",
                                           r1="0.67r",
                                           color=histo_colors,
                                           fill_color=histo_colors,
                                           thickness=0,
                                           min=0,
                                           max=50)
            self.last_track = 0.65

        add = '''
                <axes>
                <axis>
                spacing   = 0.1r
                color     = lgrey
                thickness = 2
                </axis>
                </axes>
                '''
        self.circos_reference.add_plot("circos.bar",
                                       type="histogram",
                                       r0="0.7r",
                                       r1="0.9r",
                                       color=countour_col,
                                       fill_color=histo_colors,
                                       thickness=1,
                                       min="0",
                                       max="50",
                                       rules=add)

        if samtools_depth:
            #for i, depth_file in enumerate(samtools_depth):
            all_contigs_median = circos_utils.samtools_depth2circos_data(
                samtools_depth, self.contigs_add, 1)
            self.add_samtools_depth_track(
                'circos_samtools_depth_1.txt',
                lower_cutoff=int(all_contigs_median) / 2,
                top_cutoff=int(all_contigs_median) * 2)

        if gc:
            import GC
            from Bio import SeqIO

            fasta_records = list(SeqIO.parse(self.fasta_nucl, 'fasta'))

            out_var_file = ('circos_GC_var.txt')
            out_skew_file = ('circos_GC_skew.txt')
            f = open(out_var_file, 'w')
            g = open(out_skew_file, 'w')

            out_var = ''
            out_skew = ''
            for record in fasta_records:
                # this function handle scaffolds (split sequence when encountering NNNNN regions)
                out_var += GC.circos_gc_var(
                    record, 1000, shift=self.contigs_add[record.name][0])

                out_skew += GC.circos_gc_skew(
                    record, 1000, shift=self.contigs_add[record.name][0])
            #print out_skew
            f.write(out_var)
            g.write(out_skew)
            f.close()
            g.close()

            rule = """<rule>
                    condition          = var(value) < 0
                    fill_color         = lred
                    color = red
                    </rule>

                    <rule>
                    condition          = var(value) > 0
                    fill_color         = lblue
                    color = blue
                    </rule>
            """

            rule2 = """<rule>
                    condition          = var(value) < 0
                    fill_color         = lgreen
                    color = green
                    </rule>

                    <rule>
                    condition          = var(value) > 0
                    fill_color         = lblue
                    color = blue
                    </rule>
            """

            conditions = self.circos_reference.template_rules % (rule)
            self.circos_reference.add_plot('circos_GC_skew.txt',
                                           fill_color="green",
                                           r1="%sr" % (self.last_track - 0.02),
                                           r0="%sr" % (self.last_track - 0.1),
                                           type="line",
                                           rules=conditions)
            conditions = self.circos_reference.template_rules % (rule2)
            self.circos_reference.add_plot('circos_GC_var.txt',
                                           fill_color="green",
                                           r1="%sr" % (self.last_track - 0.12),
                                           r0="%sr" % (self.last_track - 0.2),
                                           type="line",
                                           rules=conditions)
            self.last_track = self.last_track - 0.12

        self.config = self.circos_reference.get_file()

        self.brewer_conf = """
示例#23
0
def main(input_reference,
         input_queries_folder,
         blast_file,
         mlst_scheme,
         input_gbk,
         skip_parsnp=False,
         skip_blast=False,
         input_tree=None,
         skip_mlst=False,
         check_overlap=False,
         id_cutoff=80,
         blast_type='tblastn',
         reference_accession='-',
         blast_filter=False,
         show_identity_values=True,
         accession2description=False):

    import shell_command
    import os

    import sys
    import glob
    from Bio import SeqIO
    sys.stdout.write('Building tree using parsnp...\n')

    wd = os.getcwd()

    ordered_queries = [
        record.name for record in SeqIO.parse(blast_file, 'fasta')
    ]

    #print 'input fasta folder', input_queries_folder
    fasta_folder = os.path.abspath(input_queries_folder)
    reference_file = os.path.abspath(input_reference)

    if blast_filter:
        accession2hit_filter = get_accession_filter_from_blast_list(
            blast_filter)
    else:
        accession2hit_filter = False

    #print 'fasta folder', fasta_folder
    pp = fasta_folder + '/*fna'
    #print pp
    fasta_files = glob.glob(pp)
    if len(fasta_files) == 0:
        pp = fasta_folder + '/*ffn'
        #print pp
        fasta_files = glob.glob(pp)
        if len(fasta_files) == 0:
            raise ('could not find fasta files')
    #print 'fasta files', fasta_files

    reference_phylogeny_folder = os.path.join(wd, 'reference_parsnp_phylogeny')
    reference_phylogeny = os.path.join(reference_phylogeny_folder,
                                       'parsnp_edit.tree')
    out_mlst = os.path.join(wd, 'mlst_results/mlst.tab')
    if not skip_parsnp and not input_tree:
        cmd = 'parsnp -r %s -d %s -p 6 -c -o parsnp_tree' % (reference_file,
                                                             fasta_folder)
        #print cmd
        out, err, code = shell_command.shell_command(cmd)
        print code, err
        print out

        if not os.path.exists(reference_phylogeny_folder):
            os.mkdir(reference_phylogeny_folder)
        sys.stdout.write('Editing parsnp phylogeny...\n')

        parsnp_raw_phylogeny = os.path.join(wd, 'parsnp_tree/parsnp.tree')
        cmd = 'cat %s | sed  "s/.fna//g"> %s' % (parsnp_raw_phylogeny,
                                                 reference_phylogeny)
        out, err, code = shell_command.shell_command(cmd)
        #print code, err
        cmd = '''sed -i "s/\'//g" %s''' % reference_phylogeny
        out, err, code = shell_command.shell_command(cmd)
        #print cmd, '##################'
        #print code, err

        cmd = '''sed -i "s/.ref//" %s''' % (reference_phylogeny)
        out, err, code = shell_command.shell_command(cmd)
        #print code, err

    os.chdir(wd)
    if not skip_mlst:
        sys.stdout.write('Identifying mlst...\n')
        if not os.path.exists(os.path.join(wd, 'mlst_results')):
            os.mkdir(os.path.join(wd, 'mlst_results'))
        all_fasta = ' '.join(fasta_files)

        cmd = 'mlst --quiet --nopath --scheme %s %s > %s' % (
            mlst_scheme, all_fasta, out_mlst)
        out, err, code = shell_command.shell_command(cmd)
        cmd = 'sed -i "s/.fna//g" %s' % out_mlst
        out, err, code = shell_command.shell_command(cmd)
        cmd = 'sed -i "s/.ffn//g" %s' % out_mlst
        out, err, code = shell_command.shell_command(cmd)

    #print err, code
    if input_tree:
        reference_phylogeny = input_tree

    sys.stdout.write('Blasting %s...\n' % blast_file)

    blast_file_fullpath = os.path.join(wd, os.path.basename(blast_file))

    blastp_folder = os.path.join(wd, 'blastp_results')
    if not os.path.exists(blastp_folder):
        os.mkdir(blastp_folder)
    os.chdir(input_queries_folder)

    fasta_file_name2fasta_header = {}

    blast_best_hit_results = []
    for genome_file in fasta_files:
        file_name = os.path.basename(genome_file).split('.')[0]
        header_name = SeqIO.read(genome_file, 'fasta').name
        fasta_file_name2fasta_header[file_name] = header_name

        out_file = 'blast_' + os.path.basename(genome_file).split('.')[0]
        outpath = os.path.join(blastp_folder, out_file + '.tab')
        best_hit_path = os.path.join(blastp_folder,
                                     'uniq_' + out_file + '.tab')
        blast_best_hit_results.append(best_hit_path)
        if not skip_blast:
            cmd = 'formatdb -i %s -p F' % genome_file
            out, err, code = shell_command.shell_command(cmd)
            if blast_type == 'tblastn':
                from Bio.Blast.Applications import NcbitblastnCommandline

                #print err, code
                out, err, code = shell_command.shell_command(
                    'export BLASTDB=$BLASTDB:%s' % blastp_folder)

                blastp_cline = NcbitblastnCommandline(
                    query=blast_file_fullpath,
                    db=genome_file,
                    evalue=0.001,
                    outfmt=6,
                    out=outpath)

            elif blast_type == 'blastn':
                from Bio.Blast.Applications import NcbiblastnCommandline
                blastp_cline = NcbiblastnCommandline(query=blast_file_fullpath,
                                                     db=genome_file,
                                                     evalue=0.001,
                                                     outfmt=6,
                                                     out=outpath)

            else:
                raise ('unsupported blast type')

            stdout, stderr = blastp_cline()
            result_handle = open(outpath, 'r')
            best_hit_handle = open(best_hit_path, 'w')

            hit_list = []
            for line in result_handle:
                if line.split('\t')[0] in hit_list:
                    continue
                else:
                    hit_list.append(line.split('\t')[0])
                    best_hit_handle.write(line)
            best_hit_handle.close()
    os.chdir(wd)

    print fasta_file_name2fasta_header

    if accession2description:
        id2description = {}
        with open(accession2description, 'r') as f:
            for row in f:
                data = row.rstrip().split('\t')
                id2description[data[0]] = data[1]
    else:
        id2description = gbk2accessiontodefinition.get_coressp(input_gbk)
    #else:
    #    IOError('either provide id2description og gbk files')

    accession2st_type = parse_mlst_results(out_mlst)

    #print reference_phylogeny

    plot_blast_result(reference_phylogeny,
                      blast_best_hit_results,
                      id2description,
                      accession2st_type,
                      check_overlap,
                      ordered_queries,
                      fasta_file_name2fasta_header,
                      id_cutoff,
                      reference_accession=reference_accession,
                      accession2hit_filter=accession2hit_filter,
                      show_identity_values=show_identity_values)
示例#24
0
def prokka_reannotation(seq_record_list, compare=False):
    '''
    Reannotate seq record using prokka
    INPUT: list of seqrecord objects (one/sequence to reannotate)

    '''

    import re
    import datetime
    from Bio.SeqRecord import SeqRecord
    from Bio import SeqIO
    import gbk_check
    all_locus = []
    l = open('reannotation_prokka_log.txt', 'w')
    l.write(
        'reference_name\tnew_name\taccession\tref_locus_tag\tnew_locus_tag\tref_n_CDS\tnew_n_CDS\tn_CDS_identical\n'
    )
    #print 'reference_name\tnew_name\taccession\tref_locus_tag\tnew_locus_tag\tref_n_CDS\tnew_n_CDS\tn_CDS_identical'

    print 'reannotation:'
    for i, record in enumerate(seq_record_list):
        print i, record

    reanotated_gbk_list = []

    for record in seq_record_list:
        print '####### one record #########'
        print record
        print
        print '####### features ###########'
        print record.features
        print '########## length ##########', len(record)
        import shell_command
        assert type(record) == SeqRecord
        #    raise IOError('Wrong input, only scaffolded genomes should be reannotated with this script')

        record_annotations = record.annotations
        try:
            record_annotations[
                'comment'] += '\nGenome reannotated using PROKKA version 1.1'
        except KeyError:
            record_annotations[
                'comment'] = 'Genome reannotated using PROKKA version 1.1'
        record_name = record.name

        record_id = record.id
        record_description = gbk_check.clean_description(record.description)
        record_dbxrefs = record.dbxrefs

        # create locus tag based on genus and species name
        organism = re.sub('\'', '', record_annotations['source']).split(' ')

        if len(organism) > 2:
            locus_tag = "P%s%s%s" % (organism[0][0], organism[1][0:2],
                                     organism[2][0])
        elif len(organism) == 2:
            locus_tag = "P%s%s" % (organism[0][0:2], organism[1][0])
        else:
            print record

        # check if the new locus_tag is unique, otherwise add a count
        i = 2
        no_match = False
        if locus_tag in all_locus:
            #print record
            while no_match == False:
                locus_tag = locus_tag + str(i)
                #print locus_tag
                if locus_tag in all_locus:
                    i += 1
                else:
                    no_match = True
                    all_locus.append(locus_tag)
        else:
            all_locus.append(locus_tag)
        # wite fasta and annotate it using prokka
        if str(record.seq) == len(record.seq) * 'N':

            print 'No sequence in record %s' % record.id
            continue
        with open('temp_genome.fna', 'w') as f:
            f.write('>temp_seq\n%s' % record.seq)

        cmd = 'prokka --force --kingdom Bacteria --compliant --centre CHUV --locustag %s --outdir %s -genus temp_genus -strain temp_strain temp_genome.fna' % (
            locus_tag, locus_tag)

        today = datetime.date.today()
        date = today.strftime('%m%d%Y')
        prokka_genbank = '%s/%s_%s.gbk' % (locus_tag, locus_tag, date)

        cmd2 = 'prokka --kingdom Bacteria --compliant --proteins proteins.faa --locustag Citr -genus Citronella -strain virus citro_spades_123_1000.fa'

        out, err, n = shell_command.shell_command(cmd)
        print out

        reanotated_gbk = SeqIO.read(prokka_genbank, "genbank")

        reanotated_gbk.id = record_id
        reanotated_gbk.name = record_name
        reanotated_gbk.annotations = record_annotations
        reanotated_gbk.description = record_description
        reanotated_gbk.dbxrefs = record_dbxrefs
        reanotated_gbk.features[0] = record.features[0]

        #count number of identical ORF
        if compare:
            ref_CDS = 0
            for feature in record.features:
                if feature.type == 'CDS':
                    if ref_CDS == 0:
                        ref_locus_tag = feature.qualifiers['locus_tag'][
                            0].split('_')[0]
                    ref_CDS += 1

            new_CDS = 0
            identical_CDS = 0
            # count number of identical features (exact same location)
            for new_feature in reanotated_gbk.features:
                if new_feature.type == 'CDS':
                    new_CDS += 1
                    for ref_feature in record.features:
                        if ref_feature.type == 'CDS':
                            if ref_feature.qualifiers[
                                    'translation'] == new_feature.qualifiers[
                                        'translation']:
                                identical_CDS += 1
                                break
            #accession = reanotated_gbk[0].annotations["accessions"][0]
            #print 'Ref CDS', ref_CDS, 'New CDS', new_CDS, 'Identical CDS', identical_CDS
            #'reference_file_name\tnew_file_name\taccession\tref_locus_tag\tnew_locus_tag\tref_n_CDS\tnew_n_CDS\tn_CDS_identical'
            try:
                l.write('%s\t%s\t%s\t%s\t%s\t%s\n' %
                        (accession, ref_locus_tag, locus_tag, ref_CDS, new_CDS,
                         identical_CDS))
            except:
                pass
            #print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (genbank_file, out_name, accession, ref_locus_tag, locus_tag, ref_CDS, new_CDS, identical_CDS)
        reanotated_gbk_list.append(reanotated_gbk)
    return reanotated_gbk_list
示例#25
0
    out_name = args.input.split(".")[0] + "_locus.tmp"
    handle2 = open(out_name, "w")
    SeqIO.write(updated_records, handle2, "embl")
    handle2.close()
    out_handle = open(args.input.split(".")[0] + "_locus.embl", "w")
    for type in set(type_list):
        if type not in ["CDS", "tRNA", "rRNA", 'source']:
            raise IOError('Unexpected feature type: %s' % type)
    # replace specific rows
    i = 0
    with open(args.input.split(".")[0] + "_locus.tmp") as f:
        match = False
        for line in f:
            if match:
                line += 'AC * _Contig_%s_%s\nXX\n' % (i + 1, contig_list[i])
                i += 1
                match = False
            if 'ID   ' in line:
                line = 'ID   XXX; XXX; linear; XXX; XXX; XXX; XXX.\n'
            if 'AC   XXX;' in line:
                line = 'AC   ;\n'
                match = True
            if 'RA   XXX;' in line:
                line += 'RT   ;\n'
            if 'OS   .' in line or 'OC   .' in line:
                continue

            out_handle.write(line)
    import shell_command
    shell_command.shell_command('rm %s' %
                                (args.input.split(".")[0] + "_locus.tmp"))
示例#26
0
def gc_coverage_plot(samtool_depth_file,
                     contigs_file,
                     blast_file=False,
                     column1=1,
                     column2=2,
                     main=False,
                     highlight=False):

    import os
    if not main:
        main = os.path.basename(samtool_depth_file)

    import shell_command

    out, err, code = shell_command.shell_command(
        "infoseq -auto -only -Name -length -pgc %s > /tmp/gc.tab" %
        contigs_file)

    print(out)
    print(err)
    print(code)

    if highlight:
        highlight_code = """
         gc_coverage_table$color <- rep(rgb(1, 0, 0,0.5), length(gc_coverage_table[,1]))
        highlight_table <- read.table("%s", header=FALSE)
        m <- match(highlight_table[,1], gc_coverage_table$Name)
        gc_coverage_subset <- gc_coverage_table[m,]
        print("subset")
        print(m)
        gc_coverage_table[m,]$color<-rgb(0, 0, 1,0.5)

        """ % highlight

        highlight_code2 = """

        m <- match(highlight_table[,1], gc_coverage_table_2m$Name)
        print("subset m2")
        print(m)
        gc_coverage_subset2 <- gc_coverage_table_2m[m,]

        """

    else:
        highlight_code = ''
        highlight_code2 = ''

    print('high', highlight)

    if not blast_file:
        robjects.r("""

        #library(Cairo)
        library(R.utils)




        if (isGzipped("%s")){
            print('Gzipped file')
            all_depth <- read.table(gzfile('%s'), header=FALSE)
        }else{
            print('Not Gzipped')
            all_depth <- read.table('%s', header=FALSE)
        }


        contigs_depth<- aggregate(all_depth["V3"],b=all_depth[c("V1")],FUN=median)
        contigs_gc <- read.table("/tmp/gc.tab", header=TRUE)

        gc_coverage_table <-cbind(contigs_gc,coverage=contigs_depth[match(contigs_gc$Name, contigs_depth$V1),2])
        #w<-which(gc_coverage_table$Length >=1000)
        #gc_coverage_table <- gc_coverage_table[w,]

        

        write.table(gc_coverage_table, 'gc_coverage_table.tab', sep="\t", row.names=F)


    %s

     svg("gc_cov_buble.svg", width = 12, height = 12)
         symbols(x=gc_coverage_table[,3], y= gc_coverage_table[,4], circles=gc_coverage_table[,2], inches=1/3, ann=T,
                 bg=rgb(1, 0, 0,0.5), fg=rgb(1, 0, 0,0.5), main="%s", xlab="GC(%%)", ylab="Sequencing depth")
         if (any("gc_coverage_subset" %%in%% ls())) {
             symbols(x=gc_coverage_table[,3], y= gc_coverage_table[,4], circles=gc_coverage_table[,2], inches=1/3,
                     ann=T, bg=gc_coverage_table$color, fg=gc_coverage_table$color, add = TRUE)
             l <- gsub('(^[^_]+_[^_]+)_(.*)$', '\\\\1', gc_coverage_subset$Name)
             text(x=gc_coverage_subset[,3], y=gc_coverage_subset[,4], labels = l)
         }else{
            print ('a')
         }

         dev.off()

         cov_biggest <- gc_coverage_table[which(gc_coverage_table$Length==max(gc_coverage_table$Length)),4]
         print('cov biggest:')
         print(cov_biggest)
         w <- which(gc_coverage_table[,4]< (4*cov_biggest))
         gc_coverage_table_2m <- gc_coverage_table[w,]


         %s

         svg("gc_cov_buble_2m.svg", width = 12, height = 12)
            symbols(x=gc_coverage_table_2m[,3], y= gc_coverage_table_2m[,4], circles=gc_coverage_table_2m[,2],
                    inches=1/3, ann=T, bg=rgb(1, 0, 0,0.5), fg=rgb(1, 0, 0,0.5), main="%s", xlab="GC(%%)", ylab="Sequencing depth")

            if (any("gc_coverage_subset" %%in%% ls())) {

                symbols(x=gc_coverage_table_2m[,3], y= gc_coverage_table_2m[,4], circles=gc_coverage_table_2m[,2],
                        inches=1/3, ann=T, bg=gc_coverage_table_2m$color, fg=gc_coverage_table_2m$color, add = TRUE)
                l <- gsub('(^[^_]+_[^_]+)_(.*)$', '\\\\1', gc_coverage_subset2$Name)
                text(x=gc_coverage_subset2[,3], y=gc_coverage_subset2[,4], labels = l)
            }else{
                print ('a')
            }

     dev.off()



                   """ %
                   (samtool_depth_file, samtool_depth_file, samtool_depth_file,
                    highlight_code, main, highlight_code2, main))
    else:

        robjects.r("""

        #library(Cairo)
        library(R.utils)

        if (isGzipped("%s")){
            print('Gzipped file')
            all_depth <- read.table(gzfile('%s'), header=FALSE)
        }else{
            print('Not Gzipped')
            all_depth <- read.table('%s', header=FALSE)
        }

        blast_file <- read.table("%s", header=FALSE, sep="\t")[,c(2,6)]
        contigs_depth<- aggregate(all_depth["V3"],b=all_depth[c("V1")],FUN=median)
        contigs_gc <- read.table("/tmp/gc.tab", header=TRUE)

        gc_coverage_table <-cbind(contigs_gc,coverage=contigs_depth[match(contigs_gc$Name, contigs_depth$V1),2])
        #w<-which(gc_coverage_table$Length >=1000)
        #gc_coverage_table <- gc_coverage_table[w,]

        gc_coverage_table$taxon <- blast_file[,2][match(gc_coverage_table$Name, blast_file[,1])]
        print (is.na(gc_coverage_table$taxon))
        gc_coverage_table$taxon <- as.character(gc_coverage_table$taxon)
        gc_coverage_table$taxon[is.na(gc_coverage_table$taxon)] <- 'undefined'
        gc_coverage_table$taxon <- as.factor(gc_coverage_table$taxon)
        
        write.table(gc_coverage_table, 'gc_coverage_table.tab', sep="\t", row.names=F)

         svg("gc_cov_buble.svg", width = 12, height = 12,)
            symbols(x=gc_coverage_table[,3], y= gc_coverage_table[,4], circles=gc_coverage_table[,2], inches=1/3,
                    ann=F, bg=gc_coverage_table$taxon, fg=gc_coverage_table$taxon, main="%s", xlab="GC(%%)", ylab="Sequencing depth")
         dev.off()

         cov_biggest <- gc_coverage_table[which(gc_coverage_table$Length==max(gc_coverage_table$Length)),4]
         print('cov biggest:')
         print(cov_biggest)
         w <- which(gc_coverage_table[,4]< (4*cov_biggest))
         gc_coverage_table_2m <- gc_coverage_table[w,]

         svg("gc_cov_buble_2m.svg", width = 12, height = 12,)
            symbols(x=gc_coverage_table_2m[,3], y= gc_coverage_table_2m[,4], circles=gc_coverage_table_2m[,2],
                    inches=1/3, ann=F, bg=gc_coverage_table$taxon, fg=gc_coverage_table$taxon, main="%s", xlab="GC(%%)", ylab="Sequencing depth")
         dev.off()



                   """ % (samtool_depth_file, samtool_depth_file,
                          samtool_depth_file, blast_file, main, main))
示例#27
0
def gc_coverage_plot(contigs_file,
                     contig_depth_table=False,
                     samtool_depth_file=False,
                     blast_file=False,
                     column1=1,
                     column2=2,
                     main=False,
                     highlight=False,
                     taxonomy_file=False,
                     output_prefix=False):

    if output_prefix[-1] != '/':
        output_prefix += '/'
    print("output_prefix", output_prefix)

    import os
    import shell_command
    import rpy2.robjects as robjects
    import rpy2.robjects.numpy2ri
    from pandas import DataFrame
    import pandas
    import rpy2
    from rpy2.robjects import r
    from rpy2.robjects import pandas2ri
    pandas2ri.activate()

    if not main:
        main = os.path.basename(contigs_file)

    out, err, code = shell_command.shell_command(
        "infoseq -auto -only -Name -length -pgc %s > /tmp/gc.tab" %
        contigs_file)

    #print (out)
    #print (err)
    #print (code)

    if contig_depth_table:

        contig_depth = pandas.read_csv(contig_depth_table,
                                       sep='\t',
                                       names=["contig", "depth"])
        #contig_depth = DataFrame(contig_depth, columns=['contig', 'depth'])
        #print (type(contig_depth["contig"]))
        #print (type(contig_depth))
        robjects.r.assign('contigs_depth', pandas2ri.py2ri(contig_depth))

    if taxonomy_file:
        with open(taxonomy_file, 'r') as f:
            contigs2taxon2count = {}

            for row in f:
                data = row.rstrip().split()
                contig = data[0]
                taxon = data[1]
                if contig not in contigs2taxon2count:
                    contigs2taxon2count[contig] = {}
                    contigs2taxon2count[contig][taxon] = 1
                else:
                    if taxon in contigs2taxon2count[contig]:
                        contigs2taxon2count[contig][taxon] += 1
                    else:
                        contigs2taxon2count[contig][taxon] = 1
        contig2label = []
        for contig in contigs2taxon2count:
            if len(contigs2taxon2count[contig]) > 1:
                # more than one taxon
                label = ''
                for taxon in contigs2taxon2count[contig]:
                    label += '%s (%s) /' % (taxon,
                                            contigs2taxon2count[contig][taxon])
                label = label[0:-2]
            else:
                label = list(contigs2taxon2count[contig].keys())[0]
            contig2label.append([contig, label])
        label2freq = {}
        for contig in contig2label:
            if contig[1] not in label2freq:
                label2freq[contig[1]] = 1
            else:
                label2freq[contig[1]] += 1
        for contig in contig2label:
            if label2freq[contig[1]] <= 2:
                contig[1] = 'rare_taxon'

        df = DataFrame(contig2label, columns=['contig', 'label'])
        print(type(df["contig"]))
        print(type(df))
        #m = m.astype(float)
        robjects.r.assign('contig_labels', pandas2ri.py2ri(df))
    else:
        robjects.r.assign('contig_labels', False)

    if highlight:
        highlight_code = """
        gc_coverage_table$color <- rep(rgb(1, 0, 0,0.5), length(gc_coverage_table[,1]))
        highlight_table <- read.table("%s", header=FALSE)
        m <- match(highlight_table[,1], gc_coverage_table$Name)
        gc_coverage_subset <- gc_coverage_table[m,]
        print("subset")
        print(m)
        gc_coverage_table[m,]$color<-rgb(0, 0, 1,0.5)

        """ % highlight

        highlight_code2 = """

        m <- match(highlight_table[,1], gc_coverage_table_2m$Name)
        #print("subset m2")
        #print(m)
        gc_coverage_subset2 <- gc_coverage_table_2m[m,]

        """

    else:
        highlight_code = ''
        highlight_code2 = ''

    if not blast_file:
        robjects.r("""

        #library(Cairo)
        library(R.utils)
        library(ggplot2)




        if (exists("contigs_depth")==FALSE){

            if (isGzipped("%s")){
                #print('Gzipped file')
                all_depth <- read.table(gzfile('%s'), header=FALSE)
            }else{
                #print('Not Gzipped')
                all_depth <- read.table('%s', header=FALSE)
            }

            contigs_depth<- aggregate(all_depth["V3"],b=all_depth[c("V1")],FUN=median)
            colnames(contigs_depth) <- c('contig', 'depth')
        }
        #print(contigs_depth)
        #print(contig_labels)
        contigs_gc <- read.table("/tmp/gc.tab", header=TRUE)

        gc_coverage_table <-cbind(contigs_gc,coverage=contigs_depth[match(contigs_gc$Name, contigs_depth$contig),2])
        #w<-which(gc_coverage_table$Length >=1000)
        #gc_coverage_table <- gc_coverage_table[w,]

         cov_biggest <- gc_coverage_table[which(gc_coverage_table$Length==max(gc_coverage_table$Length)),4]
         #print('cov biggest:')
         #print(cov_biggest)
         w <- which(gc_coverage_table[,4]< (4*cov_biggest))
         gc_coverage_table_2m <- gc_coverage_table[w,]

        if (contig_labels != FALSE) {
            library(RColorBrewer)
            color_palette <- c('red', 'blue','green', brewer.pal(12,"Paired"), brewer.pal(12,"Set3"))
            m <- match(contig_labels$contig, gc_coverage_table$Name)

            gc_coverage_table$color <- rep("Unclassified", length(gc_coverage_table[,1]))
            gc_coverage_table$contig_alpha <- rep(0.5, length(gc_coverage_table[,1]))

            gc_coverage_table$color[m] <- as.character(contig_labels$label)
            gc_coverage_table$contig_alpha[m] <- rep(1,length(contig_labels$label))

            w<-which(gc_coverage_table$Length >=1000)
            gc_coverage_table <- gc_coverage_table[w,]
            #w2 <- which(gc_coverage_table$color != "Chlamydiae")
            #gc_coverage_table$contig_alpha[w2] <- 0.7

            svg("%sgc_cov_buble_test.svg", width = 12, height = 12)
            p6 <- ggplot(gc_coverage_table, aes(x = X.GC, y = coverage, size = Length, fill = color, colour = color, alpha = contig_alpha)) +
                    geom_point(shape = 21) +
                    ggtitle("Scaffold GC vs Depth") +
                    labs(x = "GC (%%)", y = "Sequencing depth") +
                    scale_size(range = c(1, 10))
            p6 <- p6 + scale_fill_manual(values=color_palette[0:length(unique(gc_coverage_table$color))])+ guides(color = guide_legend(override.aes = list(size=5)))
            p6 <- p6 + scale_colour_manual(values=color_palette[0:length(unique(gc_coverage_table$color))])
            #print (max(gc_coverage_table$Length))
            p6 <- p6 + scale_alpha_continuous(range=c(0.1, 1), limits=c(0.1,1)) #+ scale_alpha_continuous(range=c(0, max(gc_coverage_table$Length)), limits=c(0,max(gc_coverage_table$Length)))

            print(p6 + theme_bw())
            dev.off()

            gc_coverage_table_2m$color <- rep("Unclassified", length(gc_coverage_table_2m[,1]))
            gc_coverage_table_2m$contig_alpha <- rep(0.5, length(gc_coverage_table_2m[,1]))

            gc_coverage_table_2m$color[m] <- as.character(contig_labels$label)
            gc_coverage_table_2m$contig_alpha[m] <- rep(1,length(contig_labels$label))

            svg("%sgc_cov_buble_test_2m.svg", width = 12, height = 12)
            p6 <- ggplot(gc_coverage_table_2m, aes(x = X.GC, y = coverage, size = Length, fill = color, colour = color, alpha = contig_alpha)) +
                    geom_point(shape = 21) +
                    ggtitle("Scaffold GC vs Depth") +
                    labs(x = "GC (%%)", y = "Sequencing depth") +
                    scale_size(range = c(1, 10))
            p6 <- p6 + scale_fill_manual(values=color_palette[0:length(unique(gc_coverage_table$color))])+ guides(color = guide_legend(override.aes = list(size=5)))
            p6 <- p6 + scale_colour_manual(values=color_palette[0:length(unique(gc_coverage_table$color))])
            #print (max(gc_coverage_table$Length))
            p6 <- p6 + scale_alpha_continuous(range=c(0.1, 1), limits=c(0.1,1)) #+ scale_alpha_continuous(range=c(0, max(gc_coverage_table$Length)), limits=c(0,max(gc_coverage_table$Length)))

            print(p6 + theme_bw())
            dev.off()



        }else{
            #print('NO contig_labels')

        }

        write.table(gc_coverage_table, 'gc_coverage_table.tab', sep="\t", row.names=F)

    %s

     svg("%sgc_cov_buble.svg", width = 12, height = 12)
         symbols(x=gc_coverage_table[,3], y= gc_coverage_table[,4], circles=gc_coverage_table[,2], inches=1/3, ann=T,
                 bg=rgb(1, 0, 0,0.5), fg=rgb(1, 0, 0,0.5), main="%s", xlab="GC(%%)", ylab="Sequencing depth")
         if (any("gc_coverage_subset" %%in%% ls())) {
             symbols(x=gc_coverage_table[,3], y= gc_coverage_table[,4], circles=gc_coverage_table[,2], inches=1/3,
                     ann=T, bg=gc_coverage_table$color, fg=gc_coverage_table$color, add = TRUE)
             l <- gsub('(^[^_]+_[^_]+)_(.*)$', '\\\\1', gc_coverage_subset$Name)
             text(x=gc_coverage_subset[,3], y=gc_coverage_subset[,4], labels = l)
         }else{
            print ('a')
         }

         dev.off()




         %s

         svg("%sgc_cov_buble_2m.svg", width = 12, height = 12)
            symbols(x=gc_coverage_table_2m[,3], y= gc_coverage_table_2m[,4], circles=gc_coverage_table_2m[,2],
                    inches=1/3, ann=T, bg=rgb(1, 0, 0,0.5), fg=rgb(1, 0, 0,0.5), main="%s", xlab="GC(%%)", ylab="Sequencing depth")

            if (any("gc_coverage_subset" %%in%% ls())) {

                symbols(x=gc_coverage_table_2m[,3], y= gc_coverage_table_2m[,4], circles=gc_coverage_table_2m[,2],
                        inches=1/3, ann=T, bg=gc_coverage_table_2m$color, fg=gc_coverage_table_2m$color, add = TRUE)
                l <- gsub('(^[^_]+_[^_]+)_(.*)$', '\\\\1', gc_coverage_subset2$Name)
                text(x=gc_coverage_subset2[,3], y=gc_coverage_subset2[,4], labels = l)
            }else{
                print ('a')
            }

     dev.off()



                   """ %
                   (samtool_depth_file, samtool_depth_file, samtool_depth_file,
                    output_prefix, output_prefix, highlight_code,
                    output_prefix, main, highlight_code2, output_prefix, main))
    else:

        robjects.r("""

        #library(Cairo)
        library(R.utils)

        if (isGzipped("%s")){
            #print('Gzipped file')
            all_depth <- read.table(gzfile('%s'), header=FALSE)
        }else{
            #print('Not Gzipped')
            all_depth <- read.table('%s', header=FALSE)
        }

        blast_file <- read.table("%s", header=FALSE, sep="\t")[,c(2,6)]
        contigs_depth<- aggregate(all_depth["V3"],b=all_depth[c("V1")],FUN=median)
        contigs_gc <- read.table("/tmp/gc.tab", header=TRUE)

        gc_coverage_table <-cbind(contigs_gc,coverage=contigs_depth[match(contigs_gc$Name, contigs_depth$V1),2])
        #w<-which(gc_coverage_table$Length >=1000)
        #gc_coverage_table <- gc_coverage_table[w,]

        gc_coverage_table$taxon <- blast_file[,2][match(gc_coverage_table$Name, blast_file[,1])]
        #print (is.na(gc_coverage_table$taxon))
        gc_coverage_table$taxon <- as.character(gc_coverage_table$taxon)
        gc_coverage_table$taxon[is.na(gc_coverage_table$taxon)] <- 'undefined'
        gc_coverage_table$taxon <- as.factor(gc_coverage_table$taxon)

        write.table(gc_coverage_table, 'gc_coverage_table.tab', sep="\t", row.names=F)

         svg("gc_cov_buble.svg", width = 12, height = 12,)
            symbols(x=gc_coverage_table[,3], y= gc_coverage_table[,4], circles=gc_coverage_table[,2], inches=1/3,
                    ann=F, bg=gc_coverage_table$taxon, fg=gc_coverage_table$taxon, main="%s", xlab="GC(%%)", ylab="Sequencing depth")
         dev.off()

         cov_biggest <- gc_coverage_table[which(gc_coverage_table$Length==max(gc_coverage_table$Length)),4]
         #print('cov biggest:')
         #print(cov_biggest)
         w <- which(gc_coverage_table[,4]< (4*cov_biggest))
         gc_coverage_table_2m <- gc_coverage_table[w,]

         svg("gc_cov_buble_2m.svg", width = 12, height = 12,)
            symbols(x=gc_coverage_table_2m[,3], y= gc_coverage_table_2m[,4], circles=gc_coverage_table_2m[,2],
                    inches=1/3, ann=F, bg=gc_coverage_table$taxon, fg=gc_coverage_table$taxon, main="%s", xlab="GC(%%)", ylab="Sequencing depth")
         dev.off()



                   """ % (samtool_depth_file, samtool_depth_file,
                          samtool_depth_file, blast_file, main, main))
def locate_origin(contig_file, reference_dnaa=False, base_add=517):

    '''

    perform a tblastn to identify location od dnaa
    return chromosome name and position of the split

    :param contig_file:
    :param reference_dnaa: seqrecord object with reference dnaa sequence
    :return:
    '''

    '''
    :param contig_file:
    :param reference_dnaa:
    :return:
    '''

    import sys
    from Bio import SeqIO, Seq, SeqRecord
    from Bio.Alphabet import generic_dna, generic_protein
    from Bio.Blast.Applications import NcbitblastnCommandline
    import shell_command
    from Bio.Blast import NCBIXML

    if not reference_dnaa:

        seq = Seq.Seq('MSEKEIWEKVLEIAQEKLSAVSYSTFLKDTELYTIKDGEAIVLSS'
                      ' IPFNANWLNQQYAEIIQAILFDVVGYEVKPHFITTEELANYSNNETATPKETTKPSTET'
                      'TEDNHVLGREQFNAHNTFDTFVIGPGNRFPHAASLAVAEAPAKAYNPLFIYGGVGLGKT'
                      'HLMHAIGHHVLDNNPDAKVIYTSSEKFTNEFIKSIRDNEGEAFRERYRNIDVLLIDDIQ'
                      'FIQNKVQTQEEFFYTFNELHQNNKQIVISSDRPPKEIAQLEDRLRSRFEWGLIVDITPP'
                      'DYETRMAILQKKIEEEKLDIPPEALNYIANQIQSNIRELEGALTRLLAYSQLLGKPITT'
                      'ELTAEALKDIIQAPKSKKITIQDIQKIVGQYYNVRIEDFSAKKRTKSIAYPRQIAMYLS'
                      'RELTDFSLPKIGEEFGGRDHTTVIHAHEKISKDLKEDPIFKQEVENLEKEIRNV', generic_protein)

        reference_dnaa = SeqRecord.SeqRecord(seq,
                                 id="ADC36215.1",
                                 name="DnaA",
                                 description="Chromosomal replication initiator protein DnaA")

    cmd = 'formatdb -i %s -p F' % contig_file
    out, err, code = shell_command.shell_command(cmd)
    print out, err, code
    if code != 0:
        sys.stdout.write('problem with command: \n %s' % cmd)
    #path = os.path.abspath(contig_file)
    #blast_dir = os.path.dirname(path)
    #out, err, code = shell_command.shell_command('export BLASTDB=$BLASTDB:%s' % (blast_dir))
    #print out, err, code


    handle = open('dnaa.temp', 'w')
    SeqIO.write(reference_dnaa, handle, 'fasta')
    handle.close()

    tblastn_cline = NcbitblastnCommandline(query='dnaa.temp',
                                         db=contig_file,
                                         evalue=0.001,
                                         outfmt=5,
                                         out="dnaa_blast2.xml")

    stdout, stderr = tblastn_cline()

    result_handle = open("dnaa_blast2.xml", 'r')
    blast_records = [i for i in NCBIXML.parse(result_handle)]
    best_hit = blast_records[0].alignments[0].hsps[0]

    #print blast_records[0].alignments[0].hit_id
    contig = blast_records[0].alignments[0].hit_def
    identity_percentage = (best_hit.identities / best_hit.align_length)*100

    sbjct_end = best_hit.sbjct_end
    sbjct_start = best_hit.sbjct_start

    if sbjct_start > sbjct_end:
        #print 'strand 1', sbjct_start, sbjct_end
        split_location = sbjct_start - base_add
    else:
        #print 'strand -1', sbjct_start, sbjct_end
        split_location = sbjct_end + base_add

    sys.stdout.write('contig: %s --> hit with evalue of %s and identity of %s\n' % (contig, best_hit.expect, identity_percentage))

    return (contig, split_location)
示例#29
0
def circos_orthology(all_record_list, ref_record_and_location, target_record_and_location_list, location = "assets/circos"):
    import gbk2circos
    import os
    import re
    import shell_command
    #print location

    with open(os.path.join(location, "circos.kar"), "w") as contig_file:
        i = 0
        for record in all_record_list:
            if i%2 == 0:
                col = 1
            else:
                col = 2
            i+=1
            # band chr_name band_name band_label start end color

            description = record.description
            description = re.sub(", complete genome\.", "", description)
            description = re.sub(", complete genome", "", description)
            description = re.sub(", complete sequence\.", "", description)
            description = re.sub("strain ", "", description)
            description = re.sub("str\. ", "", description)
            description = re.sub(" complete genome sequence\.", "", description)
            description = re.sub(" complete genome\.", "", description)
            description = re.sub(" chromosome", "", description)
            description = re.sub(" DNA", "S.", description)
            description = re.sub("Merged record from ", "", description)
            description = re.sub(", wgs", "", description)
            description = re.sub("Candidatus ", "", description)
            description = re.sub(".contig.0_1, whole genome shotgun sequence.", "", description)
            description = re.sub(" ", "-", description)
            description = re.sub("Chlamydia_", "C_", description)
            description = re.sub("Chlamydophila_", "C_", description)
            description = re.sub("Simkania_", "S_", description)
            description = re.sub("Parachlamydia_", "P_", description)
            description = re.sub("Neochlamydia_", "N_", description)
            description = re.sub("Protochlamydia_", "P_", description)
            description = re.sub("Waddlia_", "W_", description)
            description = re.sub("Estrella_", "E_", description)
            description = re.sub("Methylacidiphilum_", "M_", description)
            description = re.sub("Criblamydia_", "C_", description)
            description = re.sub("Methylacidiphilum_", "M_", description)


            accession = record.id.split(".")[0]

            contig_file.write('chr - %s %s %s %s spectral-5-div-%s\n' % (accession, description, 0, len(record), col))
            #chr - Rhab_1 Rhab_1 0 125191 spectral-5-div-4


    with open(os.path.join(location, "circos.link"), "w") as link_file:
        for link in target_record_and_location_list:
            line = '%s %s %s %s %s %s\n' % (ref_record_and_location[0], ref_record_and_location[1],ref_record_and_location[2], link[0], link[1], link[2])
            link_file.write(line)

    circos_conf = gbk2circos.Circos_config("circos.kar", show_ticks="no", show_tick_labels="no", ideogram_spacing=100, label_radius=0.01, radius=0.45)

    circos_conf.add_link("circos.link", thickness=3)

    with open(os.path.join(location, "circos.config"), "w") as f:
        f.write(circos_conf.get_file())

    cmd = "circos -outputfile %s -outputdir %s -conf %s" % ("circos_ortho", location, os.path.join(location, "circos.config"))
    #print cmd
    (stdout, stderr, return_code) = shell_command.shell_command(cmd)
示例#30
0
    parser.add_argument("-1",
                        '--only_16s',
                        action="store_true",
                        help="get only 16s")
    parser.add_argument("-2",
                        '--only_23s',
                        action="store_true",
                        help="get only 23s")

    args = parser.parse_args()

    if not args.title:
        title = args.input.split('.')[0]
    else:
        title = args.title
    stdout_str, stderr_str, runcode = shell_command.shell_command(
        "barrnap %s" % args.input)

    if 'not found' in stderr_str:
        raise (Exception('Barrnap was not found on path, please install it'))
        import sys
        sys.exit()

    rrna_16S, rrna_23S, rrna_5S = search_16S_rrna(
        parse_barrnap_output(stdout_str))

    longest16 = find_longest_16S(rrna_16S)
    longest23 = find_longest_16S(rrna_23S)

    recs = []

    if args.only_16s: