Exemplo n.º 1
0
    def __init__(self,
                 query,
                 database,
                 protein=False,
                 formatdb=False,
                 best_hit_only=True):
        import os
        from Bio import SeqRecord, SeqIO

        #print type(query)

        if type(query) == list or isinstance(query, SeqRecord.SeqRecord):
            from io import StringIO
            from tempfile import NamedTemporaryFile
            temp_query = NamedTemporaryFile(delete=False, mode="w")
            fastastr = StringIO()
            SeqIO.write(query, fastastr, 'fasta')
            temp_query.write(fastastr.getvalue())
            temp_query.flush()
            self.query = temp_query.name
            # add content to temporary file

        elif type(query) == str:
            self.query = query
            self.query = query

        else:
            raise TypeError('wrong inut format: either SeqRecord or string')

        if type(database) == list or isinstance(database, SeqRecord.SeqRecord):
            from io import StringIO
            from tempfile import NamedTemporaryFile
            temp_db = NamedTemporaryFile(delete=False, mode="w")
            fastastrdb = StringIO()
            SeqIO.write(database, fastastrdb, 'fasta')
            temp_db.write(fastastrdb.getvalue())
            temp_db.flush()
            self.database = temp_db.name
            # add content to temporary file

        elif type(database) == str or type(database) == unicode:
            self.database = database

        else:
            raise TypeError('wrong inut format: either SeqRecord or string')

        self.protein = protein
        self.best_hit_only = best_hit_only
        self.formatdb = formatdb
        self.working_dir = os.getcwd()
        self.blast_path_var = 'export BLASTDB=/tmp/'
        from chlamdb.biosqldb import shell_command
        shell_command.shell_command(self.blast_path_var)
Exemplo n.º 2
0
    def format_database(self):
        from chlamdb.biosqldb import shell_command

        new_database = self.id_generator(8)

        if self.protein:
            #print 'proteins'
            #cmd = 'formatdb -i %s -t %s -o T -p T -n /tmp/%s.temp -b T' % (self.database, new_database, new_database)
            cmd = 'formatdb -i %s -p T' % (self.database)
            #print cmd
            shell_command.shell_command(cmd)
        else:
            #print 'nucl'
            cmd = 'formatdb -i %s -p F' % (self.database)
            #print cmd
            shell_command.shell_command(cmd)

        self.database_path = '/tmp/%s.temp' % new_database
Exemplo n.º 3
0
def run_prodigal(fasta_seq, output_name='temp.faa'):
    from Bio import SeqIO
    from chlamdb.biosqldb import shell_command
    from io import StringIO
    from tempfile import NamedTemporaryFile
    # -q quiet
    # -a Write protein translations to the selected file
    # -i Specify input file
    # -c:  Closed ends.  Do not allow genes to run off edges. # not activated
    cmd = "prodigal -q -a %s -i %s" % (output_name, fasta_seq)

    sdt_out, sdt_err, err = shell_command.shell_command(cmd)
    print(sdt_out)
    print(sdt_err)
    shell_command.shell_command('sed -i "s/*//g" %s' % output_name)
    #print sdt_out, sdt_err, err
    #shell_command.shell_command("seqret -sequence %s -feature -fformat gff -fopenfile temp.gff -osformat genbank -auto -outseq temp.gbk" % fasta_seq)
    #print sdt_out
    #fasta_file = NamedTemporaryFile()
    #fasta = open("temp.faa", 'w')
    #fasta.write(sdt_out)

    #print "genbank", genbank
    #for i in genbank:
    #    print "record", i

    #test = open("test.gbk", 'w')
    #test.write(sdt_out)

    #for i in genbank:
    #    print i
    #records = [i for i in genbank]
    #print records
    #SeqIO.write(genbank, fasta, "fasta")
    #fasta.close()
    #fasta_file.flush()
    return output_name
Exemplo n.º 4
0
def plot_multiple_regions_crosslink(target_protein_list,
                                    region_record_list,
                                    plasmid_list,
                                    out_name,
                                    biodb_name="chlamydia_03_15",
                                    color_locus_list=[],
                                    flip_record_based_on_first=True,
                                    color_orthogroup_list=[]):

    import matplotlib.cm as cm
    from matplotlib.colors import rgb2hex
    import matplotlib as mpl
    import MySQLdb
    import os
    sqlpsw = os.environ['SQLPSW']

    norm = mpl.colors.Normalize(vmin=-30, vmax=100)
    cmap = cm.Blues
    m = cm.ScalarMappable(norm=norm, cmap=cmap)

    conn = MySQLdb.connect(
        host="127.0.0.1",  # your host, usually localhost
        user="******",  # your username
        passwd=sqlpsw,  # your password
        db="orth_%s" % biodb_name)  # name of the data base
    cursor = conn.cursor()

    gd_diagram = GenomeDiagram.Diagram("geomic_region")
    feature_sets = []
    max_len = 0
    records = dict((rec.name, rec) for rec in region_record_list)

    n_records = len(region_record_list)

    record_length = [len(record) for record in region_record_list]

    if flip_record_based_on_first:
        region_record_list_flip = [region_record_list[0]]
        region_record_list_flip[0].name = region_record_list_flip[
            0].description
        for x in range(0, len(region_record_list) - 1):
            same_strand_count = 0
            different_strand_count = 0
            features_X = region_record_list[x].features
            features_Y = region_record_list[x + 1].features
            for feature_1 in features_X:

                if feature_1.type != "CDS":
                    continue

                for feature_2 in features_Y:
                    if feature_2.type != "CDS":
                        continue
                    try:

                        group1 = feature_1.qualifiers["orthogroup"][0]
                        group2 = feature_2.qualifiers["orthogroup"][0]
                        if group1 == group2:
                            strand1 = feature_1.location.strand
                            strand2 = feature_2.location.strand
                            if strand1 == strand2:
                                same_strand_count += 1
                            else:
                                different_strand_count += 1

                    except:
                        pass

            if different_strand_count > same_strand_count:
                region_record_list[x + 1] = region_record_list[
                    x + 1].reverse_complement(
                        id=region_record_list[x + 1].id,
                        name=region_record_list[x + 1].description)
            else:
                region_record_list[x +
                                   1].name = region_record_list[x +
                                                                1].description

        #region_record_list = region_record_list_flip
    for i, record in enumerate(region_record_list):
        max_len = max(max_len, len(record))
        #Allocate tracks 3 (top), 1 (bottom) for region 1 and 2
        #(empty tracks 2 useful white space to emphasise the cross links
        #and also serve to make the tracks vertically more compressed)
        gd_track_for_features = gd_diagram.new_track(
            (1 * n_records - 1) - 1 * i,
            name=record.name,
            greytrack=True,
            height=0.4,
            start=0,
            end=len(record))
        if record.name not in feature_sets:
            feature_sets.append(gd_track_for_features.new_set())
        else:
            print("already in feature_sets!")
            print(record)
            quit

    #print 'looping....'
    for x in range(0, len(region_record_list) - 1):
        features_X = region_record_list[x].features
        features_Y = region_record_list[x + 1].features
        set_X = feature_sets[x]
        set_Y = feature_sets[x + 1]
        for feature_1 in features_X:

            if feature_1.type != "CDS":
                continue

            for feature_2 in features_Y:
                if feature_2.type != "CDS":
                    continue
                try:

                    group1 = feature_1.qualifiers["orthogroup"][0]
                    group2 = feature_2.qualifiers["orthogroup"][0]

                except:
                    group1 = "one_singleton"
                    group2 = "two_singleton"

                if group1 == group2:
                    border = colors.lightgrey
                    color = colors.lightgrey
                    try:
                        identity = orthogroup_identity_db.check_identity(
                            cursor, feature_1.qualifiers["orthogroup"][0],
                            feature_1.qualifiers["locus_tag"][0],
                            feature_2.qualifiers["locus_tag"][0])
                    except:
                        identity = 0
                        print(
                            "problem with identity table %s and locus %s %s" %
                            (group1, feature_1.qualifiers["locus_tag"][0],
                             feature_1.qualifiers["locus_tag"][0]))

                    color2 = colors.HexColor(
                        rgb2hex(m.to_rgba(float(identity))))
                    border2 = colors.HexColor(
                        rgb2hex(m.to_rgba(float(identity))))

                    F_x = set_X.add_feature(
                        SeqFeature(
                            FeatureLocation(feature_1.location.start,
                                            feature_1.location.end,
                                            strand=0)),
                        color=color,
                        border=border,
                        set_id=feature_1.qualifiers["locus_tag"])
                    F_y = set_Y.add_feature(SeqFeature(
                        FeatureLocation(feature_2.location.start,
                                        feature_2.location.end,
                                        strand=0)),
                                            color=color,
                                            border=border)
                    gd_diagram.cross_track_links.append(
                        CrossLink(F_x, F_y, color2, border2))

    #for x in range(0,len(region_record_list)-1):
    x = 0
    all_locus = []

    for n, record in enumerate(region_record_list):
        gd_feature_set = feature_sets[n]
        i = 0

        if plasmid_list[x]:
            #print "PLASMID!!"
            color1 = colors.HexColor('#2837B7')
            color2 = colors.blue
        else:
            color1 = colors.HexColor('#40F13A')
            color2 = colors.HexColor('#0F600C')

        one_row_locus = []
        for feature in record.features:
            if feature.type == "tblast_target":
                feature.name = 'match'
                gd_feature_set.add_feature(feature,
                                           sigil="BOX",
                                           color="#ff4a0c86",
                                           label=False,
                                           label_position="middle",
                                           label_size=25,
                                           label_angle=0)

            if feature.type == "assembly_gap":
                #print "gap", feature
                feature.location.strand = None
                gd_feature_set.add_feature(feature,
                                           sigil="BOX",
                                           color="red",
                                           label=True,
                                           label_position="middle",
                                           label_strand=1,
                                           label_size=14,
                                           label_angle=40)

            if feature.type == "rRNA":

                gd_feature_set.add_feature(feature,
                                           sigil="ARROW",
                                           color="orange",
                                           label=True,
                                           label_position="middle",
                                           label_strand=1,
                                           label_size=10,
                                           label_angle=40)
                try:
                    one_row_locus.append(feature.qualifiers["locus_tag"][0])
                except:
                    pass
            if feature.type == "tRNA":

                gd_feature_set.add_feature(feature,
                                           sigil="ARROW",
                                           color="orange",
                                           label=True,
                                           label_position="middle",
                                           label_strand=1,
                                           label_size=10,
                                           label_angle=40)
                try:
                    one_row_locus.append(feature.qualifiers["locus_tag"][0])
                except:
                    print('no locus tag for:')
                    print(feature)

            if feature.type == "repeat_region":

                gd_feature_set.add_feature(feature,
                                           sigil="BOX",
                                           color="blue",
                                           label=True,
                                           label_position="middle",
                                           label_strand=1,
                                           label_size=14,
                                           label_angle=40)

            if 'pseudo' in feature.qualifiers:

                gd_feature_set.add_feature(feature,
                                           sigil="OCTO",
                                           color="#6E6E6E",
                                           label=True,
                                           label_position="middle",
                                           label_strand=1,
                                           label_size=10,
                                           label_angle=40)

            elif feature.type != "CDS":
                continue
            else:

                try:
                    a = feature.qualifiers["locus_tag"][0]
                except:
                    # cas des pseudogenes qui sont des CDS mais n'ont pas de protein ID
                    continue

                try:
                    g = feature.qualifiers["orthogroup"][0]
                except:
                    # cas des pseudogenes qui sont des CDS mais n'ont pas de protein ID
                    continue

                if a in color_locus_list:
                    #print '###########################', a, color_locus_list
                    if len(gd_feature_set) % 2 == 0:
                        color = colors.HexColor('#ca4700')
                    else:
                        color = colors.HexColor('#fd7a32')
                else:
                    if len(gd_feature_set) % 2 == 0:
                        color = color1
                    else:
                        color = color2

                if g in color_orthogroup_list:
                    #print '###########################', a, color_locus_list
                    if len(gd_feature_set) % 2 == 0:
                        color = colors.HexColor('#ca4700')
                    else:
                        color = colors.HexColor('#fd7a32')
                else:
                    if len(gd_feature_set) % 2 == 0:
                        color = color1
                    else:
                        color = color2

                #try:
                #    try:
                #            group = protein_id2group[feature.qualifiers["protein_id"][0]]
                #    except:
                #            group = protein_id2group[feature.qualifiers["protein_id"][1]]
                #except:
                #    # no group attributed: singleton => special color
                #    color = colors.HexColor('#E104C0')

                for target_protein in target_protein_list:
                    if target_protein in feature.qualifiers["locus_tag"]:
                        #print "target prot!"
                        color = colors.red

                gd_feature_set.add_feature(feature,
                                           sigil="ARROW",
                                           color=color,
                                           label=True,
                                           label_position="middle",
                                           label_strand=1,
                                           label_size=10,
                                           label_angle=40)
                i += 1
                try:
                    one_row_locus.append(feature.qualifiers["locus_tag"][0])
                except:
                    print('no locus tag for:')
                    print(feature)
        all_locus = one_row_locus + all_locus

        x += 1

    #print "max", max_len
    #print "n record", len(region_record_list)

    if len(region_record_list) == 2:
        hauteur = 300
    else:
        hauteur = 150 * len(region_record_list)
    largeur = max(record_length) / 30
    #print "hauteur", hauteur
    #print "largeur", largeur
    #gd_diagram.set_page_size(, orientation)
    if hauteur > largeur:
        gd_diagram.draw(format="linear",
                        pagesize=(hauteur, largeur),
                        orientation='portrait',
                        fragments=1,
                        start=0,
                        end=max_len)
    else:
        gd_diagram.draw(format="linear",
                        pagesize=(hauteur, largeur),
                        orientation='landscape',
                        fragments=1,
                        start=0,
                        end=max_len)
    #print "writing diagram", out_name

    #gd_diagram.write(out_name, "SVG")

    import io
    from chlamdb.plots import edit_svg

    svg_diagram = io.StringIO()
    gd_diagram.write(svg_diagram, "SVG")
    svg_diagram.flush()
    #gd_diagram

    with_links = edit_svg.edit_svg(svg_diagram.getvalue(), all_locus,
                                   biodb_name)

    with_links.write(out_name)

    png_name = out_name.split('.')[0] + '.png'

    #png_handle = open(png_name, 'w')
    #gd_diagram.write(png_handle, "PNG")
    #png_handle.close()

    try:
        cmd = 'chmod 444 %s' % out_name
    except:
        pass
    from chlamdb.biosqldb import shell_command
    #print cmd
    shell_command.shell_command(cmd)

    return all_locus
Exemplo n.º 5
0
def setup_blastdb(biodb, static_dir_path):
    from chlamdb.biosqldb import manipulate_biosqldb
    from chlamdb.biosqldb import gbk2fna
    from chlamdb.biosqldb import gbk2faa
    from chlamdb.biosqldb import gbk2ffn
    from chlamdb.biosqldb import gbk2table
    import os
    from Bio import SeqIO
    from chlamdb.biosqldb import shell_command

    server, db = manipulate_biosqldb.load_db(biodb)

    sql1 = 'select distinct accession from orthology_detail_%s' % biodb

    accession_list = [
        i[0] for i in server.adaptor.execute_and_fetchall(sql1, )
    ]

    db_static_path = os.path.join(static_dir_path, biodb)
    try:
        os.mkdir(db_static_path)
    except:
        pass
    faa_path = os.path.join(db_static_path, 'faa')
    print(faa_path)
    os.mkdir(faa_path)

    fna_path = os.path.join(db_static_path, 'fna')
    os.mkdir(fna_path)

    ffn_path = os.path.join(db_static_path, 'ffn')
    os.mkdir(ffn_path)

    gbk_path = os.path.join(db_static_path, 'gbk')
    os.mkdir(gbk_path)

    tab_path = os.path.join(db_static_path, 'tab')
    os.mkdir(tab_path)

    for n, accession in enumerate(accession_list):
        print(n, accession)
        record = db.lookup(accession=accession)
        # faa + merged

        out_name_faa = os.path.join(faa_path, accession + '.faa')
        out_name_ffn = os.path.join(ffn_path, accession + '.ffn')
        out_name_fna = os.path.join(fna_path, accession + '.fna')
        out_name_tab = os.path.join(tab_path, accession + '.tab')
        out_name_gbk = os.path.join(gbk_path, accession + '.gbk')

        gbk2faa.gbk2faa(record, lformat=True, outname=out_name_faa)

        # fna
        gbk2fna.gbk2fna(record, outname=out_name_fna)
        # ffn
        gbk2ffn.gbk2ffn(record, outname=out_name_ffn, locus_tag=True)
        # gbk
        with open(out_name_gbk, 'w') as f:
            SeqIO.write(record, f, 'genbank')
        # tab
        gbk2table.gbk2table(record, out_name_tab)

    # merging faa, fna and ffn
    shell_command.shell_command("cd %s; cat *faa> all.faa" % faa_path)
    shell_command.shell_command("cd %s; cat *ffn> all.ffn" % ffn_path)
    shell_command.shell_command("cd %s; cat *fna> all.fna" % fna_path)

    # formatdb
    # makeblastdb -in prot2003-2014_test.fa -dbtype prot
    shell_command.shell_command(
        "cd %s; for i in `ls *faa`;do makeblastdb -in $i -dbtype prot; done" %
        faa_path)
    shell_command.shell_command(
        "cd %s; for i in `ls *ffn`;do makeblastdb -in $i -dbtype nucl; done" %
        ffn_path)
    shell_command.shell_command(
        "cd %s; for i in `ls *fna`;do makeblastdb -in $i -dbtype nucl; done" %
        fna_path)
Exemplo n.º 6
0
def map2highlighted_map(map_id,
                        ko_list,
                        ko2freq,
                        biodb,
                        outpath='test.pdf',
                        taxon_id=False,
                        n_species=60):
    import re
    from chlamdb.biosqldb import shell_command
    from Bio.Graphics.KGML_vis import KGMLCanvas
    from Bio.Graphics import KGML_vis
    import urllib.request
    from Bio.KEGG.KGML.KGML_pathway import Pathway, Reaction, Relation
    import Bio.KEGG.KGML.KGML_pathway
    from Bio.KEGG.KGML import KGML_parser
    from Bio.Graphics.ColorSpiral import ColorSpiral
    import matplotlib.cm as cm
    from matplotlib.colors import rgb2hex
    import matplotlib as mpl

    values = [float(i) for i in ko2freq.values()]

    norm = mpl.colors.Normalize(vmin=0, vmax=n_species)
    cmap = cm.OrRd
    cmap2 = cm.Greens
    m = cm.ScalarMappable(norm=norm, cmap=cmap)
    m2 = cm.ScalarMappable(norm=norm, cmap=cmap2)

    url_template = 'http://rest.kegg.jp/get/%s/kgml' % re.sub(
        'map', 'ko', map_id)
    print(url_template)
    f = urllib.request.urlopen(url_template)
    from Bio.Graphics import KGML_vis

    pathway = KGML_parser.read(f.read().decode('UTF-8'))

    kgml_map = KGMLCanvas(pathway, show_maps=True)

    # Let's use some arbitrary colours for the orthologs
    cs = ColorSpiral(a=2, b=0.2, v_init=0.85, v_final=0.5, jitter=0.03)
    # Loop over the orthologs in the pathway, and change the
    # background colour
    orthologs = [e for e in pathway.orthologs]
    for o in orthologs:
        match = False
        if 'K00163' in o.name:
            print('##################################')
        ko_temp_list = set([i.rstrip() for i in o.name.split('ko:')])
        if len(ko_temp_list.intersection(set(ko2freq.keys()))) > 0:

            ko_keep = []
            for ko in ko_temp_list:
                if ko in ko2freq:
                    ko_keep.append(ko)
                if ko in ko_list:
                    match = True
            o.name = 'ko:' + ' ko:'.join(ko_keep)
            total = sum([
                int(ko2freq[i])
                for i in ko_temp_list.intersection(set(ko2freq.keys()))
            ])

            for g in o.graphics:
                if match:
                    g.bgcolor = rgb2hex(m2.to_rgba(float(total)))
                else:
                    #print 'no match!!!!'
                    #print ko_temp_list
                    #print ko2freq.keys()
                    #print 'TOTAL:', total
                    g.bgcolor = rgb2hex(m.to_rgba(float(total)))
            o.name = "%s (%s)" % (o.name.split('ko:')[0], total)
        #else:
        #    for g in o.graphics:
        #        g.bgcolor = '#FFFFFF'

    # Default settings are for the KGML elements only

    # We need to use the image map, and turn off the KGML elements, to see
    # only the .png base map. We could have set these values on canvas
    # instantiation
    kgml_map.import_imagemap = True
    kgml_map.show_maps = True
    kgml_map.show_orthologs = True
    kgml_map.draw_relations = False
    kgml_map.show_compounds = False
    kgml_map.show_genes = False
    kgml_map.show_compounds = False
    kgml_map.show_genes = False
    kgml_map.draw(outpath)
    '''
    print 'DIRLISAT:', dir(pathway)
    maps = [m for m in pathway.maps]
    for map in maps:
        for g in map.graphics:
            print g.name
    '''

    #print re.sub('pdf', 'svg', outpath)
    shell_command.shell_command(
        'inkscape %s --export-plain-svg=%s' %
        (outpath, re.sub('pdf', 'svg', outpath)))  # 'pdf2svg %s %s all'
    t = edit_svg_map("%s" % re.sub('pdf', 'svg', outpath),
                     ko2freq.keys(),
                     biodb,
                     map_id,
                     taxon_id=taxon_id)
    #print "%s" % re.sub('pdf', 'svg', outpath)
    t.write("%s" % re.sub('pdf', 'svg', outpath))
Exemplo n.º 7
0
    def run_hmmer(self, profiles=False):
        from tempfile import NamedTemporaryFile
        from chlamdb.biosqldb import shell_command

        if not profiles:
            profiles = self.hmm_profiles

        header = [
            "profile_id", "profile_length", "best_hit_id", "bias", "bitscore",
            "evalue", "query_start", "query_end", "query_coverage",
            "hit_start", "hit_end"
        ]
        results = []  #[header]
        for profile in profiles:
            temp_file = NamedTemporaryFile()
            self.hmmer_output_list.append(temp_file.name)
            if not isinstance(self.database, list):
                cmd = self.hmmer_cmd % (temp_file.name, profile, self.database)
                #print cmd
                stout, sterr, code = shell_command.shell_command(
                    cmd)  # self.hmmer_score_cutoff,
                if code != 0:
                    import sys
                    sys.stdout.write("\n%s\n%s\n" % (stout, sterr))
                    sys.exit()

                parsed_data = self._parse_hmmsearch(temp_file.name)

                if len(parsed_data) == 0:
                    print(
                        'No domains respecting score threshold for %s, continue...'
                        % profile)
                    continue

                if not isinstance(parsed_data[0], dict):
                    results.append([
                        '%s' % parsed_data[0], '-', '-', '-', '-', '-', '-',
                        '-', '-', '-'
                    ])
                else:
                    hsp_list = parsed_data
                    for x in range(0, len(hsp_list)):
                        #results += '\t'.join([str(hsp_list[x][i]) for i in header])
                        #results += '\n'
                        results.append([str(hsp_list[x][i]) for i in header])
            else:
                # multiple databases: performing bitscore filtering
                self.biodb2best_hits = {}
                for database in self.database:
                    stout, sterr, code = shell_command.shell_command(
                        self.hmmer_cmd %
                        (self.hmmer_score_cutoff, temp_file.name, profile,
                         self.database))
                    if code != 0:
                        import sys
                        sys.stdout.write("\n%s\n%s\n" % (stout, sterr))
                        sys.exit()

                    parsed_data = self._parse_hmmsearch(temp_file.name)
                    '''
                    if not isinstance(parsed_data[0], dict):
                        pass
                    else:
                        # all hsp have the same bitscore, only use the first hsp
                        if parsed_data[0]['profile_id'] not in self.profile2scores:
                            self.profile2scores[parsed_data[0]['profile_id']] = [parsed_data[0]['bitscore']]
                        else:
                            self.profile2scores[parsed_data[0]['profile_id']].append(parsed_data[0]['bitscore'])
                        hsp_list = parsed_data
                        for x in range(0,len(hsp_list)):
                            results += '\t'.join([str(hsp_list[x][i]) for i in header])
                            results += '\n'
                    '''

        return results