Exemplo n.º 1
0
 def disapprove(self, PMID):
     id = self.get_id(PMID)
     if id == 0:
         util.warn_msg('PMID: %s is not in collection!' % PMID)
     else:
         self.db.from_sql("update gp.citation set PStatus='N' where PMID=?",
                          params=[PMID])
Exemplo n.º 2
0
def make_legend_pdf(s_pdf, S_png=[], l_clean_up=True):
    """Add an optional Watermark and legend png (or more), filenames stored in S_png, generate a Watermark.pdf"""
    from reportlab.pdfgen import canvas
    #if s_wm is None:
    #    s_wm=os.path.join(os.path.dirname(__file__), "watermark", "Watermark.pdf")
    if os.path.exists(s_pdf):
        os.remove(s_pdf)
    from reportlab.lib.pagesizes import letter
    c = canvas.Canvas(s_pdf, pagesize=letter)
    w, h = c._pagesize
    w = int(w)
    # place stuff in the center
    w = w / 2
    h = h / 2
    l_has_page = False
    for x in S_png:
        # Add to lower-right corner, bottom up
        #X=mahotas.imread(x)
        if not os.path.exists(x):
            util.warn_msg('File not found in make_legend_pdf: %s' % x)
            continue
        X = scipy.misc.imread(x)
        h2, w2, Z = X.shape
        c.drawImage(x, w - w2, h)
        h += h2
        l_has_page = True
        if l_clean_up:
            os.remove(x)
    if l_has_page:
        c.save()
Exemplo n.º 3
0
 def cynet_from_xgmml(self,
                      s_file,
                      name=None,
                      s_layout="force-directed",
                      l_bundle=True,
                      t_xy=None,
                      l_digraph=False):
     """If t_xy is provided, s_layout is ignored
     return CyNetwork"""
     import os
     if not os.path.exists(s_file):
         import util
         util.warn_msg('Missing file: %s' % s_file)
         return None
     if not l_digraph:
         net = xgmml.Network(s_file, name=name)
     else:
         import digraph
         net = digraph.Digraph(s_file, name=name)
     if t_xy is not None:
         net.set_XY(t_xy)
         s_layout = None
     S = net.T_node.header()
     #print "---------------->", S
     #S=[x for x in S if x not in ('Gene','Name','Symbol')]
     #net.T_node.drop(S, axis=1, inplace=True)
     ##############################
     return self.cynet_from_network(net,
                                    name=name,
                                    s_layout=s_layout,
                                    l_bundle=l_bundle)
Exemplo n.º 4
0
 def remove(PMID):
     id = self.get_id(PMID)
     if id == 0:
         util.warn_msg('PMID: %s is not in collection!' % PMID)
     else:
         self.db.from_sql("delete from gp.citation where PMID=?",
                          params=[PMID])
Exemplo n.º 5
0
 def graph(self, PMID, has_graph='Y'):
     id = self.get_id(PMID)
     if id == 0:
         util.warn_msg('PMID: %s is not in collection!' % PMID)
     else:
         self.db.from_sql("update gp.citation set Graph=? where PMID=?",
                          params=[has_graph])
Exemplo n.º 6
0
 def restore_distance(s_file, max_dist=1.0):
     'No longer needed, as we use cwc.new'
     df=pd.read_table(s_file, header=None)
     s_col=util.header(df)[-1]
     R=(-df[s_col]+1.0).abs()*max_dist
     if R.max()<=1.0:
         df[s_col]=util.rarray2sarray((-R+1.0).abs(), s_format='%.3f') # convert to similarity score and output
         df.to_csv(s_file, sep='\t', index=False, header=False)
     else:
         util.warn_msg('Cannot restore distance to similarity, as the max distance '+str(R.max())+' > 1.0, restore skipped!')
Exemplo n.º 7
0
 def insert(self, PMID, PStatus='N', Graph='N'):
     id = self.get_id(PMID)
     if id != 0:
         util.warn_msg('PMID: %s already in collection, id=%s!' %
                       (PMID, id))
         data = {}
     else:
         data = self.fetch_pmid(PMID)
         if len(data) == 0:
             util.warn_msg('PMID: %s cannot be fetched!' % (PMID))
         else:
             #print data
             self.insert_by_data(data, PStatus, Graph)
         return data
Exemplo n.º 8
0
 def get_date(self):
     t=self.db.from_sql('select distinct history from statistics order by history desc limit 0,2')
     print t
     self.d1=self.d2=None
     if len(t)==0:
         util.error_msg('No history data is found')
     elif len(t)==1:
         util.warn_msg('Only one history entry')
         self.d1=str(t.ix[0, 'history'])
     else:
         self.d1=str(t.ix[0, 'history'])
         self.d2=str(t.ix[1, 'history'])
     print self.d1
     print self.d2
Exemplo n.º 9
0
def add_watermark_pdf(S_pdf, s_out="out", s_wm=None, l_clean_up=True):
    #https://www.binpress.com/tutorial/manipulating-pdfs-with-python/167
    #import shutil
    #shutil.copyfile(s_out, s_out+".v2.pdf")
    s_out, s_ext = os.path.splitext(s_out)
    s_out = s_out + '.pdf'
    #if os.path.exists(s_out):
    #    os.remove(s_out)
    if type(S_pdf) is str:
        S_pdf = [S_pdf]
    if s_wm is None:
        s_wm = os.path.join(os.path.dirname(__file__), "watermark",
                            "Watermark.pdf")
    if not os.path.exists(s_wm):
        util.warn_msg('File not found in make_watermark_pdf: %s' % s_wm)
        return
    f_in = []
    f_wm = open(s_wm, 'rb')
    wpdf = PdfFileReader(f_wm)
    watermark = wpdf.getPage(0)
    f_in.append(f_wm)
    # there appears to be issue with Python2, we need to close f_wm after it's been used
    out = PdfFileWriter()
    for s_pdf in S_pdf:
        if not os.path.exists(s_pdf):
            util.warn_msg('File not found in make_watermark_pdf: %s' % s_pdf)
            continue
        f = open(s_pdf, 'rb')
        ipdf = PdfFileReader(f)
        for i in range(ipdf.getNumPages()):
            page = ipdf.getPage(i)
            out.addPage(page)
        f_in.append(f)
    out.addPage(watermark)
    # if s_out is one of S_pdf, it may trigger an error msg as s_pdf is still open
    with open(s_out + ".tmp", 'wb') as f:
        out.write(f)
    # incase s_out is the same as one of input S_pdf
    for f in f_in:
        f.close()
    if l_clean_up:
        os.remove(s_wm)
        print("Deleting ... " + s_wm)
        for s_pdf in S_pdf:
            if os.path.exists(s_pdf):
                os.remove(s_pdf)
                print("Deleting ... " + s_pdf)
    shutil.move(s_out + ".tmp", s_out)
Exemplo n.º 10
0
 def start(self, f=None, n_CPU=0):
     """Launch workers.
     f: method, task for worker, default None, if None, worker runs a wrapper task, which expect the first argument to be a registered method name
     n_CPU: number of workers to launch."""
     if self.has_started():
         util.warn_msg('Already started, stop first and start new!')
         self.stop()
     self.n_CPU = n_CPU if n_CPU > 0 else multiprocessing.cpu_count() - 1
     self.n_use = self.n_CPU  # use all CPUs by default, but user could ask to use fewer
     self.q_in = [multiprocessing.Queue(1) for i in range(self.n_CPU)]
     self.q_out = [multiprocessing.Queue() for i in range(self.n_CPU)]
     self.c_proc = {
     }  #bug, if c_proc is not inside manager, each process will see {}, {1proc}, {2procs} ..., as they are created
     self.f = f
     if self.f is None:
         self.f = self.wrapper
     # put work_status, n_running into manager, so visible by all processes, see the same copy
     self.mgr = multiprocessing.Manager()
     #self.c_proc=self.mgr.dict({})
     self.work_status = self.mgr.list([False for i in range(self.n_CPU)])
     # remember which process is working on my job
     #self.has_my_job=[False for i in range(self.n_CPU)]
     self.has_my_job = None
     # whether job is done, results ready for reading
     self.job_is_done = self.mgr.list([False for i in range(self.n_CPU)])
     self.n_running = self.mgr.list([0, 0])
     # n_running [0] is # of running process, [1] # of map() calls running
     # use a list, so that fetch routine can modify it (if we do n_running=0, fetch cannot m
     self.lock = self.mgr.Lock()
     self.mail = multiprocessing.Condition(
     )  # notify all processes when a worker finishes
     procs = [
         multiprocessing.Process(target=self.spawn(self.f),
                                 args=(self.q_in[i], self.q_out[i],
                                       self.mail, self.job_is_done))
         for i in range(self.n_CPU)
     ]
     for p in procs:
         p.daemon = True
         p.start()
         self.c_proc[p.pid] = p
         self.n_running[0] += 1
         if self.DEBUG:
             print("DEBUG>>> %s  %d %d" %
                   (str(p), p.pid, self.n_running[0]))
     if self.DEBUG: print("DEBUG> self.c_proc: ", self.c_proc)
Exemplo n.º 11
0
 def color(self, nodes, l_name_to_id=True, cm=None):
     """Color tree nodes
     nodes: dict or list of nodes lists
     nodes and colormap cm are combined to passed to Tree.color_map (see document)
     l_name_to_id: bool, default True, use leave name or node_id
         Warning: this method color nodes, so if leave name is provided and two leaves
         under the same node has different colors, only one color is used.
     """
     df = Tree.read_tree_file(self.tree_file)
     # ['node','left','right','similarity', 'color']
     df['color'] = '#000000'
     c_id2node = {}
     c_name2id = {}
     if l_name_to_id:
         for k, v in self.c_name.items():
             c_name2id[v] = k
         for i in df.index:
             if not df.ix[i, 'left'].startswith('NODE'):
                 c_id2node[df.ix[i, 'left']] = df.ix[i, 'node']
             if not df.ix[i, 'right'].startswith('NODE'):
                 c_id2node[df.ix[i, 'right']] = df.ix[i, 'node']
     t = df.ix[:, ['node', 'color']]
     t.set_index('node', inplace=True)
     c = Tree.color_map(nodes, cm)
     for k, v in c.items():
         if type(v) is tuple:
             v = [min(max(int(x * 255), 0), 255) for x in v]
             v = '#%02x%02x%02x' % tuple(v[:3])
         if l_name_to_id:
             k = c_name2id.get(k, k)
             k = c_id2node.get(k, k)
         if k in t.index:
             t.ix[k, 'color'] = v
         else:
             util.warn_msg('Node not in tree: ' + k)
     df['color'] = list(t.color)
     df.columns = ['NODEID', 'LEFT', 'RIGHT', 'CORRELATION', 'NODECOLOR']
     util.df2sdf(df, s_format="%.4f").to_csv(self.tree_file,
                                             index=False,
                                             sep="\t")
Exemplo n.º 12
0
    def cynet_from_network(self,
                           net,
                           name=None,
                           s_layout="force-directed",
                           l_bundle=True):
        """Input net: xgmml.Network object
        return CyNetwork"""
        #print "====>>>>>>>>", net.T_node.header()
        data = net.to_json()
        if name is None:
            name = data['data']['name']
        #cynet=self.cy.create(suid=net)
        cynet = None
        for trial in range(
                3):  # something Cytoscape errors, try up to three times
            try:
                if cynet is None:
                    cynet = self.cy.network.create(name=name, data=data)
            except:
                cynet = None
                print("Fail at trail %d" % (trial + 1))
                print("JSON data>", data)
        if cynet is None:
            import sys
            util.warn_msg('Error during plot_go_network(): %s' %
                          sys.exc_info()[0])
            print(traceback.format_exc())

        id = cynet.get_id()

        if s_layout is not None:
            #print "apply layout %s" % s_layout
            self.cy.layout.apply(name=s_layout, network=cynet)
        if l_bundle:
            self.cy.edgebundling.apply(cynet)
            #print "apply bundle %s" % s_layout
        self.cy.layout.fit(cynet)
        return cynet
Exemplo n.º 13
0
    def OmniPath(self):
        fn_soure = os.path.join(SyncDB.DOWNLOAD_DIR(), "interactions")
        if not os.path.exists(fn_soure):
            urllib.urlretrieve("http://omnipathdb.org/interactions", fn_soure)
        t = pd.read_table(fn_soure)
        #source  target  is_directed     is_stimulation  is_inhibition   dip_url
        #Q13873  Q969L4  1       0       0
        c_gid, c_tax = self.uniprot()
        t['source_gene_id'] = t.source.apply(lambda x: c_gid.get(x, 0))
        t['tax_id_A'] = t.source.apply(lambda x: c_tax.get(x, 0))
        t['target_gene_id'] = t.target.apply(lambda x: c_gid.get(x, 0))
        t['tax_id_B'] = t.target.apply(lambda x: c_tax.get(x, 0))

        #print len(t)
        t = t[(t.source_gene_id > 0) & (t.target_gene_id > 0) &
              (t.tax_id_A == t.tax_id_B) &
              (t.source_gene_id != t.target_gene_id)].copy()
        c_cnt = util.unique_count(t.tax_id_A)
        print c_cnt
        if len(c_cnt) > 1:
            util.warn_msg('Found non-human genes')
        t = t[t.tax_id_A == 9606]
        t['interaction_type_id'] = 3
        t['interaction_categy'] = 'Signaling Pathway'
        t['interaction_type'] = 'Literature'
        t['score'] = '-'
        t['ds'] = 'OmniPath'
        t.rename2({
            'source_gene_id': 'gid_A',
            'target_gene_id': 'gid_B',
            'dip_url': 'support'
        })
        t = t[[
            'gid_A', 'gid_B', 'tax_id_A', 'tax_id_B', 'interaction_type_id',
            'interaction_categy', 'interaction_type', 'score', 'support', 'ds'
        ]].copy()
        self.omni = t
        return self.omni
Exemplo n.º 14
0
def add_watermark_png(s_png,
                      s_out="out",
                      s_wm=None,
                      s_legend=None,
                      l_clean_up=True):
    s_out, s_ext = os.path.splitext(s_out)
    s_out = s_out + '.png'
    if os.path.exists(s_out):
        os.remove(s_out)
    #M=mahotas.imread(s_png)
    if not os.path.exists(s_png):
        util.warn_msg('File not found in make_watermark_png: %s' % s_png)
        return

    M = scipy.misc.imread(s_png)
    if s_wm is None:
        s_wm = os.path.join(os.path.dirname(__file__), "watermark",
                            "Watermark.png")
    #W=mahotas.imread(s_wm)
    if not os.path.exists(s_wm):
        util.warn_msg('File not found in make_watermark_png: %s' % s_wm)
        return
    W = scipy.misc.imread(s_wm)
    if l_clean_up:
        print("Deleting ... " + s_wm)
        os.remove(s_wm)
    h1, w1, x1 = M.shape
    h2, w2, x2 = W.shape
    if x1 > x2:  # missing alpha channel
        W = np.dstack([W, np.ones([h2, w2])])
    M[-h2:, -w2:] = W
    if s_legend is not None:
        # add legend
        #L=mahotas.imread(s_legend)
        if not os.path.exists(s_legend):
            util.warn_msg('File not found in make_watermark_png: %s' %
                          s_legend)
        else:
            L = scipy.misc.imread(s_legend)
            h3, w3, x3 = L.shape
            if x1 > x3:
                L = np.dstack([L, np.ones([h3, w3])])
            elif x1 < x3:
                L = L[:, :, :x1]
            M[-h2 - h3:-h2, -w3:] = L
            if l_clean_up:
                print("Deleting ... " + s_legend)
                os.remove(s_legend)
    #mahotas.imsave(s_out, M)
    scipy.misc.imsave(s_out, M)
Exemplo n.º 15
0
def make_legend_png(s_png, S_png=[], l_clean_up=True):
    #s_out, s_ext=os.path.splitext(s_png)
    #s_out=s_out+'.png'
    #if os.path.exists(s_out):
    #    os.remove(s_out)
    #M=mahotas.imread(s_png)
    if not os.path.exists(s_png):
        util.warn_msg('File not found in make_legend_png: %s' % s_png)
        return
    try:
        M = scipy.misc.imread(s_png)
        h1, w1, x1 = M.shape
    except:
        util.warn_msg('Cannot load image: %s ' % s_png)
        return
    h = 0
    S_legend = []
    S_png2 = []
    for x in S_png:
        if not os.path.exists(x):
            util.warn_msg('File not found in make_legend_png: %s' % x)
            continue
        # Add to lower-right corner, bottom up
        X = scipy.misc.imread(x)
        S_legend.append(X)
        S_png2.append(x)
    room = 10  # give extra 10px blank space to the left of the legend
    if len(S_legend):
        w = 0
        for Q in S_legend:
            w = max(w, Q.shape[1]) + room
        if w1 < w:  # initial image is too small
            M2 = np.ones([h1, w, x1]) * 255
            M2[:, 0:w1] = M
            M = M2
            h1, w1, x1 = M.shape
        # find out the extra width required to expand the image, so that legend does not cover up existing pixels
        width = 2  # minimum 2 pixel margin
        h = 0
        for Q in S_legend:
            h2, w2, z = Q.shape
            w2 += room
            if h1 - h < 0: break
            height = min(h1 - h, h2)
            r = M[max(h1 - h - h2, 0):h1 - h, max(w1 - w2, 0):w1, 0]
            g = M[max(h1 - h - h2, 0):h1 - h, max(w1 - w2, 0):w1, 1]
            b = M[max(h1 - h - h2, 0):h1 - h, max(w1 - w2, 0):w1, 2]
            bg = np.logical_and(r == 255, g == 255, b == 255)
            xb = 0
            X = np.sum(bg, axis=0)
            for i in range(r.shape[1] - 1, -1, -1):
                if X[i] < height:
                    xb = i
                    break
            width = max(width, xb)
            h += h2

        # Expand to make sure legend will never overlap the image content
        M2 = np.ones([h1, w1 + width, x1]) * 255
        M2[:, 0:w1] = M
        h1, w1, x1 = M2.shape
        h = 0
        for X in S_legend:
            h2, w2, Z = X.shape
            if x1 > Z:
                X = np.dstack([X, np.ones([h2, w2])])
            elif x1 < Z:
                X = X[:, :, :x1]
            if h1 - h <= 0: break
            if h1 - h - h2 < 0:  # happens when there are too many gene list, legend is too long
                #if h1-h<=0: break
                M2[0:h1 - h, w1 - w2:] = X[h2 - (h1 - h):h2, :]
                break
            else:
                M2[h1 - h - h2:h1 - h, w1 - w2:] = X
            h += h2

        scipy.misc.imsave(s_png, M2)
        if l_clean_up:
            for x in S_png2:
                os.remove(x)
Exemplo n.º 16
0
    def plot(self, karyotype="", symbol=None, links=None, hits=None, outputdir=None, outputfile="CircosPlot"):
        #sw=util.StopWatch()
        outputdir=outputdir if outputdir is not None else '/tmp'
        for ext in [".png" , ".svg"]:
            s_file=os.path.join(outputdir, outputfile+ext)
            if os.path.exists(s_file):
                os.remove(s_file)
        if links is None:
            util.warn_msg('No link to plot, simply ignore')
            #return
        tmp=tempfile.NamedTemporaryFile(dir=outputdir, delete=False, prefix="CIRCOS_", suffix=".txt")
        conf_file=tmp.name
        S_tmp_file=[conf_file]

        s_conf=util.read_string(self.TEMPLATE)
        kary_file=re.sub('CIRCOS_', 'CIRCOS_KARYOTYPE_', conf_file)
        util.save_string(kary_file, karyotype)
        S_tmp_file.append(kary_file)
        s_conf=re.sub(r'@KARYOTYPE@', kary_file, s_conf)

        r0=0.90
        s_plot=""
        if hits is None:
            hits=[]
        elif type(hits) is not list:
            hits=[hits]
        for i,s_hit in enumerate(hits):
            hit_file=re.sub('CIRCOS_', 'CIRCOS_HIT_%d_' % i, conf_file)
            util.save_list(hit_file, s_hit)
            S_tmp_file.append(hit_file)
            s_plot+="<plot>\n"
            s_plot+="file = "+hit_file+"\n"
            s_plot+="r0 = "+('%.3f' % r0)+"r\n"
            s_plot+="r1 = "+('%.3f' % r0)+"r+70p\n"
            s_plot+="stroke_thickness = 0\n"
            s_plot+="min = 0\n"
            s_plot+="max = 2\n"
            s_plot+="color = oranges-3-seq\n"
            s_plot+="</plot>\n\n"
            r0-=0.05;

        s_conf=re.sub(r'@PLOTS@', s_plot, s_conf)
        #t_chr=pd.read_csv(os.path.join(Circos.HOME, "karyotype_"+pid+".tmp"), sep=r'\s+', header=None)
        #s_conf=re.sub(r'@CHROMOSOMES@', ";".join(t_chr[2]), s_conf)
        #avoid using Pandas, so that this script can be used in CGI on ldweb server, where numpy is not installed correctly
        S=karyotype.split("\n")
        S_chr=[]
        for s in S:
            if s.strip()=='': break
            S_chr.append(re.split(Circos.DELIMITER, s)[2])
        s_conf=re.sub(r'@CHROMOSOMES@', ";".join(S_chr), s_conf)
        s_symbol=""
        if symbol is not None:
            symbol_file=re.sub('CIRCOS_', 'CIRCOS_SYMBOL_', conf_file)
            util.save_string(symbol_file, symbol)
            S_tmp_file.append(symbol_file)
            s_symbol+="<plot>\n"
            s_symbol+="type = text\n"
            s_symbol+="color = black\n"
            s_symbol+="file = "+symbol_file+"\n"
            s_symbol+="r0=1.02r\n"
            s_symbol+="r1=1.2r\n"
            s_symbol+="label_size = 12p\n"
            s_symbol+="label_font = condensed\n"
            s_symbol+="padding = 0p\n"
            s_symbol+="rpadding = 0p\n"
            s_symbol+="</plot>\n"
        s_conf=re.sub(r'@SYMBOL@', s_symbol, s_conf)

        S_color=['107,174,214', '116,196,118', '106,81,163']
        s_link=""
        MAX_EDGES=10000 # Circos does not seem to work well with too many edges, it will not draw edges after maybe 20000-ish
        if links is not None:
            if type(links) is str: links=[links]
            for i in range(len(links)-1, -1, -1):
                link_file=re.sub('CIRCOS_', 'CIRCOS_LINK%02d_' % (i+1), conf_file)
                S_tmp_file.append(link_file)
                S_edges=links[i].strip().split("\n")
                n_edge=len(S_edges)/2
                if n_edge>MAX_EDGES:
                    # randomly sample a subset

                    IDX=np.repeat(np.random.permutation(list(range(0,len(S_edges),2)))[:MAX_EDGES], 2)
                    IDX[list(range(1,len(IDX),2))]+=1
                    S_in=[x for x in IDX if  x >=len(S_edges) ]
                    S_edges=pd.Series(S_edges)[IDX].astype(str)
                    links[i]="\n".join(S_edges)
                util.save_string(link_file, links[i])
                s_link+="<link link"+str(i+1)+">\n"
                s_link+="show = yes\n"
                s_link+="color = "+S_color[(i+len(S_color)-1)%len(S_color)]+"\n"
                s_link+="file = "+link_file+"\n"
                s_link+="</link>\n\n"
        s_conf=re.sub(r'@LINKS@', s_link, s_conf)
        #print s_conf
        util.save_string(conf_file, s_conf)
        #print s_conf
        ## run Circos
        #s_cmd = "cd "+os.path.join(os.path.dirname(__file__), "circos")+"; "
        s_cmd=self.BIN+" -conf "+conf_file
        s_cmd+=' -outputdir '+outputdir
        s_cmd+=' -outputfile '+outputfile
        #sw.check('prepare conf file')
        print(s_cmd)
        util.unix(s_cmd, l_print=False, l_error=False)
        l_remove_temp=True
        if l_remove_temp:
            for f in S_tmp_file:
                os.remove(f)
        s_file=os.path.join(outputdir, outputfile+".png")
        #sw.check('make circos image')
        if os.path.exists(s_file):
            return s_file
        else:
            return None
Exemplo n.º 17
0
def sql_in(s_sql_left,
           s_sql_right,
           S_id,
           num=1000,
           con=None,
           params_before=None,
           params_after=None):
    if con is None:
        util.error_msg('Database connection is not provided!')
    S_id = list(S_id)
    n = len(S_id)
    # in case it's multi-line SQL statement
    s_sql_left = re.sub('[\r\n]', ' ', s_sql_left)
    s_sql_right = re.sub('[\r\n]', ' ', s_sql_right)
    s_sql_right = re.sub(r'^\s*\)', '', s_sql_right)
    pat = re.compile(r'\s+(?P<col>[\w.]+)(?P<not>\s+NOT)?\s+IN\s*\(\s*$', re.I)
    m = re.search(pat, s_sql_left)
    if m is None:
        util.error_msg('Left SQL does not ends with IN statement: %s' %
                       s_sql_left)
    s_sql_left = s_sql_left[:m.start()] + " "
    s_col = m.group('col')
    l_NOT = m.group('not') is not None
    # somethings SQL contains its own parameters, we need to provide parameter before/after the
    # S_id parameters
    params_before = params_before or []
    params_after = params_after or []
    # If oracle, either use ToTable() or run multiple SQL queries
    # If multiple SQL are run, results do not support GROUP BY, ORDER BY, DISTINCT
    # as they are applied to individual SQL runs
    # For other SQL servers, results will be exact, via multiple OR statements
    if db_type(con) == 'ORACLE':
        # Custom function used where TOTABLE is defined under MY_SCHEMA
        MY_SCHEMA = setting.db['MY_SCHEMA']

        ### The definition of user function
        #CREATE OR REPLACE FUNCTION MY_SCHEMA.ToTable (
        #   P_STR     IN CLOB,
        #   P_DELIM   IN VARCHAR2 DEFAULT ',' ,
        #   P_LIKE    IN INT DEFAULT 0
        #)
        #   RETURN t_NTtype
        #AS
        #  L_DATA t_NTtype := t_NTtype ();
        #  L_STR  CLOB := P_STR || P_DELIM;
        #  L_SUBSTR VARCHAR2(4000);
        #  L_STEP PLS_INTEGER := 0;
        #  L_THIS INT := 1;
        #  L_PREV INT := 0;
        #  L_END CHAR := CASE P_LIKE WHEN 0 THEN NULL ELSE '%' END;
        #BEGIN
        #  LOOP
        #    L_STEP := L_STEP + 1;
        #    L_THIS := DBMS_LOB.INSTR(L_STR, P_DELIM, L_PREV + 1, 1);
        #    EXIT WHEN L_THIS = 0;
        #    L_SUBSTR :=
        #    TRIM(
        #      DBMS_LOB.SUBSTR(
        #        L_STR,
        #        L_THIS - L_PREV - 1,
        #        L_PREV + 1
        #      )
        #    );
        #    L_PREV := L_THIS;
        #    L_DATA.EXTEND();
        #    L_DATA(L_STEP) := L_SUBSTR || L_END;
        #  END LOOP;
        #  RETURN L_DATA;
        #END;
        ###

        t = from_sql(
            con, "select * from SYS.User_Objects where object_name='TOTABLE'")
        if len(t):
            S_id = [str(id) for id in S_id]
            S_id_filtered = [id for id in S_id if len(id) > 20
                             ]  # 20 is entry length limit within ToTable()
            S_id = [','.join([id for id in S_id if len(id) <= 20])]
            if len(S_id_filtered) > 0:
                util.warn_msg(
                    'The following parameters were filtered out from the sql due to the length constraint: '
                    + str(S_id_filtered))
            if l_NOT:
                s_id = '(NOT exists (select 1 from Table(' + MY_SCHEMA + '.ToTable(?)) tmp_table where tmp_table.COLUMN_VALUE = ' + s_col + '))'
            else:
                s_id = '(exists (select 1 from Table(' + MY_SCHEMA + '.ToTable(?)) tmp_table where tmp_table.COLUMN_VALUE = ' + s_col + '))'
            return from_sql(con,
                            s_sql_left + s_id + s_sql_right,
                            params=params_before + S_id + params_after)
        #else:
        #    t=[]
        #    for i in xrange(0, n, num):
        #        j=min(n,i+num)
        #        s_id=",".join(["?"]*(j-i))
        #        t2=from_sql(con, s_sql_left+" "+s_col+" IN ("+s_id+") "+s_sql_right, params=params_before+S_id[i:j]+params_after)
        #        t.append(t2)
        #    return pd.concat(t, axis=0, ignore_index=True)
    S = []
    for i in range(0, n, num):
        j = min(n, i + num)
        S.append(",".join(["?"] * (j - i)))
    if l_NOT:
        s_id = "(" + s_col + " NOT IN (" + (") AND " + s_col +
                                            " NOT IN (").join(S) + "))"
    else:
        s_id = "(" + s_col + " IN (" + (") OR " + s_col +
                                        " IN (").join(S) + "))"
    if db_type(con) == 'MYSQL' and not S:
        if l_NOT:
            s_id = "(" + s_col + " NOT IN (''))"
        else:
            s_id = "(" + s_col + " IN (''))"
    t = from_sql(con,
                 s_sql_left + s_id + s_sql_right,
                 params=params_before + S_id + params_after)
    return t
Exemplo n.º 18
0
    def load(tax_id=9606, l_use_GPDB=True, S_DB=None, entrez=None):
        """tax_id is None, defaults to 9606, if 0, means load all supported species,
        entrez is only used in local mode to accelerate Symbol retrieval"""
        S_DB=S_DB or Cache.get_DB(l_use_GPDB)
        if tax_id is None:
            util.error_msg('tax_id must be an int, or 0 means all supported species')
        tax_id=abs(tax_id)
        s_key=Cache.key(l_use_GPDB)
        eg=entrez
        S_tax_id=[]
        if not l_use_GPDB:
            if tax_id not in (0,9606):
                util.error_msg('Local database only supports human!')
            tax_id=9606
            if tax_id in Cache.ppi_data[s_key]:
                S_DB=[x for x in S_DB if x not in Cache.ppi_data[s_key][tax_id]]
                if len(S_DB)==0: return
            S_tax_id=[tax_id]
            T=[]
            for filename in S_DB:
                print("loading PPI database: "+filename+" ...")
                if os.path.isfile(filename):
                    t=pd.read_csv(filename)
                    t['ds']=filename
                    T.append(t)
                elif os.path.isfile(Cache.DATA_DIR+filename+".csv"):
                    t=pd.read_csv(Cache.DATA_DIR+filename+".csv")
                    t['ds']=filename
                    T.append(t)
                else:
                    util.warn_msg('PPI database ' + filename + ' not found.')
            if len(T)>1:
                t=pd.concat(T, axis=0, ignore_index=True)
            else:
                t=T[0]
            t=t[(t.Gene_A!=t.Gene_B) & (t.Score>=0.5)].copy()
            if eg is None:
                eg=ez.EntrezGene(tax_id=tax_id)
            else:
                eg.load_organism(tax_id=tax_id)
            c_seen={}
            t.index=list(range(len(t)))
            t['Gene_A']=t.Gene_A.astype(str)
            t['Gene_B']=t.Gene_B.astype(str)
            S_gene_A=t.Gene_A.tolist()
            S_gene_B=t.Gene_B.tolist()
            for i in range(len(t)):
                gene_A=S_gene_A[i]
                gene_B=S_gene_B[i]
                if gene_A not in c_seen:
                    c_seen[gene_A]=eg.fix_gene_id(gene_A)
                S_gene_A[i]=c_seen[gene_A]
                if S_gene_A[i] is None: continue
                if gene_B not in c_seen:
                    c_seen[gene_B]=eg.fix_gene_id(gene_B)
                S_gene_B[i]=c_seen[gene_B]
            t['Gene_A']=S_gene_A
            t['Gene_B']=S_gene_B
            t=t[~(t.Gene_A.isnull() | t.Gene_B.isnull())].copy()
            t.index=list(range(len(t)))
            t['tax_id']=tax_id
        else:
            mydb=db.DB('METASCAPE')
            if tax_id>0 and tax_id in Cache.ppi_data[s_key]:
                S_DB=[x for x in S_DB if x not in Cache.ppi_data[s_key][tax_id]]
                if len(S_DB)==0: return
            if tax_id>0:
                print("loading PPI database from database for tax_id: %d ..." % tax_id)
                t=mydb.sql_in("SELECT gid_A Gene_A,gid_B Gene_B,0 Score,tax_id_A tax_id,ds from interaction where interaction_category!='genetic' and gid_A!=gid_B and tax_id_A=tax_id_B and tax_id_A=? and ds in (", ")", S_DB, params_before=[tax_id])
                S_tax_id=[tax_id]
            else:
                #ZZZ modify in the future, to obtain the list of all supported tax_id
                t=mydb.from_sql('SELECT DISTINCT tax_id FROM gid2source_id')
                S_tax_id=[x for x in t.tax_id.astype(int).tolist() if x not in Cache.ppi_data[s_key]]
                if len(S_tax_id):
                    s_tax_id=",".join(util.iarray2sarray(S_tax_id))
                    print("loading PPI database for tax_id: %s ..." % s_tax_id)
                    t=mydb.sql_in("SELECT gid_A Gene_A,gid_B Gene_B,0 Score,tax_id_A tax_id,ds from interaction where interaction_category!='genetic' and gid_A!=gid_B and tax_id_A=tax_id_B and ds in (", ")", S_DB)
                    #t=mydb.sql_in("SELECT gid_A Gene_A,gid_B Gene_B,0 Score,tax_id_A tax_id,ds from interaction where interaction_category!='genetic' and gid_A!=gid_B and tax_id_A=tax_id_B and tax_id_A in ("+s_tax_id+") and ds in (", ")", S_DB)
                else:
                    t=pd.DataFrame()
            if len(t):
                t['Gene_A']=t.Gene_A.astype(str)
                t['Gene_B']=t.Gene_B.astype(str)

        for x in S_tax_id:
            #print ">>>>>>>>>>>>>>>>>>>>>>>", x
            if x not in Cache.ppi_data[s_key]:
                Cache.ppi_data[s_key][x]={}
                Cache.ppi_node[s_key][x]={}
            for y in S_DB:
                Cache.ppi_data[s_key][x][y]={}
                Cache.ppi_node[s_key][x][y]=pd.DataFrame()

        if len(t)==0: return
        for k,t_v in t.groupby(['tax_id','ds']):
            #print ">>>", k, len(t_v)
            #t_v=t_v.copy()
            if k[0] not in S_tax_id: continue
            data={}
            t_node=None
            #t_v=t_v.copy()
            #t_v.index=list(range(len(t_v)))
            #for i in t_v.index:
                #if i%1000==0: print i
            for row in t_v.itertuples():
                gene_A=row.Gene_A #t_v.ix[i,'Gene_A']
                gene_B=row.Gene_B #t_v.ix[i,'Gene_B']
                score=row.Score #t_v.ix[i,'Score']
                if gene_A not in data:
                    data[gene_A]={gene_B:score}
                else:
                    data[gene_A][gene_B]=max(score, data[gene_A].get(gene_B,0))
                if gene_B not in data:
                    data[gene_B]={gene_A:score}
                else:
                    data[gene_B][gene_A]=max(score, data[gene_B].get(gene_A,0))
            Cache.ppi_data[s_key][k[0]][k[1]]=data
            S_gene=list(data.keys())
            if l_use_GPDB:
                t_node=mydb.sql_in("SELECT gid Gene,source_id Symbol from gid2source_id t where gid in (", ") and t.id_type_id=1", util.rarray2iarray(S_gene))
                t_node['Gene']=t_node.Gene.astype(str)
                if len(S_gene)!=len(t_node):
                    util.warn_msg("Strange, gene ID has no symbol?")
                    t=pd.DataFrame({'Gene':S_gene})
                    t_node=t.merge(t_node, left_on='Gene', right_on='Gene', how='left')
                    X=t_node.Symbol.isnull()
                    print(t_node.ix[X][:10])
                    if X.any():
                        t_node.ix[X,'Symbol']=t_node.ix[X,'Gene']
            else:
                t_node=eg.gene_sarray_to_table(S_gene, l_description=False)
            Cache.ppi_node[s_key][k[0]][k[1]]=t_node
Exemplo n.º 19
0
    def plot_membership(self, t_membership, t_go=None, outputdir=None, outputfile="CircosPlot", s_symbol="Symbol", S_label=None, S_color=None):
        """t_mem is a table, where index is Gene ID, contains an optional column for Symbol,
        a list of columns starts with _MEMBER_, which is 1/0 indicating membership in certain hit list.  This method is for membership table generated within biolist.py.
        rename is a dict that maps existing list name (without _MEMBER_ prefix) into new name. For Circos, should not contain space.
        SIDE EFFECT: if t_go is specified, t_membership will be modified, 0.5 added to where GO-indirect membership is found!!!
        """
        #sw=util.StopWatch()
        MAX_NOF_LISTS=100
        D=Circos.DELIMITER
        t_mem=t_membership.copy()
        S_col=[x for x in t_mem.header() if x.startswith('_MEMBER_')]
        if len(S_col)>MAX_NOF_LISTS:
            S_col=S_col[:MAX_NOF_LISTS]
            util.warn_msg('Too many lists, only show the first %d in Circos plot!' % MAX_NOF_LISTS)
        if len(S_col)<=1:
            util.warn_msg('Need at least two lists to plot Circos plot, only %d found!' % len(S_col))
            return self.plot(karyotype="", symbol="", links=None, hits="", outputdir=outputdir, outputfile=outputfile)
        if S_label is None:
            S_label=[re.sub(r'[.;]', ' ', x[8:]) for x in S_col]
        S_chr=[str(i+1) for i in range(len(S_col))]
        c_rename={ x: S_chr[i] for i,x in enumerate(S_col) }
        t_mem.rename2(c_rename)
        n=len(S_chr)
        l_symbol=s_symbol is not None and (s_symbol in t_mem.header())
        if not l_symbol:
            t_mem['Symbol']=t_mem.index.astype(str)
        t_mem['_SUM_']=t_mem.ix[:, S_chr].sum(axis=1)
        if S_color is None:
            S_color=Circos.get_qualitative_colors(n)
        else:
            # convert color from hex to RGB
            S_color=[Circos.hex_to_rgb(x) for x in S_color]
        c_coord={}
        s_karyotype=""
        s_symbol="" if l_symbol else None
        s_hits=""
        #sw.check('Prepare Membership')

        for i,x in enumerate(S_chr):
            s_karyotype+="chr"+D+"-"+D+S_chr[i]+D+S_label[i]+D+"0"+D+str(int(t_mem.ix[:,x].sum()))+D+S_color[i%len(S_color)]+"\n"
            tmp=t_mem[t_mem.ix[:, x]>0].copy()
            #tmp["_PATTERN_"]=tmp["_PATTERN_"].apply(lambda x: x[i:]+x[:i])
            tmp.sort_values(['_SUM_', '_PATTERN_', 'Symbol'], ascending=[False, False, True], inplace=True)
            j=0
            for idx in tmp.index:
                j+=1
                if l_symbol:
                    s_symbol+=S_chr[i]+D+str(j-1)+D+str(j)+D+t_mem.ix[idx, 'Symbol']+"\n"
                if idx not in c_coord:
                    c_coord[idx]={}
                c_coord[idx][S_chr[i]]=j
                s_hits+=S_chr[i]+D+str(j-1)+D+str(j)+D+str(int(t_mem.ix[idx, '_SUM_']))+"\n"
        #sw.check('prepare Hit')
        tmp=t_mem[t_mem._SUM_ > 1].copy()
        tmp.sort_values('_SUM_', inplace=True)
        link_cnt=0
        out=[]
        for s_gene,v in c_coord.items():
            S_on_chr=list(v.keys())
            for i in range(len(S_on_chr)):
                chr1=S_on_chr[i]
                i_from=c_coord[s_gene][chr1]
                for j in range(i+1, len(S_on_chr)):
                    chr2=S_on_chr[j]
                    i_to=c_coord[s_gene][chr2]
                    out.append("e"+str(link_cnt)+D+chr1+D+str(i_from-1)+D+str(i_from))
                    out.append("e"+str(link_cnt)+D+chr2+D+str(i_to-1)+D+str(i_to))
                    #s_link+=str(link_cnt)+D+chr1+D+str(i_from-1)+D+str(i_from)+D+str(link_cnt)+D+chr2+D+str(i_to-1)+D+str(i_to)+"\n"
                    link_cnt+=1
        s_link="\n".join(out)
        S_link=[s_link]
        #sw.check('prepare link')

        def go_links(t_go, l_sample=False):
            link_cnt=0
            c_plot={}
            out=[]
            MAX_EDGES=10000
            MAX_GENES=20
            s_link=None
            for _,r in t_go.iterrows():
                s=r['GeneID']
                #print '>>>>', r['GO']
                #if r['GO']!='GO:0007156': continue
                S=s.split("|")
                if link_cnt>MAX_EDGES: # we start to get aggressive
                    if not l_sample: return None # the caller should call us again with l_sample=True
                    S=np.random.choice(S, MAX_GENES)
                for i in range(len(S)):
                    g1=S[i]
                    for j in range(i+1, len(S)):
                        g2=S[j]
                        #if r['GO']=='GO:0007156':
                        #    print g1+":"+g2, g1+":"+g2 in c_plot, g2+":"+g1 in c_plot
                        if g1+":"+g2 in c_plot or g2+":"+g1 in c_plot: continue
                        c_plot[g1+":"+g2]=True
                        for chr1,i_from in c_coord[g1].items():
                            #if r['GO']=='GO:0007156': print ">>> "+str(chr1)+" "+str(i_from)
                            for chr2,i_to in c_coord[g2].items():
                                #if r['GO']=='GO:0007156': print "   <<< "+str(chr2)+" "+str(i_to)
                                if chr1==chr2: continue # same chromosome
                                out.append("e"+str(link_cnt)+D+chr1+D+str(i_from-1)+D+str(i_from))
                                out.append("e"+str(link_cnt)+D+chr2+D+str(i_to-1)+D+str(i_to))
                                #str(link_cnt)+D+chr1+D+str(i_from-1)+D+str(i_from)+D+str(link_cnt)+D+chr2+D+str(i_to-1)+D+str(i_to))
                                link_cnt+=1
                                # use 0.4 instead 0.5 to avoid degeneracy in Euclidean distance calculation
                                if t_membership.ix[g1, S_col[int(chr2)-1]]==0: t_membership.ix[g1, S_col[int(chr2)-1]]=0.333
                                if t_membership.ix[g2, S_col[int(chr1)-1]]==0: t_membership.ix[g2, S_col[int(chr1)-1]]=0.333
            s_link="\n".join(out)
            return s_link

        if t_go is not None:
            for x in S_col:
                t_membership[x]=t_membership[x].astype(float)
            s_link=go_links(t_go, False)
            if s_link is None:
                s_link=go_links(t_go, True)
            S_link.append(s_link)
            #sw.check('GO links')
        return self.plot(karyotype=s_karyotype, symbol=s_symbol, links=S_link, hits=s_hits, outputdir=outputdir, outputfile=outputfile)
Exemplo n.º 20
0
    def map(self, X, n_CPU=0, l_quit=None):
        """X: list[input tuple], list of input parameters. If workers were started with f=None, each element in X in passed to the wrapper task. In that case, we expect X to be a tuple (or list) and the first element of X must be either the method pointer or its registered name. However, many methods, such as instance method or func within a func, cannot be pickled, therefore the method cannot be send over the pipe. We should pre-register such methods and call them by name. Need example later.
        l_quit: boolean, default None, if specified, controls whether workers quit or not after tasks are processed. If not, workers wait for future tasks.
        return list"""
        # very similar to the idea in https://stackoverflow.com/questions/3288595/multiprocessing-how-to-use-pool-map-on-a-function-defined-in-a-class, author klaus se
        l_quit = l_quit if l_quit is not None else self.QUIT
        #if self.is_busy():
        #    util.error_msg('Works are still busy!')
        if n_CPU == 0: n_CPU = self.n_use  # defaults to n_use
        n_CPU = min(
            n_CPU, self.n_CPU
        )  # one could start 8 CPUs, but only use 4 for mapping, if the work takes lots of memory
        res = []
        n_input = len(X)
        if n_input == 0 and not l_quit: return res
        #print '=============', self.c_proc
        if not self.has_started() and n_input > 0:
            util.warn_msg(
                'Please start processes first, no worker is running!')
            util.warn_msg('However, we will process the task with ONE cpu!!!')
            return [self.wrapper(x) for x in X]

        if n_input > 0 and n_CPU == 0:
            return [self.wrapper(x) for x in X]

        s_pid = str(multiprocessing.current_process().pid)
        has_my_job = [False for i in range(self.n_CPU)]

        def engine():
            print(
                "================================================================="
            )
            print("PID: ", str(multiprocessing.current_process().pid))
            print("WORK STATUS: ", self.work_status)
            print("HAS MY JOB: ", has_my_job)
            print("JOB IS DONE: ", self.job_is_done)
            print("N_RUNNING: (%d, %d) " %
                  (self.n_running[0], self.n_running[1]))
            print(
                "================================================================="
            )

        def is_busy():
            return sum(has_my_job) > 0

        def process_out(out):
            i, x = out
            if i is None:
                self.n_running[0] -= 1
                # I modify the original code, so that we can join the process and release it as soon as possible
                if type(x) is str:
                    print("Exception> " + x)
                    exit()
                else:
                    if self.DEBUG:
                        print("Progress: %d processes remaining. Stopping %d" %
                              (self.n_running[0], x))
                    #print self.c_proc.keys()
                    self.c_proc[x].join()
                    del self.c_proc[x]
                    if self.DEBUG: print("Progress: process %d stopped." % x)
            else:
                res.append(out)
                if self.DEBUG:
                    print("Progress: %d of %d item calculated." %
                          (len(res), n_input))

        def fetch(l_lock=False):
            while is_busy():
                l_fetch_something = False
                for i_worker in range(self.n_CPU):
                    if has_my_job[i_worker] and self.job_is_done[i_worker]:
                        try:
                            (i, x) = self.q_out[i_worker].get()
                            process_out((i, x))
                            self.n_running[1] -= 1
                            self.work_status[i_worker] = False
                            has_my_job[i_worker] = False
                            self.job_is_done[i_worker] = False
                            l_fetch_something = True
                            if self.DEBUG:
                                print(">>>A1")
                                engine()
                            with self.mail:
                                self.mail.notify_all()
                        except Exception as e:
                            print(
                                "ERROR> Fail to fetch results from worker: %d"
                                % i_worker)
                            print(traceback.format_exc())
                            return
                if not l_fetch_something:
                    if l_lock:
                        with self.mail:
                            self.mail.wait(timeout=8.5 + random.random() * 3)
                    else:
                        return

        if self.DEBUG: print("DEBUG> self.n_running: ", self.n_running)
        if self.DEBUG:
            print("DEBUG> self.c_proc.keys(): ", list(self.c_proc.keys()))
        ###ZHOU FEB16,2016
        #self.n_running[1]+=1
        ###
        i = 0
        while (i < n_input):
            x = X[i]
            if self.DEBUG: print("fetch job entry %d " % i)
            #print self.work_status
            self.lock.acquire()
            j = util.index(False, self.work_status)  # find an idle worker
            l_put_something = False
            if j >= 0 and sum(
                    has_my_job
            ) < n_CPU:  #j>=0 and j<n_CPU: # we only use up to n_CPU, even if there are more workers
                #print "assing job to %d" % j
                self.work_status[j] = True  # flag it as busy
                has_my_job[j] = True
                if self.DEBUG:
                    print("DEBUG> self.c_proc.keys(): ",
                          list(self.c_proc.keys()))
                ###ZHOU FEB16,2016
                self.n_running[1] += 1
                ###
                self.lock.release()
                self.q_in[j].put((i, j, x))  # assign task
                i += 1
                if self.DEBUG:
                    print("Progress: send input %d of %d items." % (i, len(X)))
                l_put_something = True
                if self.DEBUG:
                    print(">>>A2")
                    engine()
            else:
                self.lock.release()
            # we constantly removing items from the output queue, so that the process can release some memory
            fetch()
            if not l_put_something:
                with self.mail:
                    self.mail.wait(timeout=8.5 + random.random() * 3)
                    fetch()

        while (is_busy()):
            fetch(True)
        if self.DEBUG:
            print(">>>A3")
            engine()
        self.lock.acquire()
        ###ZHOU FEB16,2016
        #self.n_running[1]-=1
        ###
        if self.DEBUG:
            print(">>>QUIT=" + ("True" if l_quit else "False"))
            print(">>>n_running[1]=%d" % self.n_running[1])
        if l_quit and self.n_running[1] == 0:  # I am the last one running map()
            if self.DEBUG:
                print(">>>A4")
                engine()
            for i in range(self.n_CPU):
                self.q_in[i].put((None, i, None))
                self.work_status[i] = True
                self.n_running[1] += 1
                has_my_job[i] = True
            self.lock.release()
            while (is_busy()):
                fetch(True)
                if self.DEBUG:
                    engine()
        else:
            self.lock.release()
        if self.DEBUG:
            for i, x in enumerate(res):
                print('>>>A4 ', i, type(x), type(x[0]), type(x[1]))
        res.sort(key=lambda x: x[0])
        return [x for i, x in res]