def disapprove(self, PMID): id = self.get_id(PMID) if id == 0: util.warn_msg('PMID: %s is not in collection!' % PMID) else: self.db.from_sql("update gp.citation set PStatus='N' where PMID=?", params=[PMID])
def make_legend_pdf(s_pdf, S_png=[], l_clean_up=True): """Add an optional Watermark and legend png (or more), filenames stored in S_png, generate a Watermark.pdf""" from reportlab.pdfgen import canvas #if s_wm is None: # s_wm=os.path.join(os.path.dirname(__file__), "watermark", "Watermark.pdf") if os.path.exists(s_pdf): os.remove(s_pdf) from reportlab.lib.pagesizes import letter c = canvas.Canvas(s_pdf, pagesize=letter) w, h = c._pagesize w = int(w) # place stuff in the center w = w / 2 h = h / 2 l_has_page = False for x in S_png: # Add to lower-right corner, bottom up #X=mahotas.imread(x) if not os.path.exists(x): util.warn_msg('File not found in make_legend_pdf: %s' % x) continue X = scipy.misc.imread(x) h2, w2, Z = X.shape c.drawImage(x, w - w2, h) h += h2 l_has_page = True if l_clean_up: os.remove(x) if l_has_page: c.save()
def cynet_from_xgmml(self, s_file, name=None, s_layout="force-directed", l_bundle=True, t_xy=None, l_digraph=False): """If t_xy is provided, s_layout is ignored return CyNetwork""" import os if not os.path.exists(s_file): import util util.warn_msg('Missing file: %s' % s_file) return None if not l_digraph: net = xgmml.Network(s_file, name=name) else: import digraph net = digraph.Digraph(s_file, name=name) if t_xy is not None: net.set_XY(t_xy) s_layout = None S = net.T_node.header() #print "---------------->", S #S=[x for x in S if x not in ('Gene','Name','Symbol')] #net.T_node.drop(S, axis=1, inplace=True) ############################## return self.cynet_from_network(net, name=name, s_layout=s_layout, l_bundle=l_bundle)
def remove(PMID): id = self.get_id(PMID) if id == 0: util.warn_msg('PMID: %s is not in collection!' % PMID) else: self.db.from_sql("delete from gp.citation where PMID=?", params=[PMID])
def graph(self, PMID, has_graph='Y'): id = self.get_id(PMID) if id == 0: util.warn_msg('PMID: %s is not in collection!' % PMID) else: self.db.from_sql("update gp.citation set Graph=? where PMID=?", params=[has_graph])
def restore_distance(s_file, max_dist=1.0): 'No longer needed, as we use cwc.new' df=pd.read_table(s_file, header=None) s_col=util.header(df)[-1] R=(-df[s_col]+1.0).abs()*max_dist if R.max()<=1.0: df[s_col]=util.rarray2sarray((-R+1.0).abs(), s_format='%.3f') # convert to similarity score and output df.to_csv(s_file, sep='\t', index=False, header=False) else: util.warn_msg('Cannot restore distance to similarity, as the max distance '+str(R.max())+' > 1.0, restore skipped!')
def insert(self, PMID, PStatus='N', Graph='N'): id = self.get_id(PMID) if id != 0: util.warn_msg('PMID: %s already in collection, id=%s!' % (PMID, id)) data = {} else: data = self.fetch_pmid(PMID) if len(data) == 0: util.warn_msg('PMID: %s cannot be fetched!' % (PMID)) else: #print data self.insert_by_data(data, PStatus, Graph) return data
def get_date(self): t=self.db.from_sql('select distinct history from statistics order by history desc limit 0,2') print t self.d1=self.d2=None if len(t)==0: util.error_msg('No history data is found') elif len(t)==1: util.warn_msg('Only one history entry') self.d1=str(t.ix[0, 'history']) else: self.d1=str(t.ix[0, 'history']) self.d2=str(t.ix[1, 'history']) print self.d1 print self.d2
def add_watermark_pdf(S_pdf, s_out="out", s_wm=None, l_clean_up=True): #https://www.binpress.com/tutorial/manipulating-pdfs-with-python/167 #import shutil #shutil.copyfile(s_out, s_out+".v2.pdf") s_out, s_ext = os.path.splitext(s_out) s_out = s_out + '.pdf' #if os.path.exists(s_out): # os.remove(s_out) if type(S_pdf) is str: S_pdf = [S_pdf] if s_wm is None: s_wm = os.path.join(os.path.dirname(__file__), "watermark", "Watermark.pdf") if not os.path.exists(s_wm): util.warn_msg('File not found in make_watermark_pdf: %s' % s_wm) return f_in = [] f_wm = open(s_wm, 'rb') wpdf = PdfFileReader(f_wm) watermark = wpdf.getPage(0) f_in.append(f_wm) # there appears to be issue with Python2, we need to close f_wm after it's been used out = PdfFileWriter() for s_pdf in S_pdf: if not os.path.exists(s_pdf): util.warn_msg('File not found in make_watermark_pdf: %s' % s_pdf) continue f = open(s_pdf, 'rb') ipdf = PdfFileReader(f) for i in range(ipdf.getNumPages()): page = ipdf.getPage(i) out.addPage(page) f_in.append(f) out.addPage(watermark) # if s_out is one of S_pdf, it may trigger an error msg as s_pdf is still open with open(s_out + ".tmp", 'wb') as f: out.write(f) # incase s_out is the same as one of input S_pdf for f in f_in: f.close() if l_clean_up: os.remove(s_wm) print("Deleting ... " + s_wm) for s_pdf in S_pdf: if os.path.exists(s_pdf): os.remove(s_pdf) print("Deleting ... " + s_pdf) shutil.move(s_out + ".tmp", s_out)
def start(self, f=None, n_CPU=0): """Launch workers. f: method, task for worker, default None, if None, worker runs a wrapper task, which expect the first argument to be a registered method name n_CPU: number of workers to launch.""" if self.has_started(): util.warn_msg('Already started, stop first and start new!') self.stop() self.n_CPU = n_CPU if n_CPU > 0 else multiprocessing.cpu_count() - 1 self.n_use = self.n_CPU # use all CPUs by default, but user could ask to use fewer self.q_in = [multiprocessing.Queue(1) for i in range(self.n_CPU)] self.q_out = [multiprocessing.Queue() for i in range(self.n_CPU)] self.c_proc = { } #bug, if c_proc is not inside manager, each process will see {}, {1proc}, {2procs} ..., as they are created self.f = f if self.f is None: self.f = self.wrapper # put work_status, n_running into manager, so visible by all processes, see the same copy self.mgr = multiprocessing.Manager() #self.c_proc=self.mgr.dict({}) self.work_status = self.mgr.list([False for i in range(self.n_CPU)]) # remember which process is working on my job #self.has_my_job=[False for i in range(self.n_CPU)] self.has_my_job = None # whether job is done, results ready for reading self.job_is_done = self.mgr.list([False for i in range(self.n_CPU)]) self.n_running = self.mgr.list([0, 0]) # n_running [0] is # of running process, [1] # of map() calls running # use a list, so that fetch routine can modify it (if we do n_running=0, fetch cannot m self.lock = self.mgr.Lock() self.mail = multiprocessing.Condition( ) # notify all processes when a worker finishes procs = [ multiprocessing.Process(target=self.spawn(self.f), args=(self.q_in[i], self.q_out[i], self.mail, self.job_is_done)) for i in range(self.n_CPU) ] for p in procs: p.daemon = True p.start() self.c_proc[p.pid] = p self.n_running[0] += 1 if self.DEBUG: print("DEBUG>>> %s %d %d" % (str(p), p.pid, self.n_running[0])) if self.DEBUG: print("DEBUG> self.c_proc: ", self.c_proc)
def color(self, nodes, l_name_to_id=True, cm=None): """Color tree nodes nodes: dict or list of nodes lists nodes and colormap cm are combined to passed to Tree.color_map (see document) l_name_to_id: bool, default True, use leave name or node_id Warning: this method color nodes, so if leave name is provided and two leaves under the same node has different colors, only one color is used. """ df = Tree.read_tree_file(self.tree_file) # ['node','left','right','similarity', 'color'] df['color'] = '#000000' c_id2node = {} c_name2id = {} if l_name_to_id: for k, v in self.c_name.items(): c_name2id[v] = k for i in df.index: if not df.ix[i, 'left'].startswith('NODE'): c_id2node[df.ix[i, 'left']] = df.ix[i, 'node'] if not df.ix[i, 'right'].startswith('NODE'): c_id2node[df.ix[i, 'right']] = df.ix[i, 'node'] t = df.ix[:, ['node', 'color']] t.set_index('node', inplace=True) c = Tree.color_map(nodes, cm) for k, v in c.items(): if type(v) is tuple: v = [min(max(int(x * 255), 0), 255) for x in v] v = '#%02x%02x%02x' % tuple(v[:3]) if l_name_to_id: k = c_name2id.get(k, k) k = c_id2node.get(k, k) if k in t.index: t.ix[k, 'color'] = v else: util.warn_msg('Node not in tree: ' + k) df['color'] = list(t.color) df.columns = ['NODEID', 'LEFT', 'RIGHT', 'CORRELATION', 'NODECOLOR'] util.df2sdf(df, s_format="%.4f").to_csv(self.tree_file, index=False, sep="\t")
def cynet_from_network(self, net, name=None, s_layout="force-directed", l_bundle=True): """Input net: xgmml.Network object return CyNetwork""" #print "====>>>>>>>>", net.T_node.header() data = net.to_json() if name is None: name = data['data']['name'] #cynet=self.cy.create(suid=net) cynet = None for trial in range( 3): # something Cytoscape errors, try up to three times try: if cynet is None: cynet = self.cy.network.create(name=name, data=data) except: cynet = None print("Fail at trail %d" % (trial + 1)) print("JSON data>", data) if cynet is None: import sys util.warn_msg('Error during plot_go_network(): %s' % sys.exc_info()[0]) print(traceback.format_exc()) id = cynet.get_id() if s_layout is not None: #print "apply layout %s" % s_layout self.cy.layout.apply(name=s_layout, network=cynet) if l_bundle: self.cy.edgebundling.apply(cynet) #print "apply bundle %s" % s_layout self.cy.layout.fit(cynet) return cynet
def OmniPath(self): fn_soure = os.path.join(SyncDB.DOWNLOAD_DIR(), "interactions") if not os.path.exists(fn_soure): urllib.urlretrieve("http://omnipathdb.org/interactions", fn_soure) t = pd.read_table(fn_soure) #source target is_directed is_stimulation is_inhibition dip_url #Q13873 Q969L4 1 0 0 c_gid, c_tax = self.uniprot() t['source_gene_id'] = t.source.apply(lambda x: c_gid.get(x, 0)) t['tax_id_A'] = t.source.apply(lambda x: c_tax.get(x, 0)) t['target_gene_id'] = t.target.apply(lambda x: c_gid.get(x, 0)) t['tax_id_B'] = t.target.apply(lambda x: c_tax.get(x, 0)) #print len(t) t = t[(t.source_gene_id > 0) & (t.target_gene_id > 0) & (t.tax_id_A == t.tax_id_B) & (t.source_gene_id != t.target_gene_id)].copy() c_cnt = util.unique_count(t.tax_id_A) print c_cnt if len(c_cnt) > 1: util.warn_msg('Found non-human genes') t = t[t.tax_id_A == 9606] t['interaction_type_id'] = 3 t['interaction_categy'] = 'Signaling Pathway' t['interaction_type'] = 'Literature' t['score'] = '-' t['ds'] = 'OmniPath' t.rename2({ 'source_gene_id': 'gid_A', 'target_gene_id': 'gid_B', 'dip_url': 'support' }) t = t[[ 'gid_A', 'gid_B', 'tax_id_A', 'tax_id_B', 'interaction_type_id', 'interaction_categy', 'interaction_type', 'score', 'support', 'ds' ]].copy() self.omni = t return self.omni
def add_watermark_png(s_png, s_out="out", s_wm=None, s_legend=None, l_clean_up=True): s_out, s_ext = os.path.splitext(s_out) s_out = s_out + '.png' if os.path.exists(s_out): os.remove(s_out) #M=mahotas.imread(s_png) if not os.path.exists(s_png): util.warn_msg('File not found in make_watermark_png: %s' % s_png) return M = scipy.misc.imread(s_png) if s_wm is None: s_wm = os.path.join(os.path.dirname(__file__), "watermark", "Watermark.png") #W=mahotas.imread(s_wm) if not os.path.exists(s_wm): util.warn_msg('File not found in make_watermark_png: %s' % s_wm) return W = scipy.misc.imread(s_wm) if l_clean_up: print("Deleting ... " + s_wm) os.remove(s_wm) h1, w1, x1 = M.shape h2, w2, x2 = W.shape if x1 > x2: # missing alpha channel W = np.dstack([W, np.ones([h2, w2])]) M[-h2:, -w2:] = W if s_legend is not None: # add legend #L=mahotas.imread(s_legend) if not os.path.exists(s_legend): util.warn_msg('File not found in make_watermark_png: %s' % s_legend) else: L = scipy.misc.imread(s_legend) h3, w3, x3 = L.shape if x1 > x3: L = np.dstack([L, np.ones([h3, w3])]) elif x1 < x3: L = L[:, :, :x1] M[-h2 - h3:-h2, -w3:] = L if l_clean_up: print("Deleting ... " + s_legend) os.remove(s_legend) #mahotas.imsave(s_out, M) scipy.misc.imsave(s_out, M)
def make_legend_png(s_png, S_png=[], l_clean_up=True): #s_out, s_ext=os.path.splitext(s_png) #s_out=s_out+'.png' #if os.path.exists(s_out): # os.remove(s_out) #M=mahotas.imread(s_png) if not os.path.exists(s_png): util.warn_msg('File not found in make_legend_png: %s' % s_png) return try: M = scipy.misc.imread(s_png) h1, w1, x1 = M.shape except: util.warn_msg('Cannot load image: %s ' % s_png) return h = 0 S_legend = [] S_png2 = [] for x in S_png: if not os.path.exists(x): util.warn_msg('File not found in make_legend_png: %s' % x) continue # Add to lower-right corner, bottom up X = scipy.misc.imread(x) S_legend.append(X) S_png2.append(x) room = 10 # give extra 10px blank space to the left of the legend if len(S_legend): w = 0 for Q in S_legend: w = max(w, Q.shape[1]) + room if w1 < w: # initial image is too small M2 = np.ones([h1, w, x1]) * 255 M2[:, 0:w1] = M M = M2 h1, w1, x1 = M.shape # find out the extra width required to expand the image, so that legend does not cover up existing pixels width = 2 # minimum 2 pixel margin h = 0 for Q in S_legend: h2, w2, z = Q.shape w2 += room if h1 - h < 0: break height = min(h1 - h, h2) r = M[max(h1 - h - h2, 0):h1 - h, max(w1 - w2, 0):w1, 0] g = M[max(h1 - h - h2, 0):h1 - h, max(w1 - w2, 0):w1, 1] b = M[max(h1 - h - h2, 0):h1 - h, max(w1 - w2, 0):w1, 2] bg = np.logical_and(r == 255, g == 255, b == 255) xb = 0 X = np.sum(bg, axis=0) for i in range(r.shape[1] - 1, -1, -1): if X[i] < height: xb = i break width = max(width, xb) h += h2 # Expand to make sure legend will never overlap the image content M2 = np.ones([h1, w1 + width, x1]) * 255 M2[:, 0:w1] = M h1, w1, x1 = M2.shape h = 0 for X in S_legend: h2, w2, Z = X.shape if x1 > Z: X = np.dstack([X, np.ones([h2, w2])]) elif x1 < Z: X = X[:, :, :x1] if h1 - h <= 0: break if h1 - h - h2 < 0: # happens when there are too many gene list, legend is too long #if h1-h<=0: break M2[0:h1 - h, w1 - w2:] = X[h2 - (h1 - h):h2, :] break else: M2[h1 - h - h2:h1 - h, w1 - w2:] = X h += h2 scipy.misc.imsave(s_png, M2) if l_clean_up: for x in S_png2: os.remove(x)
def plot(self, karyotype="", symbol=None, links=None, hits=None, outputdir=None, outputfile="CircosPlot"): #sw=util.StopWatch() outputdir=outputdir if outputdir is not None else '/tmp' for ext in [".png" , ".svg"]: s_file=os.path.join(outputdir, outputfile+ext) if os.path.exists(s_file): os.remove(s_file) if links is None: util.warn_msg('No link to plot, simply ignore') #return tmp=tempfile.NamedTemporaryFile(dir=outputdir, delete=False, prefix="CIRCOS_", suffix=".txt") conf_file=tmp.name S_tmp_file=[conf_file] s_conf=util.read_string(self.TEMPLATE) kary_file=re.sub('CIRCOS_', 'CIRCOS_KARYOTYPE_', conf_file) util.save_string(kary_file, karyotype) S_tmp_file.append(kary_file) s_conf=re.sub(r'@KARYOTYPE@', kary_file, s_conf) r0=0.90 s_plot="" if hits is None: hits=[] elif type(hits) is not list: hits=[hits] for i,s_hit in enumerate(hits): hit_file=re.sub('CIRCOS_', 'CIRCOS_HIT_%d_' % i, conf_file) util.save_list(hit_file, s_hit) S_tmp_file.append(hit_file) s_plot+="<plot>\n" s_plot+="file = "+hit_file+"\n" s_plot+="r0 = "+('%.3f' % r0)+"r\n" s_plot+="r1 = "+('%.3f' % r0)+"r+70p\n" s_plot+="stroke_thickness = 0\n" s_plot+="min = 0\n" s_plot+="max = 2\n" s_plot+="color = oranges-3-seq\n" s_plot+="</plot>\n\n" r0-=0.05; s_conf=re.sub(r'@PLOTS@', s_plot, s_conf) #t_chr=pd.read_csv(os.path.join(Circos.HOME, "karyotype_"+pid+".tmp"), sep=r'\s+', header=None) #s_conf=re.sub(r'@CHROMOSOMES@', ";".join(t_chr[2]), s_conf) #avoid using Pandas, so that this script can be used in CGI on ldweb server, where numpy is not installed correctly S=karyotype.split("\n") S_chr=[] for s in S: if s.strip()=='': break S_chr.append(re.split(Circos.DELIMITER, s)[2]) s_conf=re.sub(r'@CHROMOSOMES@', ";".join(S_chr), s_conf) s_symbol="" if symbol is not None: symbol_file=re.sub('CIRCOS_', 'CIRCOS_SYMBOL_', conf_file) util.save_string(symbol_file, symbol) S_tmp_file.append(symbol_file) s_symbol+="<plot>\n" s_symbol+="type = text\n" s_symbol+="color = black\n" s_symbol+="file = "+symbol_file+"\n" s_symbol+="r0=1.02r\n" s_symbol+="r1=1.2r\n" s_symbol+="label_size = 12p\n" s_symbol+="label_font = condensed\n" s_symbol+="padding = 0p\n" s_symbol+="rpadding = 0p\n" s_symbol+="</plot>\n" s_conf=re.sub(r'@SYMBOL@', s_symbol, s_conf) S_color=['107,174,214', '116,196,118', '106,81,163'] s_link="" MAX_EDGES=10000 # Circos does not seem to work well with too many edges, it will not draw edges after maybe 20000-ish if links is not None: if type(links) is str: links=[links] for i in range(len(links)-1, -1, -1): link_file=re.sub('CIRCOS_', 'CIRCOS_LINK%02d_' % (i+1), conf_file) S_tmp_file.append(link_file) S_edges=links[i].strip().split("\n") n_edge=len(S_edges)/2 if n_edge>MAX_EDGES: # randomly sample a subset IDX=np.repeat(np.random.permutation(list(range(0,len(S_edges),2)))[:MAX_EDGES], 2) IDX[list(range(1,len(IDX),2))]+=1 S_in=[x for x in IDX if x >=len(S_edges) ] S_edges=pd.Series(S_edges)[IDX].astype(str) links[i]="\n".join(S_edges) util.save_string(link_file, links[i]) s_link+="<link link"+str(i+1)+">\n" s_link+="show = yes\n" s_link+="color = "+S_color[(i+len(S_color)-1)%len(S_color)]+"\n" s_link+="file = "+link_file+"\n" s_link+="</link>\n\n" s_conf=re.sub(r'@LINKS@', s_link, s_conf) #print s_conf util.save_string(conf_file, s_conf) #print s_conf ## run Circos #s_cmd = "cd "+os.path.join(os.path.dirname(__file__), "circos")+"; " s_cmd=self.BIN+" -conf "+conf_file s_cmd+=' -outputdir '+outputdir s_cmd+=' -outputfile '+outputfile #sw.check('prepare conf file') print(s_cmd) util.unix(s_cmd, l_print=False, l_error=False) l_remove_temp=True if l_remove_temp: for f in S_tmp_file: os.remove(f) s_file=os.path.join(outputdir, outputfile+".png") #sw.check('make circos image') if os.path.exists(s_file): return s_file else: return None
def sql_in(s_sql_left, s_sql_right, S_id, num=1000, con=None, params_before=None, params_after=None): if con is None: util.error_msg('Database connection is not provided!') S_id = list(S_id) n = len(S_id) # in case it's multi-line SQL statement s_sql_left = re.sub('[\r\n]', ' ', s_sql_left) s_sql_right = re.sub('[\r\n]', ' ', s_sql_right) s_sql_right = re.sub(r'^\s*\)', '', s_sql_right) pat = re.compile(r'\s+(?P<col>[\w.]+)(?P<not>\s+NOT)?\s+IN\s*\(\s*$', re.I) m = re.search(pat, s_sql_left) if m is None: util.error_msg('Left SQL does not ends with IN statement: %s' % s_sql_left) s_sql_left = s_sql_left[:m.start()] + " " s_col = m.group('col') l_NOT = m.group('not') is not None # somethings SQL contains its own parameters, we need to provide parameter before/after the # S_id parameters params_before = params_before or [] params_after = params_after or [] # If oracle, either use ToTable() or run multiple SQL queries # If multiple SQL are run, results do not support GROUP BY, ORDER BY, DISTINCT # as they are applied to individual SQL runs # For other SQL servers, results will be exact, via multiple OR statements if db_type(con) == 'ORACLE': # Custom function used where TOTABLE is defined under MY_SCHEMA MY_SCHEMA = setting.db['MY_SCHEMA'] ### The definition of user function #CREATE OR REPLACE FUNCTION MY_SCHEMA.ToTable ( # P_STR IN CLOB, # P_DELIM IN VARCHAR2 DEFAULT ',' , # P_LIKE IN INT DEFAULT 0 #) # RETURN t_NTtype #AS # L_DATA t_NTtype := t_NTtype (); # L_STR CLOB := P_STR || P_DELIM; # L_SUBSTR VARCHAR2(4000); # L_STEP PLS_INTEGER := 0; # L_THIS INT := 1; # L_PREV INT := 0; # L_END CHAR := CASE P_LIKE WHEN 0 THEN NULL ELSE '%' END; #BEGIN # LOOP # L_STEP := L_STEP + 1; # L_THIS := DBMS_LOB.INSTR(L_STR, P_DELIM, L_PREV + 1, 1); # EXIT WHEN L_THIS = 0; # L_SUBSTR := # TRIM( # DBMS_LOB.SUBSTR( # L_STR, # L_THIS - L_PREV - 1, # L_PREV + 1 # ) # ); # L_PREV := L_THIS; # L_DATA.EXTEND(); # L_DATA(L_STEP) := L_SUBSTR || L_END; # END LOOP; # RETURN L_DATA; #END; ### t = from_sql( con, "select * from SYS.User_Objects where object_name='TOTABLE'") if len(t): S_id = [str(id) for id in S_id] S_id_filtered = [id for id in S_id if len(id) > 20 ] # 20 is entry length limit within ToTable() S_id = [','.join([id for id in S_id if len(id) <= 20])] if len(S_id_filtered) > 0: util.warn_msg( 'The following parameters were filtered out from the sql due to the length constraint: ' + str(S_id_filtered)) if l_NOT: s_id = '(NOT exists (select 1 from Table(' + MY_SCHEMA + '.ToTable(?)) tmp_table where tmp_table.COLUMN_VALUE = ' + s_col + '))' else: s_id = '(exists (select 1 from Table(' + MY_SCHEMA + '.ToTable(?)) tmp_table where tmp_table.COLUMN_VALUE = ' + s_col + '))' return from_sql(con, s_sql_left + s_id + s_sql_right, params=params_before + S_id + params_after) #else: # t=[] # for i in xrange(0, n, num): # j=min(n,i+num) # s_id=",".join(["?"]*(j-i)) # t2=from_sql(con, s_sql_left+" "+s_col+" IN ("+s_id+") "+s_sql_right, params=params_before+S_id[i:j]+params_after) # t.append(t2) # return pd.concat(t, axis=0, ignore_index=True) S = [] for i in range(0, n, num): j = min(n, i + num) S.append(",".join(["?"] * (j - i))) if l_NOT: s_id = "(" + s_col + " NOT IN (" + (") AND " + s_col + " NOT IN (").join(S) + "))" else: s_id = "(" + s_col + " IN (" + (") OR " + s_col + " IN (").join(S) + "))" if db_type(con) == 'MYSQL' and not S: if l_NOT: s_id = "(" + s_col + " NOT IN (''))" else: s_id = "(" + s_col + " IN (''))" t = from_sql(con, s_sql_left + s_id + s_sql_right, params=params_before + S_id + params_after) return t
def load(tax_id=9606, l_use_GPDB=True, S_DB=None, entrez=None): """tax_id is None, defaults to 9606, if 0, means load all supported species, entrez is only used in local mode to accelerate Symbol retrieval""" S_DB=S_DB or Cache.get_DB(l_use_GPDB) if tax_id is None: util.error_msg('tax_id must be an int, or 0 means all supported species') tax_id=abs(tax_id) s_key=Cache.key(l_use_GPDB) eg=entrez S_tax_id=[] if not l_use_GPDB: if tax_id not in (0,9606): util.error_msg('Local database only supports human!') tax_id=9606 if tax_id in Cache.ppi_data[s_key]: S_DB=[x for x in S_DB if x not in Cache.ppi_data[s_key][tax_id]] if len(S_DB)==0: return S_tax_id=[tax_id] T=[] for filename in S_DB: print("loading PPI database: "+filename+" ...") if os.path.isfile(filename): t=pd.read_csv(filename) t['ds']=filename T.append(t) elif os.path.isfile(Cache.DATA_DIR+filename+".csv"): t=pd.read_csv(Cache.DATA_DIR+filename+".csv") t['ds']=filename T.append(t) else: util.warn_msg('PPI database ' + filename + ' not found.') if len(T)>1: t=pd.concat(T, axis=0, ignore_index=True) else: t=T[0] t=t[(t.Gene_A!=t.Gene_B) & (t.Score>=0.5)].copy() if eg is None: eg=ez.EntrezGene(tax_id=tax_id) else: eg.load_organism(tax_id=tax_id) c_seen={} t.index=list(range(len(t))) t['Gene_A']=t.Gene_A.astype(str) t['Gene_B']=t.Gene_B.astype(str) S_gene_A=t.Gene_A.tolist() S_gene_B=t.Gene_B.tolist() for i in range(len(t)): gene_A=S_gene_A[i] gene_B=S_gene_B[i] if gene_A not in c_seen: c_seen[gene_A]=eg.fix_gene_id(gene_A) S_gene_A[i]=c_seen[gene_A] if S_gene_A[i] is None: continue if gene_B not in c_seen: c_seen[gene_B]=eg.fix_gene_id(gene_B) S_gene_B[i]=c_seen[gene_B] t['Gene_A']=S_gene_A t['Gene_B']=S_gene_B t=t[~(t.Gene_A.isnull() | t.Gene_B.isnull())].copy() t.index=list(range(len(t))) t['tax_id']=tax_id else: mydb=db.DB('METASCAPE') if tax_id>0 and tax_id in Cache.ppi_data[s_key]: S_DB=[x for x in S_DB if x not in Cache.ppi_data[s_key][tax_id]] if len(S_DB)==0: return if tax_id>0: print("loading PPI database from database for tax_id: %d ..." % tax_id) t=mydb.sql_in("SELECT gid_A Gene_A,gid_B Gene_B,0 Score,tax_id_A tax_id,ds from interaction where interaction_category!='genetic' and gid_A!=gid_B and tax_id_A=tax_id_B and tax_id_A=? and ds in (", ")", S_DB, params_before=[tax_id]) S_tax_id=[tax_id] else: #ZZZ modify in the future, to obtain the list of all supported tax_id t=mydb.from_sql('SELECT DISTINCT tax_id FROM gid2source_id') S_tax_id=[x for x in t.tax_id.astype(int).tolist() if x not in Cache.ppi_data[s_key]] if len(S_tax_id): s_tax_id=",".join(util.iarray2sarray(S_tax_id)) print("loading PPI database for tax_id: %s ..." % s_tax_id) t=mydb.sql_in("SELECT gid_A Gene_A,gid_B Gene_B,0 Score,tax_id_A tax_id,ds from interaction where interaction_category!='genetic' and gid_A!=gid_B and tax_id_A=tax_id_B and ds in (", ")", S_DB) #t=mydb.sql_in("SELECT gid_A Gene_A,gid_B Gene_B,0 Score,tax_id_A tax_id,ds from interaction where interaction_category!='genetic' and gid_A!=gid_B and tax_id_A=tax_id_B and tax_id_A in ("+s_tax_id+") and ds in (", ")", S_DB) else: t=pd.DataFrame() if len(t): t['Gene_A']=t.Gene_A.astype(str) t['Gene_B']=t.Gene_B.astype(str) for x in S_tax_id: #print ">>>>>>>>>>>>>>>>>>>>>>>", x if x not in Cache.ppi_data[s_key]: Cache.ppi_data[s_key][x]={} Cache.ppi_node[s_key][x]={} for y in S_DB: Cache.ppi_data[s_key][x][y]={} Cache.ppi_node[s_key][x][y]=pd.DataFrame() if len(t)==0: return for k,t_v in t.groupby(['tax_id','ds']): #print ">>>", k, len(t_v) #t_v=t_v.copy() if k[0] not in S_tax_id: continue data={} t_node=None #t_v=t_v.copy() #t_v.index=list(range(len(t_v))) #for i in t_v.index: #if i%1000==0: print i for row in t_v.itertuples(): gene_A=row.Gene_A #t_v.ix[i,'Gene_A'] gene_B=row.Gene_B #t_v.ix[i,'Gene_B'] score=row.Score #t_v.ix[i,'Score'] if gene_A not in data: data[gene_A]={gene_B:score} else: data[gene_A][gene_B]=max(score, data[gene_A].get(gene_B,0)) if gene_B not in data: data[gene_B]={gene_A:score} else: data[gene_B][gene_A]=max(score, data[gene_B].get(gene_A,0)) Cache.ppi_data[s_key][k[0]][k[1]]=data S_gene=list(data.keys()) if l_use_GPDB: t_node=mydb.sql_in("SELECT gid Gene,source_id Symbol from gid2source_id t where gid in (", ") and t.id_type_id=1", util.rarray2iarray(S_gene)) t_node['Gene']=t_node.Gene.astype(str) if len(S_gene)!=len(t_node): util.warn_msg("Strange, gene ID has no symbol?") t=pd.DataFrame({'Gene':S_gene}) t_node=t.merge(t_node, left_on='Gene', right_on='Gene', how='left') X=t_node.Symbol.isnull() print(t_node.ix[X][:10]) if X.any(): t_node.ix[X,'Symbol']=t_node.ix[X,'Gene'] else: t_node=eg.gene_sarray_to_table(S_gene, l_description=False) Cache.ppi_node[s_key][k[0]][k[1]]=t_node
def plot_membership(self, t_membership, t_go=None, outputdir=None, outputfile="CircosPlot", s_symbol="Symbol", S_label=None, S_color=None): """t_mem is a table, where index is Gene ID, contains an optional column for Symbol, a list of columns starts with _MEMBER_, which is 1/0 indicating membership in certain hit list. This method is for membership table generated within biolist.py. rename is a dict that maps existing list name (without _MEMBER_ prefix) into new name. For Circos, should not contain space. SIDE EFFECT: if t_go is specified, t_membership will be modified, 0.5 added to where GO-indirect membership is found!!! """ #sw=util.StopWatch() MAX_NOF_LISTS=100 D=Circos.DELIMITER t_mem=t_membership.copy() S_col=[x for x in t_mem.header() if x.startswith('_MEMBER_')] if len(S_col)>MAX_NOF_LISTS: S_col=S_col[:MAX_NOF_LISTS] util.warn_msg('Too many lists, only show the first %d in Circos plot!' % MAX_NOF_LISTS) if len(S_col)<=1: util.warn_msg('Need at least two lists to plot Circos plot, only %d found!' % len(S_col)) return self.plot(karyotype="", symbol="", links=None, hits="", outputdir=outputdir, outputfile=outputfile) if S_label is None: S_label=[re.sub(r'[.;]', ' ', x[8:]) for x in S_col] S_chr=[str(i+1) for i in range(len(S_col))] c_rename={ x: S_chr[i] for i,x in enumerate(S_col) } t_mem.rename2(c_rename) n=len(S_chr) l_symbol=s_symbol is not None and (s_symbol in t_mem.header()) if not l_symbol: t_mem['Symbol']=t_mem.index.astype(str) t_mem['_SUM_']=t_mem.ix[:, S_chr].sum(axis=1) if S_color is None: S_color=Circos.get_qualitative_colors(n) else: # convert color from hex to RGB S_color=[Circos.hex_to_rgb(x) for x in S_color] c_coord={} s_karyotype="" s_symbol="" if l_symbol else None s_hits="" #sw.check('Prepare Membership') for i,x in enumerate(S_chr): s_karyotype+="chr"+D+"-"+D+S_chr[i]+D+S_label[i]+D+"0"+D+str(int(t_mem.ix[:,x].sum()))+D+S_color[i%len(S_color)]+"\n" tmp=t_mem[t_mem.ix[:, x]>0].copy() #tmp["_PATTERN_"]=tmp["_PATTERN_"].apply(lambda x: x[i:]+x[:i]) tmp.sort_values(['_SUM_', '_PATTERN_', 'Symbol'], ascending=[False, False, True], inplace=True) j=0 for idx in tmp.index: j+=1 if l_symbol: s_symbol+=S_chr[i]+D+str(j-1)+D+str(j)+D+t_mem.ix[idx, 'Symbol']+"\n" if idx not in c_coord: c_coord[idx]={} c_coord[idx][S_chr[i]]=j s_hits+=S_chr[i]+D+str(j-1)+D+str(j)+D+str(int(t_mem.ix[idx, '_SUM_']))+"\n" #sw.check('prepare Hit') tmp=t_mem[t_mem._SUM_ > 1].copy() tmp.sort_values('_SUM_', inplace=True) link_cnt=0 out=[] for s_gene,v in c_coord.items(): S_on_chr=list(v.keys()) for i in range(len(S_on_chr)): chr1=S_on_chr[i] i_from=c_coord[s_gene][chr1] for j in range(i+1, len(S_on_chr)): chr2=S_on_chr[j] i_to=c_coord[s_gene][chr2] out.append("e"+str(link_cnt)+D+chr1+D+str(i_from-1)+D+str(i_from)) out.append("e"+str(link_cnt)+D+chr2+D+str(i_to-1)+D+str(i_to)) #s_link+=str(link_cnt)+D+chr1+D+str(i_from-1)+D+str(i_from)+D+str(link_cnt)+D+chr2+D+str(i_to-1)+D+str(i_to)+"\n" link_cnt+=1 s_link="\n".join(out) S_link=[s_link] #sw.check('prepare link') def go_links(t_go, l_sample=False): link_cnt=0 c_plot={} out=[] MAX_EDGES=10000 MAX_GENES=20 s_link=None for _,r in t_go.iterrows(): s=r['GeneID'] #print '>>>>', r['GO'] #if r['GO']!='GO:0007156': continue S=s.split("|") if link_cnt>MAX_EDGES: # we start to get aggressive if not l_sample: return None # the caller should call us again with l_sample=True S=np.random.choice(S, MAX_GENES) for i in range(len(S)): g1=S[i] for j in range(i+1, len(S)): g2=S[j] #if r['GO']=='GO:0007156': # print g1+":"+g2, g1+":"+g2 in c_plot, g2+":"+g1 in c_plot if g1+":"+g2 in c_plot or g2+":"+g1 in c_plot: continue c_plot[g1+":"+g2]=True for chr1,i_from in c_coord[g1].items(): #if r['GO']=='GO:0007156': print ">>> "+str(chr1)+" "+str(i_from) for chr2,i_to in c_coord[g2].items(): #if r['GO']=='GO:0007156': print " <<< "+str(chr2)+" "+str(i_to) if chr1==chr2: continue # same chromosome out.append("e"+str(link_cnt)+D+chr1+D+str(i_from-1)+D+str(i_from)) out.append("e"+str(link_cnt)+D+chr2+D+str(i_to-1)+D+str(i_to)) #str(link_cnt)+D+chr1+D+str(i_from-1)+D+str(i_from)+D+str(link_cnt)+D+chr2+D+str(i_to-1)+D+str(i_to)) link_cnt+=1 # use 0.4 instead 0.5 to avoid degeneracy in Euclidean distance calculation if t_membership.ix[g1, S_col[int(chr2)-1]]==0: t_membership.ix[g1, S_col[int(chr2)-1]]=0.333 if t_membership.ix[g2, S_col[int(chr1)-1]]==0: t_membership.ix[g2, S_col[int(chr1)-1]]=0.333 s_link="\n".join(out) return s_link if t_go is not None: for x in S_col: t_membership[x]=t_membership[x].astype(float) s_link=go_links(t_go, False) if s_link is None: s_link=go_links(t_go, True) S_link.append(s_link) #sw.check('GO links') return self.plot(karyotype=s_karyotype, symbol=s_symbol, links=S_link, hits=s_hits, outputdir=outputdir, outputfile=outputfile)
def map(self, X, n_CPU=0, l_quit=None): """X: list[input tuple], list of input parameters. If workers were started with f=None, each element in X in passed to the wrapper task. In that case, we expect X to be a tuple (or list) and the first element of X must be either the method pointer or its registered name. However, many methods, such as instance method or func within a func, cannot be pickled, therefore the method cannot be send over the pipe. We should pre-register such methods and call them by name. Need example later. l_quit: boolean, default None, if specified, controls whether workers quit or not after tasks are processed. If not, workers wait for future tasks. return list""" # very similar to the idea in https://stackoverflow.com/questions/3288595/multiprocessing-how-to-use-pool-map-on-a-function-defined-in-a-class, author klaus se l_quit = l_quit if l_quit is not None else self.QUIT #if self.is_busy(): # util.error_msg('Works are still busy!') if n_CPU == 0: n_CPU = self.n_use # defaults to n_use n_CPU = min( n_CPU, self.n_CPU ) # one could start 8 CPUs, but only use 4 for mapping, if the work takes lots of memory res = [] n_input = len(X) if n_input == 0 and not l_quit: return res #print '=============', self.c_proc if not self.has_started() and n_input > 0: util.warn_msg( 'Please start processes first, no worker is running!') util.warn_msg('However, we will process the task with ONE cpu!!!') return [self.wrapper(x) for x in X] if n_input > 0 and n_CPU == 0: return [self.wrapper(x) for x in X] s_pid = str(multiprocessing.current_process().pid) has_my_job = [False for i in range(self.n_CPU)] def engine(): print( "=================================================================" ) print("PID: ", str(multiprocessing.current_process().pid)) print("WORK STATUS: ", self.work_status) print("HAS MY JOB: ", has_my_job) print("JOB IS DONE: ", self.job_is_done) print("N_RUNNING: (%d, %d) " % (self.n_running[0], self.n_running[1])) print( "=================================================================" ) def is_busy(): return sum(has_my_job) > 0 def process_out(out): i, x = out if i is None: self.n_running[0] -= 1 # I modify the original code, so that we can join the process and release it as soon as possible if type(x) is str: print("Exception> " + x) exit() else: if self.DEBUG: print("Progress: %d processes remaining. Stopping %d" % (self.n_running[0], x)) #print self.c_proc.keys() self.c_proc[x].join() del self.c_proc[x] if self.DEBUG: print("Progress: process %d stopped." % x) else: res.append(out) if self.DEBUG: print("Progress: %d of %d item calculated." % (len(res), n_input)) def fetch(l_lock=False): while is_busy(): l_fetch_something = False for i_worker in range(self.n_CPU): if has_my_job[i_worker] and self.job_is_done[i_worker]: try: (i, x) = self.q_out[i_worker].get() process_out((i, x)) self.n_running[1] -= 1 self.work_status[i_worker] = False has_my_job[i_worker] = False self.job_is_done[i_worker] = False l_fetch_something = True if self.DEBUG: print(">>>A1") engine() with self.mail: self.mail.notify_all() except Exception as e: print( "ERROR> Fail to fetch results from worker: %d" % i_worker) print(traceback.format_exc()) return if not l_fetch_something: if l_lock: with self.mail: self.mail.wait(timeout=8.5 + random.random() * 3) else: return if self.DEBUG: print("DEBUG> self.n_running: ", self.n_running) if self.DEBUG: print("DEBUG> self.c_proc.keys(): ", list(self.c_proc.keys())) ###ZHOU FEB16,2016 #self.n_running[1]+=1 ### i = 0 while (i < n_input): x = X[i] if self.DEBUG: print("fetch job entry %d " % i) #print self.work_status self.lock.acquire() j = util.index(False, self.work_status) # find an idle worker l_put_something = False if j >= 0 and sum( has_my_job ) < n_CPU: #j>=0 and j<n_CPU: # we only use up to n_CPU, even if there are more workers #print "assing job to %d" % j self.work_status[j] = True # flag it as busy has_my_job[j] = True if self.DEBUG: print("DEBUG> self.c_proc.keys(): ", list(self.c_proc.keys())) ###ZHOU FEB16,2016 self.n_running[1] += 1 ### self.lock.release() self.q_in[j].put((i, j, x)) # assign task i += 1 if self.DEBUG: print("Progress: send input %d of %d items." % (i, len(X))) l_put_something = True if self.DEBUG: print(">>>A2") engine() else: self.lock.release() # we constantly removing items from the output queue, so that the process can release some memory fetch() if not l_put_something: with self.mail: self.mail.wait(timeout=8.5 + random.random() * 3) fetch() while (is_busy()): fetch(True) if self.DEBUG: print(">>>A3") engine() self.lock.acquire() ###ZHOU FEB16,2016 #self.n_running[1]-=1 ### if self.DEBUG: print(">>>QUIT=" + ("True" if l_quit else "False")) print(">>>n_running[1]=%d" % self.n_running[1]) if l_quit and self.n_running[1] == 0: # I am the last one running map() if self.DEBUG: print(">>>A4") engine() for i in range(self.n_CPU): self.q_in[i].put((None, i, None)) self.work_status[i] = True self.n_running[1] += 1 has_my_job[i] = True self.lock.release() while (is_busy()): fetch(True) if self.DEBUG: engine() else: self.lock.release() if self.DEBUG: for i, x in enumerate(res): print('>>>A4 ', i, type(x), type(x[0]), type(x[1])) res.sort(key=lambda x: x[0]) return [x for i, x in res]