class Store: def __init__(self): self.connection = DB(dbname=environ.get('POSTGRESQL_ADDON_DB'), host=environ.get('POSTGRESQL_ADDON_HOST'), port=int(environ.get('POSTGRESQL_ADDON_PORT')), user=environ.get('POSTGRESQL_ADDON_USER'), passwd=environ.get('POSTGRESQL_ADDON_PASSWORD')) try: self.connection.query("SELECT * FROM stats") except ProgrammingError: print "Create Table" self.connection.query("""CREATE TABLE stats (timestamp TIMESTAMP PRIMARY KEY,value NUMERIC(5,2))""") # self.connection.query("truncate stats") def add(self, value): if float(value) > 0.1: self.connection.insert('stats', timestamp=get_current_time(), value=value) def display(self): self.connection.query("SELECT * FROM stats") def get_values(self, value='{}-{}-{}'.format(datetime.now().year, datetime.now().month, datetime.now().day)): result = self.connection.query( "SELECT extract(DOW FROM timestamp)::INT AS dow, timestamp, value FROM stats WHERE timestamp >= '" + value + "' AND timestamp < ('" + value + "'::DATE + '1 day'::INTERVAL) ORDER BY 2") return result def get_average_values(self, value='{}-{}-{}'.format(datetime.now().year, datetime.now().month, datetime.now().day)): result = self.connection.query( "SELECT extract(DOW FROM timestamp)::INT AS dow, " "to_timestamp('" + value + " ' || " "LPAD(extract(HOUR FROM timestamp)::TEXT,2,'0') || ':' ||" " LPAD(extract(MINUTE FROM timestamp)::TEXT,2,'0') ||" " ':00', 'YYYY-MM-DD HH24:MI:SS') AS timestamp," " round(avg(value)::NUMERIC,2) AS value FROM stats" " GROUP BY 1,2 ORDER BY 1,2") return result
class PostgresqlAdapter(object): def __init__(self, dbname, user, passwd, host="localhost", port=5432): self._db = DB(dbname=dbname, host=host, port=port, user=user, passwd=passwd) def create(self, inhabitant): record = self._db.insert('inhabitants', data=inhabitant) return _normalize(record) def find_all(self): all_inhabitants = self._db.query( 'select * from inhabitants').dictresult() return [_normalize(x) for x in all_inhabitants] def find_by_id(self, id): inhabitant = self._find_by_id(id) return _normalize(inhabitant) def find_by_name(self, name): all_inhabitants = self._db.query( "SELECT * FROM inhabitants " + "WHERE data ->> 'FirstName' ILIKE '%' || $1 || '%' OR " + " data ->> 'LastName' ILIKE '%' || $1 || '%'", (name, )).dictresult() return [_normalize(x) for x in all_inhabitants] def delete(self, id): try: self._db.delete('inhabitants', id=int(id)) except ValueError: pass def update(self, inhabitant): id = inhabitant["Id"] self._find_by_id(id) del inhabitant["Id"] stored = self._db.update('inhabitants', id=id, data=inhabitant) return _normalize(stored) def _find_by_id(self, id): try: return self._db.get('inhabitants', int(id)) except (ValueError, DatabaseError): raise NoRecordFound()
#insert_testing_patient.py import datetime from pg import DB """ Feb 27 2018 This script should be run to enter some patients into the table called 'patients' in the database called 'patient_db_test' DATE formats: YMD - year-month-day is the most desirable format, considered ISO 8601 standards """ db = DB("test_patients") db.insert('patients', first_name = "NICHOLAS ANDREW", last_name = "MARSDEN", phn = "9067 062 924", dob = datetime.date(1946, 10, 6)) db.insert('patients', first_name = "JOHNNY ALAN GEORGE", last_name = "LUCK", phn = "9136 527 693", dob = datetime.date(1941, 5, 12)) db.insert('patients', first_name = "ELIZABETH M", last_name = "CARLEY", phn = "9051 748 373", dob = datetime.date(1939, 3, 25))
for line in input(fileName): if len(line) < 20: continue if "Session" in line: session = line.split()[2][8:] if sessionOld is -1: sessionOld = session if session not in sessionOld: sessionDatabaseID = sessionDatabaseID + 1 print "sessionDatabaseID: " + str(sessionDatabaseID) sessionOld = session if "NOTIFY: login attempt" in line: sessionStartTime = DB.query(db, "select to_timestamp('" + line[0:22] + "', 'DD-Mon-YYYY (HH24:MI:SS)')").getresult()[0][0] DB.insert(db, 'connections', honeypot=honeypotName, session=sessionDatabaseID, ip=ip, port=port, authenticated='authentication method: SSH_AUTH_METHOD_PASSWORD', start_time=sessionStartTime, finish_time=sessionStartTime) print "login attempt from ip: " + ip if "NOTIFY: client session ended" in line: sessionEndTime = DB.query(db, "select to_timestamp('" + line[0:22] + "', 'DD-Mon-YYYY (HH24:MI:SS)')").getresult()[0][0] DB.insert(db, 'connections', honeypot=honeypotName, session=sessionDatabaseID, authenticated='session ended', start_time=sessionEndTime, finish_time=sessionEndTime) print "session ended" if "NOTIFY: client IP:" in line: ip = line.split()[6] sessionStartTime = DB.query(db, "select to_timestamp('" + line[0:22] + "', 'DD-Mon-YYYY (HH24:MI:SS)')").getresult()[0][0] DB.insert(db, 'connections', honeypot=honeypotName, session=sessionDatabaseID, ip=ip, port=port, authenticated='new client session', start_time=sessionStartTime, finish_time=sessionStartTime) print "session from ip: " + ip print "fileName " + fileName if "NOTIFY: user:" in line:
from pg import DB import numpy as np import datetime db = DB(dbname='Pingtung', host='localhost', port=5432, user='******', passwd='husky') import os for filename in os.listdir('Pingtung/Well'): site = filename[0:8] year = filename[9:13] print(filename) sites = db.query('select site, starttime, endtime from "Well_Stations" where site like \'{site}\''.format(site=site)).getresult() if sites != []: for line in open('Pingtung/Well/{FN}'.format(FN=filename), 'r'): #檢查時間是否相符 data = line.split() if len(data[1]) == 8: if (data[0][0:4] != year) and (data[0][5:10] != '01-01'): print('檔案有誤 日期不對') break else: print('檔案有誤 時間不對') break db.insert('Well', site=site, date='{date} {time}'.format(date=data[0], time=data[1]), value=data[2]) else: print('查無此站')
json_object = json.loads(line) user_id = json_object['user_id'] review_id = json_object['review_id'] business_id = json_object['business_id'] stars = json_object['stars'] date = json_object['date'] text = json_object['text'] useful = json_object['useful'] funny = json_object['funny'] cool = json_object['cool'] db.insert('review', user_id=user_id, review_id=review_id, business_id=business_id, stars=stars, date=date, text=text, useful=useful, funny=funny, cool=cool) i = i + 1 if (i % 10000) == 0: print(str(int(i / 10000) * 10000) + ' lines processed...') line = f.readline() f.close() print('finished processing...') f = open('yelp_academic_dataset_tip.json')
MS = DBManage.DBManage(host="192.168.1.200", port="1433", user="******", password="******", database="VIM_VCDB", charset="UTF-8") PG = DB(dbname='VM', host='localhost', port=5432, user='******', passwd='123456') MS.getMSSConnetion() sql1 = "SELECT [ID] ,[STAT_ROLLUP] ,[NAME] ,[GROUP_NAME] ,[TYPE] ,[UNIT] ,[ASSOCIATE_IDS] ,[STATS_LEVEL] ,[FIXED_COLLECTION_INTERVAL] FROM [VIM_VCDB].[dbo].[VPXV_STAT_COUNTERS]" cursor1 = MS.QuerySql(sql1) Tuple = cursor1.fetchone() while Tuple: PG.insert('stat_counters', id=Tuple[0], stat_rollup=Tuple[1], name=Tuple[2], group_name=Tuple[3], type=Tuple[4], unit=Tuple[5], associate_ids=Tuple[6], stats_level=Tuple[7], fixed_collection_interval=Tuple[8]) Tuple = cursor1.fetchone() print("成功!") MS.closeConn() PG.close()
#!/usr/bin/env python import sys import yaml import json from pg import DB bosh_manifest_path = sys.argv[1] with open(bosh_manifest_path, 'r') as f: bosh_manifest = yaml.load(f.read()) settings_path = sys.argv[2] with open(settings_path, 'r') as f: cf_ip = json.loads(f.read())['cf-ip'] postgres_properties = bosh_manifest['jobs'][0]['properties']['postgres'] dbname = postgres_properties.get('database') host = postgres_properties.get('host') port = postgres_properties.get('port', 5432) user = postgres_properties.get('user') passwd = postgres_properties.get('password') db = DB(dbname=dbname, host=host, port=port, user=user, passwd=passwd) domain_id = db.insert('domains', name='xip.io', type='NATIVE')['id'] db.insert('records', domain_id=domain_id, name='{0}.xip.io'.format(cf_ip), content='localhost [email protected] 1', type='SOA', ttl=86400, prio=None) db.insert('records', domain_id=domain_id, name='{0}.xip.io'.format(cf_ip), content='dns-us1.powerdns.net', type='NS', ttl=86400, prio=None) db.insert('records', domain_id=domain_id, name='*.{0}.xip.io'.format(cf_ip), content=cf_ip, type='A', ttl=120, prio=None) db.close()
class BotDB: def __init__(self, db_url): urlparse.uses_netloc.append("postgres") self.__db_url = db_url url = urlparse.urlparse(db_url) self.__db = DB( dbname=url.path[1:], user=url.username, passwd=url.password, host=url.hostname, port=url.port ) def insertThesis(self, init_id, chat_id, user_id, body): ts = time.time() timestamp = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') print("inserting thesis") print(init_id, chat_id, user_id, body, timestamp) self.__db.insert("theses", row={"init_id": init_id, "chat_id": chat_id, "user_id": user_id, "body": body}) print("done") self.__db.commit() def getThesisByIds(self, init_id, chat_id): query = self.__db.query("SELECT * FROM theses WHERE init_id = %d AND chat_id = %d;" % (init_id, chat_id)) dict_res = query.dictresult() if len(dict_res) == 0: return False else: return dict_res[0] def getThesisByBody(self, body): query = self.__db.query("SELECT * FROM theses WHERE body = '%s';" % body) dict_res = query.dictresult() if len(dict_res) == 0: return False else: return dict_res[0] def getLastThesesByTime(self, chat_id, interval): query = self.__db.query("SELECT * FROM theses WHERE chat_id = %s AND creation_time > current_timestamp - interval '%s';" % (chat_id, interval)) dict_res = query.dictresult() if len(dict_res) == 0: return False else: return dict_res def getTodayTheses(self, chat_id): query = self.__db.query("SELECT * FROM theses WHERE chat_id = %s AND creation_time > current_date;" % chat_id) dict_res = query.dictresult() if len(dict_res) == 0: return False else: return dict_res def insertUser(self, user_id, username, first_name, last_name): # ts = time.time() row = {"user_id":user_id} if username: row["username"] = username if first_name: row["first_name"] = first_name if last_name: row["last_name"] = last_name self.__db.insert('users', row=row) self.__db.commit() def getUserById(self, user_id): query = self.__db.query("SELECT * FROM users WHERE user_id = %d;" % user_id) dict_res = query.dictresult() if len(dict_res) == 0: return False else: return dict_res[0] def insertBotMessage(self, chat_id, message_id, owner_id): row = {"chat_id": chat_id, "message_id": message_id, "owner_id": owner_id} self.__db.insert('bot_messages', row=row) self.__db.commit() def getBotMessage(self, chat_id, message_id): query = self.__db.query("SELECT * FROM bot_messages WHERE chat_id = %d AND message_id = %d;" % (chat_id, message_id)) dict_res = query.dictresult() if len(dict_res) == 0: return False else: return dict_res[0] def close(self): self.__db.close()
bosh_manifest = yaml.load(f.read()) settings_path = sys.argv[2] with open(settings_path, 'r') as f: cf_ip = json.loads(f.read())['cf-ip'] postgres_properties = bosh_manifest['jobs'][0]['properties']['postgres'] dbname = postgres_properties.get('database') host = postgres_properties.get('host') port = postgres_properties.get('port', 5432) user = postgres_properties.get('user') passwd = postgres_properties.get('password') db = DB(dbname=dbname, host=host, port=port, user=user, passwd=passwd) domain_id = db.insert('domains', name='xip.io', type='NATIVE')['id'] db.insert('records', domain_id=domain_id, name='{0}.xip.io'.format(cf_ip), content='localhost [email protected] 1', type='SOA', ttl=86400, prio=None) db.insert('records', domain_id=domain_id, name='{0}.xip.io'.format(cf_ip), content='dns-us1.powerdns.net', type='NS', ttl=86400, prio=None) db.insert('records',
class PGSQL_DB(DB_Abstract): conn = None def __init__(self, dbname, dbhost, dbport, dbuser, dbpass): try: self.conn = DB(dbname=dbname, host=dbhost, port=dbport, user=dbuser, passwd=dbpass) except Exception as ex: print(ex) self.conn = None if (None != self.conn): tables = self.conn.get_tables() if (not ("public.users" in tables)): try: self.conn.query( "CREATE TABLE users(uid bigint primary key, name text, bio text)" ) except Exception as ex: print(ex) self.conn = None if (not ("public.tweets" in tables)): try: self.conn.query( "CREATE TABLE tweets(tid bigint primary key, author_id bigint, parent_id bigint, timestamp bigint, text text)" ) except Exception as ex: print(ex) self.conn = None def insert_user(self, user): if (not (type(user) is User)): print("type isn't user") return 400 if ("" == user.user_id or "" == user.name): print("empty user") return 400 if (None == self.conn): print("no connection") return 400 try: self.conn.insert('users', { 'uid': user.user_id, 'name': user.name, 'bio': user.bio }) except Exception as ex: print(ex) return 400 return 200 def insert_tweet(self, tweet): if (not (type(tweet) is Tweet)): print("type isn't Tweet") return 400 if ("" == tweet.tweet_id or "" == tweet.author_id or "" == tweet.parent_id or "" == tweet.timestamp or "" == tweet.text): print("empty tweet") return 400 if (None == self.conn): print("no connection") return 400 try: self.conn.insert( 'tweets', { 'tid': tweet.tweet_id, 'author_id': tweet.author_id, 'parent_id': tweet.parent_id, 'timestamp': tweet.timestamp, 'text': tweet.text }) except Exception as ex: print(ex) return 400 return 200 def get_user_by_id(self, user_id): if ("" == user_id): print("empty user_id") return None if (None == self.conn): print("no connection") return None try: u = self.conn.get('users', {'uid': user_id}) except Exception as ex: print(ex) return None user = User() user.user_id = u['uid'] user.name = u['name'] user.bio = u['bio'] return user def get_user_tweets(self, uid): if ("" == uid): print("empty uid") return None if (None == self.conn): print("no connection") return None try: t = self.conn.get_as_list('tweets', where="author_id = {0}".format(uid)) except Exception as ex: print(ex) return None tweets = [] for tweet in t: new_tweet = Tweet() new_tweet.tweet_id = tweet[0] new_tweet.author_id = tweet[1] new_tweet.parent_id = tweet[2] new_tweet.timestamp = tweet[3] new_tweet.text = tweet[4] tweets.append(new_tweet) if (0 == len(tweets)): return None return tweets def get_tweet_by_id(self, tweet_id): if (None == self.conn): print("no connection") return None try: t = self.conn.get('tweets', {'tid': tweet_id}) except Exception as ex: print(ex) return None tweet = Tweet() tweet.tweet_id = t['tid'] tweet.author_id = t['author_id'] tweet.parent_id = t['parent_id'] tweet.timestamp = t['timestamp'] tweet.text = t['text'] return tweet def get_replies_by_id(self, tweet_id): if (None == self.conn): print("no connection") return None try: t = self.conn.get_as_list('tweets', where="parent_id = " + tweet_id) except Exception as ex: print(ex) return None tweets = [] for tweet in t: if (tweet[0] != tweet[2]): new_tweet = Tweet() new_tweet.tweet_id = tweet[0] new_tweet.author_id = tweet[1] new_tweet.parent_id = tweet[2] new_tweet.timestamp = tweet[3] new_tweet.text = tweet[4] tweets.append(new_tweet) if (0 == len(tweets)): return None return tweets
from pg import DB PG=DB(dbname='VM', host='192.168.1.112' ,port=5432,user='******',passwd='123456') sql="select A.sample_time,A.stat_name,B.stat_rollup,B.unit,A.stat_group,A.entity,A.stat_value from hist_stat_daily A ,stat_counters B where A.stat_id=B.id " q=PG.query(sql) rows=q.getresult() for row in rows: PG.insert("vm_state",time=row[0],stat_name=row[1],stat_rollup_type=row[2],unit=row[3],stat_group=row[4],entity=row[5],stat_value=row[6]) print("成功") PG.close()
''' import xml.etree.ElementTree as ET from pg import DB if __name__ == '__main__': file = open("ruwiki.xml", encoding="utf-8") tree = ET.iterparse(file) db = DB(host="localhost", user="******", passwd="1234", dbname="wiki") db.query("TRUNCATE TABLE wiki") for i, line in enumerate(tree): event, element = line if element.tag == "page": pageid = element.find("id").text title = element.find("title").text timestamp = element.find("revision").find("timestamp").text.replace("T", " ") username = element.find("revision").find("contributor").find("username") if not username is None: author = username.text else: author = element.find("revision").find("contributor").find("ip").text text = element.find("revision").find("text").text db.insert("wiki", id=pageid, title=title, timestamp=timestamp, author=author, text=text) if i % 10**4 == 0: print(round(i/10**6 / 14.8 * 100, 2), "%", pageid, title, timestamp, author) print(db.query("SELECT id, title, timestamp, author FROM wiki"))
print ("Table City - multiple rows at once") cities = 'Seattle Spokane Tacoma Vancouver'.split() data = list(enumerate(cities, start=1)) db.inserttable('city', data) print ("Table Feature - multiple rows at once") features = 'Library Park Museum Theatre'.split() data = list(enumerate(features, start=1)) db.inserttable('feature', data) print ("Table cityfeature - single row at once") db.insert('cityfeatures',featureid=1, cityid=1) db.insert('cityfeatures',featureid=2, cityid=1) db.insert('cityfeatures',featureid=3, cityid=1) db.insert('cityfeatures',featureid=4, cityid=1) db.insert('cityfeatures',featureid=3, cityid=2) db.insert('cityfeatures',featureid=2, cityid=3) db.insert('cityfeatures',featureid=4, cityid=4) print ("Table location - single row at once") db.insert('location', locationid =1, featureid =1, cityid=1 , commonname='Dorothy Stimson Bullitt Library', address='100 University St', website ='seattleartmuseum.org', lng=47.6057, lat=-122.3371 ) db.insert('location', locationid =2, featureid =1, cityid=1 , commonname='Region 10 Library', address='1200 6th Ave #900', website ='epa.gov', lng=47.6090, lat=-122.3320 )
db = DB(host="localhost", user="******", passwd="1234", dbname="wiki") db.query("TRUNCATE TABLE wiki") for i, line in enumerate(tree): event, element = line if element.tag == "page": pageid = element.find("id").text title = element.find("title").text timestamp = element.find("revision").find( "timestamp").text.replace("T", " ") username = element.find("revision").find("contributor").find( "username") if not username is None: author = username.text else: author = element.find("revision").find("contributor").find( "ip").text text = element.find("revision").find("text").text db.insert("wiki", id=pageid, title=title, timestamp=timestamp, author=author, text=text) if i % 10**4 == 0: print(round(i / 10**6 / 14.8 * 100, 2), "%", pageid, title, timestamp, author) print(db.query("SELECT id, title, timestamp, author FROM wiki"))
#获取计算资源数据 from pg import DB import DBManage MS=DBManage.DBManage(host="192.168.1.200",port="1433",user="******",password="******",database="VIM_VCDB",charset="UTF-8") PG = DB(dbname='VM', host='192.168.1.112',port=5432,user='******',passwd='123456') MS.getMSSConnetion() #sql1="SELECT [SAMPLE_TIME] ,[STAT_ID] ,[STAT_NAME] ,[STAT_GROUP] ,[STAT_ROLLUP_TYPE] ,[COUNTER_ID] ,[ENTITY] ,[DEVICE_NAME] ,[DEVICE_TYPE_NAME] ,[STAT_VALUE] FROM [VIM_VCDB].[dbo].[VPXV_HIST_STAT_DAILY]" sql="select A.[SAMPLE_TIME],A.[STAT_NAME],B.[STAT_ROLLUP],B.[UNIT],A.[STAT_GROUP],A.[ENTITY],A.[STAT_VALUE] from [VIM_VCDB].[dbo].[VPXV_HIST_STAT_DAILY] A ,[VIM_VCDB].[dbo].[VPXV_STAT_COUNTERS] B where A.[STAT_ID]=B.[ID] " cursor1=MS.QuerySql(sql) Tuple=cursor1.fetchone() while Tuple: PG.insert("vm_state", time=Tuple[0], stat_name=Tuple[1], stat_rollup_type=Tuple[2], unit=Tuple[3], stat_group=Tuple[4],entity=Tuple[5], stat_value=Tuple[6]) #PG.insert('hist_stat_daily',sample_time=Tuple[0],stat_id=Tuple[1],stat_name=Tuple[2],stat_group=Tuple[3],stat_rollup_type=Tuple[4],counter_id=Tuple[5],entity=Tuple[6],device_name=Tuple[7],device_type_name=Tuple[8],stat_value=Tuple[9]) Tuple=cursor1.fetchone() print("成功!") MS.closeConn() PG.close()
from pg import DB import DBManage MS=DBManage.DBManage(host="192.168.1.200",port="1433",user="******",password="******",database="VIM_VCDB",charset="UTF-8") PG = DB(dbname='VM', host='localhost' ,port=5432,user='******',passwd='123456') MS.getMSSConnetion() sql1="SELECT [ID] ,[STAT_ROLLUP] ,[NAME] ,[GROUP_NAME] ,[TYPE] ,[UNIT] ,[ASSOCIATE_IDS] ,[STATS_LEVEL] ,[FIXED_COLLECTION_INTERVAL] FROM [VIM_VCDB].[dbo].[VPXV_STAT_COUNTERS]" cursor1=MS.QuerySql(sql1) Tuple=cursor1.fetchone() while Tuple: PG.insert('stat_counters',id=Tuple[0],stat_rollup=Tuple[1],name=Tuple[2],group_name=Tuple[3],type=Tuple[4],unit=Tuple[5],associate_ids=Tuple[6],stats_level=Tuple[7],fixed_collection_interval=Tuple[8]) Tuple=cursor1.fetchone() print("成功!") MS.closeConn() PG.close()
class data_utils(object): """docstring for db_utils""" def __init__(self, arg): #super(db_utils, self).__init__() self.arg = arg self.db = DB(dbname='structnet_complete', user='******', passwd='structnet', host='localhost') #self.identity = ''.join(random.choice(string.ascii_lowercase) for _ in range(8)) #print self.identity print "Conncted!" self.start_time = time.time() def load_pmids(self): f = open(self.arg['corpus_map'], 'rb') self.pmid_dict = marshal.load(f) f.close() print "PMID loaded!" def insert_prediction(self): cnt = 0 # 527.8M lines with open(self.arg['data_file'], 'r') as IN: for line in IN: #try: if cnt % 50000 == 0: print self.arg['data_file'], "process ", cnt, " lines" cnt += 1 tmp = json.loads(line) self.db.insert(self.arg['prediction_table'], entity_a=tmp['em1Text'], entity_b=tmp['em2Text'], relation_type=tmp['label'], score=tmp['score']) def insert_query(self): with open(self.arg['data_file']) as IN: line_num = 0 for line in IN: print line_num tmp = line.split('\t') target_type = ast.literal_eval(tmp[0])[0] if '[' in tmp[3]: output_types = ast.literal_eval(tmp[3]) else: output_types = [tmp[3]] relation_type = tmp[2] self.db.insert(self.arg['query_table'], target_type=target_type, output_types=output_types, index=line_num, relation_type=relation_type) line_num += 1 def insert_caseolap(self, index_number): with open(self.arg['data_file'] + str(index_number) + '_c.txt') as IN: for line in IN: tmp = line.split('\t') score_list = tmp[1].strip().lstrip('[').rstrip(']').split(',') for ele in score_list: temp = ele.strip().split('|') if len(temp) != 2: continue self.db.insert(self.arg['caseolap_table'], doc_id=index_number, sub_type=tmp[0], entity=temp[0], score=float(temp[1])) def query_prediction(self, name_a, name_b, relation_type): self.db.query("set statement_timeout TO 0") query_string = "SELECT score FROM "+self.arg['prediction_table']+" WHERE entity_a=\'" + name_a +"\' AND entity_b=\'" + \ name_b + "\' AND relation_type=\'" + relation_type + "\'" #query_em_a = "SELECT sent_id FROM"+self.arg['entity_table']+" WHERE entity_name=\'"+name_a+"\' GROUP BY article_id" q = self.db.query(query_string) if len(q.dictresult()) == 0: return 0 else: return q.dictresult()[0]['score'] def query_prediction_v2(self, name_a, name_b, relation_type): query_string = "SELECT score FROM "+self.arg['prediction_table']+" WHERE entity_a=\'" + name_a +"\' AND entity_b=\'" + \ name_b + "\' AND relation_type=\'" + relation_type + "\'" query_ems = "SELECT L.article_title, L.pmid, L.sent FROM entity_table as L INNER JOIN (select sent_id FROM entity_table where entity_name=\'"+name_a+\ "\') AS R ON L.sent_id=R.sent_id where entity_name=\'"+name_b+"\' LIMIT 1" #set_a = set(map(lambda x:x['sent_id'], self.db.query(query_em_a).dictresult())) ##query_em_b = "SELECT distinct sent_id FROM entity_table WHERE entity_name=\'"+name_b+"\'" #set_b = set(map(lambda x:x['sent_id'], self.db.query(query_em_b).dictresult())) #print set_a.intersection(set_b) self.db.query("set statement_timeout TO 100") try: result = self.db.query(query_ems).dictresult()[0] except: result = {} q = self.db.query(query_string) if len(q.dictresult()) == 0: result['score'] = 0 else: result['score'] = q.dictresult()[0]['score'] return result #print query_string def query_distinctive(self, target_type, output_types, relation_type, sub_types, num_records=8): self.db.query("set statement_timeout TO 0") sub_types = ast.literal_eval(sub_types) result = [] query_string = "SELECT index FROM query_table WHERE target_type @@ \'" + target_type + "\' AND output_types@>\'" +\ output_types+"\' and relation_type=\'" + relation_type +"\'" #print query_string idx = self.db.query(query_string).dictresult()[0]['index'] for sub_type in sub_types: query_string = "SELECT entity,score FROM " + self.arg['caseolap_table'] + " WHERE doc_id=" + str(idx) + \ " AND sub_type @@ \'" + sub_type + "\' ORDER BY score LIMIT " + str(num_records) q = self.db.query(query_string) result.append(q.dictresult()) return result def query_distinctive_v2(self, target_type, output_types, relation_type, sub_types, num_records=8): self.db.query("set statement_timeout TO 0") sub_types = ast.literal_eval(sub_types) result = [] pmid_result = [] query_string = "SELECT index FROM query_table WHERE target_type @@ \'" + target_type + "\' AND output_types@>\'" +\ output_types+"\' and relation_type=\'" + relation_type +"\'" #print query_string if 'MeSH' in target_type: type_b_name = 'type_b_mesh' else: type_b_name = 'type_b_umls' if 'MeSH' in output_types: type_a_name = 'type_a_mesh' else: type_a_name = 'type_a_umls' idx = self.db.query(query_string).dictresult()[0]['index'] for sub_type in sub_types: query_string = "SELECT entity,score FROM " + self.arg['caseolap_table'] + " WHERE doc_id=" + str(idx) + \ " AND sub_type @@ \'" + sub_type + "\' ORDER BY score LIMIT " + str(num_records) q = self.db.query(query_string) type_target = sub_type.split('::')[0] #print type_target #entity_list = q.dictresult() pmids = [] for em in q.dictresult(): qq = "select pmid from entity_table where sent_id = (select sent_id FROM relation_table WHERE entity_a=\'" +em['entity'] +\ "\' AND relation_type = '"+relation_type+"' LIMIT 1) LIMIT 1" #"\' AND relation_type = '"+relation_type+"' AND "+type_b_name+"@>'{"+type_target+"}' LIMIT 1" em_result = self.db.query(qq).dictresult() if len(em_result) > 0: em_result[0]['entity'] = em['entity'] pmids.append(em_result[0]) result.append(q.dictresult()) pmid_result.append(pmids) return result, pmid_result def insert_record(self): cnt = 0 # 527.8M lines with open(self.arg['data_file'], 'r') as IN: for line in IN: #try: if cnt % 5000 == 0: print self.arg['data_file'], "process ", cnt, " lines" cnt += 1 tmp = json.loads(line) type_dict = {} for e in tmp['entityMentions']: aid = str(tmp['articleId']) mesh = [] umls = [] go = [] sub_type = defaultdict(list) for l in e['label']: if 'MeSH' in l: mesh.append(l.replace('MeSH:::', '')) elif 'GO' in l: go.append(l.replace('GO:::', '')) else: umls.append(l) if len(go) > 0: sub_type['go'] = True else: sub_type['go'] = False if len(mesh) > 0: sub_type['mesh'] = mesh[0].split('::') sub_type['umls'] = umls type_dict[e['start']] = sub_type assert (len(self.pmid_dict[aid]) < 9) self.db.insert(self.arg['entity_table'], entity_name=e['text'], sent_id=tmp['sentId'], article_id=tmp['articleId'], pmid=self.pmid_dict[aid][0], article_title=self.pmid_dict[aid][1], sent=tmp['sentText']) for r in tmp['relationMentions']: self.db.insert( self.arg['relation_table'], entity_a=r['em1Text'], entity_b=r['em2Text'], relation_type=r['label'], a_is_gene=type_dict[r['em1Start']]['go'], b_is_gene=type_dict[r['em2Start']]['go'], type_a_umls=type_dict[r['em1Start']]['umls'], type_b_umls=type_dict[r['em2Start']]['umls'], type_a_mesh=type_dict[r['em1Start']]['mesh'], type_b_mesh=type_dict[r['em2Start']]['mesh'], sent_id=tmp['sentId'], article_id=tmp['articleId']) #except: #print cnt def generate_random_walks(self, edge_pairs, num_walks=3, num_steps=3): print len(edge_pairs) outgoing_edges = defaultdict(list) ingoing_edges = defaultdict(list) map(lambda x: outgoing_edges[x['entity_a']].append(x['entity_b']), edge_pairs) map(lambda x: ingoing_edges[x['entity_b']].append(x['entity_a']), edge_pairs) for k, v in outgoing_edges.iteritems(): if len(v) > 0: v.append(k) #print outgoing_edges #print ingoing_edges #print outgoing_edges temp = max(outgoing_edges.values(), key=len) origin = temp[-1] walk_nodes = [origin] total_nodes = set([origin]) edges = set() for i in xrange(num_steps): #print "++++++++++++++++++++",i next_step = [] while len(walk_nodes) > 0: node = walk_nodes.pop() if i % 2 == 0: node_list = outgoing_edges[node][:-1] assert (len(node_list) > 0) else: node_list = ingoing_edges[node] assert (len(node_list) > 0) #print node if len(node_list) <= num_walks: next_step.extend(node_list) edges = edges.union( list(map(lambda x: (node, x), node_list))) else: #print "random_sample",node_list random_nodes = random.sample(node_list, num_walks) next_step.extend(random_nodes) edges = edges.union( list(map(lambda x: (node, x), random_nodes))) assert (len(walk_nodes) == 0) walk_nodes = list(set(next_step)) total_nodes = total_nodes.union(set(next_step)) #print total_nodes print len(edges) return edges #print edges def query_links_with_walk(self, type_a, type_b, relation_type, num_edges=5, num_pps=1): try: type_a = ast.literal_eval(type_a) type_b = ast.literal_eval(type_b) except: pass #print num_edges #query_string = "SELECT * FROM (SELECT DISTINCT ON(entity_a,entity_b) entity_a,entity_b,sent_id "+"FROM "+self.arg['relation_table']+" WHERE type_a_"+type_a['name']+"@>'"\ #+type_a['type']+"' AND type_b_"+type_b['name']+"@>'"+ type_b['type']+"' AND relation_type='"+relation_type + "') x ORDER BY RANDOM() LIMIT 500" query_string = "SELECT * FROM (SELECT entity_a,entity_b,(array_agg('[' || article_id || ',' || sent_id || ']'))[1:" + str(num_pps) + "] "+"FROM "+self.arg['relation_table']+" WHERE type_a_"+type_a['name']+"@>'"\ +type_a['type']+"' AND type_b_"+type_b['name']+"@>'"+ type_b['type']+"' AND relation_type='"+relation_type + "' GROUP BY entity_a,entity_b) x ORDER BY RANDOM() LIMIT " +str(num_edges) q = self.db.query(query_string) result = {'node_a': {}, 'node_b': {}, 'edges': []} for p in list(self.generate_random_walks(q.dictresult())): result['node_a'][p[0]] = [] result['node_b'][p[1]] = [] result['edges'].append({'source': p[0], 'target': p[1]}) #print result return result def get_category_entities(self): outputfile = open("category_entities.json", 'w') category_entity = dict() with open("mesh_type_hierarchy-2016.txt") as fp: for line in fp: mesh_id = line.split('\t', 1)[0] if len(mesh_id.split('.')) == 2: mesh = line.split('\t', 1)[1] query_string = "SELECT entity_name from entity_table_slim where entity_mesh @> '{" + mesh.replace( "'", "''") + "}'" # print query_string entities = self.db.query(query_string) entities = entities.dictresult() if len(entities) == 0: continue category_entity[mesh[:-1]] = [] for entity in entities: category_entity[mesh[:-1]].append( entity['entity_name']) category_json = json.dumps(category_entity) outputfile.write(str(category_json)) outputfile.close() def get_relations(self, type_a, type_b, entities_left=[], entities_right=[]): try: entities_left = ast.literal_eval(entities_left) entities_right = ast.literal_eval(entities_right) except: pass query_string = "SELECT distinct relation_type, entity_a, entity_b FROM relation_table_slim WHERE type_a_mesh @> '{" + type_a.replace( "'", "''") + "}' AND type_b_mesh @> '{\ " + type_b.replace("'", "''") + "}'" q = self.db.query(query_string) relations = q.dictresult() result = set() for relation in relations: if (not entities_left or (relation['entity_a'] in entities_left)) and ( (not entities_right) or (relation['entity_b'] in entities_right)): result.add(relation['relation_type']) return {'relations': list(result)} def query_links_by_two_sides_entities(self, entities_left, entities_right, relation_type, num_edges=5, num_pps=1): query_a = [] query_b = [] query_edges = [] for entity_left in entities_left: for entity_right in entities_right: query_string = "SELECT (array_agg('[' || article_id || ',' || sent_id || ']'))[1:" + str( num_pps + 2) + "] as sents FROM relation_table WHERE \ entity_a='" + entity_left + "' and entity_b='" + entity_right + "' and relation_type='" + relation_type + "'" q = self.db.query(query_string) result = q.dictresult() article = dict() if result[0]['sents'] != None: for x in result[0]['sents']: x = ast.literal_eval(x) article[x[0]] = str(x[1]) print "article: ", article query_a.append(entity_left) query_b.append(entity_right) query_edges.append({ 'source': entity_left, 'target': entity_right, 'sids': article.values(), 'sents': [] }) red_node = dict() for v in query_a: tmp = self.db.query( "SELECT article_titles, pmids, sents FROM entity_table_compressed WHERE entity_name= '" + v + "' LIMIT " + str(1)) dictresult = tmp.dictresult()[0] red_node[v] = [] random_nums = random.sample( range(0, len(dictresult['article_titles'])), min(len(dictresult['article_titles']), num_pps)) for i in range(0, len(random_nums)): red_node[v].append( (dictresult['article_titles'][random_nums[i]], dictresult['sents'][random_nums[i]], dictresult['pmids'][random_nums[i]])) blue_node = dict() for v in query_b: tmp = self.db.query( "SELECT article_titles, pmids, sents FROM entity_table_compressed WHERE entity_name= '" + v + "' LIMIT " + str(1)) dictresult = tmp.dictresult()[0] blue_node[v] = [] random_nums = random.sample( range(0, len(dictresult['article_titles'])), min(len(dictresult['article_titles']), num_pps)) for i in range(0, len(random_nums)): blue_node[v].append( (dictresult['article_titles'][random_nums[i]], dictresult['sents'][random_nums[i]], dictresult['pmids'][random_nums[i]])) for edge in query_edges: for s_id in edge['sids'][:num_pps]: tmp = self.db.query( "SELECT article_title, pmid, sent FROM entity_table WHERE sent_id = '" + s_id + "' LIMIT 1") edge['sents'].append(tmp.dictresult()[0]) return {'node_a': red_node, 'node_b': blue_node, 'edge': query_edges} def query_links_by_left_entities(self, entities_left, type_b, relation_type, num_edges, num_pps): self.db.query("set statement_timeout TO 0") query_string = "SELECT * FROM (SELECT entity_a,entity_b,(array_agg('[' || article_id || ',' || sent_id || ']'))[1:" + str( num_pps + 2 ) + "] as sents FROM " + self.arg[ 'relation_table'] + " WHERE type_b_" + type_b[ 'name'] + "@>'" + type_b[ 'type'] + "' AND relation_type='" + relation_type + "' GROUP BY entity_a,entity_b) x" q = self.db.query(query_string) result = q.dictresult() query_a = [] query_b = [] query_edges = [] count = 0 for r in result: if r['entity_a'] in entities_left: if count < int(num_edges): count = count + 1 else: break query_a.append(r['entity_a']) query_b.append(r['entity_b']) article = dict() for x in r['sents']: x = ast.literal_eval(x) article[x[0]] = str(x[1]) query_edges.append({ 'source': r['entity_a'], 'target': r['entity_b'], 'sids': article.values(), 'sents': [] }) red_node = dict() for v in query_a: tmp = self.db.query( "SELECT article_titles, pmids, sents FROM entity_table_compressed WHERE entity_name= '" + v + "' LIMIT " + str(1)) dictresult = tmp.dictresult()[0] red_node[v] = [] random_nums = random.sample( range(0, len(dictresult['article_titles'])), min(len(dictresult['article_titles']), num_pps)) for i in range(0, len(random_nums)): red_node[v].append( (dictresult['article_titles'][random_nums[i]], dictresult['sents'][random_nums[i]], dictresult['pmids'][random_nums[i]])) blue_node = dict() for v in query_b: tmp = self.db.query( "SELECT article_titles, pmids, sents FROM entity_table_compressed WHERE entity_name= '" + v + "' LIMIT " + str(1)) dictresult = tmp.dictresult()[0] blue_node[v] = [] random_nums = random.sample( range(0, len(dictresult['article_titles'])), min(len(dictresult['article_titles']), num_pps)) for i in range(0, len(random_nums)): blue_node[v].append( (dictresult['article_titles'][random_nums[i]], dictresult['sents'][random_nums[i]], dictresult['pmids'][random_nums[i]])) for edge in query_edges: for s_id in edge['sids'][:num_pps]: tmp = self.db.query( "SELECT article_title, pmid, sent FROM entity_table WHERE sent_id = '" + s_id + "' LIMIT 1") edge['sents'].append(tmp.dictresult()[0]) return {'node_a': red_node, 'node_b': blue_node, 'edge': query_edges} def query_links_by_right_entities(self, type_a, entities_right, relation_type, num_edges, num_pps): self.db.query("set statement_timeout TO 0") query_string = "SELECT * FROM (SELECT entity_a,entity_b,(array_agg('[' || article_id || ',' || sent_id || ']'))[1:" + str( num_pps + 2 ) + "] as sents FROM " + self.arg[ 'relation_table'] + " WHERE type_a_" + type_a[ 'name'] + "@>'" + type_a[ 'type'] + "' AND relation_type='" + relation_type + "' GROUP BY entity_a,entity_b) x" q = self.db.query(query_string) print "right entities" result = q.dictresult() query_a = [] query_b = [] query_edges = [] count = 0 for r in result: if r['entity_b'] in entities_right: if count < int(num_edges): count = count + 1 else: break print 'entity_a: ', r['entity_a'] print 'entity_b: ', r['entity_b'] query_a.append(r['entity_a']) query_b.append(r['entity_b']) article = dict() for x in r['sents']: x = ast.literal_eval(x) article[x[0]] = str(x[1]) query_edges.append({ 'source': r['entity_a'], 'target': r['entity_b'], 'sids': article.values(), 'sents': [] }) print 'query_a: ', query_a print 'query_b: ', query_b print 'query_edges: ', query_edges red_node = dict() for v in query_a: tmp = self.db.query( "SELECT article_titles, pmids, sents FROM entity_table_compressed WHERE entity_name= '" + v + "' LIMIT " + str(1)) dictresult = tmp.dictresult()[0] red_node[v] = [] random_nums = random.sample( range(0, len(dictresult['article_titles'])), min(len(dictresult['article_titles']), num_pps)) for i in range(0, len(random_nums)): red_node[v].append( (dictresult['article_titles'][random_nums[i]], dictresult['sents'][random_nums[i]], dictresult['pmids'][random_nums[i]])) blue_node = dict() for v in query_b: tmp = self.db.query( "SELECT article_titles, pmids, sents FROM entity_table_compressed WHERE entity_name= '" + v + "' LIMIT " + str(1)) dictresult = tmp.dictresult()[0] blue_node[v] = [] random_nums = random.sample( range(0, len(dictresult['article_titles'])), min(len(dictresult['article_titles']), num_pps)) for i in range(0, len(random_nums)): blue_node[v].append( (dictresult['article_titles'][random_nums[i]], dictresult['sents'][random_nums[i]], dictresult['pmids'][random_nums[i]])) for edge in query_edges: for s_id in edge['sids'][:num_pps]: tmp = self.db.query( "SELECT article_title, pmid, sent FROM entity_table WHERE sent_id = '" + s_id + "' LIMIT 1") edge['sents'].append(tmp.dictresult()[0]) return {'node_a': red_node, 'node_b': blue_node, 'edge': query_edges} def query_links_by_categories(self, type_a, type_b, relation_type, num_edges, num_pps): self.db.query("set statement_timeout TO 0") query_string_v2 = "SELECT * FROM (SELECT entity_a,entity_b,(array_agg('[' || article_id || ',' || sent_id || ']'))[1:" + str(num_pps+2) + "] as sents "+"FROM "+self.arg['relation_table']+" WHERE type_a_"+type_a['name']+"@>'"\ +type_a['type']+"' AND type_b_"+type_b['name']+"@>'"+ type_b['type']+"' AND relation_type='"+relation_type + "' GROUP BY entity_a,entity_b) x ORDER BY RANDOM() LIMIT " +str(num_edges) print query_string_v2 q = self.db.query(query_string_v2) temp_time = time.time() - self.start_time self.start_time = time.time() print "------query time cost = ", temp_time result = q.dictresult() query_a = [] query_b = [] query_edges = [] for r in result: print 'entity_a: ', r['entity_a'] print 'entity_b: ', r['entity_b'] query_a.append(r['entity_a']) query_b.append(r['entity_b']) article = dict() for x in r['sents']: x = ast.literal_eval(x) article[x[0]] = str(x[1]) query_edges.append({ 'source': r['entity_a'], 'target': r['entity_b'], 'sids': article.values(), 'sents': [] }) print "query_a: ", query_a print "query_b: ", query_b red_node = dict() for v in query_a: # tmp=self.db.query("select article_title, pmid, sent from entity_table where entity_name= '" + v + "' LIMIT "+str(num_pps)) tmp = self.db.query( "SELECT article_titles, pmids, sents FROM entity_table_compressed WHERE entity_name= '" + v + "' LIMIT " + str(1)) dictresult = tmp.dictresult()[0] red_node[v] = [] random_nums = random.sample( range(0, len(dictresult['article_titles'])), min(len(dictresult['article_titles']), num_pps)) for i in range(0, len(random_nums)): red_node[v].append( (dictresult['article_titles'][random_nums[i]], dictresult['sents'][random_nums[i]], dictresult['pmids'][random_nums[i]])) # red_node[v] = map(lambda x:(x['article_titles'],x['sents'],x['pmids']),tmp.dictresult()) blue_node = dict() for v in query_b: tmp = self.db.query( "SELECT article_titles, pmids, sents FROM entity_table_compressed WHERE entity_name= '" + v + "' LIMIT " + str(1)) dictresult = tmp.dictresult()[0] blue_node[v] = [] random_nums = random.sample( range(0, len(dictresult['article_titles'])), min(len(dictresult['article_titles']), num_pps)) for i in range(0, len(random_nums)): blue_node[v].append( (dictresult['article_titles'][random_nums[i]], dictresult['sents'][random_nums[i]], dictresult['pmids'][random_nums[i]])) temp_time = time.time() - self.start_time self.start_time = time.time() print "------query entity_a and entity_b time cost = ", temp_time for edge in query_edges: for s_id in edge['sids'][:num_pps]: tmp = self.db.query( "select article_title, pmid, sent from entity_table where sent_id= '" + s_id + "' LIMIT 1") edge['sents'].append(tmp.dictresult()[0]) temp_time = time.time() - self.start_time self.start_time = time.time() print "red_node nubmer = ", len(red_node), " blue_node number = ", len( blue_node), " edges number = ", len(query_edges) print "------query edges time cost = ", temp_time return {'node_a': red_node, 'node_b': blue_node, 'edge': query_edges} def query_links_v2(self, type_a, type_b, relation_type, entities_left=[], entities_right=[], num_edges=5, num_pps=1): try: type_a = ast.literal_eval(type_a) type_b = ast.literal_eval(type_b) entities_left = ast.literal_eval(entities_left) entities_right = ast.literal_eval(entities_right) except: pass if len(entities_left) > 0 and len(entities_right) > 0: print "two sides" result = self.query_links_by_two_sides_entities( entities_left, entities_right, relation_type, num_edges, num_pps) elif len(entities_left) == 0 and len(entities_right) == 0: print "categories" result = self.query_links_by_categories(type_a, type_b, relation_type, num_edges, num_pps) elif len(entities_left) == 0: print "right" result = self.query_links_by_right_entities( type_a, entities_right, relation_type, num_edges, num_pps) elif len(entities_right) == 0: print "left" result = self.query_links_by_left_entities(entities_left, type_b, relation_type, num_edges, num_pps) nodes = [] if 'node_a' in result: for node_name in result['node_a']: node = dict() node['name'] = node_name node['sents'] = [] for article in result['node_a'][node_name]: temp = dict() temp['artitle_title'] = article[0] temp['sent'] = article[1] temp['pmid'] = article[2] node['sents'].append(temp) node['group'] = 1 nodes.append(node) if 'node_b' in result: for node_name in result['node_b']: node = dict() node['name'] = node_name node['sents'] = [] for article in result['node_b'][node_name]: temp = dict() temp['artitle_title'] = article[0] temp['sent'] = article[1] temp['pmid'] = article[2] node['sents'].append(temp) node['group'] = 2 nodes.append(node) # print json.dumps({'edges': result['edge'], 'nodes': nodes}) return {'edges': result['edge'], 'nodes': nodes} def query_links(self, type_a, type_b, relation_type, num_edges=5, num_pps=1): #type_a={'mesh':0, 'name':"Chemicals_and_Drugs"} try: type_a = ast.literal_eval(type_a) type_b = ast.literal_eval(type_b) except: pass print "type_a = ", type_a print "type_b = ", type_b print "relation_type = ", relation_type self.db.query("set statement_timeout TO 0") #query_string = "SELECT * INTO " +self.identity+ " FROM (SELECT DISTINCT ON(entity_a,entity_b) entity_a,entity_b,sent_id "+"FROM "+self.arg['relation_table']+" WHERE type_a_"+type_a['name']+"@>'"\ #+type_a['type']+"' AND type_b_"+type_b['name']+"@>'"+ type_b['type']+"' AND relation_type='"+relation_type + "') x ORDER BY RANDOM() LIMIT " +str(num_edges) query_string_v2 = "SELECT * FROM (SELECT entity_a,entity_b,(array_agg('[' || article_id || ',' || sent_id || ']'))[1:" + str(num_pps+2) + "] as sents "+"FROM "+self.arg['relation_table']+" WHERE type_a_"+type_a['name']+"@>'"\ +type_a['type']+"' AND type_b_"+type_b['name']+"@>'"+ type_b['type']+"' AND relation_type='"+relation_type + "' GROUP BY entity_a,entity_b) x ORDER BY RANDOM() LIMIT " +str(num_edges) print query_string_v2 q = self.db.query(query_string_v2) temp_time = time.time() - self.start_time self.start_time = time.time() # print "------query time cost = ", temp_time result = q.dictresult() # print "result length = ", len(result) query_a = [] query_b = [] query_edges = [] for r in result: query_a.append(r['entity_a']) query_b.append(r['entity_b']) article = dict() for x in r['sents']: x = ast.literal_eval(x) article[x[0]] = str(x[1]) query_edges.append({ 'source': r['entity_a'], 'target': r['entity_b'], 'sids': article.values(), 'sents': [] }) red_node = dict() # print "len query_a = ", len(query_a), "; len query_b = ", len(query_b), "; len query_edges", len(query_edges) for v in query_a: #print "select distinct on (article_id) article_title, pmid, sent from entity_table where entity_name= '" + v['entity_a'] + "' LIMIT "+str(num_pps) #print "select distinct on (article_id) article_title, pmid, sent from entity_table where entity_name= '" + v['entity_a'] + "' LIMIT 2" tmp = self.db.query( "select article_title, pmid, sent from entity_table where entity_name= '" + v + "' LIMIT " + str(num_pps)) #tmp=self.db.query("select distinct on (article_id) article_title, pmid, sent from entity_table where entity_name= '" + v['entity_a'] + "' LIMIT "+str(num_pps)) red_node[v] = map( lambda x: (x['article_title'], x['sent'], x['pmid']), tmp.dictresult()) #query_b=self.db.query("select entity_b from "+self.identity) blue_node = dict() for v in query_b: #pass #print "select distinct on (article_id) article_title, pmid, sent from entity_table where entity_name= '" + v['entity_b'] + "' LIMIT "+str(num_pps) tmp = self.db.query( "select article_title, pmid, sent from entity_table where entity_name= '" + v + "' LIMIT " + str(num_pps)) #tmp=self.db.query("select distinct on (article_id) article_title, pmid, sent from entity_table where entity_name= '" + v['entity_b'] + "' LIMIT "+str(num_pps)) blue_node[v] = map( lambda x: (x['article_title'], x['sent'], x['pmid']), tmp.dictresult()) temp_time = time.time() - self.start_time self.start_time = time.time() # print "------query entity_a and entity_b time cost = ", temp_time #print red_node,blue_node for edge in query_edges: for s_id in edge['sids'][:num_pps]: tmp = self.db.query( "select article_title, pmid, sent from entity_table where sent_id= '" + s_id + "' LIMIT 1") edge['sents'].append(tmp.dictresult()[0]) temp_time = time.time() - self.start_time self.start_time = time.time() # print "red_node nubmer = ", len(red_node), " blue_node number = ", len(blue_node), " edges number = ", len(query_edges) # print "------query edges time cost = ", temp_time #query_edge = "SELECT DISTINCT T.entity_a as source, T.entity_b as target, E.pmid, E.article_title,E.sent FROM " + self.arg['entity_table'] + " AS E INNER JOIN "+\ #self.identity+" T ON E.sent_id = T.sent_id"; #query_edges = "SELECT distinct on (R.article_id) R.article_id, R.sent_id from " + self.arg['relation_table']+ " AS R inner join " + self.identity+ \ #" T on R.entity_a = T.entity_a and R.entity_b = T.entity_b and R.relation_type = \'" + relation_type +"' LIMIT " +str(num_pps) #"distinct on (article_id) inner join on entity_a, entity_b, relation_type" #q = self.db.query(query_edge) #self.db.query("drop table "+self.identity) return {'node_a': red_node, 'node_b': blue_node, 'edge': query_edges}
from pg import DB PG = DB(dbname='VM', host='192.168.1.112', port=5432, user='******', passwd='123456') sql = "select A.sample_time,A.stat_name,B.stat_rollup,B.unit,A.stat_group,A.entity,A.stat_value from hist_stat_daily A ,stat_counters B where A.stat_id=B.id " q = PG.query(sql) rows = q.getresult() for row in rows: PG.insert("vm_state", time=row[0], stat_name=row[1], stat_rollup_type=row[2], unit=row[3], stat_group=row[4], entity=row[5], stat_value=row[6]) print("成功") PG.close()
class PostGreDBConnector: """PostGreDBConnector opens a PostGre DB connection. Different functions allow you to add, delete or update documents in PostGre DB.""" def __init__(self): """Connecting to localhost (default host and port [localhost, 4532]) PostGre DB and initializing needed data base and tables.""" try: print("Connecting to PostGre DB...") self.__db = DB(dbname='testdb', host='localhost', port=5432, user='******', passwd='superuser') print("PostGre DB connection successfully built.") except ConnectionError: print("PostGre DB connection could not be built.") self.delete_all_data() self.drop_all_tables() def close_connection(self): self.__db.close() def create_schema(self, schema_name): self.__db.query("CREATE SCHEMA " + schema_name) self.__create_tables(schema_name) self.__create_functions(schema_name) def __create_tables(self, schema): """Create needed tables for RDF parsing.""" schema += "." self._add_table("CREATE TABLE " + schema + "texts (id serial primary key, title text)") self._add_table( "CREATE TABLE " + schema + "bscale (id serial primary key, bscale text, nominal bool, ordinal bool, interval bool)") self._add_table("CREATE TABLE " + schema + "bsort (id serial primary key, bsort text)") self._add_table("CREATE TABLE " + schema + "pattern (id serial primary key, pattern text)") self._add_table("CREATE TABLE " + schema + "single_pattern (id serial primary key, single_pattern text)") self._add_table("CREATE TABLE " + schema + "snippets (id serial primary key, snippet text)") # relations self._add_table("CREATE TABLE " + schema + "has_attribute (bsort_id int, bscale_id integer[], aggregation int)") self._add_table("CREATE TABLE " + schema + "has_object (bscale_id int, pattern_id integer[], aggregation int)") self._add_table( "CREATE TABLE " + schema + "pattern_single_pattern (pattern_id int, single_pattern_id integer[], aggregation int)") self._add_table("CREATE TABLE " + schema + "texts_snippets (text_id int primary key, snippet_id integer[], aggregation int)") self._add_table( "CREATE TABLE " + schema + "snippet_offsets (id serial primary key," " single_pattern_id int, snippet_id int, offsets integer[][], aggregation int)") # adjective and verb extractions self._add_table("CREATE TABLE " + schema + "subject_occ (id serial primary key, subject text, count int)") self._add_table("CREATE TABLE " + schema + "adjective_occ (id serial primary key, adjective text, count int)") self._add_table("CREATE TABLE " + schema + "verb_occ (id serial primary key, verb text, count int)") self._add_table("CREATE TABLE " + schema + "object_occ (id serial primary key, object text, count int)") self._add_table("CREATE TABLE " + schema + "subject_adjective_occ (id serial primary key, subject int, adjective int, count int, pmi float)") self._add_table("CREATE TABLE " + schema + "subject_object_occ (id serial primary key, subject int, object int, count int, pmi float)") self._add_table("CREATE TABLE " + schema + "object_verb_occ (id serial primary key, object int, verb int, count int, pmi float)") self._add_table("CREATE TABLE " + schema + "subject_verb_occ (id serial primary key, subject int, verb int, count int, pmi float)") # correlating pattern self._add_table("CREATE TABLE " + schema + "bscale_single_pattern (id serial primary key, bscale_id int, single_pattern_id int, single_pattern text, count int)") self._add_table( "CREATE TABLE " + schema + "correlating_pattern (id serial primary key, pattern_a int, pattern_b int, count int, pmi float)") def __create_functions(self, schema): """Create all necessary functions to aggregate the results saved in the database.""" schema += "." self.add_function(schema + "aggregate_texts_snippets", "SELECT text_id, array_length(snippet_id, 1) FROM " + schema + "texts_snippets") self.add_function(schema + "aggregate_snippet_offsets", "SELECT id, array_length(offsets, 1) FROM " + schema + "snippet_offsets") def add_function(self, name, function): """Create a new function in the db.""" create_function = "CREATE FUNCTION " returns = "() RETURNS SETOF RECORD AS " lang = " LANGUAGE SQL" query = create_function + name + returns + add_quotes(function) + lang self.__db.query(query) def _add_table(self, query): """Create a new table with a query.""" self.__db.query(query) def add_table(self, schema, name, rows): """Create a new table with a name and rows given in query form.""" create_table = "CREATE TABLE " query = create_table + schema + "." + name + rows self.__db.query(query) def insert(self, schema, table, row): """Insert a new row element into a specified table.""" return self.__db.insert(schema + "." + table, row) def is_in_table(self, schema, table, where_clause): """Returns whether a row already exists in a table or not.""" select = "SELECT * FROM " where = " WHERE " q = select + schema + "." + table + where + where_clause result = self.__db.query(q).dictresult() if len(result) > 0: return True else: return False def update(self, schema, table, values, where_clause): """Update an entry in a specified table.""" UPDATE = "UPDATE " SET = " SET " WHERE = " WHERE " query = UPDATE + schema + "." + table + SET + values + WHERE + where_clause self.query(query) def get(self, schema, table, where_clause, key): """Return the key of a specific item in a table.""" select = "SELECT " _from = " FROM " where = " WHERE " q = select + key + _from + schema + "." + table + where + where_clause result = self.__db.query(q).dictresult() if len(result) > 0: return result[0][key] else: return None def get_data_from_table(self, schema, table): """Gets all data available in a specific table.""" return self.__db.query("SELECT * FROM " + schema + "." + table).dictresult() def get_id(self, schema, table, where_clause): """Return the id of an item in a table. If found return id number of found item, else None.""" select = "SELECT id FROM " where = " WHERE " q = select + schema + "." + table + where + where_clause result = self.__db.query(q).dictresult() if len(result) > 0: return result[0]['id'] else: return None def delete_from_table(self, schema, table, row): """Delete a row element form a specific table.""" return self.__db.delete(schema + "." + table, row) def delete_data_in_table(self, schema, table): """Delete all data in a specific table.""" self.__db.truncate(schema + "." + table, restart=True, cascade=True, only=False) def delete_all_data(self): """Deletes all data from all existing tables.""" tables = self.get_tables() for table in tables: table_name = str(table) self.__db.truncate(table_name, restart=True, cascade=True, only=False) def get_tables(self): """Get all available tables in the database.""" return self.__db.get_tables() def get_attributes(self, schema, table): """Get all attributes of a specified table.""" return self.__db.get_attnames(schema + "." + table) def drop_table(self, schema, table): """Drops a specified table.""" query = "DROP TABLE " self.__db.query(query + schema + "." + table) def drop_all_tables(self): """Drops all existing tables.""" tables = self.get_tables() table_names = "" if len(tables) > 0 : for ind, table in enumerate(tables): if ind == 0: table_names = str(table) else: table_names = table_names + ", " + str(table) self.__db.query("DROP TABLE " + table_names) else: print("Nothing to delete.") def get_all(self, schema, table, attribute): """Gets one or more attributes of all entries from a specified table.""" select = "SELECT " _from = " FROM " query = select + attribute + _from + schema + "." + table return self.__db.query(query).dictresult() def query(self, query): """Sends a query to the database.""" result = self.__db.query(query) if result is not None: if not isinstance(result, str): return result.dictresult() else: return result
if exist == []: #insert curId = curId + 1 venue["id"] = curId venue["venue_name"] = row["venue_name"] venue["venue_add_comp"] = row["venue_add_comp"] if row["venue_add_comp"] is not None: (add, lat, lon) = geocode(FormatAddress(row["venue_add_comp"])) else: add = None lat = None lon = None venue["venue_address"] = add venue["lat"] = lat venue["lon"] = lon db.insert('city.venues', venue) print('INSERT VENUE', row["venue_name"]) row["venue_id"] = curId elif None in exist[0][0:4]: #update venue["id"] = exist[0][2] row["venue_id"] = exist[0][2] venue["venue_name"] = row["venue_name"] venue["venue_add_comp"] = row["venue_add_comp"] if row["venue_add_comp"] is not None: (add, lat, lon) = geocode(FormatAddress(row["venue_add_comp"])) else: add = None lat = None lon = None venue["venue_address"] = add