def post(self): # 从客户端获取信息 try: latest_time=self.get_argument('latest_time') latest_timestamp=self.get_argument('latest_timestamp') container_id=self.get_argument('container_id') self.write('success') self.finish() print('Success: to get data from web') except Exception as e: self.write('fail to return user history') self.finish() print('Error:server-HistoryReturn:' 'Unable to get value from http package,Reason:') print(e) return dbi=MySQL_Interface() checkin_timestamp=int(time.time()) col_info=dbi.get_col_name('cache_history') data=dict( latest_time=latest_time, latest_timestamp=latest_timestamp, container_id=container_id, checkin_timestamp=checkin_timestamp ) keys=data.keys() insert_data=[[data[item] if item in keys else None for item in col_info]] dbi.insert_asList('cache_history',insert_data)
def run(self): while True: self.dbi = MySQL_Interface() t = time.time() time_stick = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(t - 12 * 60 * 60)) #删掉cache_history中的行 query='delete from cache_history where container_id in (select container_id from user_info_table where isGettingBlog<\'{time}\' and update_time is null)'\ .format(time=time_stick) self.dbi.update_asQuery(query) # 删掉mongodb-assemble factory中的相关值 select_query = 'select container_id from user_info_table where isGettingBlog<\'{time}\' and update_time is null'.format( time=time_stick) res = [x[0] for x in self.dbi.select_asQuery(select_query)] client = MongoClient('localhost', 27017) db = client['microblog_spider'] assemble_table = db.assemble_factory assemble_table.remove({'container_id': {'$in': res}}) # 将user info table中超时行的isGettingBlog清空 query = "update user_info_table set isGettingBlog=null where isGettingBlog<\'{time}\' and update_time is null".format( time=time_stick) self.dbi.update_asQuery(query) # 将cache_history中的残留项去除 query = "delete from cache_history where is_dealing<\'{time}\' ;".format( time=time_stick) self.dbi.update_asQuery(query) time.sleep(60)
class deal_isGettingBLog_user(threading.Thread): def __init__(self): threading.Thread.__init__(self) self.dbi=MySQL_Interface() def run(self): while True: self.dbi=MySQL_Interface() t=time.time() time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(t-12*60*60)) #删掉cache_history中的行 query='delete from cache_history where container_id in (select container_id from user_info_table where isGettingBlog<\'{time}\' and update_time is null)'\ .format(time=time_stick) self.dbi.update_asQuery(query) # 删掉mongodb-assemble factory中的相关值 select_query='select container_id from user_info_table where isGettingBlog<\'{time}\' and update_time is null'.format(time=time_stick) res=[x[0] for x in self.dbi.select_asQuery(select_query)] client=MongoClient('localhost',27017) db=client['microblog_spider'] assemble_table=db.assemble_factory assemble_table.remove({'container_id':{'$in':res}}) # 将user info table中超时行的isGettingBlog清空 query="update user_info_table set isGettingBlog=null where isGettingBlog<\'{time}\' and update_time is null".format(time=time_stick) self.dbi.update_asQuery(query) time.sleep(60)
def run(self): while True: client=MongoClient('localhost',27017) db=client['microblog_spider'] mission_mongo=db.update_mission assemble_mongo=db.assemble_factory current_time=int(time.time()) target_time=current_time-60*60*6 #将6个小时仍未完成的任务清除出去 expired_mission=mission_mongo.find({'mission_start':{'$lt':target_time}}).limit(1) expired_mission=[x for x in expired_mission] if expired_mission.__len__()==0: # 如果没有符合要求的过期任务,则休眠 time.sleep(60) else: # 如果有过期的任务 expired_mission=expired_mission[0] mission_id=expired_mission['mission_id'] user_content=expired_mission['user_list'] user_list=[x['container_id'] for x in user_content] # 将mysql中相关用户isGettingBlog清空 user_list_str='' for item in user_list: user_list_str+='\''+str(item)+'\',' user_list_str=user_list_str[:-1] dbi=MySQL_Interface() query='update user_info_table set isGettingBlog=null where container_id in ({user_list});' \ .format(user_list=user_list_str) dbi.update_asQuery(query) # 将assemble_factory中数据清空 assemble_mongo.remove({'container_id':mission_id}) # 将Mongo中该任务从任务表中清空。 mission_mongo.remove({'mission_id':mission_id})
def run(self): bag=[] uid_bag=[] #与bag类似,只不过存储uid bag_size=1000 #100次插入一次 ready_to_get_col=self.dbi.get_col_name('ready_to_get') cache_attends_col=self.dbi.get_col_name('cache_attends') while True: query='select * from cache_attends limit 5000' res=self.dbi.select_asQuery(query) if res.__len__()==0: if bag.__len__()>0: self.dbi.insert_asList('ready_to_get',bag,unique=True) bag=[] # self.bf.insert_asList(uid_bag,'ready_to_get') uid_bag=[] time.sleep(1) self.dbi=MySQL_Interface() #更新dbi continue print('thread cache attends is working') for line in res: raw_id=line[cache_attends_col.index('uid')] in_user_info=self.bf.isContains(raw_id,'user_info_table') #此处可优化 if not in_user_info: data=[line[cache_attends_col.index(col)] if col in cache_attends_col else None for col in ready_to_get_col] bag.append(data) uid_bag.append(raw_id) if bag.__len__()>bag_size: self.dbi.insert_asList('ready_to_get',bag,unique=True) # self.bf.insert_asList(uid_bag,'ready_to_get') print('insert once') bag=[] uid_bag=[] self.dbi.delete_line('cache_attends','uid',raw_id) # 此处可优化
def post(self): # 从客户端获取信息 try: latest_time = self.get_argument('latest_time') latest_timestamp = self.get_argument('latest_timestamp') container_id = self.get_argument('container_id') self.write('success') self.finish() print('Success: to get data from web') except Exception as e: self.write('fail to return user history') self.finish() print('Error:server-HistoryReturn:' 'Unable to get value from http package,Reason:') print(e) return dbi = MySQL_Interface() checkin_timestamp = int(time.time()) col_info = dbi.get_col_name('cache_history') data = dict(latest_time=latest_time, latest_timestamp=latest_timestamp, container_id=container_id, checkin_timestamp=checkin_timestamp) keys = data.keys() insert_data = [[ data[item] if item in keys else None for item in col_info ]] dbi.insert_asList('cache_history', insert_data)
def run(self): while True: self.dbi=MySQL_Interface() t=time.time() time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(t-3600)) query="update ready_to_get set is_fetching=null where is_fetching < \'{time}\' ;".format(time=time_stick) # print(query) # query='select * from ready_to_get where is_fetching < {time}'.format(time=time_stick) self.dbi.update_asQuery(query) time.sleep(1)
class deal_cache_user_info(threading.Thread): def __init__(self): threading.Thread.__init__(self) self.dbi = MySQL_Interface() self.bf = BloomFilter() def run(self): while True: if self.dbi.is_empty('cache_user_info'): time.sleep(2) self.dbi = MySQL_Interface() continue [res, cache_user_info_col] = self.dbi.select_all('cache_user_info') time_stick = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # insert into user info table user_info_table_col = self.dbi.get_col_name('user_info_table') data = [[ line[cache_user_info_col.index(col)] if col in cache_user_info_col else time_stick if col == 'insert_time' else None if col == 'update_time' else None if col == 'latest_blog' else None if col == 'isGettingBlog' else '' for col in user_info_table_col ] for line in res] uid_list = [ line[user_info_table_col.index('uid')] for line in data ] self.dbi.insert_asList('user_info_table', data, unique=True) # 插入 user info table self.bf.insert_asList(uid_list, 'user_info_table') print('insert {num} users into user info table'.format( num=data.__len__())) uid_list = [line[cache_user_info_col.index('uid')] for line in res] q1 = "delete from {table_name} where uid in ( {id_str_list} ) ;" # 从cache user info 中删除 id_str_list = '' for i in uid_list: id_str_list = id_str_list + '\'' + str(i) + '\'' + ',' id_str_list = id_str_list[:-1] query = q1.format(id_str_list=id_str_list, table_name='cache_user_info') self.dbi.cur.execute(query) self.dbi.conn.commit() query = q1.format(id_str_list=id_str_list, table_name='ready_to_get') self.dbi.cur.execute(query) self.dbi.conn.commit()
def run(self): while True: self.dbi = MySQL_Interface() num = self.dbi.get_line_num('ready_to_get') if num > 150 * 1000: query='select m.fans_num from (' \ 'select fans_num from ready_to_get ' \ 'ORDER BY fans_num limit 50000' \ ') as m order by fans_num desc limit 1' res = self.dbi.select_asQuery(query)[0][0] query='delete from ready_to_get where fans_num<{num}'\ .format(num=res) self.dbi.update_asQuery(query) else: time.sleep(600)
class deal_fetching_user(threading.Thread): #定期清理获取时间过长的部分 def __init__(self): threading.Thread.__init__(self) self.dbi=MySQL_Interface() def run(self): while True: self.dbi=MySQL_Interface() t=time.time() time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(t-3600)) query="update ready_to_get set is_fetching=null where is_fetching < \'{time}\' ;".format(time=time_stick) # print(query) # query='select * from ready_to_get where is_fetching < {time}'.format(time=time_stick) self.dbi.update_asQuery(query) time.sleep(1)
class deal_cache_user_info(threading.Thread): def __init__(self): threading.Thread.__init__(self) self.dbi=MySQL_Interface() self.bf=BloomFilter() def run(self): while True: if self.dbi.is_empty('cache_user_info'): time.sleep(2) self.dbi=MySQL_Interface() continue [res,cache_user_info_col]=self.dbi.select_all('cache_user_info') time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # insert into user info table user_info_table_col=self.dbi.get_col_name('user_info_table') data= [ [ line[cache_user_info_col.index(col)] if col in cache_user_info_col else time_stick if col=='insert_time' else None if col=='update_time' else None if col=='latest_blog' else None if col=='isGettingBlog' else '' for col in user_info_table_col ] for line in res] uid_list=[line[user_info_table_col.index('uid')] for line in data] self.dbi.insert_asList('user_info_table',data,unique=True) # 插入 user info table self.bf.insert_asList(uid_list,'user_info_table') print('insert {num} users into user info table'.format(num=data.__len__())) uid_list=[line[cache_user_info_col.index('uid')] for line in res] q1="delete from {table_name} where uid in ( {id_str_list} ) ;" # 从cache user info 中删除 id_str_list='' for i in uid_list: id_str_list=id_str_list+'\''+str(i)+'\''+',' id_str_list=id_str_list[:-1] query=q1.format(id_str_list=id_str_list,table_name='cache_user_info') self.dbi.cur.execute(query) self.dbi.conn.commit() query=q1.format(id_str_list=id_str_list,table_name='ready_to_get') self.dbi.cur.execute(query) self.dbi.conn.commit()
class state_persistance(threading.Thread): """ function: monitor and note the state of proxy pool,including the current size of proxy pool, the input speed of new proxy , and the output speed. and manage the average size oj of proxy_pool class """ def __init__(self,proxy_pool): threading.Thread.__init__(self) self.proxy_pool=proxy_pool self.dbi=MySQL_Interface() def run(self): while True: time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) current_size=self.proxy_pool.size() [input,output]=self.proxy_pool.update_proxy_state() insert_value=[[current_size,time_stick,input,output]] self.dbi.insert_asList('proxy_table',insert_value,unique=True) time.sleep(server_config.PROXY_MONITOR_GAP)
class control_ready_table(threading.Thread): def __init__(self): threading.Thread.__init__(self) self.dbi=MySQL_Interface() def run(self): while True: self.dbi=MySQL_Interface() num=self.dbi.get_line_num('ready_to_get') if num>150*1000: query='select m.fans_num from (' \ 'select fans_num from ready_to_get ' \ 'ORDER BY fans_num limit 50000' \ ') as m order by fans_num desc limit 1' res=self.dbi.select_asQuery(query)[0][0] query='delete from ready_to_get where fans_num<{num}'\ .format(num=res) self.dbi.update_asQuery(query) else: time.sleep(600)
def start_selfcheck(): # 启动自检 print('\n\n********* start to selfcheck *********\n') mi = MySQL_Interface() if mi.cur: print('mysql is connected') client = MongoClient('localhost', 27017) print('mongodb is connected') client.close() auto_index() print('\n********* selfcheck success *********\n')
class state_persistance(threading.Thread): """ function: monitor and note the state of proxy pool,including the current size of proxy pool, the input speed of new proxy , and the output speed. and manage the average size oj of proxy_pool class """ def __init__(self, proxy_pool): threading.Thread.__init__(self) self.proxy_pool = proxy_pool self.dbi = MySQL_Interface() def run(self): while True: time_stick = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) current_size = self.proxy_pool.size() [input, output] = self.proxy_pool.update_proxy_state() insert_value = [[current_size, time_stick, input, output]] self.dbi.insert_asList('proxy_table', insert_value, unique=True) time.sleep(server_config.PROXY_MONITOR_GAP)
class deal_cache_attends(threading.Thread): def __init__(self): threading.Thread.__init__(self) dbi = MySQL_Interface() self.dbi = dbi self.bf = BloomFilter() def run(self): bag = [] uid_bag = [] #与bag类似,只不过存储uid bag_size = 1000 #100次插入一次 ready_to_get_col = self.dbi.get_col_name('ready_to_get') cache_attends_col = self.dbi.get_col_name('cache_attends') while True: query = 'select * from cache_attends limit 5000' res = self.dbi.select_asQuery(query) if res.__len__() == 0: if bag.__len__() > 0: self.dbi.insert_asList('ready_to_get', bag, unique=True) bag = [] # self.bf.insert_asList(uid_bag,'ready_to_get') uid_bag = [] time.sleep(1) self.dbi = MySQL_Interface() #更新dbi continue print('thread cache attends is working') for line in res: raw_id = line[cache_attends_col.index('uid')] in_user_info = self.bf.isContains(raw_id, 'user_info_table') #此处可优化 if not in_user_info: data = [ line[cache_attends_col.index(col)] if col in cache_attends_col else None for col in ready_to_get_col ] bag.append(data) uid_bag.append(raw_id) if bag.__len__() > bag_size: self.dbi.insert_asList('ready_to_get', bag, unique=True) # self.bf.insert_asList(uid_bag,'ready_to_get') print('insert once') bag = [] uid_bag = [] self.dbi.delete_line('cache_attends', 'uid', raw_id) # 此处可优化 def isInUserInfo(self, in_uid): col_user_info = self.dbi.get_col_name('user_info_table') query = 'select * from user_info_table where uid={uid}'.format( uid=in_uid) res = self.dbi.select_asQuery(query) if res.__len__() == 0: return False else: return True
class deal_cache_attends(threading.Thread): def __init__(self): threading.Thread.__init__(self) dbi=MySQL_Interface() self.dbi=dbi self.bf=BloomFilter() def run(self): bag=[] uid_bag=[] #与bag类似,只不过存储uid bag_size=1000 #100次插入一次 ready_to_get_col=self.dbi.get_col_name('ready_to_get') cache_attends_col=self.dbi.get_col_name('cache_attends') while True: query='select * from cache_attends limit 5000' res=self.dbi.select_asQuery(query) if res.__len__()==0: if bag.__len__()>0: self.dbi.insert_asList('ready_to_get',bag,unique=True) bag=[] # self.bf.insert_asList(uid_bag,'ready_to_get') uid_bag=[] time.sleep(1) self.dbi=MySQL_Interface() #更新dbi continue print('thread cache attends is working') for line in res: raw_id=line[cache_attends_col.index('uid')] in_user_info=self.bf.isContains(raw_id,'user_info_table') #此处可优化 if not in_user_info: data=[line[cache_attends_col.index(col)] if col in cache_attends_col else None for col in ready_to_get_col] bag.append(data) uid_bag.append(raw_id) if bag.__len__()>bag_size: self.dbi.insert_asList('ready_to_get',bag,unique=True) # self.bf.insert_asList(uid_bag,'ready_to_get') print('insert once') bag=[] uid_bag=[] self.dbi.delete_line('cache_attends','uid',raw_id) # 此处可优化 def isInUserInfo(self,in_uid): col_user_info=self.dbi.get_col_name('user_info_table') query='select * from user_info_table where uid={uid}'.format(uid=in_uid) res=self.dbi.select_asQuery(query) if res.__len__()==0: return False else: return True
def __init__(self, proxy_pool): threading.Thread.__init__(self) self.proxy_pool = proxy_pool self.dbi = MySQL_Interface()
def __init__(self): threading.Thread.__init__(self) self.dbi=MySQL_Interface()
def post(self): try: user_basic_info = self.get_argument('user_basic_info') attends = self.get_argument('user_attends') user_basic_info = eval(user_basic_info) attends = eval(attends) self.write('success to return user info') self.finish() except: self.write('fail to return user info') self.finish() return try: dbi = MySQL_Interface() except: print('unable to connect to MySql DB') try: if attends.__len__() > 0: #store attends info table_name = 'cache_attends' attends_col_info = dbi.get_col_name(table_name) keys = attends[0].keys() attends = [[ line[i] if i in keys else '' for i in attends_col_info ] for line in attends] fans_col_pos = attends_col_info.index('fans_num') insert_attends = [] for line in attends: if line[fans_col_pos] > 1000: insert_attends.append(line) dbi.insert_asList(table_name, insert_attends, unique=True) print('Success : attends of {uid} is stored in {tname}'.format( uid=user_basic_info['uid'], tname=table_name)) else: pass except Exception as e: print(e) path = "temp" + os.sep + "{uid}_attends.pkl".format( uid=user_basic_info['uid']) print( 'unable to store attends of {uid}, it will be stored '.format( uid=user_basic_info['uid'])) FI.save_pickle(attends, path) try: atten_num_real = user_basic_info['attends_num'] atten_num_get = attends.__len__() user_basic_info['accuracy'] = atten_num_get # 实际获取到的关注数目 col_info = dbi.get_col_name( 'cache_user_info') # store user basic info keys = user_basic_info.keys() data = [user_basic_info[i] if i in keys else '' for i in col_info] dbi.insert_asList('cache_user_info', [data], unique=True) print('Success : basic info of {uid} is stored in cache_user_info'. format(uid=user_basic_info['uid'])) except Exception as e: print(e) path = 'temp' + os.sep + '{uid}_basic_info.pkl'.format( uid=user_basic_info['uid']) print('unable to store basic info of {uid} , it will be stored'. format(uid=user_basic_info['uid'])) FI.save_pickle(user_basic_info, path) try: if attends.__len__() > 0: # store atten connection web from_uid = user_basic_info['uid'] from_fans_num = user_basic_info['fans_num'] from_blog_num = user_basic_info['blog_num'] data = [[ from_uid, from_fans_num, from_blog_num, str(x[attends_col_info.index('uid')]), str(x[attends_col_info.index('fans_num')]), str(x[attends_col_info.index('blog_num')]) ] for x in attends] dbi.insert_asList('cache_atten_web', data) print( 'Success : conn web of {uid} is stored in cache_atten_web'. format(uid=user_basic_info['uid'])) else: pass except Exception as e: print(e) path = '{uid}_atten_web.pkl'.format(uid=user_basic_info['uid']) print('unable to store atten web of {uid} , it will be stored'. format(uid=user_basic_info['uid'])) FI.save_pickle(data, path)
self.bit_size=1<<15 self.seeds=[5,7,11,13,31,37,61] self.r=redis.StrictRedis(host='127.0.0.1',port=6379,db=0) self.hashFunc=[] for i in range(self.seeds.__len__()): self.hashFunc.append(SimpleHash(self.bit_size,self.seeds[i])) def isContains(self,str_input,name): if str_input==None: return False if str_input.__len__()==0: return False ret=True for f in self.hashFunc: loc=f.hash(str_input) ret=ret & self.r.getbit(name,loc) return ret def insert(self,str_input,name): for f in self.hashFunc: loc=f.hash(str_input) self.r.setbit(name,loc,1) dbi=MySQL_Interface(dbname='microblog_spider') r=redis.StrictRedis(host='127.0.0.1',port=6379,db=0) query='select uid from user_info_table ;' uid=dbi.select_asQuery(query) uid=[x[0] for x in uid] bf=BloomFilter() for id in uid: bf.insert(id,'user_info_table')
def __init__(self,proxy_pool): threading.Thread.__init__(self) self.proxy_pool=proxy_pool self.dbi=MySQL_Interface()
def __init__(self): threading.Thread.__init__(self) self.dbi=MySQL_Interface() self.bf=BloomFilter()
def __init__(self): threading.Thread.__init__(self) dbi = MySQL_Interface() self.dbi = dbi self.bf = BloomFilter()
__author__ = 'multiangle' from DB_Interface import MySQL_Interface import json import networkx as nx dbi = MySQL_Interface() # create table (select * from user_info_table order by fans_num limit 1000) [web_info, col_info] = dbi.select_all('temp_table2') select_web = [] select_user = {} for atte in web_info: if (atte[1], atte[0]) in web_info: select_web.append(list(atte)) select_user[atte[1]] = 1 select_user[atte[0]] = 1 select_user = select_user.keys() G = nx.Graph() G.add_nodes_from(select_user) G.add_edges_from(select_web) nx.write_gexf(G, 'weibo_node1000.gexf')
def get(self): global proxy uuid = str(self.get_argument('uuid')) task_id = self.task_assign(uuid) if proxy.get_ave_proxy_size( ) < 30: # check the size of current proxy size self.write('no task') self.finish() return if task_id == -1: # checi if this uuid is valid self.write('no task') self.finish() return if task_id == 1: # get the social web of certain user dbi = MySQL_Interface() query = 'select * from ready_to_get where is_fetching is null order by fans_num desc limit 1;' res = dbi.select_asQuery(query) if res.__len__() == 0: self.write('no task') self.finish() return res = res[0] col_info = dbi.get_col_name('ready_to_get') uid = res[col_info.index('uid')] self.write('{uid},connect'.format(uid=uid)) self.finish() time_stick = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) query="update ready_to_get set is_fetching=\'{t_time}\' where uid={uid} ;"\ .format(t_time=time_stick,uid=uid) dbi.update_asQuery(query) if task_id == 2: # get the history microblog of a certain user dbi = MySQL_Interface() query='select container_id,blog_num from user_info_table ' \ 'where (isGettingBlog is null and update_time is null and blog_num<{valve} and blog_num>100)' \ 'order by fans_num desc limit 1 ;'.format(valve=config.HISTORY_TASK_VALVE) # query='select container_id,blog_num from user_info_table ' \ # 'order by rand() limit 1 ;' res = dbi.select_asQuery(query) if res.__len__() == 0: self.write('no task') self.finish() return [container_id, blog_num] = res[0] self.write('{c_id};{blog},history'.format(c_id=container_id, blog=blog_num)) self.finish() time_stick = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) query="update user_info_table set isGettingBlog=\'{t_time}\' where container_id={cid} ;"\ .format(t_time=time_stick,cid=container_id) dbi.update_asQuery(query) if task_id == 3: # get the history microblog of a certain user dbi = MySQL_Interface() query='select container_id,blog_num from user_info_table ' \ 'where (isGettingBlog is null and update_time is null and blog_num>={valve} and blog_num>100)' \ 'order by fans_num desc limit 1 ;'.format(valve=config.HISTORY_TASK_VALVE) # query='select container_id,blog_num from user_info_table ' \ # 'order by rand() limit 1 ;' [container_id, blog_num] = dbi.select_asQuery(query)[0] self.write('{c_id};{blog},history'.format(c_id=container_id, blog=blog_num)) self.finish() time_stick = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) query="update user_info_table set isGettingBlog=\'{t_time}\' where container_id={cid} ;" \ .format(t_time=time_stick,cid=container_id) dbi.update_asQuery(query) if task_id == 4 or task_id == 5 or task_id == 100: # this part is in test dbi = MySQL_Interface() current_time_stick = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) target_time_stick = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time() - 60 * 60 * 24 * 1)) #提早5天 if task_id == 4: batch_size = 100 elif task_id == 5: batch_size = 200 else: batch_size = 10 query='select container_id,update_time,latest_blog from user_info_table ' \ 'where update_time<\'{target_time}\' and isGettingBlog is null and blog_num>10 order by fans_num desc limit {batch}' \ .format(target_time=target_time_stick,batch=batch_size) print(query) res = dbi.select_asQuery(query) # 将从mysql中取得的用户列表加上必要的变量以后发送给客户端 res = [[ line[0], int(time.mktime(line[1].timetuple())), int(time.mktime(line[2].timetuple())) ] for line in res] res_cp = res if res_cp.__len__() == 0: # if no task ,then return "no task" print('*** warning: no avaliable update mission ***') self.write('no task') self.finish() return # print('debug from task handler') # pprint(res_cp) res = [ line[0] + '-' + str(line[1]) + '-' + str(line[2]) for line in res ] inn = '' for item in res: inn += item + ';' inn = inn[0:-1] # uid-stamp;uid-timestamp;...;,update (the formation of order) mission_id = random_str(15) commend = '{list};{task_id},update'.format(list=inn, task_id=mission_id) # 传送给客户端的指令格式: ContainerId-UpdateTime-LatestBlog;...;...;...,update self.write(commend) self.finish() # 将用户列表,任务id,以及任务开始时间存入mongodb u_list = [ dict(container_id=x[0], update_time=x[1], latest_blog=x[2]) for x in res_cp ] data_toMongo = dict(mission_id=mission_id, user_list=u_list, mission_start=int(time.time())) client = MongoClient('localhost', 27017) db = client['microblog_spider'] collec = db.update_mission collec.insert(data_toMongo) # 将相关内容从mysql中设置isGettingBlog user_list_str = '' for line in res_cp: user_list_str += '\'{cid}\','.format(cid=line[0]) user_list_str = user_list_str[:-1] time_stick = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) query='update user_info_table set isGettingBlog=\'{time}\' where container_id in ({ulist})'\ .format(time=time_stick,ulist=user_list_str) dbi.update_asQuery(query)
__author__ = 'multiangle' import networkx as nx import matplotlib.pyplot as plt import numpy as np from DB_Interface import MySQL_Interface dbi = MySQL_Interface() [select_user, select_user_col] = dbi.select_all('select_user') user_list = [line[select_user_col.index('name')] for line in select_user] user_id = [line[select_user_col.index('uid')] for line in select_user] [atten_web, atten_web_col] = dbi.select_all('select_atten') atten_list = [[ line[atten_web_col.index('from_uid')], line[atten_web_col.index('to_uid')] ] for line in atten_web] # temp_atten_list=[] # for line in atten_list: # try: # temp=[user_list[user_id.index(line[0])],user_list[user_id.index(line[1])]] # temp_atten_list.append(temp) # except: # pass # atten_list=temp_atten_list print(atten_list.__len__()) sig_list = [line[0] + line[1] for line in atten_list] select_atten_list = [] for line in atten_list: temp_sig_a = line[0] + line[1] temp_sig_b = line[1] + line[0]
def run(self): while True: start_time = time.time() dbi=MySQL_Interface() col_info=dbi.get_col_name('cache_history') query='select * from cache_history where is_dealing is null order by checkin_timestamp limit 1' mysql_res=dbi.select_asQuery(query) if mysql_res.__len__()==0: # cache_history表为空时,睡眠1秒,跳过此次循环 time.sleep(1) continue mysql_res=mysql_res[0] # todo for delete----- print('debug->start to deal with a new task') print('debug->mysql_res: ') print(mysql_res) #------------------------ container_id=mysql_res[col_info.index('container_id')] print('debug->container_id: {cid}'.format(cid=container_id)) latest_time=mysql_res[col_info.index('latest_time')] latest_timestamp=mysql_res[col_info.index('latest_timestamp')] time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) query = 'update cache_history set is_dealing=\'{time}\' where container_id={cid}'.format(time=time_stick, cid = container_id) # todo for delete----- print('debug->query1 : {q}'.format(q=query)) # ------------------------ dbi.update_asQuery(query) client = MongoClient('localhost', 27017) db = client['microblog_spider'] assemble_table = db.assemble_factory res = assemble_table.find({'container_id': container_id}, {'current_id': 1, 'total_num': 1}) id_list = [x['current_id'] for x in res] num = int([x['total_num'] for x in assemble_table.find({'container_id': container_id}).limit(1)][0]) ## todo for delete----- print('debug->id_list_len: {len}'.format(len=id_list.__len__())) print('debug->num: {n}'.format(n=num)) # ------------------------ # 检查是否所有包裹已经到齐 check_state = True if id_list.__len__() < num: print('server->HistoryReport:The package is not complete, retry to catch data') check_state = False if check_state: # 如果所有子包已经收集完毕,则将数据放入正式数据库mongodb # 将装配车间中的相关数据删除 # 并且在Mysql中更新update_time和latest_blog,抹掉isGettingBlog # 从mysql获取该用户信息 try: query = 'select * from user_info_table where container_id=\'{cid}\'' \ .format(cid=container_id) user_info = dbi.select_asQuery(query)[0] # todo fro debug------------- print('task {cid} :debug->query2: {q}'.format(q=query,cid=container_id)) print('task {cid} debug->user_info:'.format(cid = container_id)) print(user_info) # -------------------------------- col_name = dbi.get_col_name('user_info_table') except Exception as e: print('task {cid} :Error:server-HistoryReturn:' 'No such user in MySQL.user_info_table,Reason:'.format(cid = container_id)) print(e) # 将数据从assemble factory中提取出来 try: data_list = assemble_table.find({'container_id':container_id}, {'data': 1 , 'current_id': 1}) data_list_ori = [x for x in data_list] data_list = [x['data'] for x in data_list_ori] id_list = [x['current_id'] for x in data_list_ori] data_list_ori = None # todo fro debug------------- print('task {cid} debug->datalist: {len}'.format(len = data_list.__len__(),cid=container_id)) # -------------------------------- except Exception as e: print('Error:server-HistoryReturn:' 'Unable to get data from MongoDB, assemble factory,Reason:') print(e) # 长度大于预期,说明有重复信息,需要去重 if id_list.__len__() > num : unique_data_list = [] check_dict = {} for i in range(id_list.__len__()) : try: # 这里使用字典去重,(算是hash吧) check_dict[str(id_list[i])] continue except: check_dict[str(id_list[i])] = True unique_data_list.append(data_list[i]) # print('data_list.len :{len}'.format(len=data_list.__len__())) # print('id_list.len :{len}'.format(len=id_list.__len__())) # print(i) data_list = unique_data_list # 将碎片拼接 try: data_final = [] for i in data_list: data_final = data_final+i # todo fro debug------------- print('task {cid} debug->数据拼接完毕,len {len}'.format(len=data_final.__len__(),cid=container_id)) # -------------------------------- except Exception as e: print('Error:server-HistoryReport:' 'Unable to contact the pieces of information,Reason:') print(e) # 将本次信息录入accuracy_table 用以进一步分析 blog_len = data_final.__len__() wanted_blog_len = user_info[col_name.index('blog_num')] blog_accuracy = blog_len/wanted_blog_len time_stick = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) query = 'insert into accuracy_table values ({acc},\'{t_s}\',{num}) ;' \ .format(acc=blog_accuracy, t_s=time_stick, num=wanted_blog_len) dbi.insert_asQuery(query) # 将数据录入Mongodb 更改Mysql,删除assemble中相关内容 try: if not user_info[col_name.index('update_time')]: # 将数据存入 Mongodb 的formal collection save_data_seperately(data_final) print('task {cid} Success: Data has saved in Mongodb, size is {size}' .format(size=sys.getsizeof(data_final),cid=container_id)) # # 将关键信息录入Mydql query = 'update user_info_table set ' \ 'update_time=\'{up_time}\',' \ 'latest_blog=\'{latest_blog}\',' \ 'isGettingBlog=null ' \ 'where container_id=\'{cid}\';'\ .format(up_time=time_stick,latest_blog=latest_time,cid=container_id) # query='update user_info_table set ' \ # 'update_time=\'{up_time}\',' \ # 'latest_blog=\'{latest_blog}\'' \ # 'where container_id=\'{cid}\';' \ # .format(up_time=time_stick,latest_blog=latest_time,cid=container_id) #TODO 这里为了方便统计,去掉了抹除isGetting这一项,但是正式运行的时候是要加上的 dbi.update_asQuery(query) print('task {cid} Success: insert user into MongoDB, the num of data is {len}' .format(len=blog_len,cid=container_id)) else: query='update user_info_table set isGettingBlog=null where container_id=\'{cid}\'' \ .format(cid=container_id) dbi.update_asQuery(query) except Exception as e: print('task {cid} Error:server->HistoryReport:' 'Reason:'.format(cid=container_id)) print(e) else: # 如果所有子包不全,则抹掉isGettingBlog,将装配车间中数据删除 print('task {cid} :Error: the package is not complete ,{a} of {b}' .format(a=id_list.__len__(),b=num,cid=container_id)) query='update user_info_table set isGettingBlog=null where container_id=\'{cid}\'' \ .format(cid=container_id) dbi.update_asQuery(query) # 将数据从assemble factory 去掉 assemble_table.remove({'container_id':container_id}) print('task {cid} Success: Data has been removed from assemble factory' .format(cid=container_id)) # 将cache_history中的相应行删掉,表示已经处理完该事物了 query='delete from cache_history where container_id=\'{cid}\'' \ .format(cid=container_id) dbi.update_asQuery(query) end_time = time.time() deal_time = end_time - start_time print('task {cid} :Success : the user {cid} is completed, length is {len}, use {t} seconds' .format(cid = container_id, len = data_final.__len__(), t = deal_time))
self.seeds = [5, 7, 11, 13, 31, 37, 61] self.r = redis.StrictRedis(host='127.0.0.1', port=6379, db=0) self.hashFunc = [] for i in range(self.seeds.__len__()): self.hashFunc.append(SimpleHash(self.bit_size, self.seeds[i])) def isContains(self, str_input, name): if str_input == None: return False if str_input.__len__() == 0: return False ret = True for f in self.hashFunc: loc = f.hash(str_input) ret = ret & self.r.getbit(name, loc) return ret def insert(self, str_input, name): for f in self.hashFunc: loc = f.hash(str_input) self.r.setbit(name, loc, 1) dbi = MySQL_Interface(dbname='microblog_spider') r = redis.StrictRedis(host='127.0.0.1', port=6379, db=0) query = 'select uid from user_info_table ;' uid = dbi.select_asQuery(query) uid = [x[0] for x in uid] bf = BloomFilter() for id in uid: bf.insert(id, 'user_info_table')
__author__ = 'multiangle' from DB_Interface import MySQL_Interface import json import networkx as nx dbi=MySQL_Interface() # create table (select * from user_info_table order by fans_num limit 1000) [web_info,col_info]=dbi.select_all('temp_table2') select_web=[] select_user={} for atte in web_info: if (atte[1],atte[0]) in web_info: select_web.append(list(atte)) select_user[atte[1]]=1 select_user[atte[0]]=1 select_user=select_user.keys() G=nx.Graph() G.add_nodes_from(select_user) G.add_edges_from(select_web) nx.write_gexf(G,'weibo_node1000.gexf')
def run(self): client = MongoClient('localhost', 27017) while True: db = client['microblog_spider'] mission_mongo = db.update_mission # 表示需要处理,但是现在无人处理的任务 res = mission_mongo.find({ 'isReported': { '$ne': None }, 'isDealing': None }).limit(1) res = [x for x in res] # 若没有待完成的任务,则该线程休眠1秒然后继续 if res.__len__() == 0: time.sleep(1) continue # 提取出需要处理的任务 task = res[0] task.pop('_id') mission_id = task['mission_id'] user_content = task['user_list'] # 将任务列表中的isDealing设置当前时间,表示当前任务开始受理 mission_mongo.update({'mission_id': mission_id}, {'$set': { 'isDealing': int(time.time()) }}) print('Update Mission :{mi} set isDealing as {t}'.format( mi=mission_id, t=int(time.time()))) # 获取包裹id和总包裹数 assemble_table = db.assemble_factory res = assemble_table.find({'container_id': mission_id}, { 'current_id': 1, 'total_num': 1 }) id_list = [x['current_id'] for x in res] check_state = True try: num = int([ x['total_num'] for x in assemble_table.find({ 'container_id': mission_id }).limit(1) ][0]) except: print( 'deal_update_mission :{mi} can not get num info from mongo' .format(mi=mission_id)) num = 100000000 check_state = False #检查是否所有包裹已经到齐 if id_list.__len__() < num: print( 'Update Mission :{mi} The package is not complete, retry to catch data' .format(mi=mission_id)) check_state = False if check_state: # 增加当前时间的转发,点赞和评论数,便于追踪 # 如果所有子包完毕,则将数据放入正式数据库mongodb各已经收集月份表和最近半月表 # 将数据从assemble factory中提取出来 try: data_list = assemble_table.find( {'container_id': mission_id}, { 'data': 1, 'current_id': 1 }) data_list_ori = [x for x in data_list] data_list = [x['data'] for x in data_list_ori] id_list = [x['current_id'] for x in data_list_ori] data_list_ori = None print( 'Update Mission :{mi} success->datalist: {len}'.format( len=data_list.__len__(), mi=mission_id)) except Exception as e: print( 'Update Mission :{mi} Error:server_database-deal_update_mission:' 'Unable to get data from MongoDB, assemble factory,Reason:' .format(mi=mission_id)) print(e) # 长度大于预期,说明有重复信息,需要去重 if id_list.__len__() > num: unique_data_list = [] check_dict = {} for i in range(id_list.__len__()): try: # 这里使用字典去重,(算是hash吧) check_dict[str(id_list[i])] continue except: check_dict[str(id_list[i])] = True unique_data_list.append(data_list[i]) data_list = unique_data_list # 将碎片拼接 try: data_final = [] for i in data_list: data_final = data_final + i print('Update Mission :{mi} success->数据拼接完毕,len {len}'. format(len=data_final.__len__(), mi=mission_id)) except Exception as e: print( 'Update Mission :{mi} Error:server-HistoryReport:' 'Unable to contact the pieces of information,Reason:'. format(mi=mission_id)) print(e) # 增加当前时间的转发,点赞和评论数,便于追踪,并制作成UpdateMany对象 user_list = [x['container_id'] for x in user_content] user_list_str = '' for item in user_list: user_list_str += '\'' + str(item) + '\',' user_list_str = user_list_str[:-1] def temp_add_trace(line): msg_id = line['id'] current_status = dict( comments_count=line['comments_count'], attitudes_count=line['attitudes_count'], reposts_count=line['reposts_count']) t = int(time.time()) t_str = str(t) line['status_trace.{date}'.format( date=t_str)] = current_status update_item = UpdateMany({'id': msg_id}, {'$set': line}, upsert=True) return update_item requests = [temp_add_trace(x) for x in data_final] latest_mongo = db.latest_history latest_mongo.bulk_write(requests) print( 'Update Mission :{mi} Success: server_database:UpdateMany列表生成,' '写入latest_history表成功,{len}'.format(len=requests.__len__(), mi=mission_id)) # 将获得数据写入各按月份分类的聚合中 table_list = [] request_updateMonth = [] for i in range(data_final.__len__()): temp_time = data_final[i]['created_at'] temp_table_name = 'user_{year}_{month}'.format( year=temp_time[0:4], month=temp_time[5:7]) if temp_table_name in table_list: request_updateMonth[table_list.index( temp_table_name)].append(requests[i]) else: table_list.append(temp_table_name) request_updateMonth.append([requests[i]]) print('the number of ori table is {len}'.format( len=request_updateMonth.__len__())) print(table_list) selected_num = 5 if table_list.__len__() > selected_num: packed = [[table_list[i], request_updateMonth[i]] for i in range(table_list.__len__())] packed = sorted(packed, key=lambda x: x[0], reverse=True) packed = packed[:selected_num] table_list = [x[0] for x in packed] request_updateMonth = [x[1] for x in packed] print('the number of dealed table is {len}'.format( len=request_updateMonth.__len__())) print(table_list) if request_updateMonth.__len__() >= 3: print('{a}-{b}-{c}'.format( a=request_updateMonth[0].__len__(), b=request_updateMonth[1].__len__(), c=request_updateMonth[2].__len__())) for i in range(table_list.__len__()): collection = eval('db.{name}'.format(name=table_list[i])) # todo for debug---------------------------- print('table {x} is started'.format(x=table_list[i])) #--------------------------------------------------- if request_updateMonth[i].__len__() > 0: try: collection.bulk_write(request_updateMonth[i]) except Exception as e: print( 'Update Mission :{mi} fail to update table {t}' .format(mi=mission_id, t=table_list[i])) print( 'Update Mission :{mi} Success:server_database:所获的数组已经写入按月分类聚合中' .format(mi=mission_id)) # 清理Mydql,更新相关行数中的update_time和latest_blog time_stick = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 找出各用户的最近更新时间 latest_list = [0] * user_list.__len__() for line in data_final: this_timestick = int(line['created_timestamp']) this_container = '100505' + str(line['user_id']) try: index = user_list.index(this_container) if latest_list[index] < this_timestick: latest_list[index] = this_timestick except: print('error:server_database->deal_update_mission:' 'container {id} is not in user_list'.format( id=this_container)) # 将各用户最近更新时间固化为mysql更新语句。 case_list = '' updated_user_list = '' for i in range(latest_list.__len__()): if latest_list[i] > user_content[i]['latest_blog']: time_stick_inner = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(latest_list[i])) case_list += ' when \'{cid}\' then \'{tstick}\' '.format( cid=user_list[i], tstick=time_stick_inner) updated_user_list += '\'{cid}\','.format( cid=user_list[i]) updated_user_list = updated_user_list[:-1] # 构建mysql更新语句 query1='update user_info_table set update_time=\'{time}\' where container_id in ( {user_list} ) ;'\ .format(time=time_stick,user_list=user_list_str) query2='update user_info_table set latest_blog= case container_id {case_list} end where container_id in ( {ulist2} ) ;'\ .format(case_list=case_list,ulist2=updated_user_list) dbi = MySQL_Interface() dbi.update_asQuery(query2) dbi.update_asQuery(query1) print( 'Update Mission :{mi} Success:server_database: UpdateTime和LatestBlog选项已更新' .format(mi=mission_id)) if user_list_str.__len__() > 0: query='update user_info_table set isGettingBlog=null where container_id in ({user_list});' \ .format(user_list=user_list_str) dbi.update_asQuery(query) print( 'Update Mission :{mi} Success:erver_database: isGettingBlog选项已清除' .format(mi=mission_id)) else: if user_list_str.__len__() > 0: query='update user_info_table set isGettingBlog=null where container_id in ({user_list});'\ .format(user_list=user_list_str) dbi = MySQL_Interface() dbi.update_asQuery(query) # 将assemble_factory中与当前任务有关数据清空 assemble_table.remove({'container_id': mission_id}) print( 'Update Mission :{mi} Success:server_database: assemble_factory in Mongo is cleared' .format(mi=mission_id)) # 将mongodb,任务列表中当前任务项清空 mission_mongo.remove({'mission_id': mission_id}) print( 'Update Mission :{mi} Success:server_database: this mission is cleared' .format(mi=mission_id))
def run(self): client=MongoClient('localhost',27017) while True: db=client['microblog_spider'] mission_mongo=db.update_mission # 表示需要处理,但是现在无人处理的任务 res=mission_mongo.find({'isReported':{'$ne':None},'isDealing':None}).limit(1) res=[x for x in res] # 若没有待完成的任务,则该线程休眠1秒然后继续 if res.__len__()==0: time.sleep(1) continue # 提取出需要处理的任务 task=res[0] task.pop('_id') mission_id=task['mission_id'] user_content=task['user_list'] # 将任务列表中的isDealing设置当前时间,表示当前任务开始受理 mission_mongo.update({'mission_id':mission_id},{'$set':{'isDealing':int(time.time())}}) print('Update Mission :{mi} set isDealing as {t}'.format(mi=mission_id,t=int(time.time()))) # 获取包裹id和总包裹数 assemble_table=db.assemble_factory res=assemble_table.find({'container_id':mission_id},{'current_id':1,'total_num':1}) id_list=[x['current_id'] for x in res] check_state=True try: num=int([x['total_num'] for x in assemble_table.find({'container_id':mission_id}).limit(1)][0]) except: print('deal_update_mission :{mi} can not get num info from mongo' .format(mi=mission_id)) num = 100000000 check_state = False #检查是否所有包裹已经到齐 if id_list.__len__()<num: print('Update Mission :{mi} The package is not complete, retry to catch data' .format(mi=mission_id)) check_state=False if check_state: # 增加当前时间的转发,点赞和评论数,便于追踪 # 如果所有子包完毕,则将数据放入正式数据库mongodb各已经收集月份表和最近半月表 # 将数据从assemble factory中提取出来 try: data_list = assemble_table.find({'container_id':mission_id}, {'data': 1 , 'current_id': 1}) data_list_ori = [x for x in data_list] data_list = [x['data'] for x in data_list_ori] id_list = [x['current_id'] for x in data_list_ori] data_list_ori = None print('Update Mission :{mi} success->datalist: {len}'.format(len=data_list.__len__(),mi=mission_id)) except Exception as e: print('Update Mission :{mi} Error:server_database-deal_update_mission:' 'Unable to get data from MongoDB, assemble factory,Reason:'.format(mi=mission_id)) print(e) # 长度大于预期,说明有重复信息,需要去重 if id_list.__len__() > num : unique_data_list = [] check_dict = {} for i in range(id_list.__len__()) : try: # 这里使用字典去重,(算是hash吧) check_dict[str(id_list[i])] continue except: check_dict[str(id_list[i])] = True unique_data_list.append(data_list[i]) data_list = unique_data_list # 将碎片拼接 try: data_final=[] for i in data_list: data_final=data_final+i print('Update Mission :{mi} success->数据拼接完毕,len {len}' .format(len=data_final.__len__(),mi=mission_id)) except Exception as e: print('Update Mission :{mi} Error:server-HistoryReport:' 'Unable to contact the pieces of information,Reason:'.format(mi=mission_id)) print(e) # 增加当前时间的转发,点赞和评论数,便于追踪,并制作成UpdateMany对象 user_list=[x['container_id'] for x in user_content] user_list_str='' for item in user_list: user_list_str+='\''+str(item)+'\',' user_list_str=user_list_str[:-1] def temp_add_trace(line): msg_id=line['id'] current_status=dict( comments_count=line['comments_count'], attitudes_count=line['attitudes_count'], reposts_count=line['reposts_count'] ) t=int(time.time()) t_str=str(t) line['status_trace.{date}'.format(date=t_str)]=current_status update_item=UpdateMany({'id':msg_id},{'$set':line},upsert=True) return update_item requests=[temp_add_trace(x) for x in data_final] latest_mongo=db.latest_history latest_mongo.bulk_write(requests) print('Update Mission :{mi} Success: server_database:UpdateMany列表生成,' '写入latest_history表成功,{len}'.format(len=requests.__len__(),mi=mission_id)) # 将获得数据写入各按月份分类的聚合中 table_list=[] request_updateMonth=[] for i in range(data_final.__len__()): temp_time=data_final[i]['created_at'] temp_table_name='user_{year}_{month}'.format(year=temp_time[0:4],month=temp_time[5:7]) if temp_table_name in table_list: request_updateMonth[table_list.index(temp_table_name)].append(requests[i]) else: table_list.append(temp_table_name) request_updateMonth.append([requests[i]]) print('the number of ori table is {len}'.format(len=request_updateMonth.__len__())) print(table_list) selected_num = 5 if table_list.__len__()>selected_num: packed = [[table_list[i],request_updateMonth[i]] for i in range(table_list.__len__())] packed = sorted(packed, key=lambda x:x[0], reverse=True) packed = packed[:selected_num] table_list = [x[0] for x in packed] request_updateMonth = [x[1] for x in packed] print('the number of dealed table is {len}'.format(len=request_updateMonth.__len__())) print(table_list) if request_updateMonth.__len__()>=3: print('{a}-{b}-{c}'.format( a=request_updateMonth[0].__len__(), b=request_updateMonth[1].__len__(), c=request_updateMonth[2].__len__() )) for i in range(table_list.__len__()): collection=eval('db.{name}'.format(name=table_list[i])) # todo for debug---------------------------- print('table {x} is started'.format(x=table_list[i])) #--------------------------------------------------- if request_updateMonth[i].__len__()>0: try: collection.bulk_write(request_updateMonth[i]) except Exception as e: print('Update Mission :{mi} fail to update table {t}' .format(mi=mission_id,t=table_list[i])) print('Update Mission :{mi} Success:server_database:所获的数组已经写入按月分类聚合中' .format(mi=mission_id)) # 清理Mydql,更新相关行数中的update_time和latest_blog time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 找出各用户的最近更新时间 latest_list=[0]*user_list.__len__() for line in data_final: this_timestick=int(line['created_timestamp']) this_container='100505'+str(line['user_id']) try: index=user_list.index(this_container) if latest_list[index]<this_timestick: latest_list[index]=this_timestick except: print('error:server_database->deal_update_mission:' 'container {id} is not in user_list'.format(id=this_container)) # 将各用户最近更新时间固化为mysql更新语句。 case_list='' updated_user_list='' for i in range(latest_list.__len__()): if latest_list[i]>user_content[i]['latest_blog'] : time_stick_inner=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(latest_list[i])) case_list+=' when \'{cid}\' then \'{tstick}\' '.format(cid=user_list[i],tstick=time_stick_inner) updated_user_list+='\'{cid}\','.format(cid=user_list[i]) updated_user_list=updated_user_list[:-1] # 构建mysql更新语句 query1='update user_info_table set update_time=\'{time}\' where container_id in ( {user_list} ) ;'\ .format(time=time_stick,user_list=user_list_str) query2='update user_info_table set latest_blog= case container_id {case_list} end where container_id in ( {ulist2} ) ;'\ .format(case_list=case_list,ulist2=updated_user_list) dbi=MySQL_Interface() dbi.update_asQuery(query2) dbi.update_asQuery(query1) print('Update Mission :{mi} Success:server_database: UpdateTime和LatestBlog选项已更新' .format(mi=mission_id)) if user_list_str.__len__()>0: query='update user_info_table set isGettingBlog=null where container_id in ({user_list});' \ .format(user_list=user_list_str) dbi.update_asQuery(query) print('Update Mission :{mi} Success:erver_database: isGettingBlog选项已清除'.format(mi=mission_id)) else: if user_list_str.__len__()>0: query='update user_info_table set isGettingBlog=null where container_id in ({user_list});'\ .format(user_list=user_list_str) dbi=MySQL_Interface() dbi.update_asQuery(query) # 将assemble_factory中与当前任务有关数据清空 assemble_table.remove({'container_id':mission_id}) print('Update Mission :{mi} Success:server_database: assemble_factory in Mongo is cleared' .format(mi=mission_id)) # 将mongodb,任务列表中当前任务项清空 mission_mongo.remove({'mission_id':mission_id}) print('Update Mission :{mi} Success:server_database: this mission is cleared' .format(mi=mission_id))
def test2(): from DB_Interface import MySQL_Interface dbi = MySQL_Interface() [x, s] = dbi.select_all("ready_to_get") print(x)
def get(self): global proxy uuid=str(self.get_argument('uuid')) task_id=self.task_assign(uuid) if proxy.get_ave_proxy_size()<30: # check the size of current proxy size self.write('no task') self.finish() return if task_id==-1: # checi if this uuid is valid self.write('no task') self.finish() return if task_id==1: # get the social web of certain user dbi=MySQL_Interface() query='select * from ready_to_get where is_fetching is null order by fans_num desc limit 1;' res=dbi.select_asQuery(query) if res.__len__()==0: self.write('no task') self.finish() return res=res[0] col_info=dbi.get_col_name('ready_to_get') uid=res[col_info.index('uid')] self.write('{uid},connect'.format(uid=uid)) self.finish() time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) query="update ready_to_get set is_fetching=\'{t_time}\' where uid={uid} ;"\ .format(t_time=time_stick,uid=uid) dbi.update_asQuery(query) if task_id==2: # get the history microblog of a certain user dbi=MySQL_Interface() query='select container_id,blog_num from user_info_table ' \ 'where (isGettingBlog is null and update_time is null and blog_num<{valve} and blog_num>100)' \ 'order by fans_num desc limit 1 ;'.format(valve=config.HISTORY_TASK_VALVE) # query='select container_id,blog_num from user_info_table ' \ # 'order by rand() limit 1 ;' res=dbi.select_asQuery(query) if res.__len__()==0: self.write('no task') self.finish() return [container_id,blog_num]=res[0] self.write('{c_id};{blog},history' .format(c_id=container_id,blog=blog_num)) self.finish() time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) query="update user_info_table set isGettingBlog=\'{t_time}\' where container_id={cid} ;"\ .format(t_time=time_stick,cid=container_id) dbi.update_asQuery(query) if task_id==3: # get the history microblog of a certain user dbi=MySQL_Interface() query='select container_id,blog_num from user_info_table ' \ 'where (isGettingBlog is null and update_time is null and blog_num>={valve} and blog_num>100)' \ 'order by fans_num desc limit 1 ;'.format(valve=config.HISTORY_TASK_VALVE) # query='select container_id,blog_num from user_info_table ' \ # 'order by rand() limit 1 ;' [container_id,blog_num]=dbi.select_asQuery(query)[0] self.write('{c_id};{blog},history' .format(c_id=container_id,blog=blog_num)) self.finish() time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) query="update user_info_table set isGettingBlog=\'{t_time}\' where container_id={cid} ;" \ .format(t_time=time_stick,cid=container_id) dbi.update_asQuery(query) if task_id==4 or task_id==5 or task_id==100: # this part is in test dbi=MySQL_Interface() current_time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) target_time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()-60*60*24*1)) #提早5天 if task_id==4: batch_size = 100 elif task_id==5: batch_size = 200 else: batch_size = 10 query='select container_id,update_time,latest_blog from user_info_table ' \ 'where update_time<\'{target_time}\' and isGettingBlog is null and blog_num>10 order by fans_num desc limit {batch}' \ .format(target_time=target_time_stick,batch=batch_size) print(query) res=dbi.select_asQuery(query) # 将从mysql中取得的用户列表加上必要的变量以后发送给客户端 res=[[line[0],int(time.mktime(line[1].timetuple())),int(time.mktime(line[2].timetuple()))] for line in res] res_cp=res if res_cp.__len__()==0: # if no task ,then return "no task" print('*** warning: no avaliable update mission ***') self.write('no task') self.finish() return # print('debug from task handler') # pprint(res_cp) res=[line[0]+'-'+str(line[1])+'-'+str(line[2]) for line in res] inn='' for item in res: inn+=item+';' inn=inn[0:-1] # uid-stamp;uid-timestamp;...;,update (the formation of order) mission_id=random_str(15) commend='{list};{task_id},update'.format(list=inn,task_id=mission_id) # 传送给客户端的指令格式: ContainerId-UpdateTime-LatestBlog;...;...;...,update self.write(commend) self.finish() # 将用户列表,任务id,以及任务开始时间存入mongodb u_list=[dict(container_id=x[0],update_time=x[1],latest_blog=x[2]) for x in res_cp] data_toMongo=dict( mission_id = mission_id, user_list = u_list, mission_start= int(time.time()) ) client=MongoClient('localhost',27017) db=client['microblog_spider'] collec=db.update_mission collec.insert(data_toMongo) # 将相关内容从mysql中设置isGettingBlog user_list_str='' for line in res_cp: user_list_str+='\'{cid}\','.format(cid=line[0]) user_list_str=user_list_str[:-1] time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) query='update user_info_table set isGettingBlog=\'{time}\' where container_id in ({ulist})'\ .format(time=time_stick,ulist=user_list_str) dbi.update_asQuery(query)
def __init__(self): threading.Thread.__init__(self) self.dbi = MySQL_Interface()
def post(self): # 从客户端获取信息 try: user_history=self.get_argument('user_history') latest_time=self.get_argument('latest_time') latest_timestamp=self.get_argument('latest_timestamp') container_id=self.get_argument('container_id') isDivided=self.get_argument('isDivided') user_history=eval(user_history) if isDivided==1 or isDivided=='1' : block_num=self.get_argument('block_num') current_block=self.get_argument('current_block') self.write('success to return user history') self.finish() print('Success: to get data from web') except Exception as e: self.write('fail to return user history') self.finish() print('Error:server-HistoryReturn:' 'Unable to get value from http package,Reason:') print(e) return # 连接 try: dbi=MySQL_Interface() except: print('Error:server-HistoryReturn:' 'Unable to connect to MySQL') # 从MYSQL获取该用户相关信息 try: query='select * from user_info_table where container_id=\'{cid}\''\ .format(cid=container_id) user_info=dbi.select_asQuery(query)[0] col_name=dbi.get_col_name('user_info_table') except Exception as e: print('Error:server-HistoryReturn:' 'No such user in MySQL.user_info_table,Reason:') print(e) # 将数据存入Mongodb以后将相关信息存入mysql,并将isGettingBlog字段设为空 try: blog_len=user_history.__len__() wanted_blog_len=user_info[col_name.index('blog_num')] blog_accuracy=blog_len/wanted_blog_len time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) if not user_info[col_name.index('update_time')]: save_data_inMongo(user_history) # query='update user_info_table set ' \ # 'update_time=\'{up_time}\',' \ # 'latest_blog=\'{latest_blog}\',' \ # 'isGettingBlog=null ' \ # 'where container_id=\'{cid}\';'\ # .format(up_time=time_stick,latest_blog=latest_time,cid=container_id) query='update user_info_table set ' \ 'update_time=\'{up_time}\',' \ 'latest_blog=\'{latest_blog}\'' \ 'where container_id=\'{cid}\';' \ .format(up_time=time_stick,latest_blog=latest_time,cid=container_id) dbi.update_asQuery(query) else: query='update user_info_table set isGettingBlog=null where container_id=\'{cid}\''\ .format(cid=container_id) dbi.update_asQuery(query) query='insert into accuracy_table values ({acc},\'{t_s}\') ;'\ .format(acc=blog_accuracy,t_s=time_stick) dbi.insert_asQuery(query) print('Success: insert user into MongoDB, the num of data is {len}' .format(len=blog_len)) except Exception as e: print('Error:server-HistoryReturn:' 'Unable to update data in MySQL.user_info_tabe,Reason:') print(e)
def post(self): try: user_basic_info=self.get_argument('user_basic_info') attends=self.get_argument('user_attends') user_basic_info=eval(user_basic_info) attends=eval(attends) self.write('success to return user info') self.finish() except: self.write('fail to return user info') self.finish() return try: dbi=MySQL_Interface() except: print('unable to connect to MySql DB') try: if attends.__len__()>0: #store attends info table_name='cache_attends' attends_col_info=dbi.get_col_name(table_name) keys=attends[0].keys() attends= [[line[i] if i in keys else '' for i in attends_col_info] for line in attends] fans_col_pos=attends_col_info.index('fans_num') insert_attends=[] for line in attends: if line[fans_col_pos]>1000: insert_attends.append(line) dbi.insert_asList(table_name,insert_attends,unique=True) print('Success : attends of {uid} is stored in {tname}' .format(uid=user_basic_info['uid'],tname=table_name)) else: pass except Exception as e: print(e) path="temp\\{uid}_attends.pkl".format(uid=user_basic_info['uid']) print('unable to store attends of {uid}, it will be stored ' .format(uid=user_basic_info['uid'])) FI.save_pickle(attends,path) try: atten_num_real=user_basic_info['attends_num'] atten_num_get=attends.__len__() user_basic_info['accuracy']=atten_num_get # 实际获取到的关注数目 col_info=dbi.get_col_name('cache_user_info') # store user basic info keys=user_basic_info.keys() data=[user_basic_info[i] if i in keys else '' for i in col_info] dbi.insert_asList('cache_user_info',[data],unique=True) print('Success : basic info of {uid} is stored in cache_user_info' .format(uid=user_basic_info['uid'])) except Exception as e: print(e) path='temp\\{uid}_basic_info.pkl'.format(uid=user_basic_info['uid']) print('unable to store basic info of {uid} , it will be stored' .format(uid=user_basic_info['uid'])) FI.save_pickle(user_basic_info,path) try: if attends.__len__()>0: # store atten connection web from_uid=user_basic_info['uid'] from_fans_num=user_basic_info['fans_num'] from_blog_num=user_basic_info['blog_num'] data=[[from_uid,from_fans_num,from_blog_num,str(x[attends_col_info.index('uid')]),str(x[attends_col_info.index('fans_num')]),str(x[attends_col_info.index('blog_num')])]for x in attends] dbi.insert_asList('cache_atten_web',data) print('Success : conn web of {uid} is stored in cache_atten_web' .format(uid=user_basic_info['uid'])) else: pass except Exception as e: print(e) path='{uid}_atten_web.pkl'.format(uid=user_basic_info['uid']) print('unable to store atten web of {uid} , it will be stored' .format(uid=user_basic_info['uid'])) FI.save_pickle(data,path)
def run(self): while True: start_time = time.time() dbi = MySQL_Interface() col_info = dbi.get_col_name('cache_history') query = 'select * from cache_history where is_dealing is null order by checkin_timestamp limit 1' mysql_res = dbi.select_asQuery(query) if mysql_res.__len__() == 0: # cache_history表为空时,睡眠1秒,跳过此次循环 time.sleep(1) continue mysql_res = mysql_res[0] # todo for delete----- print('debug->start to deal with a new task') print('debug->mysql_res: ') print(mysql_res) #------------------------ container_id = mysql_res[col_info.index('container_id')] print('debug->container_id: {cid}'.format(cid=container_id)) latest_time = mysql_res[col_info.index('latest_time')] latest_timestamp = mysql_res[col_info.index('latest_timestamp')] time_stick = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) query = 'update cache_history set is_dealing=\'{time}\' where container_id={cid}'.format( time=time_stick, cid=container_id) # todo for delete----- print('debug->query1 : {q}'.format(q=query)) # ------------------------ dbi.update_asQuery(query) client = MongoClient('localhost', 27017) db = client['microblog_spider'] assemble_table = db.assemble_factory res = assemble_table.find({'container_id': container_id}, { 'current_id': 1, 'total_num': 1 }) id_list = [x['current_id'] for x in res] num = int([ x['total_num'] for x in assemble_table.find({ 'container_id': container_id }).limit(1) ][0]) ## todo for delete----- print('debug->id_list_len: {len}'.format(len=id_list.__len__())) print('debug->num: {n}'.format(n=num)) # ------------------------ # 检查是否所有包裹已经到齐 check_state = True if id_list.__len__() < num: print( 'server->HistoryReport:The package is not complete, retry to catch data' ) check_state = False if check_state: # 如果所有子包已经收集完毕,则将数据放入正式数据库mongodb # 将装配车间中的相关数据删除 # 并且在Mysql中更新update_time和latest_blog,抹掉isGettingBlog # 从mysql获取该用户信息 try: query = 'select * from user_info_table where container_id=\'{cid}\'' \ .format(cid=container_id) user_info = dbi.select_asQuery(query)[0] # todo fro debug------------- print('task {cid} :debug->query2: {q}'.format( q=query, cid=container_id)) print('task {cid} debug->user_info:'.format( cid=container_id)) print(user_info) # -------------------------------- col_name = dbi.get_col_name('user_info_table') except Exception as e: print( 'task {cid} :Error:server-HistoryReturn:' 'No such user in MySQL.user_info_table,Reason:'.format( cid=container_id)) print(e) # 将数据从assemble factory中提取出来 try: data_list = assemble_table.find( {'container_id': container_id}, { 'data': 1, 'current_id': 1 }) data_list_ori = [x for x in data_list] data_list = [x['data'] for x in data_list_ori] id_list = [x['current_id'] for x in data_list_ori] data_list_ori = None # todo fro debug------------- print('task {cid} debug->datalist: {len}'.format( len=data_list.__len__(), cid=container_id)) # -------------------------------- except Exception as e: print( 'Error:server-HistoryReturn:' 'Unable to get data from MongoDB, assemble factory,Reason:' ) print(e) # 长度大于预期,说明有重复信息,需要去重 if id_list.__len__() > num: unique_data_list = [] check_dict = {} for i in range(id_list.__len__()): try: # 这里使用字典去重,(算是hash吧) check_dict[str(id_list[i])] continue except: check_dict[str(id_list[i])] = True unique_data_list.append(data_list[i]) # print('data_list.len :{len}'.format(len=data_list.__len__())) # print('id_list.len :{len}'.format(len=id_list.__len__())) # print(i) data_list = unique_data_list # 将碎片拼接 try: data_final = [] for i in data_list: data_final = data_final + i # todo fro debug------------- print('task {cid} debug->数据拼接完毕,len {len}'.format( len=data_final.__len__(), cid=container_id)) # -------------------------------- except Exception as e: print( 'Error:server-HistoryReport:' 'Unable to contact the pieces of information,Reason:') print(e) # 将本次信息录入accuracy_table 用以进一步分析 blog_len = data_final.__len__() wanted_blog_len = user_info[col_name.index('blog_num')] blog_accuracy = blog_len / wanted_blog_len time_stick = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) query = 'insert into accuracy_table values ({acc},\'{t_s}\',{num}) ;' \ .format(acc=blog_accuracy, t_s=time_stick, num=wanted_blog_len) dbi.insert_asQuery(query) # 将数据录入Mongodb 更改Mysql,删除assemble中相关内容 try: if not user_info[col_name.index('update_time')]: # 将数据存入 Mongodb 的formal collection save_data_seperately(data_final) print( 'task {cid} Success: Data has saved in Mongodb, size is {size}' .format(size=sys.getsizeof(data_final), cid=container_id)) # # 将关键信息录入Mydql query = 'update user_info_table set ' \ 'update_time=\'{up_time}\',' \ 'latest_blog=\'{latest_blog}\',' \ 'isGettingBlog=null ' \ 'where container_id=\'{cid}\';'\ .format(up_time=time_stick,latest_blog=latest_time,cid=container_id) # query='update user_info_table set ' \ # 'update_time=\'{up_time}\',' \ # 'latest_blog=\'{latest_blog}\'' \ # 'where container_id=\'{cid}\';' \ # .format(up_time=time_stick,latest_blog=latest_time,cid=container_id) #TODO 这里为了方便统计,去掉了抹除isGetting这一项,但是正式运行的时候是要加上的 dbi.update_asQuery(query) print( 'task {cid} Success: insert user into MongoDB, the num of data is {len}' .format(len=blog_len, cid=container_id)) else: query='update user_info_table set isGettingBlog=null where container_id=\'{cid}\'' \ .format(cid=container_id) dbi.update_asQuery(query) except Exception as e: print('task {cid} Error:server->HistoryReport:' 'Reason:'.format(cid=container_id)) print(e) else: # 如果所有子包不全,则抹掉isGettingBlog,将装配车间中数据删除 print( 'task {cid} :Error: the package is not complete ,{a} of {b}' .format(a=id_list.__len__(), b=num, cid=container_id)) query='update user_info_table set isGettingBlog=null where container_id=\'{cid}\'' \ .format(cid=container_id) dbi.update_asQuery(query) # 将数据从assemble factory 去掉 assemble_table.remove({'container_id': container_id}) print( 'task {cid} Success: Data has been removed from assemble factory' .format(cid=container_id)) # 将cache_history中的相应行删掉,表示已经处理完该事物了 query='delete from cache_history where container_id=\'{cid}\'' \ .format(cid=container_id) dbi.update_asQuery(query) end_time = time.time() deal_time = end_time - start_time print( 'task {cid} :Success : the user {cid} is completed, length is {len}, use {t} seconds' .format(cid=container_id, len=data_final.__len__(), t=deal_time))
def get(self): global proxy uuid=str(self.get_argument('uuid')) task_id=self.task_assign(uuid) if proxy.get_ave_proxy_size()<30: # check the size of current proxy size self.write('no task') self.finish() return if task_id==-1: # checi if this uuid is valid self.write('no task') self.finish() return if task_id==1: # get the social web of certain user dbi=MySQL_Interface() query='select * from ready_to_get where is_fetching is null order by fans_num desc limit 1;' res=dbi.select_asQuery(query) if res.__len__()==0: self.write('no task') self.finish() return res=res[0] col_info=dbi.get_col_name('ready_to_get') uid=res[col_info.index('uid')] self.write('{uid},connect'.format(uid=uid)) self.finish() time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) query="update ready_to_get set is_fetching=\'{t_time}\' where uid={uid} ;"\ .format(t_time=time_stick,uid=uid) dbi.update_asQuery(query) if task_id==2: # get the history microblog of a certain user dbi=MySQL_Interface() query='select container_id,blog_num from user_info_table ' \ 'where (isGettingBlog is null and update_time is null) ' \ 'order by fans_num desc limit 1 ;' # query='select container_id,blog_num from user_info_table ' \ # 'order by rand() limit 1 ;' [container_id,blog_num]=dbi.select_asQuery(query)[0] self.write('{c_id};{blog},history' .format(c_id=container_id,blog=blog_num)) self.finish() time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) query="update user_info_table set isGettingBlog=\'{t_time}\' where container_id={cid} ;"\ .format(t_time=time_stick,cid=container_id) dbi.update_asQuery(query) if task_id==3: # this part is in test dbi=MySQL_Interface() current_time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) target_time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()-60*60*24)) #提早一天 query='select uid,update_time from user_info_table ' \ 'where update_time<\'{target_time}\' and isGettingBlog is null limit {batch}'\ .format(target_time=target_time_stick,batch=10) res=dbi.select_asQuery(query) res=[[line[0],int(time.mktime(line[1].timetuple()))] for line in res] # res=[[line[0],int(time.mktime(time.strptime(str(line[1]),'%Y-%m-%d %H:%M:%S')))] for line in res] res=[line[0]+'-'+str(line[1]) for line in res] inn='' for item in res: inn+=item+';' inn=inn[0:-1] # uid-stamp;uid-timestamp;...;,update (the formation of order) commend='{list},update'.format(list=inn) self.write(commend) self.finish()
__author__ = 'multiangle' import networkx as nx import matplotlib.pyplot as plt import numpy as np from DB_Interface import MySQL_Interface dbi=MySQL_Interface() [select_user,select_user_col]=dbi.select_all('select_user') user_list= [line[select_user_col.index('name')] for line in select_user] user_id=[line[select_user_col.index('uid')] for line in select_user] [atten_web,atten_web_col]=dbi.select_all('select_atten') atten_list=[[line[atten_web_col.index('from_uid')],line[atten_web_col.index('to_uid')]] for line in atten_web] # temp_atten_list=[] # for line in atten_list: # try: # temp=[user_list[user_id.index(line[0])],user_list[user_id.index(line[1])]] # temp_atten_list.append(temp) # except: # pass # atten_list=temp_atten_list print(atten_list.__len__()) sig_list= [line[0]+line[1] for line in atten_list] select_atten_list=[] for line in atten_list: temp_sig_a=line[0]+line[1] temp_sig_b=line[1]+line[0] if temp_sig_a in sig_list and temp_sig_b in sig_list : select_atten_list.append(line)