def __init__(self,ids=518725853,comment='song',): super(Multi_Comment,self).__init__(ids=ids,category='comment',comment=comment) self.comments=[] self.mysql=Cloud_Music_MySQL() self.comment=comment
def __init__(self, song_ids=518725853, refer=2147483647): ##2147483647为32位操作系统int的最大值 即2^31-1 super(Multi_Song_Info, self).__init__(ids=song_ids, category='url') self.refer = refer self.mysql = Cloud_Music_MySQL()
class Multi_Song_Info(Cloud_Music): ##不能多继承 重名了get_response()函数 def __init__(self, song_ids=518725853, refer=2147483647): ##2147483647为32位操作系统int的最大值 即2^31-1 super(Multi_Song_Info, self).__init__(ids=song_ids, category='url') self.refer = refer self.mysql = Cloud_Music_MySQL() def get_info(self, ): s = Song(ids=self.ids) info = s.get_song_info() self.song = info['song'] self.singer = info['singer'] logging.debug(u'id=%s song is:%s,singer is:%s' % (self.ids, self.song, self.singer)) def get_url(self, ): response = self.get_response().json() self.url = response['data'][0]['url'] logging.debug(u'the url of song id=%s is:%s' % (self.ids, self.url)) def insert_song(self, ): t1 = time.time() self.get_info() # self.get_url() url会失效,不获取了 self.url = '' try: self.mysql.insert_table_song(song=self.song, singer=self.singer, url=self.url, ids=self.ids, refer=self.refer) ##已经有了try except 语句,所有线程一定会结束 except: traceback.print_exc() finally: self.mysql.close_connect() t2 = time.time() logging.info(u'try to close mysql connect avoid to many connect') print u'线程执行完毕!一共花费%s秒,关闭数据库连接' % (t2 - t1) logging.debug(u'finish insert_song threading,total cost time is:%s' % (t2 - t1)) ##需要关闭连接,不然的话,线程就不能即时结束,会产生过多数据库连接。使用连接池会忽略close.connect,因而不会发生mysql server has gone away return t2 - t1
def __init__(self,): self.max_connect=11 self.run_connect=Queue() for i in range(self.max_connect): self.run_connect.put(Cloud_Music_MySQL())
def put_song_comment_queue(): mysql = Cloud_Music_MySQL() def put_queue(): song_comment_queue = manager.get_song_comment_queue() data = mysql.check_table_song() mysql.close_connect() for i in data[1]: ids = i['ids'] song_comment_queue.put(ids) logging.info(u'put %s into song_comment_queue' % ids) put_queue() while not song_comment_queue.empty(): print u'[%s] current queue size:%s' % (time.asctime(), song_comment_queue.qsize()) logging.info( u'check whether queue is empty after 10 second,current qsize is:%s' % (song_comment_queue.qsize())) time.sleep(10) logging.info(u'Queue empty , generate new queue')
def put_queue_model(query,queue,queue_name): mysql=Cloud_Music_MySQL() data=getattr(mysql,query)(limit=10000) mysql.close_connect() for d in data[1]: queue.put(d) logging.info(u'put %s into %s'%(d,queue_name)) while not queue.empty(): print u'[%s] current %s size:%s'%(time.asctime(),queue_name,queue.qsize()) logging.info(u'check whether %s is empty after 10 second,current qsize is:%s'%(queue_name,queue.qsize())) time.sleep(10) logging.info(u'Queue empty , generate new queue')
def put_song_info_queue_by_model(): mysql=Cloud_Music_MySQL() logging.info(u'更新playlist中status的值') mysql.auto_update_playlist_status() logging.info(u'获取playlist中status小于75的数据') data=mysql.check_table_playlist_status() mysql.close_connect() queue=manager.get_song_info_queue() queue_name='song_info_queue' def parse(): for i in data[1]: song_ids=i['song_ids'].split(',') refer=i['ids'] for ids in song_ids: d=dict(ids=ids,refer=refer) yield d put_queue_model_special(parse=parse,queue=queue,queue_name=queue_name)
def put_song_comment_queue_by_model(): data = Cloud_Music_MySQL().check_table_song() queue = manager.get_song_comment_queue() def parse_data(): for i in data[1]: ids = i['ids'] yield ids put_queue_model(parse_data=parse_data, queue=queue)
def __init__(self, ): super(Discover_Playlist, self).__init__() self.params = dict( order='hot', cat='全部', limit=35, offset=35, ) self.url = 'http://music.163.com/discover/playlist' self.soup = self.get_soup() self.mysql = Cloud_Music_MySQL()
def __init__(self, ): # self.mysql=Cloud_Music_MySQL() ###所有mysql对象使用同一个mysql连接,会发生插入错误 self.playlist_queue = Queue() self.table_playlist_info_queue = Queue() self.table_playlist_comment_queue = Queue() self.song_info_queue = Queue() self.song_comment_queue = Queue() self.mysql = Cloud_Music_MySQL() self.playlist_thread_pool = ThreadPoolExecutor( 100) ##专门用来执行更新playlist comment任务的线程,因为comment任务中有等待超时 self.song_thread_pool = ThreadPoolExecutor( 100) ##专门用来执行更新playlist comment任务的线程 self.thread_pool = ThreadPoolExecutor(100) ##执行其余任务的线程
def put_song_info_queue(): mysql = Cloud_Music_MySQL() def put_queue(): song_info_queue = manager.get_song_info_queue() # logging.info(u'更新playlist set status=1 歌单中85%以上的歌曲被抓取可以将status更改为1') # print u'更新playlist set status=1 歌单中85%以上的歌曲被抓取可以将status更改为1' # mysql.auto_update_playlist_status() logging.info(u'查询playlist中status小于75的歌单') # print u'查询playlist中status小于75的歌单' data = mysql.check_table_playlist_status() mysql.close_connect() # print data for i in data[1]: refer = i['ids'] song_ids = i['song_ids'].split(',') for ids in song_ids: d = dict(ids=ids, refer=refer) song_info_queue.put(d) # print u'put dict into song_info_queue %s'%d logging.info(u'put dict into song_info_queue %s' % d) put_queue() while not song_info_queue.empty(): print u'[%s] current queue size:%s' % (time.asctime(), song_info_queue.qsize()) logging.info( u'check whether queue is empty after 10 second,current qsize is:%s' % (song_info_queue.qsize())) time.sleep(10) logging.info(u'Queue empty , generate new queue')
def model_run(queue, func): ##需要传入两个参数,queue为manage中管理的queue,func为需要提交的线程任务,需要带参数 '''主函数''' avg_time = model_test(queue=queue, func=func) ##平均一个线程花费的时间 logging.info(u'from function test get avg_time=%s' % avg_time) print u'from function test get avg_time=%s' % avg_time count = 0 start_time = time.time() multiple = g_multiple while True: try: logging.info(u'max wait 100 second try to get element from queue') # d=song_info_queue.get(timeout=100) d = queue.get(timeout=100) logging.info(u'get %s from queue' % d) # msi=Multi_Song_Info(song_ids=d['ids'],refer=d['refer']) # pool.submit(msi.insert_song) pool.submit(func, d) logging.info( u'%s start new threading, get song ids=%s,threading name is:%s,pid is:%s' % (sys._getframe().f_code.co_name, d['ids'], threading.current_thread().name, os.getpid())) count += 1 if count >= max_pool * multiple: sleeptime = multiple * avg_time print u'generate %s threading,so sleep %s second! current active threading num=%s' % ( count, sleeptime, threading.active_count()) time.sleep(sleeptime) count = 0 end_time = time.time() if end_time - start_time >= 600: ##每过10十分做一次检测,确保数据库连接数最好介于30到150之间,保证程序稳定运行 mysql = Cloud_Music_MySQL() Threads_connected = mysql.show_Threads_connected() mysql.close_connect() if Threads_connected <= 30: ##说明程序运行效率不高,可以适当提高multiple,或是降低avg_time avg_time = test() multiple = multiple + 1 info = u'程序闲置过多,current Threads_connected=%s,重设avg_time=%s,multiple=%s' % ( Threads_connected, avg_time, multiple) print info logging.info(info) elif Threads_connected >= 150: ##说明程序负荷过重,可以适当降低multiple,或是提高avg_time avg_time = test() multiple = max(multiple - 1, 2) ##multiple最小为2 info = u'程序负荷过重,current Threads_connected=%s,重设avg_time=%s,multiple=%s' % ( Threads_connected, avg_time, multiple) print info logging.info(info) else: info = u'程序运行良好,current Threads_connected=%s,保持avg_time=%s,multiple=%s' % ( Threads_connected, avg_time, multiple) print info logging.info(info) start_time = time.time() ##重设start_time except Exception, e: if str(e): e = str(e) else: ##queue raise error e , str(e)为空 e = 'queue empty' logging.warn( u' function %s raise error cause by %s,traceback info is:%s ' % (sys._getframe().f_code.co_name, e, traceback.format_exc())) print u'error info is:%s' % e if 'many connections' in e: ##最好使用joinablequeue,##经过600秒一次的性能检测,很难抛出too many connections 异常了 print u'current too many connections,sleep 3 second wait runing connections close' # song_info_queue.put(d) queue.put(d) print u'catch too many connections error ,so put d=%s back into queue' % d logging.info( u'catch too many connections error ,so put d=%s back into queue' % d) ##发生异常在于数据库操作,d的值可以获取到,所以把他重新放回queue中,所以不需要joinablequeue了 mysql = Cloud_Music_MySQL() Threads_connected = mysql.show_Threads_connected() while Threads_connected >= 100: info = u'current Threads_connected is:%s,also too much,so sleep 3 second!' % Threads_connected print info logging.debug(info) time.sleep(3) Threads_connected = mysql.show_Threads_connected() mysql.close_connect() continue elif 'empty' in e: print u'empty queue,break loop!' print u'wait 20 second ensure runing threading done' time.sleep(20) break else: info = u'unexcept error,here is traceback info:%s' % ( traceback.format_exc()) print info logging.error(info) # song_info_queue.put(d) queue.put(d) print u'catch unexcept error ,so put d=%s back into queue' % d break
def __init__(self,ids=2190625773,mysql=Cloud_Music_MySQL()): super(Multi_Playlist_Info,self).__init__(ids=ids) self.mysql=mysql##persistentdb 线程池
class Multi_Comment(Cloud_Music): '''利用进程池和线程池''' def __init__(self,ids=518725853,comment='song',): super(Multi_Comment,self).__init__(ids=ids,category='comment',comment=comment) self.comments=[] self.mysql=Cloud_Music_MySQL() self.comment=comment def parse_comments(self,response): comment_json='' content=response.json() comments=content['comments'] for comment in comments: d=dict( content=comment['content'].strip(), nickname=comment['user']['nickname'], userid=comment['user']['userId'], likedcount=comment['likedCount'], time=comment['time'], ) j=json.dumps(d,ensure_ascii=False) comment_json+=j+'\n' return comment_json def get_first_comment(self,): '''通过绑定属性,将会获得三个返回值,self.page,self.total''' self.page=self.get_page() response=self.first_comment logging.info(u'ids 为%s的歌曲,评论总页码为:%s'%(self.ids,self.page)) comment=self.parse_comments(response) self.comments.append(comment) def get_other_comment(self,page=2): # logging.info(u'正在抓取ids=%s第%s页评论'%(self.ids,page)) response=self.get_response(page=page) comment=self.parse_comments(response) logging.info(u'正在抓取ids=%s第%s页评论'%(self.ids,page)) self.comments.append(comment) def get_all_comment(self,max_page=5): self.min_page=min(max_page,self.page)##最多抓取页数,避免抓取过多 for page in range(2,self.min_page+1): self.get_other_comment(page=page) logging.info(u'获取ids=%s的%s的第%s页评论'%(self.ids,self.comment,page)) def unique_comment(self,): info=u'得到%s页评论中的%s页评论,准备合并评论'%(self.min_page,len(self.comments)) print info logging.info(info) return '\n'.join(self.comments) def update_song_comment(self,comments): # mysql=Cloud_Music_MySQL() ##可以做成线程池 self.mysql.update_table_song(comments=comments,comment_count=self.total,ids=self.ids) self.mysql.close_connect() def update_playlist_comment(self,comments): # mysql=Cloud_Music_MySQL() ##可以做成线程池 self.mysql.update_table_playlist_comment(comments=comments,comment_count=self.total,ids=self.ids) self.mysql.close_connect()