def __init__(self): cm = ConfigManager() db_config = cm.get_dbconfig() self.conn_r = MySQLdb.connect(host=db_config['host'], user=db_config['user'], passwd=db_config['passwd'], db=db_config['db'], charset=db_config['charset'])
class Baidu(): def __init__(self): self.cm = ConfigManager() self.db = DBManager() def __del__(self): self.db.close() # 根据歌曲名称查询 # 优先搜索数据库, 若找到, 直接返回该数据 # 提示用户是否仍要继续下载 # 否则, 联网搜索, 并将新数据存入数据库 def searchBySinger(self, author): table = '%sresult' % self.cm.get_config('table')[0]['song']['prefix'] sql_search = ('SELECT sname, durl FROM %s ' % table) + 'WHERE author = \'%s\'' % author data = self.db.query(sql_search) size = len(data) print('数据库目前收录%d首' % size) print '分别有:' for l in data: print(l[0]) print('是否开始下载?(y/n)') choice = raw_input() if choice == 'y': base_dir = self.cm.get_config('dir')['path'] download.download_with_singer(data, base_dir, author, size) else: print '已取消下载' def searchBySname(self, sname, singer=None): """ 提供对外调用的接口, 接受歌曲名作为参数 :param sname: 歌曲名 :param singer: 歌手名(选项) :return: """ table = '%sresult' % self.cm.get_config('table')[0]['song']['prefix'] sql_search = ('SELECT author, durl FROM %s ' % table) + 'WHERE sname = \'%s\'' % sname data = self.db.query(sql_search) size = len(data) print('数据库目前收录%d首' % size) print '分别有:' for l in data: print(l[0]) print('是否开始下载?(y/n)') choice = raw_input() if choice == 'y': base_dir = self.cm.get_config('dir')['path'] download.download_with_sname(sname, data, base_dir) else: print '已取消下载'
class DataProcedure(): def __init__(self): self.cm = ConfigManager() self.db = DBManager() def __del__(self): self.db.close() def run(self): sql = self.get_sql() queue_data = self.cm.get_config('taskqueue')['data'] queue_back = self.cm.get_config('taskqueue')['backup'] r = redis.Redis(host=self.cm.get_config('redis')['host'], port=self.cm.get_config('redis')['port']) if not r: print 'Redis服务未启动' else: print 'Redis服务正常' # 处理当前任务 cur_task = r.rpoplpush(queue_data, queue_back) while cur_task is not None: # print cur_task is_success, rows = self.db.save(sql, eval(cur_task)) if is_success: # 提交成功, 清空备份队列 r.delete(queue_back) cur_task = r.rpoplpush(queue_data, queue_back) print '队列中没有要处理的任务' def get_sql(self): table_name = '%sresult' % self.cm.get_config( 'table')[0]['song']['prefix'] sql_data_save = ("INSERT INTO %s " % table_name) + "(`sid`, `author`, `sname`, `counts`, `durl`) " \ "VALUES (%s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE " \ "counts=counts+1;" return sql_data_save
def __init__(self): self.serialization = DataSerialization() self.cm = ConfigManager()
class Crawler(): def __init__(self): self.serialization = DataSerialization() self.cm = ConfigManager() def get_url_by_singer(self, author): """ 接受author参数, 获取其在百度音乐上收录的所有歌曲, 按照['sid', 'author', 'sname', 'download_counts']元组形式 压入redis数据库, 由后台的task程序处理 :param author: 歌手名 :return: """ s_total, s_size = PageDetector.detect(author) queue_data = self.cm.get_config('taskqueue')['data'] r = redis.Redis( host=self.cm.get_config('redis')['host'], port=self.cm.get_config('redis')['port'] ) if not r: print 'Redis服务未启动' else: print 'Redis服务正常' pages = (s_total + s_size - 1)/s_size list1 = [] sids = '{' # 维护一个计数器, 到达5时, 触发url探测 # 之后, 清零, 并重置sids和list1 counts = 0 for i in xrange(pages): start = i*s_size print '第%d页' % (i+1) if counts == 5: sids += '}' # step 3: 获取歌曲真实下载地址 download_url = 'http://play.baidu.com/data/music/songlink?songIds=%s' % sids curl = pycurl.Curl() buffers = StringIO() curl.setopt(curl.URL, download_url) curl.setopt(curl.USERAGENT, user_agent) curl.setopt(curl.WRITEDATA, buffers) curl.perform() curl.close() body = buffers.getvalue() soup = BeautifulSoup(body) song_lists = self.serialization.json_to_data(soup.text)['data']['songList'] soup.decompose() counter = 0 # step 4: 封装歌曲信息 for item in list1: links = song_lists[counter]['songLink'] # print links if not links: url = 'zzz' else: url = pattern.sub('', links) item.append(url) # print item counter += 1 # step 5: 压入redis print '开始提交数据' r.lpush(queue_data, list1) print '数据压入redis完毕' list1 = [] sids = '{' counts = 0 # step 1: 搜索歌手, 获取歌曲id buffers = StringIO() curl = pycurl.Curl() # 伪造来源 refer_url = 'http://music.baidu.com/search?key=%s' % author search_url = 'http://music.baidu.com/search/song?s=1&key=%s&start=%d&size=%d' % (author, start, s_size) # print tmp_url curl.setopt(curl.URL, search_url) curl.setopt(curl.USERAGENT, user_agent) curl.setopt(pycurl.REFERER, refer_url) curl.setopt(curl.WRITEDATA, buffers) curl.perform() curl.close() body = buffers.getvalue() soup = BeautifulSoup(body) # 返回一个list content = soup.find('div', {'class': 'search-song-list song-list song-list-hook'}).findAll\ ('li', {'class': 'bb-dotimg clearfix song-item-hook '}) soup.decompose() # step 2: 获取歌曲基本信息 counts += 1 if not content: continue for tag in content: songitem = self.serialization.json_to_data((tag['data-songitem']))['songItem'] sid = songitem['sid'] # print sid if not isinstance(sid, int): continue sids += str(sid) + ',' sname = tag.find('div', {'class': 'song-item'}).find('span', {'class': 'song-title'}).find('a').text list1.append([sid, author, sname, 1]) def get_url_by_sname(self, sname, author=None): """ 接受歌曲名作为参数, 搜索其对应的列表, 这里做个处理, 只是抽取列表第1页的数据, 后面的数据参考性不是很大 :param sname: 歌曲名 :param author: 歌手名, 仅当用户定点查询歌手的某首歌时, 才开启该选项; 默认None :return: """ search_url = 'http://music.baidu.com/search?key=%s' % sname refer_url = 'http://music.baidu.com/' buffers = StringIO() curl = pycurl.Curl() curl.setopt(pycurl.URL, search_url) curl.setopt(pycurl.REFERER, refer_url) curl.setopt(pycurl.USERAGENT, user_agent) curl.setopt(pycurl.WRITEDATA, buffers) curl.perform() curl.close() body = buffers.getvalue() soup = BeautifulSoup(body) content = soup.find('div', {'id': 'result_container'}).find('ul') soup.decompose() song_list = [] sids = '{' for tag in content: # 删除注释行 if isinstance(tag, bs4.element.Comment): continue item = self.serialization.json_to_data(tag.get('data-songitem'))['songItem'] sid = item['sid'] author = item['author'] sids += str(sid) + ',' song_list.append([sid, author, sname, 1]) sids += '}' urls = self.get_url_by_sid(sids) counter = 0 for item in song_list: item.append(urls[counter]) counter += 1 queue_data = self.cm.get_config('taskqueue')['data'] r = redis.Redis( host=self.cm.get_config('redis')['host'], port=self.cm.get_config('redis')['port'] ) try: print 'Redis服务正常' print('开始提交数据') r.lpush(queue_data, song_list) print('数据压入redis完毕') except Exception: print 'Redis服务未启动' def get_url_by_sid(self, sids): """ 接受歌曲id构成的字符串 :param sids: 歌曲id串 :return: 歌曲真实下载url """ search_url = 'http://play.baidu.com/data/music/songlink?songIds=%s' % sids buffers = StringIO() curl = pycurl.Curl() curl.setopt(pycurl.URL, search_url) curl.setopt(pycurl.USERAGENT, user_agent) curl.setopt(pycurl.WRITEDATA, buffers) curl.perform() curl.close() body = buffers.getvalue() soup = BeautifulSoup(body) song_lists = self.serialization.json_to_data(soup.text)['data']['songList'] soup.decompose() urls = [] for l in song_lists: link = l['songLink'] if not link: url = 'zzz' else: url = pattern.sub('', link) urls.append(url) return urls
# coding: utf-8 from crawler.conf.cm import ConfigManager import redis cm = ConfigManager() def cleanup(): r = redis.Redis(host=cm.get_config('redis')['host'], port=cm.get_config('redis')['port']) if not r: print 'Redis服务未启动' else: print 'Redis服务正常' # 清空redis错误数据 print '开始清理redis垃圾数据' queue_data = cm.get_config('taskqueue')['data'] queue_back = cm.get_config('taskqueue')['backup'] r.delete(queue_data) r.delete(queue_back) print '清理完成'
def __init__(self): self.cm = ConfigManager() self.db = DBManager()