Exemplo n.º 1
0
class Baidu():
    def __init__(self):
        self.cm = ConfigManager()
        self.db = DBManager()

    def __del__(self):
        self.db.close()

    # 根据歌曲名称查询
    # 优先搜索数据库, 若找到, 直接返回该数据
    # 提示用户是否仍要继续下载
    # 否则, 联网搜索, 并将新数据存入数据库
    def searchBySinger(self, author):
        table = '%sresult' % self.cm.get_config('table')[0]['song']['prefix']
        sql_search = ('SELECT sname, durl FROM %s ' %
                      table) + 'WHERE author = \'%s\'' % author
        data = self.db.query(sql_search)
        size = len(data)
        print('数据库目前收录%d首' % size)
        print '分别有:'
        for l in data:
            print(l[0])
        print('是否开始下载?(y/n)')
        choice = raw_input()
        if choice == 'y':
            base_dir = self.cm.get_config('dir')['path']
            download.download_with_singer(data, base_dir, author, size)
        else:
            print '已取消下载'

    def searchBySname(self, sname, singer=None):
        """
        提供对外调用的接口, 接受歌曲名作为参数
        :param sname: 歌曲名
        :param singer: 歌手名(选项)
        :return:
        """
        table = '%sresult' % self.cm.get_config('table')[0]['song']['prefix']
        sql_search = ('SELECT author, durl FROM %s ' %
                      table) + 'WHERE sname = \'%s\'' % sname
        data = self.db.query(sql_search)
        size = len(data)
        print('数据库目前收录%d首' % size)
        print '分别有:'
        for l in data:
            print(l[0])
        print('是否开始下载?(y/n)')
        choice = raw_input()
        if choice == 'y':
            base_dir = self.cm.get_config('dir')['path']
            download.download_with_sname(sname, data, base_dir)
        else:
            print '已取消下载'
Exemplo n.º 2
0
class DataProcedure():
    def __init__(self):
        self.cm = ConfigManager()
        self.db = DBManager()

    def __del__(self):
        self.db.close()

    def run(self):
        sql = self.get_sql()
        queue_data = self.cm.get_config('taskqueue')['data']
        queue_back = self.cm.get_config('taskqueue')['backup']
        r = redis.Redis(host=self.cm.get_config('redis')['host'],
                        port=self.cm.get_config('redis')['port'])
        if not r:
            print 'Redis服务未启动'
        else:
            print 'Redis服务正常'
            # 处理当前任务
            cur_task = r.rpoplpush(queue_data, queue_back)
            while cur_task is not None:
                # print cur_task
                is_success, rows = self.db.save(sql, eval(cur_task))
                if is_success:
                    # 提交成功, 清空备份队列
                    r.delete(queue_back)
                cur_task = r.rpoplpush(queue_data, queue_back)
            print '队列中没有要处理的任务'

    def get_sql(self):
        table_name = '%sresult' % self.cm.get_config(
            'table')[0]['song']['prefix']
        sql_data_save = ("INSERT INTO %s " % table_name) + "(`sid`, `author`, `sname`, `counts`, `durl`) " \
                                                           "VALUES (%s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE " \
                                                           "counts=counts+1;"
        return sql_data_save
Exemplo n.º 3
0
class Crawler():
    def __init__(self):
        self.serialization = DataSerialization()
        self.cm = ConfigManager()

    def get_url_by_singer(self, author):
        """
        接受author参数, 获取其在百度音乐上收录的所有歌曲, 按照['sid', 'author', 'sname', 'download_counts']元组形式
        压入redis数据库, 由后台的task程序处理
        :param author: 歌手名
        :return:
        """
        s_total, s_size = PageDetector.detect(author)
        queue_data = self.cm.get_config('taskqueue')['data']
        r = redis.Redis(
            host=self.cm.get_config('redis')['host'],
            port=self.cm.get_config('redis')['port']
        )
        if not r:
            print 'Redis服务未启动'
        else:
            print 'Redis服务正常'
        pages = (s_total + s_size - 1)/s_size
        list1 = []
        sids = '{'
        # 维护一个计数器, 到达5时, 触发url探测
        # 之后, 清零, 并重置sids和list1
        counts = 0
        for i in xrange(pages):
            start = i*s_size
            print '第%d页' % (i+1)
            if counts == 5:
                sids += '}'
                # step 3: 获取歌曲真实下载地址
                download_url = 'http://play.baidu.com/data/music/songlink?songIds=%s' % sids
                curl = pycurl.Curl()
                buffers = StringIO()
                curl.setopt(curl.URL, download_url)
                curl.setopt(curl.USERAGENT, user_agent)
                curl.setopt(curl.WRITEDATA, buffers)
                curl.perform()
                curl.close()
                body = buffers.getvalue()
                soup = BeautifulSoup(body)
                song_lists = self.serialization.json_to_data(soup.text)['data']['songList']
                soup.decompose()
                counter = 0
                # step 4: 封装歌曲信息
                for item in list1:
                    links = song_lists[counter]['songLink']
                    # print links
                    if not links:
                        url = 'zzz'
                    else:
                        url = pattern.sub('', links)
                    item.append(url)
                    # print item
                    counter += 1
                # step 5: 压入redis
                print '开始提交数据'
                r.lpush(queue_data, list1)
                print '数据压入redis完毕'
                list1 = []
                sids = '{'
                counts = 0

            # step 1: 搜索歌手, 获取歌曲id
            buffers = StringIO()
            curl = pycurl.Curl()
            # 伪造来源
            refer_url = 'http://music.baidu.com/search?key=%s' % author
            search_url = 'http://music.baidu.com/search/song?s=1&key=%s&start=%d&size=%d' % (author, start, s_size)
            # print tmp_url
            curl.setopt(curl.URL, search_url)
            curl.setopt(curl.USERAGENT, user_agent)
            curl.setopt(pycurl.REFERER, refer_url)
            curl.setopt(curl.WRITEDATA, buffers)
            curl.perform()
            curl.close()
            body = buffers.getvalue()
            soup = BeautifulSoup(body)
            # 返回一个list
            content = soup.find('div', {'class': 'search-song-list song-list song-list-hook'}).findAll\
                ('li', {'class': 'bb-dotimg clearfix song-item-hook  '})
            soup.decompose()
            # step 2: 获取歌曲基本信息
            counts += 1
            if not content:
                continue
            for tag in content:
                songitem = self.serialization.json_to_data((tag['data-songitem']))['songItem']
                sid = songitem['sid']
                # print sid
                if not isinstance(sid, int):
                    continue
                sids += str(sid) + ','
                sname = tag.find('div', {'class': 'song-item'}).find('span', {'class': 'song-title'}).find('a').text
                list1.append([sid, author, sname, 1])

    def get_url_by_sname(self, sname, author=None):
        """
        接受歌曲名作为参数, 搜索其对应的列表, 这里做个处理, 只是抽取列表第1页的数据, 后面的数据参考性不是很大
        :param sname: 歌曲名
        :param author: 歌手名, 仅当用户定点查询歌手的某首歌时, 才开启该选项; 默认None
        :return:
        """
        search_url = 'http://music.baidu.com/search?key=%s' % sname
        refer_url = 'http://music.baidu.com/'
        buffers = StringIO()
        curl = pycurl.Curl()
        curl.setopt(pycurl.URL, search_url)
        curl.setopt(pycurl.REFERER, refer_url)
        curl.setopt(pycurl.USERAGENT, user_agent)
        curl.setopt(pycurl.WRITEDATA, buffers)
        curl.perform()
        curl.close()

        body = buffers.getvalue()
        soup = BeautifulSoup(body)

        content = soup.find('div', {'id': 'result_container'}).find('ul')
        soup.decompose()
        song_list = []
        sids = '{'
        for tag in content:
            # 删除注释行
            if isinstance(tag, bs4.element.Comment):
                continue
            item = self.serialization.json_to_data(tag.get('data-songitem'))['songItem']
            sid = item['sid']
            author = item['author']
            sids += str(sid) + ','
            song_list.append([sid, author, sname, 1])
        sids += '}'
        urls = self.get_url_by_sid(sids)
        counter = 0
        for item in song_list:
            item.append(urls[counter])
            counter += 1
        queue_data = self.cm.get_config('taskqueue')['data']
        r = redis.Redis(
            host=self.cm.get_config('redis')['host'],
            port=self.cm.get_config('redis')['port']
        )
        try:
            print 'Redis服务正常'
            print('开始提交数据')
            r.lpush(queue_data, song_list)
            print('数据压入redis完毕')
        except Exception:
            print 'Redis服务未启动'

    def get_url_by_sid(self, sids):
        """
        接受歌曲id构成的字符串
        :param sids: 歌曲id串
        :return: 歌曲真实下载url
        """
        search_url = 'http://play.baidu.com/data/music/songlink?songIds=%s' % sids
        buffers = StringIO()
        curl = pycurl.Curl()
        curl.setopt(pycurl.URL, search_url)
        curl.setopt(pycurl.USERAGENT, user_agent)
        curl.setopt(pycurl.WRITEDATA, buffers)
        curl.perform()
        curl.close()

        body = buffers.getvalue()
        soup = BeautifulSoup(body)

        song_lists = self.serialization.json_to_data(soup.text)['data']['songList']
        soup.decompose()
        urls = []
        for l in song_lists:
            link = l['songLink']
            if not link:
                url = 'zzz'
            else:
                url = pattern.sub('', link)
            urls.append(url)
        return urls