示例#1
0
 def save_sql(self):
     mysql_command = MySQLCommand()
     mysql_command.connectdb()
     while True:
         ids = self.id_queue.get()
         print('数据为:\n', ids)
         mysql_command.update_list(ids)
示例#2
0
 def save_user_info(self):
     mysql_command = MySQLCommand()
     mysql_command.connectdb()
     while True:
         result = self.user_queue.get()
         # print('爬去的用户结果: ', result)
         mysql_command.insert_user(result)
示例#3
0
 def save_music_list(self):
     mysql_command = MySQLCommand()
     mysql_command.connectdb()
     while True:
         result = self.list_queue.get()
         # print('爬去的歌单结果: ', result)
         mysql_command.insert_list(result)
示例#4
0
 def get_user_id(self):
     mysql_command = MySQLCommand()
     mysql_command.connectdb()
     mysql_command.cursor.execute("select userId, playlistCount from user")
     user_list = mysql_command.cursor.fetchall()
     for userinfo in user_list:
         user_id = userinfo['userId']
         playlistCount = userinfo['playlistCount']
         if playlistCount is not None:
             playlistCount = playlistCount.strip()
         else:
             playlistCount = 0
         if len(user_id) > 0:
             self.music_task.put(user_id)
             self.list_task.put([user_id, int(playlistCount)])
             time.sleep(2)
示例#5
0
 def save_sql(self):
     mysql_command = MySQLCommand()
     mysql_command.connectdb()
     while True:
         ids = self.id_queue.get()
         mysql_command.insert_list(ids)
示例#6
0
    def list_id(self):
        mysql_command = MySQLCommand()
        mysql_command.connectdb()
        mysql_command.cursor.execute("select id, userLikeId from music_list")
        list_ids = mysql_command.cursor.fetchall()

        for id in list_ids:
            userids = id.get('userLikeId')
            list_id = id.get('id')

            if userids is not None and len(userids) > 10:
                print('ID为%s的歌单已经更新' % list_id)
                continue
            if len(list_id) > 0:
                replace = 1
                print('正在爬取ID为%s 的歌单' % list_id)
                id = list_id.strip()
                url = 'https://music.163.com/playlist?id=%s' % id
                print('爬取的歌单url为:%s' % url)
                while replace < 10:
                    msg = 0
                    try:
                        headers = self.headers
                        res = requests.get(url, headers=headers)
                        time.sleep(replace + 5)
                        soup = BeautifulSoup(res.text, 'html5lib')
                        try:
                            music = soup.find('ul', attrs={'class': 'f-hide'})
                            music_id = music.find_all('li')
                        except Exception as e:
                            music_id = []
                            msg += 1
                            print('ID为%s的歌单没有歌曲!原因是%s' % (list_id, e))
                        try:
                            user = soup.find('ul',
                                             attrs={'class': 'm-piclist f-cb'})
                            user_id = user.find_all('li')
                        except Exception as e:
                            user_id = []
                            msg += 1
                            print('ID为%s的歌单没有喜欢的用户!原因是%s' % (list_id, e))
                        try:
                            simple_list = soup.findAll('div',
                                                       attrs={'class': 'info'})
                        except Exception as e:
                            simple_list = []
                            msg += 1
                            print('ID为%s的歌单没有相关推荐或热门歌单!原因是%s' % (list_id, e))
                        if msg < 2:
                            try:
                                self.extract_id(list_id, music_id, user_id,
                                                simple_list)
                                break
                            except Exception as e:
                                print('失败!原因是%r' % e)
                                replace += 1
                        else:
                            replace += 2
                    except Exception as e:
                        print('重试! %r' % e)
                        replace += 1
                        time.sleep(2)
示例#7
0
class MusicSpider(object):
    def __init__(self):
        self.file_path = '../data/'  # 用户信息保存位置
        self.headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Connection': 'keep-alive',
            'Cookie': 'JSESSIONID-WYYY=ByCd%2F1zHaA6%5CBqA%2BY6sxOkSFXycajAx3XuQyySu2buAYehwzXeZkRb1wscB8vUIg83pUvkMHO1SmtGIO3pKyySb%5CoxUpy9CUWWEo0hjRRszV%2FkqPsH%2B5PykExoVq9zQCZuwyQz4tQqCrvotiqb%5CO%5CA8cpWAqAQraI5NsvM5VY5KenvqS%3A1578052539036; _iuqxldmzr_=32; _ntes_nnid=6773350955c533de38f1625624ebe4f4,1578050739108; _ntes_nuid=6773350955c533de38f1625624ebe4f4; WM_NI=3NHJAjwsUDaG8r2TMyn128jA6fBbyickbyK%2FnunpTznOsK4Xk5AhevMS3EvW6tQsbNoSelxCjgnNNqWFyUEP%2B1e8SaaQ51OcjIxmvagcdyPMlC%2B%2BTwteRAImrcPzeEINM0U%3D; WM_NIKE=9ca17ae2e6ffcda170e2e6eed2d14a9596ae94f067a88e8ba2d14a929a9aaabb21bab2aba9c240b19bfdb7db2af0fea7c3b92af19288abc462b5ad9ba5e44dfcaefeb5d073aeeffed9e94bf6ba8e83fc63a1b5ae9aca25aeaba291d772ae91bdacb754a9eb8f89e87e8f8dfda6f55df6ac9f94e146ad8dab8dfb49aab9a2afcd7b959ab7b6c85ce9efabd9d26ba38ffbd2ce69aa97b88ef56ba5bdac9ad347b09de5ccd77db8bb9ea2cc67b2bda09be84f8b9283d1d837e2a3; WM_TID=mCNsKkYK71tBAQBBRFNtqjmPHS4pFjUG',
            'Host': 'music.163.com',
            'Referer': 'http://music.163.com/',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/66.0.3359.181 Safari/537.36'
        }
        self.data_list = pd.read_csv('../data/csv_file/music_spider.csv')
        self.num = 0  # 从第0首歌曲开始爬取
        self.music = False
        self.cookie_path = '../data/cookie.txt'

    @staticmethod
    def _generate_random_strs(length):
        string = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
        i = 0  # 控制次数参数i
        # 初始化随机字符串
        random_strs = ""
        while i < length:
            e = random.random() * len(string)
            # 向下取整
            e = math.floor(e)
            random_strs = random_strs + list(string)[e]
            i = i + 1
        return random_strs

    # AES加密
    @staticmethod
    def _aes_encrypt(msg, key):
        padding = 16 - len(msg) % 16  # 如果不是16的倍数则进行填充(padding)
        msg = msg + padding * chr(padding)  # 这里使用padding对应的单字符进行填充
        iv = '0102030405060708'  # 用来加密或者解密的初始向量(必须是16位)
        cipher = AES.new(key, AES.MODE_CBC, iv)
        encrypted_bytes = cipher.encrypt(msg)  # 加密后得到的是bytes类型的数据
        encode_strs = base64.b64encode(encrypted_bytes)  # 使用Base64进行编码,返回byte字符串
        enc_text = encode_strs.decode('utf-8')  # 对byte字符串按utf-8进行解码
        return enc_text

    # RSA加密
    @staticmethod
    def _rsa_encrypt(random_strs, key, f):
        # 随机字符串逆序排列
        string = random_strs[::-1]
        # 将随机字符串转换成byte类型数据
        text = bytes(string, 'utf-8')
        seckey = int(codecs.encode(text, encoding='hex'), 16) ** int(key, 16) % int(f, 16)
        return format(seckey, 'x').zfill(256)

    # 获取参数
    def get_params(self, id_msg, comment):
        # msg也可以写成msg = {"offset":"页面偏移量=(页数-1) * 20", "limit":"20"},offset和limit这两个参数必须有(js)
        # limit最大值为100,当设为100时,获取第二页时,默认前一页是20个评论,也就是说第二页最新评论有80个,有20个是第一页显示的
        # msg = '{"rid":"R_SO_4_1302938992","offset":"0","total":"True","limit":"100","csrf_token":""}'
        # offset = (page-1) * 20
        # msg = '{offset":' + str(offset) + ',"limit":"20"}'
        # msg = '{"rid":"R_SO_4_1302938992","offset":' + str(offset) + ',"total":"True","limit":"20","csrf_token":""}'
        key = '0CoJUm6Qyw8W8jud'
        if comment:
            offset = (id_msg - 1) * 20
            # offset和limit是必选参数,其他参数是可选的,其他参数不影响data数据的生成
            msg = '{"offset":' + str(offset) + ',"total":"True","limit":"20","csrf_token":""}'
        else:
            msg = '{id: ' + id_msg + ', lv: -1, tv: -1}'
        f = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7'
        e = '010001'
        enc_text = self._aes_encrypt(msg, key)
        # 生成长度为16的随机字符串
        i = self._generate_random_strs(16)

        # 两次AES加密之后得到params的值
        enc_text = self._aes_encrypt(enc_text, i)
        # RSA加密之后得到encSecKey的值
        enc_seckey = self._rsa_encrypt(i, e, f)
        return enc_text, enc_seckey

    # 数据正则处理
    def re_value(self, value):
        value = re.sub(r'\r|\n|\\|\'|\{|\}|\"', ' ', value)
        return value

    def check_headers(self):
        cookie_list = []
        with open(self.cookie_path, 'r') as fp:
            for i in fp.readlines():
                i = json.loads(i)
                cookie_list.append(i)
        self.headers['Cookie'] = random.choice(cookie_list)['cookie']

    # 获取评论总数
    def page_spider(self, music_id):
        url = 'https://music.163.com/weapi/v1/resource/comments/R_SO_4_' + music_id + '?csrf_token='
        page = 1
        params, encSecKey = self.get_params(page, True)
        data = {'params': params, 'encSecKey': encSecKey}
        self.headers['Referer'] = 'https://music.163.com/song?id=%s' % music_id
        repeat = 0
        while repeat < 8:
            try:
                if repeat > 5:
                    self.check_headers()
                r = requests.post(url, headers=self.headers, data=data)
                time.sleep(repeat)
                r.encoding = "utf-8"
                if r.status_code == 200:
                    # 返回json格式的数据
                    result = r.json()
                    if 'total' in result.keys():
                        total = result['total']
                        return total
                    else:
                        return 0
                else:
                    repeat += 1
            except Exception as e:
                print('ID为%s的歌曲评论总数获取失败, 原因是%s' % (music_id, e))
                repeat += 1

    def get_lynic(self, song_id):
        # params的长度为108,不要拿浏览器控制面板中的数据进行测试,那里的params长度为128,不符合
        params, encSecKey = self.get_params(song_id, False)
        data = {'params': params, 'encSecKey': encSecKey}
        url = 'https://music.163.com/weapi/song/lyric?csrf_token='
        repeat = 1
        while repeat < 16:
            try:
                if repeat > 8:
                    self.check_headers()
                r = requests.post(url, headers=self.headers, data=data)
                time.sleep(repeat)
                song = r.json()
                if 'uncollected' in song.keys() or 'lrc' in song.keys() or 'nolyric' in song.keys():
                    break
                else:
                    if 'sgc' in song.keys():
                        if song['sgc']:
                            break
                    repeat += 1
                    print('第%d次获取ID为%s的歌曲歌词失败,请求太快!' % (repeat, song_id))
            except Exception as e:
                print('第%d次获取ID为%s的歌曲歌词失败,原因%s' % (repeat, song_id, e))
                repeat += 1
        try:
            song_lynic = song['lrc']['lyric']
            song_lynic = re.sub(r'\[(.*?)\]', '', song_lynic)
            song_lynic = re.sub(r'\n', ',', song_lynic)
            song_lynic = self.re_value(song_lynic)
        except Exception:
            # print(song_id)
            # print(song)
            song_lynic = ''
        try:
            id = song['lyricUser']['userid']
            uptime = song['lyricUser']['uptime']
            lynic_user = json.dumps({'user_id': id, 'uptime': uptime})
        except Exception:
            lynic_user = json.dumps({'user_id': '', 'uptime': ''})

        result = {'song_lynic': song_lynic, 'lynic_user': lynic_user}
        # print(result)
        return result

    # 获取歌曲详情
    def get_music_info(self, music):
        music_dict = {}
        # lynic_result = self.get_lynic(music)
        lynic_result = {'song_lynic': '', 'lynic_user': ''}
        m_id = music
        music_dict['music_id'] = m_id
        simple_music = []
        contain_list = []
        url = 'https://music.163.com/song?id=%s' % m_id
        repeat = 0
        while repeat < 5:
            try:
                response = requests.get(url, headers=self.headers)
                time.sleep(repeat)
                response = response.text
                soup = BeautifulSoup(response, 'html5lib')
                try:
                    title = soup.find_all('div', attrs={'class': 'tit'})[0]
                    title = title.find('em').text
                    music_dict['music_name'] = title
                    break
                except Exception as e:
                    music_dict['music_name'] = ''
                    print('未找到ID为%s歌曲的歌名,原因是%s' % (m_id, e))
                    repeat += 1
            except Exception as e:
                print('第%d次获取ID为%s歌曲详情失败!原因是%s ' % (repeat, m_id, e))
                repeat += 1
            break

        try:
            for index, info in enumerate(soup.find_all('p', attrs={'class': 'des s-fc4'})):
                try:
                    singer_id = info.find_all('span')[0].find_all('a')[0]['href'].replace('/artist?id=', '').strip()
                    music_dict['singer_id'] = singer_id
                except Exception:
                    try:
                        album_id = info.find_all('a')[0]['href'].replace('/album?id=', '').strip()
                        music_dict['album_id'] = album_id
                    except:
                        if index == 0:
                            music_dict['singer_id'] = ''
                        else:
                            music_dict['album_id'] = ''
        except Exception as e:
            music_dict['singer_id'] = ''
            music_dict['album_id'] = ''
            print('ID为%s的歌曲的歌手和专辑信息获取失败,使用默认空值!失败原因是%s' % (m_id, e))
        try:
            music_list = soup.find_all('ul', attrs={'class': 'm-rctlist f-cb'})[0]
            for info in music_list.find_all('li'):
                try:
                    playlist = re.findall(r'playlist\?id=(.*?)" title', str(info))[0]
                    creator_id = re.findall(r'/user/home\?id=(.*?)" title', str(info))[0]
                    contain_list.append({'list': playlist, 'creator': creator_id})
                except:
                    print('歌单ID和创建者ID爬取异常!此信息为:', str(info))
        except Exception as e:
            print('获取包含此歌的歌单信息失败!失败歌曲为: %s, 原因是%s' % (m_id, e))
        music_dict['contain_list'] = json.dumps(contain_list)
        try:
            simple_m = soup.find_all('ul', attrs={'class': 'm-sglist f-cb'})[0]
            for music in simple_m.find_all('li', attrs={'class': 'f-cb'}):
                try:
                    song_id = re.findall(r'/song\?id=(.*?)" title', str(music))[0]
                    try:
                        singer_id = re.findall(r'/artist\?id=(.*?)">', str(music))[0]
                    except:
                        try:
                            singer_id = re.findall(r'title="(.*?)"><span', str(music))[0]
                        except:
                            singer_id = ''
                    simple_music.append({'song': song_id, 'singer': singer_id})
                except:
                    print('歌曲ID和歌手ID爬取异常!此信息为:', str(music))
        except Exception as e:
            print('获取于此歌相似的歌曲失败!失败歌曲为: %s, 原因是%s' % (m_id, e))
        comment_num = self.page_spider(m_id)
        music_dict['comment_num'] = str(comment_num)
        music_dict['simple_music'] = json.dumps(simple_music)
        music_dict['song_lynic'] = lynic_result['song_lynic']
        music_dict['lynic_user'] = lynic_result['lynic_user']
        return music_dict

    # 重连数据库
    def conn_music(self):
        self.mysqlMusic = MySQLCommand()
        self.mysqlMusic.connectdb()
        self.music = True

    # 保存歌曲信息
    def save_music(self, num, result):
        try:
            self.mysqlMusic.insert_music(result)
            print(result)
            print('--->第%d首歌曲爬取完成<---' % num)

        except:
            self.music = False
            print('数据库异常,重新连接数据库...')

    # 从csv文件下发任务
    def get_list_id(self):
        if self.music is False:
            self.conn_music()
            time.sleep(1)
        data = self.data_list['music_id']
        num = 0
        for task in data.values:
            task = str(task)
            print('爬取ID为 %s 的歌曲...' % task)
            if num >= self.num:
                result = self.get_music_info(task)
                while True:
                    if self.music is False:
                        self.conn_music()
                        time.sleep(1)
                    else:
                        self.save_music(num, result)
                        break
            num += 1
示例#8
0
 def save_user_info(self):
     mysql_command = MySQLCommand()
     mysql_command.connectdb()
     while True:
         result = self.user_result_queue.get()
         mysql_command.insert_user(result)
示例#9
0
class ListCommSpider():
    def __init__(self):
        self.headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Connection': 'keep-alive',
            'Cookie': 'WM_TID=36fj4OhQ7NdU9DhsEbdKFbVmy9tNk1KM; _iuqxldmzr_=32; _ntes_nnid=26fc3120577a92f179a3743269d8d0d9,1536048184013; _ntes_nuid=26fc3120577a92f179a3743269d8d0d9; __utmc=94650624; __utmz=94650624.1536199016.26.8.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); WM_NI=2Uy%2FbtqzhAuF6WR544z5u96yPa%2BfNHlrtTBCGhkg7oAHeZje7SJiXAoA5YNCbyP6gcJ5NYTs5IAJHQBjiFt561sfsS5Xg%2BvZx1OW9mPzJ49pU7Voono9gXq9H0RpP5HTclE%3D; WM_NIKE=9ca17ae2e6ffcda170e2e6eed5cb8085b2ab83ee7b87ac8c87cb60f78da2dac5439b9ca4b1d621f3e900b4b82af0fea7c3b92af28bb7d0e180b3a6a8a2f84ef6899ed6b740baebbbdab57394bfe587cd44b0aebcb5c14985b8a588b6658398abbbe96ff58d868adb4bad9ffbbacd49a2a7a0d7e6698aeb82bad779f7978fabcb5b82b6a7a7f73ff6efbd87f259f788a9ccf552bcef81b8bc6794a686d5bc7c97e99a90ee66ade7a9b9f4338cf09e91d33f8c8cad8dc837e2a3; JSESSIONID-WYYY=G%5CSvabx1X1F0JTg8HK5Z%2BIATVQdgwh77oo%2BDOXuG2CpwvoKPnNTKOGH91AkCHVdm0t6XKQEEnAFP%2BQ35cF49Y%2BAviwQKVN04%2B6ZbeKc2tNOeeC5vfTZ4Cme%2BwZVk7zGkwHJbfjgp1J9Y30o1fMKHOE5rxyhwQw%2B%5CDH6Md%5CpJZAAh2xkZ%3A1536204296617; __utma=94650624.1052021654.1536048185.1536199016.1536203113.27; __utmb=94650624.12.10.1536203113',
            'Host': 'music.163.com',
            'Referer': 'http://music.163.com/',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/66.0.3359.181 Safari/537.36'}
        self.host_path = '../data/host.txt'
        self.cookie_path = '../data/cookie.txt'
        self.ip_queue = Queue()
        self.save_queue = Queue()  # 结果队列
        self.task_queue = Queue()  # 任务队列
        self.save_user_queue = Queue()  # 评论人队列
        self.conn_task = False
        self.conn_result = False
        self.conn_user = False
        self.prosiex_start = True  # 是否启动代理IP爬取线程
        self.num = 0  # 从第0个歌单开始爬取
        self.listid = pd.read_csv('/Users/apple/PycharmProjects/WYY_sprider/demo/musicList.csv', dtype={'list_id': str, 'user_id': str})

    # 重连数据库
    def task_conn(self):
        self.mysqlCommand = MySQLCommand()
        self.mysqlCommand.connectdb()
        self.conn_task = True

    def result_conn(self):
        self.mysqlResult = MySQLCommand()
        self.mysqlResult.connectdb()
        self.conn_result = True

    def user_conn(self):
        self.mysqlUser = MySQLCommand()
        self.mysqlUser.connectdb()
        self.conn_user = True

    # 生成16个随机字符
    def generate_random_strs(self, length):
        string = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
        # 控制次数参数i
        i = 0
        # 初始化随机字符串
        random_strs = ""
        while i < length:
            e = random.random() * len(string)
            # 向下取整
            e = math.floor(e)
            random_strs = random_strs + list(string)[e]
            i = i + 1
        return random_strs

    # AES加密
    def AESencrypt(self, msg, key):
        # 如果不是16的倍数则进行填充(paddiing)
        padding = 16 - len(msg) % 16
        # 这里使用padding对应的单字符进行填充
        msg = msg + padding * chr(padding)
        # 用来加密或者解密的初始向量(必须是16位)
        iv = '0102030405060708'

        cipher = AES.new(key, AES.MODE_CBC, iv)

        # 加密后得到的是bytes类型的数据
        encryptedbytes = cipher.encrypt(msg)
        # 使用Base64进行编码,返回byte字符串
        encodestrs = base64.b64encode(encryptedbytes)
        # 对byte字符串按utf-8进行解码
        enctext = encodestrs.decode('utf-8')

        return enctext

    # RSA加密
    def RSAencrypt(self, randomstrs, key, f):
        # 随机字符串逆序排列
        string = randomstrs[::-1]
        # 将随机字符串转换成byte类型数据
        text = bytes(string, 'utf-8')
        seckey = int(codecs.encode(text, encoding='hex'), 16) ** int(key, 16) % int(f, 16)
        return format(seckey, 'x').zfill(256)

    # 获取参数
    def get_params(self, page):
        # msg也可以写成msg = {"offset":"页面偏移量=(页数-1) * 20", "limit":"20"},offset和limit这两个参数必须有(js)
        # limit最大值为100,当设为100时,获取第二页时,默认前一页是20个评论,也就是说第二页最新评论有80个,有20个是第一页显示的
        # msg = '{"rid":"R_SO_4_1302938992","offset":"0","total":"True","limit":"100","csrf_token":""}'
        # 偏移量
        offset = (page - 1) * 20
        # offset和limit是必选参数,其他参数是可选的,其他参数不影响data数据的生成
        msg = '{"offset":' + str(offset) + ',"total":"True","limit":"20","csrf_token":""}'
        key = '0CoJUm6Qyw8W8jud'
        f = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7'
        e = '010001'
        enctext = self.AESencrypt(msg, key)
        # 生成长度为16的随机字符串
        i = self.generate_random_strs(16)

        # 两次AES加密之后得到params的值
        encText = self.AESencrypt(enctext, i)
        # RSA加密之后得到encSecKey的值
        encSecKey = self.RSAencrypt(i, e, f)
        return encText, encSecKey

    def check_headers(self):
        cookie_list = []
        with open(self.cookie_path, 'r') as fp:
            for i in fp.readlines():
                i = json.loads(i)
                cookie_list.append(i)
        self.headers['Cookie'] = random.choice(cookie_list)['cookie']

    # 检查代理IP是否可用
    def check_ip(self, proxies):
        try:
            header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                                    'AppleWebKit/537.36 (KHTML, like Gecko) '
                                    'Chrome/64.0.3282.186 Safari/537.36'}
            ip = '://' + proxies['ip'] + ':' + proxies['port']
            proxies = {'https': 'https' + ip}
            url = 'https://www.ipip.net/'
            r = requests.get(url, headers=header, proxies=proxies, timeout=5)
            r.raise_for_status()
        except:
            return False
        else:
            print(proxies, '检查通过!')
            return True

    # 生成IP代理
    def ip_proxies(self):
        api = 'http://www.xicidaili.com/wn/{}'
        header = {
            'Cookie': '_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJTZlOTVjNGQ1MmUxMDlmNzhlNjkwMDU3MDUxMTQ4YTUwBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMUpRcU9ySVRNcmlOTytuNm9ZWm53RUFDYzhzTnZCbGlNa0ZIaHJzancvZEU9BjsARg%3D%3D--742b1937a06cc747483cd594752ef2ae80fc4d91; Hm_lvt_0cf76c77469e965d2957f0553e6ecf59=1577952296; Hm_lpvt_0cf76c77469e965d2957f0553e6ecf59=1578016572',
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/'
                          '537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
            'Host': 'www.xicidaili.com',
            'Connection': 'keep-alive',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'Cache-Control': 'no-cache'}

        fp = open(self.host_path, 'a+', encoding=('utf-8'))
        self.ip_pool = []
        for i in range(20):
            api = api.format(1)
            respones = requests.get(url=api, headers=header)
            time.sleep(3)
            soup = BeautifulSoup(respones.text, 'html.parser')
            container = soup.find_all(name='tr', attrs={'class': 'odd'})
            for tag in container:
                try:
                    con_soup = BeautifulSoup(str(tag), 'html.parser')
                    td_list = con_soup.find_all('td')
                    ip = str(td_list[1])[4:-5]
                    port = str(td_list[2])[4:-5]
                    _type = td_list[5].text
                    IPport = {'ip': ip, 'port': port, 'type': _type.lower()}
                    if self.check_ip(IPport):
                        IPport = json.dumps(IPport)
                        self.ip_pool.append(IPport)
                        fp.write(IPport)
                        fp.write('\n')
                        self.ip_queue.put(IPport)
                except Exception as e:
                    print('No IP!')
            if self.prosiex_start is False:
                break
        fp.close()

    # 从host.txt中读取代理
    def ip_txt(self):
        print('IP代理爬取不够,从host.txt中添加...')
        with open(self.host_path, 'r') as fp:
            ip_port = fp.readlines()
            for i in ip_port:
                self.ip_pool.append(i)
                self.ip_queue.put(i)

    def get_comments_json(self, url, data):
        repeat = 0
        while repeat < 4:
            try:
                r = requests.post(url, headers=self.headers, data=data)
                time.sleep(repeat+2)
                r.encoding = "utf-8"
                if r.status_code == 200:
                    # 返回json格式的数据
                    result = r.json()
                    if 'total' in result.keys():
                        total = result['total']
                        repeat = 0
                        self.ip_pool = []
                        return result, total
                    elif 'code' in result.keys():
                        if result['code'] == -460:
                            if repeat < 3:
                                self.check_headers()
                            else:
                                if len(self.ip_pool) < 10:
                                    Thread(target=self.ip_proxies, args=()).start()
                                if len(self.ip_pool) < 10:
                                    self.ip_txt()
                                result, total = self.ip_spider(url, data)
                                if result is None:
                                    self.prosiex_start = False
                                    for i in range(90000):
                                        print('\r IP可能被封,代理IP不可用!需要等待' + str(90000 - i) + '秒...', sep=' ', end='', flush=True)
                                        time.sleep(1)
                                    self.prosiex_start = True
                                else:
                                    self.prosiex_start = True
                                    return result, total
                            repeat += 1

            except:
                time.sleep(1)
                repeat += 1
                print("第%d次爬取url为%s 的页面失败!正重新尝试..." % (repeat, url))
        return None, None

    # 使用代理爬取
    def ip_spider(self, url, data):
        repeat = 0
        while repeat < 50:
            proxies = self.ip_queue.get()
            proxies = json.loads(proxies)
            ip = '://' + proxies['ip'] + ':' + proxies['port']
            proxies = {'https': 'https' + ip}
            print('使用的代理IP为:', proxies)
            try:
                r = requests.post(url, headers=self.headers, data=data, proxies=proxies)
                time.sleep(2)
                try:
                    r.encoding = 'utf-8'
                    result = r.json()
                except Exception as e:
                    print('错误:', e)
                    return r, None
                if 'code' in result.keys():
                    if result['code'] == -460:
                        repeat += 1
                        print('%r的IP代理不可用, 访问URL为%s的网页失败!原因是%s, 重试第%d次' % (proxies, url, result, repeat + 1))
                if 'total' in result.keys():
                    total = result['total']
                    print('result: ', result)
                    return result, total
            except Exception as e:
                print('IP代理为%r, 访问URL为%s的网页失败!原因是%s, 重试第%d次' % (proxies, url, e, repeat+1))
                repeat += 1
        print('返回的是none')
        return None, None

    # 数据正则处理
    def re_value(self, value):
        value = re.sub(r'\r|\n|\\|\'|\{|\}|\"', ' ', value)
        return value

    # 获取热门评论
    def hot_comments(self, html, list_id, pages, total, creater_id):
        try:
            print("正在获取歌单{}的热门评论,总共有{}页{}条评论!".format(list_id, pages, total))
            if 'hotComments' in html:
                for item in html['hotComments']:
                    # 提取发表热门评论的用户名
                    user = item['user']
                    if item['content'] is not None:
                        comment = self.re_value(item['content'])
                    else:
                        comment = ''
                    # 写入文件
                    hot_comment = {'hot_comment': '1', 'user_id': str(user['userId']), 'comment': comment,
                                   'likedCount': str(item['likedCount']), 'time': str(item['time']), 'list_id': list_id,
                                   'creater_id': creater_id}
                    self.save_user_queue.put(str(user['userId']))
                    # 回复评论
                    reply_comment = []
                    if len(item['beReplied']) != 0:
                        for reply in item['beReplied']:
                            # 提取发表回复评论的用户名
                            reply_user = reply['user']
                            if reply['content'] is not None:
                                content = self.re_value(reply['content'])
                            else:
                                content = ''
                            reply_comment.append({'user_id': reply_user['userId'], 'content': content})
                            self.save_user_queue.put(str(reply_user['userId']))
                    hot_comment['reply'] = str(reply_comment)
                    self.save_queue.put(hot_comment)
        except Exception as e:
            print('获取歌单{}的评论失败,原因是{}'.format(list_id, e))
            return False

    # 获取普通评论
    def comments(self, html, list_id, i, pages, total, creater_id):
        # try:
        print("正在获取歌单{}的第{}页评论,总共有{}页{}条评论!".format(list_id, i, pages, total))
        # 全部评论
        for item in html['comments']:
            # 提取发表评论的用户名
            user = item['user']
            if item['content'] is not None:
                comment = self.re_value(item['content'])
            else:
                comment = ''
            comment = {'hot_comment': '0', 'user_id': str(user['userId']), 'comment': comment,
                       'likedCount': str(item['likedCount']), 'time': str(item['time']), 'list_id': list_id,
                       'creater_id': creater_id}
            self.save_user_queue.put(str(user['userId']))
            # 回复评论
            reply_comment = []
            if len(item['beReplied']) != 0:
                for reply in item['beReplied']:
                    # 提取发表回复评论的用户名
                    reply_user = reply['user']
                    if reply['content'] is not None:
                        content = self.re_value(reply['content'])
                    else:
                        content = ''
                    reply_comment.append({'user_id': reply_user['userId'], 'content': content})
                    self.save_user_queue.put(str(reply_user['userId']))
            comment['reply'] = str(reply_comment)
            self.save_queue.put(comment)
        return True
        # except Exception as e:
        #     print('获取歌单{}的第{}页评论失败,原因是{}'.format(list_id, i, e))
        #     return False

    def page_spider(self):
        while True:
            list_id, creater_id = self.task_queue.get()
            print('开始爬取ID为%s歌单的所有评论!!!!!' % list_id)
            url1 = 'https://music.163.com/playlist?id=' + list_id
            url = 'https://music.163.com/weapi/v1/resource/comments/A_PL_0_' + list_id + '?csrf_token='
            page = 1
            params, encSecKey = self.get_params(page)
            data = {'params': params, 'encSecKey': encSecKey}
            # 获取第一页评论
            try:
                html, total = self.get_comments_json(url, data)
                # 评论总数
                if html is None:
                    continue
                if 'comments' in html.keys():
                    if html['comments'] is None:
                        try:
                            requests.get(url1, headers=self.headers)
                        except:
                            pass
                        html, total = self.get_comments_json(url, data)
                        if html is None:
                            continue
            except Exception as e:
                print('此歌单: %s, 评论爬取失败!原因:%s' % (list_id, e))
                continue
            # 总页数
            pages = math.ceil(total / 20)
            try:
                self.hot_comments(html, list_id, pages, total, creater_id)
            except Exception as e:
                print('此歌单: %s, 热门评论爬取失败!原因:%s' % (list_id, e))
            try:
                self.comments(html, list_id, page, pages, total, creater_id)
            except Exception as e:
                print('此歌单: %s, 第一页普通评论爬取失败!原因:%s' % (list_id, e))

            # 开始获取歌曲的全部评论
            page = 2
            reverse = False  # 若请求的评论结果为空,则从最后评论页向前爬取
            while True:
                if page == 0:
                    break
                params, encSecKey = self.get_params(page)
                data = {'params': params, 'encSecKey': encSecKey}
                html, total = self.get_comments_json(url, data)
                # 从后向前已经把可请求的评论页请求完成,则跳出循环
                if reverse is True and len(html['comments']) == 0:
                    break

                if len(html['comments']) == 0:
                    reverse = True
                    page = pages
                    print('开始倒序爬取!')
                    continue
                # 从第二页开始获取评论
                try:
                    self.comments(html, list_id, page, pages, total, creater_id)
                except Exception as e:
                    print('此歌单: %s, 第%d页普通评论爬取失败!原因:%s' % (list_id, page, e))
                    print('重新爬取!')
                    if 'total' in str(e):
                        for i in range(90000):
                            print('\r IP可能被封,需要等待' + str(90000 - i) + '秒...', sep=' ', end='', flush=True)
                            time.sleep(1)
                    elif 'comments' in str(e):
                        for i in range(10000):
                            print('\r IP可能被封,需要等待' + str(10000 - i) + '秒...', sep=' ', end='', flush=True)
                            time.sleep(1)
                    else:
                        continue
                if reverse is False:
                    page += 1
                else:
                    page -= 1
                # 如果爬取完成,则跳出循环
                if page > pages:
                    break

    # 连接wyy_spider数据库
    def conn_data(self):
        while True:
            print('连接到mysql服务器...')
            try:
                conn = pymysql.connect(
                    host='localhost',
                    user='******',
                    passwd='0321',
                    port=3306,
                    db='wyy_spider',
                    charset='utf8mb4',
                    cursorclass=pymysql.cursors.DictCursor
                )
                cursor = conn.cursor()
                print('wyy_spider连接上了!')
                return conn, cursor
            except:
                print('wyy_spider连接失败!')
                time.sleep(2)

    # 从musicList.csv中获取任务
    def sql_task(self):
        conn, cursor = self.conn_data()
        data = self.listid.loc[:, ['list_id', 'user_id']]
        num = 0
        for listId, userId in zip(data['list_id'], data['user_id']):
            sql = "select user_id from list_comment where list_id=%s limit 1" % listId
            cursor.execute(sql)
            music_ids = cursor.fetchall()
            if len(music_ids) == 0:
                print('开始爬取ID为 %s 的歌单评论...' % listId)
                num += 1
            else:
                num += 1
                print('===' * 10, 'id为%s的歌单爬取完成, 第%d个歌单' % (listId, num), '===' * 10)
                continue
            if num >= self.num:
                list_id = listId.strip()
                creater_id = userId.strip()
                self.task_queue.put([list_id, creater_id])
                time.sleep(15)

            print('===' * 10, 'id为%s的歌单爬取完成, 第%d个歌单' % (listId, num), '===' * 10)

    # 评论保存至数据库
    def save_result(self):
        while True:
            comment = self.save_queue.get()
            if self.conn_result is False:
                self.result_conn()
            try:
                self.mysqlResult.insert_list_comm(comment)
            except:
                self.conn_result = False

    # 评论人保存至数据库
    def save_user(self):
        while True:
            comment_user = self.save_user_queue.get()
            if self.conn_user is False:
                self.user_conn()
            try:
                self.mysqlUser.insert_co_user(comment_user)
            except:
                self.conn_user = False

    def spider_main(self):
        # Thread(target=self.page_spider, args=()).start()
        # Thread(target=self.page_spider, args=()).start()
        # Thread(target=self.page_spider, args=()).start()
        Thread(target=self.page_spider, args=()).start()
        Thread(target=self.save_result, args=()).start()
        Thread(target=self.save_user, args=()).start()
        self.sql_task()
示例#10
0
class DataToCSV():
    def __init__(self):
        self.music_queue = mp.Queue()
        self.user_queue = mp.Queue()
        self.list_queue = mp.Queue()
        self.singer_queue = mp.Queue()
        self.comment_queue = mp.Queue()
        self.music = False
        self.user = False
        self.list = False
        self.comm = False
        self.singer = False
        self.list_comm = False

    # 重连数据库
    def conn_music(self):
        self.mysqlMusic = MySQLCommand()
        self.mysqlMusic.connectdb()
        self.music = True

    def conn_list(self):
        self.mysqlList = MySQLCommand()
        self.mysqlList.connectdb()
        self.list = True

    def conn_user(self):
        self.mysqlUser = MySQLCommand()
        self.mysqlUser.connectdb()
        self.user = True

    def conn_comm(self):
        self.mysqlComment = MySQLCommand()
        self.mysqlComment.connectdb()
        self.comm = True

    def conn_list_comm(self):
        self.mysqlLcomm = MySQLCommand()
        self.mysqlLcomm.connectdb()
        self.list_comm = True

    def conn_singer(self):
        self.mysqlSinger = MySQLCommand()
        self.mysqlSinger.connectdb()
        self.singer = True

    # def sql_music(self):
    #     while True:
    #         if self.music is False:
    #             self.conn_music()
    #             self.music = True
    #         try:
    #             self.mysqlMusic.cursor.execute("select * from music")
    #             music_ids = self.mysqlMusic.cursor.fetchall()
    #             break
    #         except:
    #             self.music = False
    #     with open("music.csv", "w") as csvfile:
    #         writer = csv.writer(csvfile)
    #         # 先写入columns_name
    #         writer.writerow(["歌曲ID", "歌名", "歌手ID", "专辑ID", "包含这首歌的歌单ID", "相似歌曲ID", "歌词", "歌词贡献者ID", "评论数"])
    #         # 写入多行用writerows
    #         for id in music_ids:
    #
    #             music_id = id.get('music_id')
    #             singer_id = id.get('singer_id')
    #             music_name = id.get('music_name')
    #             album_id = id.get('album_id')
    #             contain_list = id.get('contain_list')
    #             simple_music = id.get('simple_music')
    #             song_lynic = id.get('song_lynic')
    #             lynic_user = id.get('lynic_user')
    #             if song_lynic == '' or song_lynic is None:
    #                 if random.randint(1, 10) >= 3:
    #                     continue
    #             comment_num = id.get('comment_num')
    #             writer.writerow([music_id, music_name, singer_id, album_id, contain_list, simple_music, song_lynic, lynic_user, comment_num])

    def sql_list(self):
        if self.list is False:
            self.conn_list()
            self.list = True
        self.mysqlList.cursor.execute("select * from music_list")
        list_ids = self.mysqlList.cursor.fetchall()
        num = 0
        with open("music_list.csv", "w") as csvfile:
            writer = csv.writer(csvfile)
            # 先写入columns_name
            writer.writerow([
                "list_id", "list_name", "list_user_id", "tags", "e_tags",
                "create_time", "update_time", "authority", "music_count",
                "play_count", "special_category", "subscription_count",
                "cloud_count", "music_count_update_time", "music_update_time",
                "is_quality", "list_describe", "like_user", "music_ids",
                "hot_list"
            ])
            # 写入多行用writerows
            for id in list_ids:
                if id is None:
                    continue
                list_id = id.get('id')
                musicId = id.get('musicId')
                if musicId is None:
                    continue
                list_name = id.get('name')
                user_id = id.get('userId')
                createTime = id.get('createTime')
                updateTime = id.get('updateTime')
                description = id.get('description')
                trackCount = id.get('trackCount')
                authority = id.get('authority')
                playCount = id.get('playCount')
                specialType = id.get('specialType')
                expertTags = id.get('expertTags')
                tags = id.get('tags')
                if len(str(tags).strip()) < 5:
                    continue
                num += 1
                subscribedCount = id.get('subscribedCount')
                cloudTrackCount = id.get('cloudTrackCount')
                trackUpdateTime = id.get('trackUpdateTime')
                trackNumberUpdateTime = id.get('trackNumverUpdateTime')
                highQuality = id.get('highQuality')
                userLikeId = id.get('userLikeId')
                hotlist = id.get('hotlist')
                writer.writerow([
                    list_id, list_name, user_id, tags, expertTags, createTime,
                    updateTime, authority, trackCount, playCount, specialType,
                    subscribedCount, cloudTrackCount, trackNumberUpdateTime,
                    trackUpdateTime, highQuality, description, userLikeId,
                    musicId, hotlist
                ])
        print(num)

    #
    # def sql_singer(self):
    #     if self.singer is False:
    #         self.conn_singer()
    #         self.singer = True
    #     self.mysqlSinger.cursor.execute("select * from singer")
    #     singer_ids = self.mysqlSinger.cursor.fetchall()
    #     with open("singer.csv", "w") as csvfile:
    #         writer = csv.writer(csvfile)
    #         # 先写入columns_name
    #         writer.writerow(["歌手ID", "歌手名", "歌手主页ID", "歌手主页中的歌曲"])
    #         # 写入多行用writerows
    #         for id in singer_ids:
    #             artist_id = id.get('artist_id')
    #             artist_name = id.get('artist_name')
    #             homepage_id = id.get('homepage_id')
    #             top50 = id.get('top50_song_dict')
    #             if top50 == '' or top50 is None:
    #                 if random.randint(1, 10) >= 3:
    #                     continue
    #             writer.writerow([artist_id, artist_name, homepage_id, top50])
    #
    # def sql_user(self):
    #     if self.user is False:
    #         self.conn_user()
    #         self.user = True
    #     self.mysqlUser.cursor.execute("select * from user limit 0, 600000")
    #     user_ids = self.mysqlUser.cursor.fetchall()
    #     with open("user.csv", "w") as csvfile:
    #         writer = csv.writer(csvfile)
    #         # 先写入columns_name
    #         writer.writerow(['用户ID', '用户昵称', '性别', '省份', '城市', '生日', '描述信息', '详细描述信息', '专家标签', '个性签名',
    #                          '用户类型', 'vip类型', '关注量', '粉丝量', '动态量', '创建的歌单数', '用户创建时间', '所有歌单ID', '本周听过', '以前听过', '听过的歌曲数'])
    #         # 写入多行用writerows
    #         for id in user_ids:
    #             user_id = id.get('userId')
    #             nickname = id.get('nickname')
    #             province = id.get('province')
    #             city = id.get('city')
    #             birthday = id.get('birthday')
    #             detailDescription = id.get('detailDescription')
    #             description = id.get('description')
    #             expertTags = id.get('expertTags')
    #             signature = id.get('signature')
    #             userType = id.get('userType')
    #             vipType = id.get('vipType')
    #             list_id = id.get('list_id')
    #             eventCount = id.get('eventCount')
    #             followeds = id.get('followeds')
    #             follows = id.get('follows')
    #             gender = id.get('gender')
    #             playlistCount = id.get('playlistCount')
    #             time = id.get('time')
    #             week_music = id.get('week_music')
    #             all_music = id.get('all_music')
    #             listen_num = id.get('listen_num')
    #             if province == '' or province is None:
    #                 if random.randint(1, 10) >= 3:
    #                     continue
    #             if week_music == '' or week_music is None:
    #                 if random.randint(1, 10) >= 9:
    #                     continue
    #             writer.writerow([user_id, nickname, gender, province, city, birthday, description, detailDescription, expertTags,
    #                                    signature, userType, vipType, follows, followeds, eventCount, playlistCount, time, list_id, week_music, all_music, listen_num])
    #
    # def sql_comments(self):
    #     if self.comm is False:
    #         self.conn_comm()
    #         self.comm = True
    #     self.mysqlComment.cursor.execute("select * from comments limit 100000, 600000")
    #     comment_ids = self.mysqlComment.cursor.fetchall()
    #     with open("comment.csv", "w") as csvfile:
    #         writer = csv.writer(csvfile)
    #         # 先写入columns_name
    #         writer.writerow(['歌曲ID', '用户ID', '歌手ID', '评论时间', '是否为热门评论', '点赞量', '评论', '回复'])
    #         # 写入多行用writerows
    #         for id in comment_ids:
    #
    #             music_id = id.get('music_id')
    #             user_id = id.get('user_id')
    #             hot_comment = id.get('hot_comment')
    #             comment = id.get('comment')
    #             likedCount = id.get('likedCount')
    #             time = id.get('time')
    #             singer_id = id.get('singer_id')
    #             reply = id.get('reply')
    #             writer.writerow([music_id, user_id, singer_id, time, hot_comment, likedCount, comment, reply])
    #
    # def sql_list_comments(self):
    #     if self.list_comm is False:
    #         self.conn_list_comm()
    #         self.list_comm = True
    #     self.mysqlLcomm.cursor.execute("select * from list_comment limit 0, 600000")
    #     comment_ids = self.mysqlLcomm.cursor.fetchall()
    #     with open("list_comment.csv", 'w') as csvfile:
    #         writer = csv.writer(csvfile)
    #         writer.writerow(['歌单ID', '创建者ID', '评论者ID', '评论时间', '是否为热门评论', '点赞量', '评论', '回复'])
    #         for id in comment_ids:
    #             list_id = id.get('list_id')
    #             user_id = id.get('user_id')
    #             hot_comment = id.get('hot_comment')
    #             likedCount = id.get('likedCount')
    #             time = id.get('time')
    #             comment = id.get('comment')
    #             creater_id = id.get('creater_id')
    #             reply = id.get('reply')
    #             writer.writerow([list_id, creater_id, user_id, time, hot_comment, likedCount, comment, reply])

    def execute_main(self):
        # Thread(target=self.sql_singer, args=()).start()
        # Thread(target=self.sql_comments, args=()).start()
        Thread(target=self.sql_list, args=()).start()
示例#11
0
class SingerSpider(object):
    def __init__(self):
        self.list1 = [
            1001, 1002, 1003, 2001, 2002, 2003, 6001, 6002, 6003, 7001, 7002,
            7003, 4001, 4002, 4003
        ]
        self.list2 = [
            -1, 0, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
            80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90
        ]  # initial的值
        self.headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'zh-CN,zh;q=0.9',
            'Connection':
            'keep-alive',
            'Cookie':
            '_iuqxldmzr_=32; _ntes_nnid=0e6e1606eb78758c48c3fc823c6c57dd,1527314455632; '
            '_ntes_nuid=0e6e1606eb78758c48c3fc823c6c57dd; __utmc=94650624; __utmz=94650624.1527314456.1.1.'
            'utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); WM_TID=blBrSVohtue8%2B6VgDkxOkJ2G0VyAgyOY;'
            ' JSESSIONID-WYYY=Du06y%5Csx0ddxxx8n6G6Dwk97Dhy2vuMzYDhQY8D%2BmW3vlbshKsMRxS%2BJYEnvCCh%5CKY'
            'x2hJ5xhmAy8W%5CT%2BKqwjWnTDaOzhlQj19AuJwMttOIh5T%5C05uByqO%2FWM%2F1ZS9sqjslE2AC8YD7h7Tt0Shufi'
            '2d077U9tlBepCx048eEImRkXDkr%3A1527321477141; __utma=94650624.1687343966.1527314456.1527314456'
            '.1527319890.2; __utmb=94650624.3.10.1527319890',
            'Host':
            'music.163.com',
            'Referer':
            'http://music.163.com/',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
            'Chrome/66.0.3359.181 Safari/537.36'
        }
        self.mysqlCommand = MySQLCommand()
        self.mysqlCommand.connectdb()

    # 获取歌手信息
    def get_singer_info(self, artist_id):
        song_dict = dict()  # 歌手热门歌曲字典
        try:
            url = 'https://music.163.com/artist?id=' + artist_id
            r = requests.get(url, headers=self.headers)
            soup = BeautifulSoup(r.text, 'html5lib')
            try:
                singer_homepage = soup.find('a',
                                            attrs={'class': 'btn-rz f-tid'})
                singer_homepage = singer_homepage['href'].replace(
                    '/user/home?id=', '').strip()
            except:
                singer_homepage = ''
            try:
                song_list = str(soup.find_all('ul', attrs={'class': 'f-hide'}))
                song_list = BeautifulSoup(song_list, 'html5lib')
                song_list = song_list.find_all('a')
            except:
                song_list = []
            for song in song_list:
                song_name = song.string
                song_id = song['href'].replace('/song?id=', '').strip()
                song_dict[song_id] = song_name
            song_dict = str(song_dict)
            song_dict = json.dumps(song_dict)
            return singer_homepage, song_dict
        except:
            return '', json.dumps({})

    # 获取所有歌手
    def get_all_singer(self, url):
        r = requests.get(url, headers=self.headers)
        soup = BeautifulSoup(r.text, 'html5lib')

        for artist in soup.find_all('a',
                                    attrs={'class':
                                           'nm nm-icn f-thide s-fc0'}):

            artist_name = artist.string
            artist_id = artist['href'].replace('/artist?id=', '').strip()
            singer_homepage, song_dict = self.get_singer_info(artist_id)
            print(artist_id, artist_name, singer_homepage)
            try:
                self.mysqlCommand.insert_singer(artist_id, artist_name,
                                                singer_homepage, song_dict)
            except Exception as msg:
                print(msg)

    # spider主函数
    def spider_main(self):
        print('开始爬取歌手信息...')
        for index, i in enumerate(self.list1):
            for j in self.list2:
                url = 'http://music.163.com/discover/artist/cat?id=' + str(
                    i) + '&initial=' + str(j)
                self.get_all_singer(url)
示例#12
0
class CommSpider(object):
    def __init__(self):
        self.headers = {
            'Accept':
            '*/*',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'zh-CN,zh;q=0.9',
            'Connection':
            'keep-alive',
            'Host':
            'music.163.com',
            'Origin':
            'http://music.163.com',
            'Cookie':
            '_iuqxldmzr_=32; _ntes_nnid=0e6e1606eb78758c48c3fc823c6c57dd,1527314455632; '
            '_ntes_nuid=0e6e1606eb78758c48c3fc823c6c57dd; __utmc=94650624; __utmz=94650624.1527314456.1.1.'
            'utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); WM_TID=blBrSVohtue8%2B6VgDkxOkJ2G0VyAgyOY;'
            ' JSESSIONID-WYYY=Du06y%5Csx0ddxxx8n6G6Dwk97Dhy2vuMzYDhQY8D%2BmW3vlbshKsMRxS%2BJYEnvCCh%5CKY'
            'x2hJ5xhmAy8W%5CT%2BKqwjWnTDaOzhlQj19AuJwMttOIh5T%5C05uByqO%2FWM%2F1ZS9sqjslE2AC8YD7h7Tt0Shufi'
            '2d077U9tlBepCx048eEImRkXDkr%3A1527321477141; __utma=94650624.1687343966.1527314456.1527314456'
            '.1527319890.2; __utmb=94650624.3.10.1527319890',
            'Referer':
            'http://music.163.com/',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
            'Content-Type':
            'application/x-www-form-urlencoded'
        }
        self.host_path = '../data/host.txt'
        self.cookie_path = '../data/cookie.txt'
        self.ip_queue = Queue()
        self.save_queue = Queue()  # 结果队列
        self.task_queue = Queue()  # 任务队列
        self.save_user_queue = Queue()  # 评论人队列
        self.ip_pool = []  # ip代理池
        self.conn_task = False
        self.conn_result = False
        self.conn_user = False
        self.prosiex_start = True  # 是否启动代理IP爬取线程

    # 重连数据库
    def task_conn(self):
        self.mysqlCommand = MySQLCommand()
        self.mysqlCommand.connectdb()
        self.conn_task = True
        time.sleep(1)

    def result_conn(self):
        self.mysqlResult = MySQLCommand()
        self.mysqlResult.connectdb()
        self.conn_result = True
        time.sleep(1)

    def user_conn(self):
        self.mysqlUser = MySQLCommand()
        self.mysqlUser.connectdb()
        self.conn_user = True
        time.sleep(1)

    def check_headers(self):
        cookie_list = []
        with open(self.cookie_path, 'r') as fp:
            for i in fp.readlines():
                i = json.loads(i)
                cookie_list.append(i)
        self.headers['Cookie'] = random.choice(cookie_list)['cookie']

    # 检查代理IP是否可用
    def check_ip(self, proxies):
        try:
            header = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                'AppleWebKit/537.36 (KHTML, like Gecko) '
                'Chrome/64.0.3282.186 Safari/537.36'
            }
            ip = '://' + proxies['ip'] + ':' + proxies['port']
            proxies = {'https': 'https' + ip}
            url = 'https://www.ipip.net/'
            r = requests.get(url, headers=header, proxies=proxies, timeout=5)
            r.raise_for_status()
        except:
            return False
        else:
            print(proxies, '检查通过!')
            return True

    # 生成IP代理
    def ip_proxies(self):
        api = 'http://www.xicidaili.com/wn/{}'
        header = {
            'Cookie':
            '_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJTZlOTVjNGQ1MmUxMDlmNzhlNjkwMDU3MDUxMTQ4YTUwBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMUpRcU9ySVRNcmlOTytuNm9ZWm53RUFDYzhzTnZCbGlNa0ZIaHJzancvZEU9BjsARg%3D%3D--742b1937a06cc747483cd594752ef2ae80fc4d91; Hm_lvt_0cf76c77469e965d2957f0553e6ecf59=1577952296; Hm_lpvt_0cf76c77469e965d2957f0553e6ecf59=1578016572',
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/'
            '537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
            'Host':
            'www.xicidaili.com',
            'Connection':
            'keep-alive',
            'Accept-Language':
            'zh-CN,zh;q=0.9',
            'Accept-Encoding':
            'gzip, deflate, br',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'Cache-Control':
            'no-cache'
        }

        fp = open(self.host_path, 'a+', encoding=('utf-8'))
        self.ip_pool = []
        for i in range(20):
            api = api.format(1)
            respones = requests.get(url=api, headers=header)
            time.sleep(3)
            soup = BeautifulSoup(respones.text, 'html.parser')
            container = soup.find_all(name='tr', attrs={'class': 'odd'})
            for tag in container:
                try:
                    con_soup = BeautifulSoup(str(tag), 'html.parser')
                    td_list = con_soup.find_all('td')
                    ip = str(td_list[1])[4:-5]
                    port = str(td_list[2])[4:-5]
                    _type = td_list[5].text
                    IPport = {'ip': ip, 'port': port, 'type': _type.lower()}
                    if self.check_ip(IPport):
                        IPport = json.dumps(IPport)
                        self.ip_pool.append(IPport)
                        fp.write(IPport)
                        fp.write('\n')
                        self.ip_queue.put(IPport)
                except Exception as e:
                    print('No IP!')
            if self.prosiex_start is False:
                break
        fp.close()

    # 从host.txt中读取代理
    def ip_txt(self):
        print('IP代理爬取不够,从host.txt中添加...')
        with open(self.host_path, 'r') as fp:
            ip_port = fp.readlines()
            for i in ip_port:
                self.ip_pool.append(i)
                self.ip_queue.put(i)

    # 生成16个随机字符
    def generate_random_strs(self, length):
        string = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
        # 控制次数参数i
        i = 0
        # 初始化随机字符串
        random_strs = ""
        while i < length:
            e = random.random() * len(string)
            # 向下取整
            e = math.floor(e)
            random_strs = random_strs + list(string)[e]
            i = i + 1
        return random_strs

    # AES加密
    def AESencrypt(self, msg, key):
        # 如果不是16的倍数则进行填充(paddiing)
        padding = 16 - len(msg) % 16
        # 这里使用padding对应的单字符进行填充
        msg = msg + padding * chr(padding)
        # 用来加密或者解密的初始向量(必须是16位)
        iv = '0102030405060708'

        cipher = AES.new(key, AES.MODE_CBC, iv)

        # 加密后得到的是bytes类型的数据
        encryptedbytes = cipher.encrypt(msg)
        # 使用Base64进行编码,返回byte字符串
        encodestrs = base64.b64encode(encryptedbytes)
        # 对byte字符串按utf-8进行解码
        enctext = encodestrs.decode('utf-8')

        return enctext

    # RSA加密
    def RSAencrypt(self, randomstrs, key, f):
        # 随机字符串逆序排列
        string = randomstrs[::-1]
        # 将随机字符串转换成byte类型数据
        text = bytes(string, 'utf-8')
        seckey = int(codecs.encode(text, encoding='hex'), 16)**int(
            key, 16) % int(f, 16)
        return format(seckey, 'x').zfill(256)

    # 获取参数
    def get_params(self, page):
        # msg也可以写成msg = {"offset":"页面偏移量=(页数-1) * 20", "limit":"20"},offset和limit这两个参数必须有(js)
        # limit最大值为100,当设为100时,获取第二页时,默认前一页是20个评论,也就是说第二页最新评论有80个,有20个是第一页显示的
        # msg = '{"rid":"R_SO_4_1302938992","offset":"0","total":"True","limit":"100","csrf_token":""}'
        # 偏移量
        offset = (page - 1) * 20
        # offset和limit是必选参数,其他参数是可选的,其他参数不影响data数据的生成
        msg = '{"offset":' + str(
            offset) + ',"total":"True","limit":"20","csrf_token":""}'
        key = '0CoJUm6Qyw8W8jud'
        f = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7'
        e = '010001'
        enctext = self.AESencrypt(msg, key)
        # 生成长度为16的随机字符串
        i = self.generate_random_strs(16)

        # 两次AES加密之后得到params的值
        encText = self.AESencrypt(enctext, i)
        # RSA加密之后得到encSecKey的值
        encSecKey = self.RSAencrypt(i, e, f)
        return encText, encSecKey

    # 使用代理爬取
    def ip_spider(self, url, data):
        repeat = 0
        while repeat < 50:
            proxies = self.ip_queue.get()
            proxies = json.loads(proxies)
            ip = '://' + proxies['ip'] + ':' + proxies['port']
            proxies = {'https': 'https' + ip}
            print('使用的代理IP为:', proxies)
            try:
                r = requests.post(url,
                                  headers=self.headers,
                                  data=data,
                                  proxies=proxies)
                time.sleep(2)
                try:
                    r.encoding = 'utf-8'
                    result = r.json()
                except Exception as e:
                    print('错误:', e)
                    return r, None
                if 'code' in result.keys():
                    if result['code'] == -460:
                        repeat += 1
                        print('%r的IP代理不可用, 访问URL为%s的网页失败!原因是%s, 重试第%d次' %
                              (proxies, url, result, repeat + 1))
                if 'total' in result.keys():
                    total = result['total']
                    print('result: ', result)
                    return result, total
            except Exception as e:
                print('IP代理为%r, 访问URL为%s的网页失败!原因是%s, 重试第%d次' %
                      (proxies, url, e, repeat + 1))
                repeat += 1
        print('返回的是none')
        return None, None

    def get_comments_json(self, url, data):
        repeat = 0
        while repeat < 4:
            try:
                r = requests.post(url, headers=self.headers, data=data)
                time.sleep(repeat + 2)
                r.encoding = "utf-8"
                if r.status_code == 200:
                    # 返回json格式的数据
                    result = r.json()
                    if 'total' in result.keys():
                        total = result['total']
                        repeat = 0
                        self.ip_pool = []
                        return result, total
                    elif 'code' in result.keys():
                        if result['code'] == -460:
                            if repeat < 3:
                                self.check_headers()
                            else:
                                if len(self.ip_pool) < 10:
                                    Thread(target=self.ip_proxies,
                                           args=()).start()
                                if len(self.ip_pool) < 10:
                                    self.ip_txt()
                                result, total = self.ip_spider(url, data)
                                if result is None:
                                    self.prosiex_start = False
                                    for i in range(90000):
                                        print('\r IP可能被封,代理IP不可用!需要等待' +
                                              str(90000 - i) + '秒...',
                                              sep=' ',
                                              end='',
                                              flush=True)
                                        time.sleep(1)
                                    self.prosiex_start = True
                                else:
                                    self.prosiex_start = True
                                    return result, total
                            repeat += 1

            except:
                time.sleep(1)
                repeat += 1
                print("第%d次爬取url为%s 的页面失败!正重新尝试..." % (repeat, url))
        return None, None

    # 数据正则处理
    def re_value(self, value):
        value = re.sub(r'\r|\n|\\|\'|\{|\}|\"', ' ', value)
        return value

    # 获取热门评论
    def hot_comments(self, html, song_id, pages, total, singer_id):
        print("正在获取歌曲{}的热门评论,总共有{}页{}条评论!".format(song_id, pages, total))
        if 'hotComments' in html:
            for item in html['hotComments']:
                # 提取发表热门评论的用户名
                user = item['user']
                if item['content'] is not None:
                    comment = self.re_value(item['content'])
                else:
                    comment = ''
                # 写入文件
                hot_comment = {
                    'hot_comment': '1',
                    'user_id': str(user['userId']).strip(),
                    'comment': comment,
                    'likedCount': str(item['likedCount']),
                    'time': str(item['time']),
                    'music_id': song_id,
                    'singer_id': singer_id
                }
                self.save_user_queue.put(str(user['userId']).strip())
                # 回复评论
                reply_comment = []
                if len(item['beReplied']) != 0:
                    for reply in item['beReplied']:
                        # 提取发表回复评论的用户名
                        reply_user = reply['user']
                        if reply['content'] is not None:
                            content = self.re_value(reply['content'])
                        else:
                            content = ''
                        reply_comment.append({
                            'user_id':
                            str(reply_user['userId']).strip(),
                            'content':
                            content
                        })
                        self.save_user_queue.put(
                            str(reply_user['userId']).strip())
                hot_comment['reply'] = str(reply_comment)
                self.save_queue.put(hot_comment)

    # 获取普通评论
    def comments(self, html, song_id, i, pages, total, singer_id):
        print("正在获取歌曲{}的第{}页评论,总共有{}页{}条评论!".format(song_id, i, pages, total))
        # 全部评论
        for item in html['comments']:
            # 提取发表评论的用户名
            user = item['user']
            if item['content'] is not None:
                comment = self.re_value(item['content'])
            else:
                comment = ''
            comment = {
                'hot_comment': '0',
                'user_id': str(user['userId']).strip(),
                'comment': comment,
                'likedCount': str(item['likedCount']),
                'time': str(item['time']),
                'music_id': song_id,
                'singer_id': singer_id
            }
            self.save_user_queue.put(str(user['userId']))
            # 回复评论
            reply_comment = []
            if len(item['beReplied']) != 0:
                for reply in item['beReplied']:
                    # 提取发表回复评论的用户名
                    reply_user = reply['user']
                    if reply['content'] is not None:
                        content = self.re_value(reply['content'])
                    else:
                        content = ''
                    reply_comment.append({
                        'user_id':
                        str(reply_user['userId']).strip(),
                        'content':
                        content
                    })
                    self.save_user_queue.put(str(reply_user['userId']))
            comment['reply'] = str(reply_comment)
            self.save_queue.put(comment)
        return True

    def page_spider(self):
        while True:
            songid, singer_id = self.task_queue.get()
            print('开始爬取ID为%s歌曲的所有评论!!!!!' % songid)
            url1 = 'https://music.163.com/song?id=' + songid
            url = 'https://music.163.com/weapi/v1/resource/comments/R_SO_4_' + songid + '?csrf_token='
            page = 1
            params, encSecKey = self.get_params(page)
            data = {'params': params, 'encSecKey': encSecKey}
            self.headers[
                'Referer'] = 'https://music.163.com/song?id=%s' % songid
            # 获取第一页评论
            try:
                html, total = self.get_comments_json(url, data)
                # 评论总数
                if html is None:
                    continue
                if 'comments' in html.keys():
                    if html['comments'] is None:
                        try:
                            requests.get(url1, headers=self.headers)
                            time.sleep(2)
                        except:
                            pass
                        html, total = self.get_comments_json(url, data)
                        if html is None:
                            continue
            except Exception as e:
                print('此歌曲: %s, 评论爬取失败!原因:%s' % (songid, e))
                if 'total' in str(e):
                    for i in range(90000):
                        print('\r IP可能被封,需要等待' + str(90000 - i) + '秒...',
                              sep=' ',
                              end='',
                              flush=True)
                        time.sleep(1)
                else:
                    continue
                continue
            # 总页数
            pages = math.ceil(total / 20)
            try:
                self.hot_comments(html, songid, pages, total, singer_id)
            except Exception as e:
                print('此歌曲: %s, 热门评论爬取失败!原因:%s' % (songid, e))
            try:
                self.comments(html, songid, page, pages, total, singer_id)
            except Exception as e:
                print('此歌曲: %s, 第一页普通评论爬取失败!原因:%s' % (songid, e))

            # 开始获取歌曲的全部评论
            page = 2
            reverse = False  # 若请求的评论结果为空,则从最后评论页向前爬取
            while True:
                if page == 0:
                    break
                params, encSecKey = self.get_params(page)
                data = {'params': params, 'encSecKey': encSecKey}
                html, total = self.get_comments_json(url, data)
                # 从后向前已经把可请求的评论页请求完成,则跳出循环
                if reverse is True and len(html['comments']) == 0:
                    break

                # 从第二页到后可请求的评论已请求完,则从后向前请求
                if len(html['comments']) == 0:
                    reverse = True
                    page = pages
                    continue
                try:
                    self.comments(html, songid, page, pages, total, singer_id)
                except Exception as e:
                    print('此歌曲: %s, 第%d页普通评论爬取失败!原因:%s' % (songid, page, e))
                    print('重新爬取!')
                    if 'total' in str(e):
                        for i in range(90000):
                            print('\r IP可能被封,需要等待' + str(90000 - i) + '秒...',
                                  sep=' ',
                                  end='',
                                  flush=True)
                            time.sleep(1)
                    elif 'comments' in str(e):
                        for i in range(10000):
                            print('\r IP可能被封,需要等待' + str(10000 - i) + '秒...',
                                  sep=' ',
                                  end='',
                                  flush=True)
                            time.sleep(1)
                    else:
                        continue
                if reverse is False:
                    page += 1
                else:
                    page -= 1
                # 如果爬取完成,则跳出循环
                if page > pages:
                    break
            print('==' * 20, '%s====歌====曲====爬====取====完====成' % songid,
                  '==' * 20)

    # 连接wyy_spider数据库
    def conn_data(self):
        while True:
            print('连接到mysql服务器...')
            try:
                conn = pymysql.connect(host='localhost',
                                       user='******',
                                       passwd='0321',
                                       port=3306,
                                       db='wyy_spider',
                                       charset='utf8mb4',
                                       cursorclass=pymysql.cursors.DictCursor)
                cursor = conn.cursor()
                print('wyy_spider连接上了!')
                return conn, cursor
            except:
                print('wyy_spider连接失败!')
                time.sleep(2)

    # 从数据库获取任务
    def sql_task(self):
        conn, cursor = self.conn_data()
        cursor.execute("select music_id, singer_id from music limit 20,100")
        music_ids = cursor.fetchall()

        for id in music_ids:
            if id is None:
                continue
            try:
                music_id = id.get('music_id').strip()
                singer_id = id.get('singer_id').strip()
            except:
                continue
            self.task_queue.put([music_id, singer_id])

    # 评论保存至数据库
    def save_result(self):
        while True:
            comment = self.save_queue.get()
            if self.conn_result is False:
                self.result_conn()
            try:
                self.mysqlResult.insert_comments(comment)
            except:
                self.conn_result = False

    # 评论人保存至数据库
    def save_user(self):
        while True:
            comment_user = self.save_user_queue.get()
            if self.conn_user is False:
                self.user_conn()
            try:
                self.mysqlUser.insert_co_user(comment_user)
            except:
                self.conn_user = False

    def spider_main(self):
        # Thread(target=self.page_spider, args=()).start()
        # Thread(target=self.page_spider, args=()).start()
        # Thread(target=self.page_spider, args=()).start()
        Thread(target=self.page_spider, args=()).start()
        Thread(target=self.save_result, args=()).start()
        Thread(target=self.save_user, args=()).start()
        self.sql_task()