class wangyiyun(): def __init__(self): options = Options() options.headless = True self.driver = webdriver.Firefox(options=options) # 连接数据库 self.mysqlCommand = MySQLCommand() self.mysqlCommand.connectdb() # 每次查询数据库中最后一条数据的ID,新加的数据每成功插入一条ID+1 self.music = {} def run(self): self.mysqlCommand.cursor.execute( "select url ,singer_name from table_singer") urls = self.mysqlCommand.cursor.fetchall() for odd, url in enumerate(urls): if url.get('url') != None and odd % 2 == 0: self.driver.get(url.get('url')) time.sleep(4) self.driver.switch_to.frame( self.driver.find_element_by_name('contentFrame')) time.sleep(1) source = self.driver.page_source # print(url.get('list_url')) # print(source) html = etree.HTML(source) time.sleep(1) song_name = html.xpath( "//div[@class='j-flag']//div[@class='ttc']/span[@class='txt']/a/b/@title" ) song_url = html.xpath( "//div[@class='j-flag']//div[@class='ttc']/span[@class='txt']/a/@href" ) album = html.xpath("//div[@class='text']/a/@title") singer = url.get('singer_name') for i in range(len(song_name)): song_n = re.sub(r'\\xa0', ' ', song_name[i]) song_u = 'https://music.163.com' + song_url[i] albums = re.sub(r'\\xa0', ' ', album[i]) print(song_n, '+', song_u, '+', albums, '+', singer) try: self.mysqlCommand.insert_musicData( song_n, song_u, albums, singer) print('==' * 20) except: pass print('==' * 20)
class wangyiyun(): def __init__(self): options = Options() options.headless = True self.driver = webdriver.Firefox(options=options) # 连接数据库 self.mysqlCommand = MySQLCommand() self.mysqlCommand.connectdb() # 每次查询数据库中最后一条数据的ID,新加的数据每成功插入一条ID+1 self.music = {} def run(self): self.mysqlCommand.cursor.execute("select list_url from song_list") list_song = self.mysqlCommand.cursor.fetchall() for odd, url in enumerate(list_song): if url.get('list_url') != None and odd % 3 == 0: self.driver.get(url.get('list_url')) time.sleep(4) self.driver.switch_to.frame( self.driver.find_element_by_name('contentFrame')) time.sleep(1) source = self.driver.page_source # print(url.get('list_url')) # print(source) html = etree.HTML(source) time.sleep(1) song_name = re.findall(r'"><b title="(.*?)">', source, re.DOTALL) song_url = re.findall( r'<div class="ttc"><span class="txt"><a href="(.*?)"><b', source, re.DOTALL) album = html.xpath("//div[@class='text']/a/@title") singer = html.xpath("//div[@class='text']/@title") singer_url = html.xpath("//div[@class='text']/span/a/@href") for i in range(len(song_name)): song_n = re.sub(r' ', ' ', song_name[i]) song_u = 'https://music.163.com' + song_url[i] singerurl = 'https://music.163.com' + singer_url[i] print(singer[i], singerurl) try: self.mysqlCommand.insert_musicData( song_n, song_u, album[i], singer[i]) self.mysqlCommand.insert_singer(singer[i], singerurl) print('==' * 20) except Exception as e: print(e) pass
class wangyiyun(): def __init__(self): options = Options() options.headless = True self.driver = webdriver.Firefox(options=options) self.url = [ 'https://music.163.com/#/discover/toplist?id=19723756', 'https://music.163.com/#/discover/toplist?id=3779629', 'https://music.163.com/#/discover/toplist?id=2884035', 'https://music.163.com/#/discover/toplist?id=3778678', 'https://music.163.com/#/discover/toplist?id=991319590', 'https://music.163.com/#/discover/toplist?id=71384707', 'https://music.163.com/#/discover/toplist?id=1978921795', 'https://music.163.com/#/discover/toplist?id=2250011882', 'https://music.163.com/#/discover/toplist?id=2617766278', 'https://music.163.com/#/discover/toplist?id=71385702', 'https://music.163.com/#/discover/toplist?id=745956260', 'https://music.163.com/#/discover/toplist?id=10520166', 'https://music.163.com/#/discover/toplist?id=2023401535', 'https://music.163.com/#/discover/toplist?id=2006508653', 'https://music.163.com/#/discover/toplist?id=180106', 'https://music.163.com/#/discover/toplist?id=60198', 'https://music.163.com/#/discover/toplist?id=3812895', 'https://music.163.com/#/discover/toplist?id=27135204', 'https://music.163.com/#/discover/toplist?id=21845217', 'https://music.163.com/#/discover/toplist?id=11641012', 'https://music.163.com/#/discover/toplist?id=60131', 'https://music.163.com/#/discover/toplist?id=120001', 'https://music.163.com/#/discover/toplist?id=112463', 'https://music.163.com/#/discover/toplist?id=10169002', 'https://music.163.com/#/discover/toplist?id=2809513713', 'https://music.163.com/#/discover/toplist?id=2809577409' ] # 连接数据库 self.mysqlCommand = MySQLCommand() self.mysqlCommand.connectdb() # 每次查询数据库中最后一条数据的ID,新加的数据每成功插入一条ID+1 def run(self): for s_url in self.url: self.driver.get(s_url) time.sleep(4) self.driver.switch_to.frame( self.driver.find_element_by_name("contentFrame")) source = self.driver.page_source html = etree.HTML(source) list_name = html.xpath("//div[@class='hd f-cb']/h2/text()") play_num = html.xpath( "//div[@class='more s-fc3']/strong[@class='s-fc6']/text()") creator = '网易云' creator_url = '无' try: self.mysqlCommand.insert_list(s_url, list_name, creator, creator_url, play_num) except: print('列表错误' + list_name, play_num, creator_url, creator) url = re.findall(r'<span class="txt"><a href="(.*?)"><b', source, re.DOTALL) song_name = re.findall(r'><b title="(.*?)">', source, re.DOTALL) singer = re.findall(r'div class="text" title="(.*?)"><span', source, re.DOTALL) for i in range(len(url)): urli = 'https://music.163.com' + url[i] song_namei = re.sub(r' ', ' ', song_name[i]) singeri = singer[i] album = '网易云排行榜' try: self.mysqlCommand.insert_musicData(song_namei, urli, album, singeri) except: song_nameii = '' for i in song_namei: if i != '\'' and i != ')': song_nameii = song_nameii + i try: self.mysqlCommand.insert_musicData( song_nameii, urli, album, singeri) except Exception as e: print('歌曲错误 ' + song_nameii, '歌手 ' + singeri, '原名 ' + song_namei) print(e) self.mysqlCommand.closeMysql()
class wangyiyun(): def __init__(self): options=Options() options.headless=True self.driver = webdriver.Firefox(options=options) # 连接数据库 self.mysqlCommand = MySQLCommand() self.mysqlCommand.connectdb() # 每次查询数据库中最后一条数据的ID,新加的数据每成功插入一条ID+1 self.message = {} self.user = {} def run(self): self.mysqlCommand.cursor.execute("select url,song_name from message") name_url = self.mysqlCommand.cursor.fetchall() for odd, url in enumerate(name_url): if url.get('url') != None and odd % 4 == 1: # print(url) self.driver.get(url.get('url')) # self.request_preson_page(url.get('url'), url.get('song_name')) time.sleep(4) preson_url = url.get('url') song_name = url.get('song_name') self.driver.switch_to.frame(self.driver.find_element_by_name("contentFrame")) source = self.driver.page_source #time.sleep(2) self.parse_preson_page(source, song_name, preson_url) # 解析评论人信息页 def parse_preson_page(self, source, song_name, preson_url): html = etree.HTML(source) person_name = "".join(html.xpath("//span[@class='tit f-ff2 s-fc0 f-thide']/text()")) IDs = html.xpath("//ul[@class='data s-fc3 f-cb']/li/a/@href") count = re.findall(r'<strong.*?</strong>', source, re.DOTALL) counts = [] for i in count: count = re.findall(r'">(.*?)</strong>', i) counts.append(count) ids = [] for i in IDs: ids.append("https://music.163.com" + i) if len(ids) == 0: print(ids, preson_url) exit() introduce = html.xpath("//div[@class='inf s-fc3 f-brk']/text()") if len(introduce) == 0: introduce = ['无信息'] introduce = "".join(introduce) introduce = "".join(re.sub(r'个人介绍:', '', introduce)) introduce = re.sub(r'\n', ' ', introduce) district = "".join(re.findall(r'<div class="inf s-fc3".*?所在地区:(.*?)</span>', source, re.DOTALL)) if len(district) == 0: district = '无信息' age = "".join(html.xpath("//span[@class='sep']/span/text()")) if len(age) == 0: age = '无信息' self.user['name'] = person_name self.user['introduction'] = introduce self.user['region'] = district self.user['age'] = age self.user['dynamic'] = ids[0] self.user['focus'] = ids[1] self.user['fans'] = ids[2] self.user['url'] = preson_url # print(self.user) try: # 插入数据 self.mysqlCommand.insert_userData(self.user) except Exception as e: print("插入用户数据失败", str(e)) # 输出插入失败的报错语句
class wangyiyun(): def __init__(self): options = Options() options.headless = True self.driver = webdriver.Firefox(options=options) # 连接数据库 self.mysqlCommand = MySQLCommand() self.mysqlCommand.connectdb() # 每次查询数据库中最后一条数据的ID,新加的数据每成功插入一条ID+1 self.message = {} self.user = {} def run(self): self.mysqlCommand.cursor.execute("select url from table_music") music_url = self.mysqlCommand.cursor.fetchall() for odd, url in enumerate(music_url): if url.get('url') != None and odd % 4 == 1: self.driver.get(url.get('url')) time.sleep(4) self.driver.switch_to.frame(self.driver.find_element_by_name('contentFrame')) # 滚动条到页面最底部 js = "var q=document.documentElement.scrollTop=10000" self.driver.execute_script(js) time.sleep(1) source = self.driver.page_source #time.sleep(1) j_flag = "".join(re.findall(r'<div class="auto-(.*?) u-page">', source, re.DOTALL)) i = 1 while True: source = self.driver.page_source self.parse_detail_page(source) try: # self.parse_detail_page(source) # self.driver.switch_to.window(self.driver.window_handles[1]) # self.driver.switch_to.frame(self.driver.find_element_by_name("contentFrame")) # source = self.driver.page_source time.sleep(5) # print('刷新本页html:',source) js = "var q=document.documentElement.scrollTop=10000" self.driver.execute_script(js) # self.driver.switch_to.frame(self.driver.find_element_by_name("contentFrame")) WebDriverWait(driver=self.driver, timeout=10).until(EC.presence_of_element_located((By.XPATH, "//div[@class='auto-" + j_flag + " u-page']/a[last()]"))) next_btn = self.driver.find_element_by_xpath("//div[@class='auto-"+j_flag+" u-page']/a[last()]") print('爬取第%d页成功!' % i) if "js-disabled" in next_btn.get_attribute("class"): print('本首歌爬取完成!') # self.mysqlCommand.closeMysql() break else: next_btn.click() i += 1 time.sleep(2) except: if self.driver.page_source.find("//div[@class='auto-"+j_flag+" u-page']/a[last()]"): print('有btn') print('爬取第%d页失败!' % i) # self.mysqlCommand.closeMysql() print("=="*20) # print(source) print(j_flag) # 关闭详情页 #self.driver.close() # 切换回排行榜列表 #self.driver.switch_to.window(self.driver.window_handles[0]) # 爬取歌曲评论信息 def parse_detail_page(self, source): html = etree.HTML(source) preson_id = html.xpath("//div[@class='cnt f-brk']/a[@class='s-fc7']/@href") song_name = "".join(html.xpath("//div[@class='tit']/em[@class='f-ff2']/text()")) # 获取点击量 points_tags = re.findall(r'<i class="zan u-icn2 u-icn2-12">(.*?)</a>', source, re.DOTALL) point = [] for i in points_tags: point_rag = re.sub('</i> ', '', i) point_rag = re.sub('</i>', '0', point_rag) point.append(point_rag) name = html.xpath("//div[@class='cnt f-brk']/a[1]/text()") comment_tags = re.findall(r'<div class="cnt f-brk">.*?</a>(.*?)</div>.*?</a>(.*?)</div>', source, re.DOTALL) comments = [] for item in comment_tags: comment=str() for i in item: comment_tag = re.sub('<br />', ' ', i) comment_tag = re.sub('<(.*?)>', '', comment_tag) if item.index(i) == 1 and comment_tag != '|回复': comment_tag = '\n 评论回复'+comment_tag comment += comment_tag comment=comment.rstrip('|回复') comment=''.join(comment) comment='""'+comment+'""' comments.append(comment) time = [] times = html.xpath("//div[@class='time s-fc4']/text()") for i in times: time.append(i.replace(' ', '')) # print('++' * 30) for i in range(len(name)): self.message['song_name'] = song_name self.message['name'] = name[i] self.message['comments'] = comments[i] self.message['time'] = time[i] self.message['point'] = point[i] self.message['url'] = "https://music.163.com"+preson_id[i] self.mysqlCommand.insert_messageData(self.message) comment_sum = ''.join(re.findall(r'<span class="j-flag">(.*?)</span>', source, re.DOTALL)) try: self.mysqlCommand.insert_musicnum(song_name, comment_sum) except: print(song_name, comment_sum, '评论数插入失败!') pass
class wangyiyun(): def __init__(self): options = Options() options.headless = True self.driver = webdriver.Firefox(options=options) self.url = [ 'https://music.163.com/#/discover/playlist/?order=hot&cat=%E5%85%A8%E9%83%A8', ] # 连接数据库 self.mysqlCommand = MySQLCommand() self.mysqlCommand.connectdb() # 每次查询数据库中最后一条数据的ID,新加的数据每成功插入一条ID+1 def run(self): for s_url in self.url: self.driver.get(s_url) time.sleep(5) self.driver.switch_to.frame( self.driver.find_element_by_name("contentFrame")) self.parse_list_page() # 获取歌曲的url def parse_list_page(self): k = 1 h = 0 while True: try: source = self.driver.page_source time.sleep(2) html = etree.HTML(source) list_url = html.xpath( "//li/div[@class='u-cover u-cover-1']/a/@href") creator_url = html.xpath("//li//p[last()]/a/@href") play_num = re.findall(r'<span class="nb">(.*?)</span>', source, re.DOTALL) creator = html.xpath("//li/p[last()]/a/text()") list_name = html.xpath("//li/p[@class='dec']/a/text()") js = "var q=document.documentElement.scrollTop=10000" self.driver.execute_script(js) try: # 插入数据 for i in range(len(creator_url)): list = "https://music.163.com" + list_url[i] print(creator_url[i]) creator_s = "https://music.163.com" + creator_url[i] print(list, list_name[i], creator[i], creator_s, play_num[i]) self.mysqlCommand.insert_list(list, list_name[i], creator[i], creator_s, play_num[i]) except Exception as e: print("插入歌单数据失败", str(e)) # 输出插入失败的报错语句 next_btn = self.driver.find_element_by_xpath( "//div[@class='u-page']/a[last()]") print('爬取第%d页成功!' % k) if "zbtn znxt js-disabled" in next_btn.get_attribute("class"): h += 1 for i in range(len(creator_url)): list = "https://music.163.com" + list_url[i] print(creator_url[i]) creator_s = "https://music.163.com" + creator_url[i] self.mysqlCommand.insert_list(list, list_name[i], creator[i], creator_s, play_num[i]) print('本首歌爬取完成!') if h == 2: self.mysqlCommand.closeMysql() break next_list = self.driver.find_element_by_xpath( "//div[@class='u-btn f-fr u-btn-hot d-flag']/a[last()]" ) next_list.click() else: next_btn.click() time.sleep(2) k += 1 time.sleep(2) except: print('爬取第%d页失败!' % k) self.mysqlCommand.closeMysql() print("==" * 20)