def getOnePageSongList(self, page): s = requests.session() play_url = self.__play_url + str(page * 35) try: s = BeautifulSoup( s.get(play_url, headers=self.__headers).content, "lxml") lst = s.find('ul', {'class': 'm-cvrlst f-cb'}) values = [] # 一次插入多条记录 sql = "insert into rap_playlist163 (`title`, `link`, `play_num`)" \ "values(%s,%s,%s)" for play in lst.find_all('div', {'class': 'u-cover u-cover-1'}): title = MySQLdb.escape_string( play.find('a', {'class': 'msk'})['title'].encode('utf-8')) link = MySQLdb.escape_string( play.find('a', {'class': 'msk'})['href'].encode('utf-8')) playNum = MySQLdb.escape_string( play.find('span', { 'class': 'nb' }).text.encode('utf-8')) values.append((title, link, playNum)) self.__db.batchInsertSQL(sql, values) print 'page:%s页 歌单入库' % page except Exception as err: # 打印异常堆栈 exstr = traceback.format_exc() print exstr c.Log('{} : {} {}'.format("ERROR 104 ", "URL", play_url))
def insertSQL(self, sql): try: self.__cursor.execute(sql) self.__db.commit() except Exception as err: # 打印异常堆栈 self.__db.rollback() exstr = traceback.format_exc() print exstr c.Log("ERROR 909 : SQL " + sql)
def batchInsertSQL(self, sql, values): cursor = self.__db.cursor() cursor.execute("SET NAMES utf8mb4") try: cursor.executemany(sql, values) self.__db.commit() except Exception as err: # 打印异常堆栈 self.__db.rollback() exstr = traceback.format_exc() print exstr c.Log("ERROR 909 : SQL " + sql) finally: cursor.close()
def getOneSong(self, lyric, id): try: # 先改变状态到生成歌曲中, 锁住 self.dbManager.execute( "update rap_music163 set status = 2 where status = 1 and id = '" + str(id) + "'") # 结巴分词 print len(lyric) # 打开并行 # jieba.enable_parallel(4) # 关闭并行 jieba.disable_parallel() words = [x for x in jieba.cut(lyric) if len(x) >= 2] jieba.disable_parallel() from collections import Counter count = Counter(words).most_common(20) print count for vo in count: word = vo[0] number = vo[1] # 自增有序集合内value对应的分数 self.r.zincrby(self.sortedSetKey, word, number) # 自增zset_name对应的有序集合里a1对应的分数 print self.r.zcard(self.sortedSetKey) # # 获取关键词 # tags = jieba.analyse.extract_tags(lyric, topK=3) # print u"关键词:" # print " ".join(tags) # 循环每个词,数据库里确认是插入还是更新 redis更好 self.dbManager.execute( "update rap_music163 set status = 3 where status = 2 and id = '" + str(id) + "'") except Exception as err: # 打印异常堆栈 exstr = traceback.format_exc() print exstr c.Log('{} : {}'.format("Error 901", err))
def getOneSongList(self, songListlink): global content, lyric s = requests.session() url = self.__url + str(songListlink) # request = urllib2.Request(url=url, headers=self.__headers) # response = urllib2.urlopen(request) # page = response.read().decode('utf-8') b = random.sample(self.allValidIp, 1) proxy = b[0] try: try: content = s.get(url, headers=self.__headers, proxies=proxy).content except: # 将这个代理去掉 # self.allValidIp.remove(proxy) # 改变状态到歌曲获取失败 self.dbManager.execute( "update rap_playlist163 set status = 9 where link = '" + str(songListlink) + "'") return # content = s.get(url).content # s = BeautifulSoup(page, "lxml") page = BeautifulSoup(content, "lxml") musics = page.find('ul', {'class': 'f-hide'}) if musics is None: # 将这个代理去掉 # self.allValidIp.remove(proxy) # 改变状态到歌曲获取失败 self.dbManager.execute( "update rap_playlist163 set status = 9 where link = '" + str(songListlink) + "'") print '%s return 503' % songListlink return # 先改变状态到生成歌曲中, 锁住 self.dbManager.execute( "update rap_playlist163 set status = 1 where status = 0 and link = '" + str(songListlink) + "'") # 一次插入多条记录 sql = "insert into rap_music163 (`song_id`, `name`, `link`, `lyric`, `status`)" \ "values(%s,%s,%s,%s,%s)" status = 1 values = [] for music in musics: songLink = MySQLdb.escape_string( music.find('a')['href'].encode('utf-8')) name = MySQLdb.escape_string(music.text.encode('utf-8')) o = re.match(r'.*id=(.*)', songLink, re.M | re.I) id = int(o.group(1)) # 根据songLink获取歌词 lrc_url = self.lyricAPI + str(id) try: lyric = s.get(lrc_url, headers=self.__headers, proxies=proxy) except: # 将这个代理去掉 # self.allValidIp.remove(proxy) # 改变状态到歌曲获取失败 # self.dbManager.execute( # "update rap_playlist163 set status = 9 where link = '" + str(songListlink) + "'") return time.sleep(0.5) # 休眠0.1秒 json_obj = lyric.text j = json.loads(json_obj) code = j['code'] if code == 200 and j.has_key('lrc') and j['lrc'].has_key( 'lyric'): lrc = j['lrc']['lyric'] pat = re.compile(r'\[.*\]') lrc = re.sub(pat, "", lrc) lrc = lrc.strip() # print(lrc) if not self.isDuplicate(songLink): values.append((id, name, songLink, lrc, status)) print name # 由于有歌词 所以存在sql过长问题 self.dbManager.batchInsertSQL(sql, values) print values # 再改变状态到生成歌曲完成 self.dbManager.execute( "update rap_playlist163 set status = 2 where status =1 and link = '" + str(songListlink) + "'") except Exception as err: # 打印异常堆栈 exstr = traceback.format_exc() print exstr c.Log('{} : {}'.format("Error 901", err))