def requestUrl(url, restart=0): proxy={'http': 'http://39.134.93.13:80'} proxy_support = urllib2.ProxyHandler(proxy) opener = urllib2.build_opener(proxy_support) urllib2.install_opener(opener) filename = url[FILE_OFFSET:] try: if restart: if DB.get(filename): return DB.get(filename) else: print url time.sleep(TIME_SLEEP) req = urllib2.Request(url, headers=M_Headers) data = urllib2.urlopen(req).read() DB.set(filename, data) return data else: print url time.sleep(TIME_SLEEP) req = urllib2.Request(url, headers=M_Headers) data = urllib2.urlopen(req).read() # DB.set(filename, data) return data except Exception, e: print e
def getSrc(num): for i in range(1, num): url = SOURCE_URL + '/geci/' + str(i) + '.htm' try: data = requestUrl(url) soup = BeautifulSoup(data, 'html.parser') tag_src = soup.find('div', class_="geciInfo") src = tag_src.text if (src != ''): tag_info = soup.find('div', class_="geciText").find_all('li') song_name = tag_info[0].text[4:] singer = tag_info[1].text[4:] DB.hset(singer, song_name, src) print song_name except urllib2.HTTPError, e: print e if hasattr(e, 'code'): if (e.code == 404): continue elif (e.code == 502): # 更换IP再试一次 continue # return会导致回到现在来 elif (e.code == 403): continue except urllib2.URLError, e: print e continue
def get_playlist_followuser(playlist_id): # 获取歌单的追随者 data=requestUrl('http://music.163.com/playlist?id=' + str(playlist_id)) soup = BeautifulSoup(data, 'html.parser') tag_ul = soup.find('ul', class_="m-piclist f-cb") for li in tag_ul: tag_a = li.find('a') if tag_a != -1: user_id = tag_a.get('href')[14:] DB.sadd('users', str(user_id))
def get_user_top(): user_id = 264257752 with open(TEMP_FILE, 'r') as f: j = json.loads(f.read()) arr = j['weekData'] + j['allData'] for song in arr: song_score = song['score'] song_id = song['song']['id'] song_name = song['song']['name'] print song_name DB.rpush('user:'******':top_song', song_id)
def get_indexpage_playlist(data): soup = BeautifulSoup(data, 'html.parser') tag_divs = soup.find_all('div', class_="u-cover u-cover-1") try: for tag_div in tag_divs: print tag_div tag_a=tag_div.find('a') playlist_id = tag_a.get('href')[13:] DB.sadd('playlist', playlist_id) except Exception, e: print e
def get_relation_user(ori_user): users_list = DB.lrange('user', 0, 26) top_songs = DB.lrange('user:'******':top_song', 0, 250) for top in top_songs: for user in users_list: hname = 'combine:' + str(user) tname = 'user:'******':like_song' songs = DB.lrange(tname, 0, 500) for s in songs: if top == s: print top DB.rpush(hname, str(s))
def get_playlist_playlistandfollower(playlist_id): data=requestUrl('http://music.163.com/playlist?id=' + str(playlist_id)) soup = BeautifulSoup(data, 'html.parser') tag_divs = soup.find_all('a', class_="sname f-fs1 s-fc0") get_playlist_followuser(playlist_id) try: for tag_div in tag_divs: playlist_id = tag_div.get('href')[13:] DB.sadd('playlist', playlist_id) except Exception, e: print e
def get_playlist_like(playlist, data): soup = BeautifulSoup(data, 'html.parser') tag_ul = soup.find('ul', class_="m-piclist") print tag_ul try: for li in tag_ul: for l in li: user_id = l.get('data-res-id') user_name = l.get('title') DB.hset('playlist:' + str(playlist) + ':like_user_id:' + str(user_id), 'user_id', user_id) except Exception, e: print e
def getUserIdList(data): soup = BeautifulSoup(data, 'html.parser') tag_ul = soup.find('ul', class_="f-hide") try: for li in tag_ul: for l in li: song_id = l.get('href')[9:] song_name = l.text playlist_name = 'playlist:' + + ":" + song_id DB.hset(playlist_name, 'song_id', song_id) DB.hset(playlist_name, 'song_name', song_name) except Exception, e: print e
def postUrl(url, keywords, restart=0): filename = url[FILE_OFFSET:] if not restart: return DB.get(filename) else: data = { 's': keywords, 'offset': 0, 'limit': 30, 'type': 1000, } data = requests.post(url, data).text DB.set(filename, data) return data
def get_song_lyric(song_id): url = 'http://music.163.com/api/song/lyric?os=pc&id=' + str(song_id) + '&lv=-1&kv=-1&tv=-1' res = requestUrl(url) try: data = json.loads(res) song_info = data['songs'][0] dic = {} dic['name'] = song_info['name'] dic['id'] = song_info['id'] dic['position'] = song_info['position'] dic['artist'] = song_info['artists'][0]['name'] dic['score'] = song_info['score'] dic['popularity'] = song_info['popularity'] DB.hmset('songs:' + str(song_id), dic) except Exception, e: print e
def get_song_info(song_id): url = 'http://music.163.com/api/song/detail/?id=' + str(song_id) + '&ids=%5B' + str(song_id) + '%5D' res = requestUrl(url) try: data = json.loads(res) song_info = data['songs'][0] dic = {} print song_info['name'] dic['name'] = song_info['name'] dic['id'] = song_info['id'] dic['position'] = song_info['position'] dic['artist'] = song_info['artists'][0]['name'] dic['score'] = song_info['score'] dic['popularity'] = song_info['popularity'] DB.hmset('songs:' + str(song_id), dic) except Exception, e: print e
def get_playlist_id(): play_id_list = [] keys = DB.keys() for key in keys: if 'playlist' in key: play_id_list.append(key[9:]) print key[9:] return play_id_list
def get_user_info(user_id): url = 'http://music.163.com/user/home?id=' + str(user_id) data = requestUrl(url) try: soup = BeautifulSoup(data, 'html.parser') dic = {} city= soup.find('div', attrs={'class': 'inf'}).find('span') name = soup.find('span', attrs={'class': 'tit f-ff2 s-fc0 f-thide'}) if name: dic['name']=name.text if city: dic['city']=city.text[5:] dic['img'] = soup.find('dt', attrs={'id': 'ava'}).find('img').get('src') DB.hmset('user:' + str(user_id), dic) except Exception, e: print e
def requestUrl(url, restart=0): filename = url[FILE_OFFSET:] try: if not restart: if DB.get(filename): return DB.get(filename) else: print url time.sleep(TIME_SLEEP) req = urllib2.Request(url, headers=M_Headers) data = urllib2.urlopen(req).read() DB.set(filename, data) return data else: print url time.sleep(TIME_SLEEP) req = urllib2.Request(url, headers=M_Headers) data = urllib2.urlopen(req).read() # DB.set(filename, data) return data except Exception, e: print e
def get_user_playlists(user_id, data): # 去掉playcount==0 res = json.loads(data) play_id_list = [] try: for playlist in res['playlist']: playlist_playcount = playlist['playCount'] if not playlist_playcount == '0': playlist_id = playlist['id'] playlist_name = 'user:'******':playlist:' + str(playlist_id) DB.hset(playlist_name, 'playlist_id', playlist_id) DB.hset(playlist_name, 'playlist_name', playlist['name']) DB.hset(playlist_name, 'playlist_playcount', playlist_playcount) DB.hset(playlist_name, 'user_id', user_id) if playlist_id: play_id_list.append(playlist_id) except Exception, e: print e
def requestByFixfox(url, restart=0): filename = url[FILE_OFFSET:] if not restart: return DB.get(filename) else: # time.sleep(TIME_SLEEP) options = Options() # options.add_argument('-headless') # driver = Firefox(executable_path='C:\Python27\geckodriver.exe', firefox_options=options) driver = Chrome(executable_path='chromedriver.exe') driver.get(url) wait = WebDriverWait(driver, timeout=5) if (driver.execute_script("return document.readyState")) == "complete": # wait.until(EC.visibility_of_element_located((By.CLASS_NAME, 'iptarea' ))) data = driver.page_source driver.quit() return data
def get_alluser_id(): return DB.lrange('user', 0, 100)
def del_cache(): keys = DB.keys() for i in keys: r = re.search(r'^\d+.+', i) if r: DB.delete(r.group(0))
if __name__ == '__main__': # 279254081 # while True: # playlist_idlist=[] # for i in range(50): # playlist_idlist.append() # startthread(get_playlist_playlist(playlist_idlist),playlist_idlist) #多线程取user # while True: # playlist_idlist = [] # for i in range(50): # pop = DB.spop('playlist') # DB.sadd('ori_playlist', pop) # playlist_idlist.append(pop) # time.sleep(TIME_SLEEP) # startthread(get_playlist_playlistandfollower,playlist_idlist) print DB.sismember('users','') # url='http://music.163.com/discover/playlist/?order=hot&cat=%E5%85%A8%E9%83%A8&limit=35&offset=' # for i in range(0,701,35): # get_indexpage_playlist(requestUrl(url+str(i))) # while True: # playlist_idlist=[] # for i in range(50): # playlist_idlist.append(DB.spop('playlist')) # startthread(get_playlist_followuser,playlist_idlist)