def parse_sogou(self, response): hxs = pageParser(response) title = response.meta['title'] artist = response.meta['artist'] data = '' try: data = hxs.select('//a[@action="down"]/@onclick').extract()[0] except: print '[ ' + title + ' NOT FOUND]' if data != '': download_page = 'http://mp3.sogou.com' + data.split("'")[1] yield Request(download_page, callback = self.parse_download, meta = {'title': title, 'artist': artist})
def parse(self, response): hxs = pageParser(response) titles = hxs.select('//td[@class="title"]/a/text()').extract() artists = hxs.select('//a[@uigs="consume=singer_new"]/text()').extract() download_pages = hxs.select('//span[@class="cd3"]/a/@onclick').extract() #dates = hxs.select('//div[@class="item-date"]/text()').extract() for i in range(0, len(titles)): title = titles[i].replace('\t', '').replace('\n', '').replace('\r', '') #+ ' ' + artists[i] artist = artists[i].replace('\t', '').replace('\n', '').replace('\r', '') query = title query = urllib.urlencode({'query':query}).split('=')[1] sogou_url = 'http://mp3.sogou.com/music.so?query=' + query + '&class=1&st=&ac=1&pf=mp3&_asf=mp3.sogou.com&_ast=1355460624&p=&w=&interV=&w=02009900' yield Request(sogou_url, callback=self.parse_sogou, meta={'title': self.str_replace(title), 'artist': self.str_replace(artist)})
def parse_download(self, response): hxs = pageParser(response) title = response.meta['title'] artist = response.meta['artist'] download_link = '' links = hxs.select('//a/@href').extract() for link in links: if '.mp3' in link: download_link = link break if download_link != '': item = BillboardItem() item['title'] = title item['artist'] = artist item['download_link'] = download_link yield item
def parse(self, response): hxs = pageParser(response) #titles = hxs.select('//div[@class="item-title"]/text()').extract() #artists = hxs.select('//div[@class="item-artist"]/text()').extract() #dates = hxs.select('//div[@class="item-date"]/text()').extract() nodes = hxs.select('//article[contains(@class, "song_review")]') for node in nodes: title = node.select('./header/h1/text()').extract()[0] if_artist = node.select('./header/p[@class="chart_info"]/a/text()').extract() if len(if_artist) > 0: artist = if_artist[0] else: artist = 'unknown' query = title query = urllib.urlencode({'query':query}).split('=')[1] sogou_url = 'http://mp3.sogou.com/music.so?query=' + query + '&class=1&st=&ac=1&pf=mp3&_asf=mp3.sogou.com&_ast=1355460624&p=&w=&interV=&w=02009900' yield Request(sogou_url, callback=self.parse_sogou, meta={'title': self.str_replace(title), 'artist': self.str_replace(artist)})