def process_item(self, item, spider): if not item['id'] or len(item['id'].strip()) == 0: raise DropItem("Missing id: %s" % item) if not item['source'] or len(item['source'].strip()) == 0: raise DropItem("Missing source: %s" % item) source = item['source'] # This is a little hack, for now, we only get new releases from Itunes # and its identified by extra field try: if item["newrls"] and item["newrls"] is True: self.storage = chartCache.newreleases else: self.storage = chartCache.storage except: self.storage = chartCache.storage chart_id = source + item['id'] log.msg("Saving %s - %s" % (source, item['id'])) chart_list = self.storage.get(source, {}) # metadata is the chart item minus the actual list plus a size metadata_keys = filter(lambda k: k != 'list', item.keys()) metadata = {key: item[key] for key in metadata_keys} metadata['size'] = len(item['list']) chart_list[chart_id] = metadata self.storage[source] = chart_list self.storage[chart_id] = dict(item) self.storage[source + "cacheControl"] = dict( chartCache.setCacheControl({'seconds': item["maxage"]})) return item
def process_item(self, item, spider): if not item['id'] or len(item['id'].strip()) == 0: raise DropItem("Missing id: %s" % item) if not item['source'] or len(item['source'].strip()) == 0: raise DropItem("Missing source: %s" % item) source = item['source'] # This is a little hack, for now, we only get new releases from Itunes # and its identified by extra field try : if source == "itunes" and item['extra'] : self.storage = chartCache.newreleases else : self.storage = chartCache.storage except : self.storage = chartCache.storage chart_id = source+item['id'] log.msg("Saving %s - %s" %(source, item['id'])) chart_list = self.storage.get(source, {}) # metadata is the chart item minus the actual list plus a size metadata_keys = filter(lambda k: k != 'list', item.keys()) metadata = { key: item[key] for key in metadata_keys } metadata['size'] = len(item['list']) chart_list[chart_id] = metadata self.storage[source] = chart_list self.storage[chart_id] = dict(item) self.storage[source+"cacheControl"] = dict(chartCache.setCacheControl({'seconds' : item["maxage"]})) return item
class DjShopSpider(CrawlSpider): name = "djshop.de" allowed_domains = ["djshop.de"] baseUrl = "http://www.djshop.de/" baseCharts = [ "%sDownload-Charts/ex/s~mp3,u~charts/xe/Download-Charts.html" % baseUrl, "%sVinyl-Charts/ex/s~charts/xe/charts.html" % baseUrl ] chartTypes = [{"unpretty" : "MP3 Downloads Charts", "pretty" : "Digital Charts"}, \ {"unpretty" : "Charts Style Charts", "pretty" : "Vinyl Charts"}, \ {"unpretty" : "Charts Top 100", "pretty" : "Top 100"}, \ {"unpretty" : "Charts International Charts", "pretty" : "International Charts"}] # Expires in 2 days expires = chartCache.timedeltaUntilDays(2) cacheControl = chartCache.setCacheControl(expires) source_id = "djshop.de" source_name = "djShop.de" description = "Updated daily with what's currently hot on the electronic scene." have_extra = True details = DetailItem( Detail(id=source_id, description=description, name=source_name, have_extra=have_extra)) def __init__(self, name=None, **kwargs): super(DjShopSpider, self).__init__() chartCache.shoveDetails(self.details) self.get_chart_urls() def get_chart_urls(self): for chart in self.baseCharts: req = urllib2.Request(chart) hxs = HtmlXPathSelector(text=urllib2.urlopen(req).read()) try: navBox = hxs.select('//div[@id="leftColumn"]') navList = navBox.select('//ul[@class="navUL"]/li') for index, link in enumerate(navList): if not "Label Charts" in link.select( 'a/text()').extract()[0].strip(): self.start_urls.append( "http://www.djshop.de" + link.select('a/@href').extract()[0].strip()) except Exception, e: print e
def __getCacheControl(self): self.cacheControl = chartCache.setCacheControl(self.expires)
class BillboardSpider(CrawlSpider): name = "billboard.com" allowed_domains = ["billboard.com"] start_urls = [ # this is the list of all the charts "http://www.billboard.com/charts" ] # xpath to retrieve the urls to specific charts chart_xpath = '//span[@class="field-content"]/a' # the xpath to the pagination links next_page_xpath = '//div[@class="chart_pager_bottom"]/div/ul/li[@class="pager-item"]/a/@href' # we only need one rule, and that is to follow # the links from the charts list page rules = [ Rule(SgmlLinkExtractor(allow=['/charts/\w+'], restrict_xpaths=chart_xpath), callback='parse_chart', follow=True) ] expires = chartCache.timedeltaUntilWeekday(EXPIRES_DAY, EXPIRES_HOUR) cacheControl = chartCache.setCacheControl(expires) source_id = "billboard" description = "The week's top-selling and most played albums and tracks across all genres, ranked by sales data and radio airtime as compiled by Nielsen." details = DetailItem(Detail( id=source_id, description=description, )) def __init__(self, name=None, **kwargs): super(BillboardSpider, self).__init__() chartCache.shoveDetails(self.details) def parse_chart(self, response): hxs = HtmlXPathSelector(response) chart_name = hxs.select( '//h1[@id="page-title"]/text()').extract()[0].strip() #chart_type = hxs.select('//*[@id="chart-list"]/div[@id="chart-type-fb"]/text()').extract()[0].strip() # get a list of pages next_pages = hxs.select(self.next_page_xpath).extract() # remove javascript links and turn it into a queue, also, we want to exclude next chart (!) next_pages = deque(filter(lambda e: not 'javascript' in e, next_pages)) # Correct the grammar to fit our expectations if chart_name == 'Germany Songs': chart_name = 'German Tracks' chart = ChartItem() chart['name'] = chart_name chart['display_name'] = chart_name if chart_name else "Top Overall" chart['origin'] = response.url chart['source'] = 'billboard' chart['id'] = slugify(chart_name) chart['list'] = [] chart['date'] = self.cacheControl.get("Date-Modified") chart['expires'] = self.cacheControl.get("Date-Expires") chart['maxage'] = self.cacheControl.get("Max-Age") # lets figure out the content type lower_name = chart_name.lower() if 'songs' in lower_name: chart['type'] = 'Track' typeItem = SingleTrackItem() elif 'albums' in lower_name \ or any(lower_name in s for s in ['soundtracks', 'billboard 200', 'tastemakers']): chart['type'] = 'Album' typeItem = SingleAlbumItem() elif any(lower_name in s for s in ['social 50', 'uncharted']): chart['type'] = 'Artist' typeItem = SingleArtistItem() else: chart['type'] = 'Track' typeItem = SingleTrackItem() if (chart['id'] == settings["BILLBOARD_DEFAULT_ALBUMCHART"] or chart['id'] == settings["BILLBOARD_DEFAULT_TRACKCHART"]): chart['default'] = 1 chart = self.parse_items(hxs, chart, typeItem) # ok, we've prepped the chart container, lets start getting the pages if len(next_pages) > 0: next_page = next_pages.popleft() request = Request('http://www.billboard.com' + next_page, callback=lambda r: self.parse_page( r, chart, next_pages, typeItem)) yield request def parse_items(self, hxs, chart, typeItem): # parse every chart entry chart_list = [] for item in hxs.select( '//div[contains(@class,"chart_listing")]/article'): loader = XPathItemLoader(typeItem, selector=item) loader.add_xpath( 'rank', 'header/span[contains(@class, "chart_position")]/text()') # ptitle yields the title for the type, so just set the title to whatever the chartype is. if 'artist' in chart['type'].lower(): loader.add_xpath('artist', 'header/p[@class="chart_info"]/a/text()') else: loader.add_xpath(chart['type'].lower(), 'header/h1/text()') loader.add_xpath('artist', 'header/p[@class="chart_info"]/a/text()') loader.add_xpath('album', 'header/p[@class="chart_info"]/text()') single = loader.load_item() chart_list.append(dict(single)) chart['list'] += chart_list return chart def parse_page(self, response, chart, next_pages, typeItem): hxs = HtmlXPathSelector(response) chart = self.parse_items(hxs, chart, typeItem) if len(next_pages) == 0: log.msg("Done with %s" % (chart['name'])) yield chart else: next_page = next_pages.popleft() log.msg("Starting nextpage (%s) of %s - %s left" % (next_page, chart['name'], len(next_pages))) request = Request('http://www.billboard.com' + next_page, callback=lambda r: self.parse_page( r, chart, next_pages, typeItem)) yield request
def parse(self, response): hxs = HtmlXPathSelector(response) chart_name = "Top 100" try: chart_type = hxs.select( '//*[@class="tab-right-active"]/text()').extract()[0].strip() except IndexError: chart_type = hxs.select( '//*[@class="tab-left-active"]/text()').extract()[0].strip() if "upcoming" in response.url: extra = "Upcoming" if "mainstream" in response.url: extra = "Mainstream" if "alltime" in response.url: chart_name += " " + extra extra = "Alltime" id = chart_name + extra + chart_type chart = ChartItem() chart['name'] = chart_name + " " + chart_type chart[ 'display_name'] = chart["name"] if chart["name"] else "Top Overall" chart['origin'] = response.url chart['source'] = 'hotnewhiphop' chart['id'] = slugify(id) chart['list'] = [] chart['extra'] = extra expires = chartCache.timedeltaUntilDays(1) cacheControl = chartCache.setCacheControl(expires) chart['date'] = cacheControl.get("Date-Modified") chart['expires'] = cacheControl.get("Date-Expires") chart['maxage'] = cacheControl.get("Max-Age") if "mixtape" in response.url: if extra == "Upcoming": chart['default'] = 1 chart['type'] = "Album" loader = SingleUrlAlbumItem() urlKey = "url" url = "http://www.hotnewhiphop.com/ajax/api/getMixtape/" elif "song" in response.url: chart['type'] = "Track" loader = SingleUrlTrackItem() # Later on, if we have a hnhh resolver, this url could be used to # get a valid mp3 stream. url = "hnhh://www.hotnewhiphop.com/ajax/api/getSong/" urlKey = "stream_url" else: log.msg("Error with %s" % (chart['name'])) return chart_list = [] rank = 0 for item in hxs.select('//div[@class="newCell newCell2"]'): if chart['type'] == "Album": loader = XPathItemLoader(SingleUrlAlbumItem(), selector=item) if chart['type'] == "Track": loader = XPathItemLoader(SingleUrlTrackItem(), selector=item) loader.add_xpath(chart['type'].lower(), 'div[@class="centerBlock"]/h3/a/text()') loader.add_xpath('artist', 'div[@class="centerBlock"]/a/i/text()') loader.add_xpath(urlKey, 'div[@class="centerBlock"]/a/@href') single = loader.load_item() single[urlKey] = url + urlparse(single[urlKey]).path.split(".")[1] rank += 1 single['rank'] = rank chart_list.append(dict(single)) log.msg("Done with %s" % (chart['name'])) chart['list'] += chart_list return chart
def parse_atom(self, feed): ns = {'ns': 'http://www.w3.org/2005/Atom', 'im': 'http://itunes.apple.com/rss'} try: _id = feed.xpath('/ns:feed/ns:id', namespaces=ns)[0].text _type = feed.xpath('/ns:feed/ns:entry/im:contentType/im:contentType', namespaces=ns)[0].attrib['term'] except IndexError: return if _type != "Album" and _type != "Track": return # skip playlists entries = feed.xpath('/ns:feed/ns:entry', namespaces=ns) chart_list = [] rank = 0 for entry in entries: title = entry.xpath('im:name', namespaces=ns)[0].text artist = entry.xpath('im:artist', namespaces=ns)[0].text if _type == "Album": album = title item = SingleAlbumItem() elif _type == "Track": album = None collectionNames = entry.xpath('im:collection/im:name', namespaces=ns) if len(collectionNames) > 0: album = collectionNames[0].text item = SingleTrackItem() item['track'] = title rank += 1 item['artist'] = artist item['album'] = album item['rank'] = rank chart_list.append( dict(item) ) title = feed.xpath('ns:title', namespaces=ns)[0].text geo = None geo_re = re.compile("cc=([a-zA-Z]+)") rGeo = geo_re.search(_id) if rGeo != None: geo = rGeo.groups()[0] genre = None genre_re = re.compile("genre=(\d+)/") rGenre = genre_re.search(_id) if rGenre != None: genre = rGenre.groups()[0] if not genre is None: genre = get_genre(genre) origin = _id md5 = hashlib.md5() md5.update(_id) _id = md5.hexdigest() if geo is None: geo_s = origin.split("/") geo = geo_s chart = ChartItem() # Itunes expires tomorrow at 00am chart['id'] = _id chart['display_name'] = genre if genre else "Top Overall" chart['origin'] = origin chart['genre'] = genre chart['geo'] = geo chart['name'] = title chart['type'] = _type chart['list'] = chart_list chart['source'] = 'itunes' # maxage is the last item scraped expires = chartCache.timedeltaUntilDays(1) cacheControl = chartCache.setCacheControl(expires) chart['date'] = cacheControl.get("Date-Modified") chart['expires'] = cacheControl.get("Date-Expires") chart['maxage'] = cacheControl.get("Max-Age") if(_id == settings["ITUNES_DEFAULT_ALBUMCHART"] or _id == settings["ITUNES_DEFAULT_TRACKCHART"]): print "Found default" + _id chart['default'] = 1 return chart
def parse_rss(self, feed, url): genre_name = None feed_extra = None feed_type = "Album" geo = None genre = filter(lambda k: 'genre' in k, urlparser(url).path.split("/")) try : genre_name = get_genre( genre[0].split("=")[1] ) # geo in xpath is different ISO than in url. We want cc not xpath # geo = feed.xpath('.//channel/language')[0].text geo_re = re.compile("cc=(.*)(?=\/)") rGeo = geo_re.search(url) if rGeo != None: geo = rGeo.groups()[0] except IndexError : return if 'newreleases' in url : feed_extra = "New Album Releases" if 'justadded' in url : feed_extra = "Just Added Albums" if 'featuredalbums' in url: feed_extra = "Featured Albums" if feed_extra is None or genre_name is None or geo is None : return ns = { 'itms': 'http://phobos.apple.com/rss/1.0/modules/itms/' } entries = feed.xpath('.//channel/item') rank = 0 chart_list = [] for entry in entries: artist = entry.xpath('itms:artist', namespaces=ns)[0].text album = entry.xpath('itms:album', namespaces=ns)[0].text rank += 1 item = SingleAlbumItem() item['artist'] = artist item['album'] = album item['rank'] = rank chart_list.append( dict(item) ) chart = ChartItem() # Unique ids _id = url md5 = hashlib.md5() md5.update(_id) _id = md5.hexdigest() chart['id'] = _id chart['origin'] = url chart['genre'] = genre_name chart['geo'] = geo.lower() chart['name'] = genre_name chart['extra'] = feed_extra chart["newrls"] = True chart['type'] = feed_type chart['list'] = chart_list chart['source'] = 'itunes' # maxage is the last item scraped # Expires in 1 days expires = chartCache.timedeltaUntilDays(1) cacheControl = chartCache.setCacheControl(expires) chart['date'] = cacheControl.get("Date-Modified") chart['expires'] = cacheControl.get("Date-Expires") chart['maxage'] = cacheControl.get("Max-Age") if _id == settings["ITUNES_DEFAULT_NRCHART"]: chart['default'] = 1 return chart
def parse(self, response): hxs = HtmlXPathSelector(response) chart_name = "Top 100" try: chart_type = hxs.select('//*[@class="tab-right-active"]/text()').extract()[0].strip() except IndexError: chart_type = hxs.select('//*[@class="tab-left-active"]/text()').extract()[0].strip() if "upcoming" in response.url : extra = "Upcoming" if "mainstream" in response.url : extra = "Mainstream" if "alltime" in response.url : chart_name += " " + extra extra = "Alltime" id = chart_name + extra + chart_type chart = ChartItem() chart['name'] = chart_name + " " + chart_type chart['display_name'] = chart["name"] if chart["name"] else "Top Overall" chart['origin'] = response.url chart['source'] = 'hotnewhiphop' chart['id'] = slugify(id) chart['list'] = [] chart['extra'] = extra expires = chartCache.timedeltaUntilDays(1) cacheControl = chartCache.setCacheControl(expires) chart['date'] = cacheControl.get("Date-Modified") chart['expires'] = cacheControl.get("Date-Expires") chart['maxage'] = cacheControl.get("Max-Age") if "mixtape" in response.url : if extra == "Upcoming" : chart['default'] = 1 chart['type'] = "Album" loader = SingleUrlAlbumItem() urlKey = "url" url = "http://www.hotnewhiphop.com/ajax/api/getMixtape/" elif "song" in response.url : chart['type'] = "Track" loader = SingleUrlTrackItem() # Later on, if we have a hnhh resolver, this url could be used to # get a valid mp3 stream. url = "hnhh://www.hotnewhiphop.com/ajax/api/getSong/" urlKey = "stream_url" else : log.msg("Error with %s" %(chart['name'])) return chart_list = [] rank = 0 for item in hxs.select('//div[@class="newCell newCell2"]'): if chart['type'] == "Album" : loader = XPathItemLoader(SingleUrlAlbumItem(), selector=item) if chart['type'] == "Track" : loader = XPathItemLoader(SingleUrlTrackItem(), selector=item) loader.add_xpath(chart['type'].lower(), 'div[@class="centerBlock"]/h3/a/text()') loader.add_xpath('artist', 'div[@class="centerBlock"]/a/i/text()') loader.add_xpath(urlKey, 'div[@class="centerBlock"]/a/@href') single = loader.load_item() single[urlKey] = url + urlparse(single[urlKey]).path.split(".")[1] rank += 1 single['rank'] = rank chart_list.append(dict(single)) log.msg("Done with %s" %(chart['name'])) chart['list'] += chart_list return chart
class MetacriticSpider(CrawlSpider): name = "metacritic.com" allowed_domains = ["metacritic.com"] baseUrl = "http://www.metacritic.com" genre_nav_xpath = './/ul[@class="genre_nav"]/li' types_xpath = './/ul[contains(@class, "tabs")]/li' first_nav_xpath = './/ul[contains(@class, "nav_items")]/li' current_page_name_xpath = './/ul[contains(@class, "tabs")]/li/span[@class="active"]/span/text()' list_xpath = './/ol[contains(@class,"list_product_condensed")]/li' next_page_xpath = './/ul[@class="pages"]/li/a/@href' coming_soon_table_xpath = './/table[@class="musicTable"]/tr' coming_soon_artist_xpath = './/td[@class="artistName"]' coming_soon_album_xpath = './/td[@class="albumTitle"]/text()' start_urls = ["http://www.metacritic.com/music"] rules = [ Rule(SgmlLinkExtractor(allow=("albums/genre/\w+", ), deny=( "music", "name", ), restrict_xpaths=(genre_nav_xpath, )), callback='parse_page', follow=True), Rule(SgmlLinkExtractor( deny=("albums/genre/\w+", "name", "music", "coming-soon/(metascore|userscore|name|date)", "new-releases/name"), restrict_xpaths=(types_xpath, )), callback='parse_new_releases', follow=True), Rule(SgmlLinkExtractor(allow=( "albums/release-date", "albums/score", ), deny=( "feature", "artist", "/\w+/people", ), restrict_xpaths=(first_nav_xpath, )), callback='parse_new_releases', follow=True) ] # Expires in 2 days expires = chartCache.timedeltaUntilDays(1) cacheControl = chartCache.setCacheControl(expires) source_id = "metacritic" source_name = "Metacritic" description = "Critically acclaimed and noteworthy music." have_extra = True details = DetailItem( Detail(id=source_id, description=description, name=source_name, have_extra=have_extra)) def __init__(self, name=None, **kwargs): super(MetacriticSpider, self).__init__() chartCache.shoveDetails(self.details) chartCache.shoveDetails(self.details, False) def get_current_genre(self, hxs): navList = hxs.select(self.genre_nav_xpath) for index, item in enumerate(navList): if item.select('.//span'): return item.select('.//span/text()').extract()[0].strip() return None def get_current(self, hxs, chart): try: active = hxs.select(self.current_page_name_xpath).extract() chart["extra"] = active[0].strip() chart["name"] = active[1].strip() chart["display_name"] = chart["name"] chart["id"] = slugify(chart["name"] + chart["extra"]) except Exception, e: if "coming-soon" in chart["origin"]: chart["extra"] = "Coming Soon" chart["name"] = "By Date" chart["display_name"] = chart["name"] chart["id"] = slugify(chart["name"] + chart["extra"])
def parse_atom(self, feed): ns = {'ns': 'http://www.w3.org/2005/Atom', 'im': 'http://itunes.apple.com/rss'} try: _id = feed.xpath('/ns:feed/ns:id', namespaces=ns)[0].text _type = feed.xpath('/ns:feed/ns:entry/im:contentType/im:contentType', namespaces=ns)[0].attrib['term'] except IndexError: return if _type != "Album" and _type != "Track": return # skip playlists entries = feed.xpath('/ns:feed/ns:entry', namespaces=ns) chart_list = [] rank = 0 for entry in entries: title = entry.xpath('im:name', namespaces=ns)[0].text artist = entry.xpath('im:artist', namespaces=ns)[0].text if _type == "Album": album = title item = SingleAlbumItem() elif _type == "Track": album = entry.xpath('im:collection/im:name', namespaces=ns)[0].text item = SingleTrackItem() item['track'] = title rank += 1 item['artist'] = artist item['album'] = album item['rank'] = rank chart_list.append( dict(item) ) title = feed.xpath('ns:title', namespaces=ns)[0].text geo = None geo_re = re.compile("cc=([a-zA-Z]+)") rGeo = geo_re.search(_id) if rGeo != None: geo = rGeo.groups()[0] genre = None genre_re = re.compile("genre=(\d+)/") rGenre = genre_re.search(_id) if rGenre != None: genre = rGenre.groups()[0] if not genre is None: genre = get_genre(genre) origin = _id md5 = hashlib.md5() md5.update(_id) _id = md5.hexdigest() if geo is None: geo_s = origin.split("/") geo = geo_s chart = ChartItem() # Itunes expires tomorrow at 00am chart['id'] = _id chart['display_name'] = genre if genre else "Top Overall" chart['origin'] = origin chart['genre'] = genre chart['geo'] = geo chart['name'] = title chart['type'] = _type chart['list'] = chart_list chart['source'] = 'itunes' # maxage is the last item scraped expires = chartCache.timedeltaUntilDays(1) cacheControl = chartCache.setCacheControl(expires) chart['date'] = cacheControl.get("Date-Modified") chart['expires'] = cacheControl.get("Date-Expires") chart['maxage'] = cacheControl.get("Max-Age") if(_id == settings["ITUNES_DEFAULT_ALBUMCHART"] or _id == settings["ITUNES_DEFAULT_TRACKCHART"]): print "Found default" + _id chart['default'] = 1 return chart