Пример #1
0
    def process_item(self, item, spider):

        if not item['id'] or len(item['id'].strip()) == 0:
            raise DropItem("Missing id: %s" % item)

        if not item['source'] or len(item['source'].strip()) == 0:
            raise DropItem("Missing source: %s" % item)

        source = item['source']
        # This is a little hack, for now, we only get new releases from Itunes
        # and its identified by extra field
        try:
            if item["newrls"] and item["newrls"] is True:
                self.storage = chartCache.newreleases
            else:
                self.storage = chartCache.storage
        except:
            self.storage = chartCache.storage

        chart_id = source + item['id']
        log.msg("Saving %s - %s" % (source, item['id']))

        chart_list = self.storage.get(source, {})

        # metadata is the chart item minus the actual list plus a size
        metadata_keys = filter(lambda k: k != 'list', item.keys())
        metadata = {key: item[key] for key in metadata_keys}
        metadata['size'] = len(item['list'])
        chart_list[chart_id] = metadata
        self.storage[source] = chart_list
        self.storage[chart_id] = dict(item)
        self.storage[source + "cacheControl"] = dict(
            chartCache.setCacheControl({'seconds': item["maxage"]}))
        return item
Пример #2
0
    def process_item(self, item, spider):

        if not item['id'] or len(item['id'].strip()) == 0:
            raise DropItem("Missing id: %s" % item)

        if not item['source'] or len(item['source'].strip()) == 0:
            raise DropItem("Missing source: %s" % item)

        source = item['source']
        # This is a little hack, for now, we only get new releases from Itunes
        # and its identified by extra field
        try :
            if source == "itunes" and item['extra'] :
                self.storage = chartCache.newreleases
            else : self.storage = chartCache.storage
        except :
            self.storage = chartCache.storage
        
        chart_id = source+item['id']
        log.msg("Saving %s - %s" %(source, item['id']))

        chart_list = self.storage.get(source, {})

        # metadata is the chart item minus the actual list plus a size
        metadata_keys = filter(lambda k: k != 'list', item.keys())
        metadata = { key: item[key] for key in metadata_keys }
        metadata['size'] = len(item['list'])
        chart_list[chart_id] = metadata
        self.storage[source] = chart_list
        self.storage[chart_id] = dict(item)
        self.storage[source+"cacheControl"] = dict(chartCache.setCacheControl({'seconds' : item["maxage"]}))
        return item
Пример #3
0
class DjShopSpider(CrawlSpider):
    name = "djshop.de"
    allowed_domains = ["djshop.de"]
    baseUrl = "http://www.djshop.de/"
    baseCharts = [
        "%sDownload-Charts/ex/s~mp3,u~charts/xe/Download-Charts.html" %
        baseUrl,
        "%sVinyl-Charts/ex/s~charts/xe/charts.html" % baseUrl
    ]

    chartTypes = [{"unpretty" : "MP3 Downloads Charts", "pretty" : "Digital Charts"}, \
                  {"unpretty" : "Charts Style Charts", "pretty" : "Vinyl Charts"}, \
                  {"unpretty" : "Charts Top 100", "pretty" : "Top 100"}, \
                  {"unpretty" : "Charts International Charts", "pretty" : "International Charts"}]

    # Expires in 2 days
    expires = chartCache.timedeltaUntilDays(2)
    cacheControl = chartCache.setCacheControl(expires)

    source_id = "djshop.de"
    source_name = "djShop.de"
    description = "Updated daily with what's currently hot on the electronic scene."
    have_extra = True
    details = DetailItem(
        Detail(id=source_id,
               description=description,
               name=source_name,
               have_extra=have_extra))

    def __init__(self, name=None, **kwargs):
        super(DjShopSpider, self).__init__()
        chartCache.shoveDetails(self.details)
        self.get_chart_urls()

    def get_chart_urls(self):
        for chart in self.baseCharts:
            req = urllib2.Request(chart)
            hxs = HtmlXPathSelector(text=urllib2.urlopen(req).read())
            try:
                navBox = hxs.select('//div[@id="leftColumn"]')
                navList = navBox.select('//ul[@class="navUL"]/li')
                for index, link in enumerate(navList):
                    if not "Label Charts" in link.select(
                            'a/text()').extract()[0].strip():
                        self.start_urls.append(
                            "http://www.djshop.de" +
                            link.select('a/@href').extract()[0].strip())
            except Exception, e:
                print e
Пример #4
0
 def __getCacheControl(self):
     self.cacheControl = chartCache.setCacheControl(self.expires)
class BillboardSpider(CrawlSpider):
    name = "billboard.com"
    allowed_domains = ["billboard.com"]
    start_urls = [
        # this is the list of all the charts
        "http://www.billboard.com/charts"
    ]

    # xpath to retrieve the urls to specific charts
    chart_xpath = '//span[@class="field-content"]/a'
    # the xpath to the pagination links
    next_page_xpath = '//div[@class="chart_pager_bottom"]/div/ul/li[@class="pager-item"]/a/@href'
    # we only need one rule, and that is to follow
    # the links from the charts list page
    rules = [
        Rule(SgmlLinkExtractor(allow=['/charts/\w+'],
                               restrict_xpaths=chart_xpath),
             callback='parse_chart',
             follow=True)
    ]

    expires = chartCache.timedeltaUntilWeekday(EXPIRES_DAY, EXPIRES_HOUR)
    cacheControl = chartCache.setCacheControl(expires)

    source_id = "billboard"
    description = "The week's top-selling and most played albums and tracks across all genres, ranked by sales data and radio airtime as compiled by Nielsen."
    details = DetailItem(Detail(
        id=source_id,
        description=description,
    ))

    def __init__(self, name=None, **kwargs):
        super(BillboardSpider, self).__init__()
        chartCache.shoveDetails(self.details)

    def parse_chart(self, response):
        hxs = HtmlXPathSelector(response)

        chart_name = hxs.select(
            '//h1[@id="page-title"]/text()').extract()[0].strip()
        #chart_type = hxs.select('//*[@id="chart-list"]/div[@id="chart-type-fb"]/text()').extract()[0].strip()

        # get a list of pages
        next_pages = hxs.select(self.next_page_xpath).extract()
        # remove javascript links and turn it into a queue, also, we want to exclude next chart (!)
        next_pages = deque(filter(lambda e: not 'javascript' in e, next_pages))

        # Correct the grammar to fit our expectations
        if chart_name == 'Germany Songs':
            chart_name = 'German Tracks'

        chart = ChartItem()
        chart['name'] = chart_name
        chart['display_name'] = chart_name if chart_name else "Top Overall"
        chart['origin'] = response.url
        chart['source'] = 'billboard'
        chart['id'] = slugify(chart_name)
        chart['list'] = []

        chart['date'] = self.cacheControl.get("Date-Modified")
        chart['expires'] = self.cacheControl.get("Date-Expires")
        chart['maxage'] = self.cacheControl.get("Max-Age")

        # lets figure out the content type
        lower_name = chart_name.lower()
        if 'songs' in lower_name:
            chart['type'] = 'Track'
            typeItem = SingleTrackItem()
        elif 'albums' in lower_name \
            or any(lower_name in s for s in ['soundtracks', 'billboard 200', 'tastemakers']):
            chart['type'] = 'Album'
            typeItem = SingleAlbumItem()
        elif any(lower_name in s for s in ['social 50', 'uncharted']):
            chart['type'] = 'Artist'
            typeItem = SingleArtistItem()
        else:
            chart['type'] = 'Track'
            typeItem = SingleTrackItem()

        if (chart['id'] == settings["BILLBOARD_DEFAULT_ALBUMCHART"]
                or chart['id'] == settings["BILLBOARD_DEFAULT_TRACKCHART"]):
            chart['default'] = 1

        chart = self.parse_items(hxs, chart, typeItem)
        # ok, we've prepped the chart container, lets start getting the pages
        if len(next_pages) > 0:
            next_page = next_pages.popleft()
            request = Request('http://www.billboard.com' + next_page,
                              callback=lambda r: self.parse_page(
                                  r, chart, next_pages, typeItem))
            yield request

    def parse_items(self, hxs, chart, typeItem):
        # parse every chart entry
        chart_list = []
        for item in hxs.select(
                '//div[contains(@class,"chart_listing")]/article'):
            loader = XPathItemLoader(typeItem, selector=item)
            loader.add_xpath(
                'rank',
                'header/span[contains(@class, "chart_position")]/text()')
            # ptitle yields the title for the type, so just set the title to whatever the chartype is.
            if 'artist' in chart['type'].lower():
                loader.add_xpath('artist',
                                 'header/p[@class="chart_info"]/a/text()')
            else:
                loader.add_xpath(chart['type'].lower(), 'header/h1/text()')
                loader.add_xpath('artist',
                                 'header/p[@class="chart_info"]/a/text()')
                loader.add_xpath('album',
                                 'header/p[@class="chart_info"]/text()')

            single = loader.load_item()
            chart_list.append(dict(single))

        chart['list'] += chart_list

        return chart

    def parse_page(self, response, chart, next_pages, typeItem):

        hxs = HtmlXPathSelector(response)
        chart = self.parse_items(hxs, chart, typeItem)

        if len(next_pages) == 0:
            log.msg("Done with %s" % (chart['name']))
            yield chart
        else:
            next_page = next_pages.popleft()
            log.msg("Starting nextpage (%s) of %s - %s left" %
                    (next_page, chart['name'], len(next_pages)))
            request = Request('http://www.billboard.com' + next_page,
                              callback=lambda r: self.parse_page(
                                  r, chart, next_pages, typeItem))
            yield request
Пример #6
0
 def __getCacheControl(self):
     self.cacheControl = chartCache.setCacheControl(self.expires)
Пример #7
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        chart_name = "Top 100"
        try:
            chart_type = hxs.select(
                '//*[@class="tab-right-active"]/text()').extract()[0].strip()
        except IndexError:
            chart_type = hxs.select(
                '//*[@class="tab-left-active"]/text()').extract()[0].strip()

        if "upcoming" in response.url:
            extra = "Upcoming"
        if "mainstream" in response.url:
            extra = "Mainstream"
        if "alltime" in response.url:
            chart_name += " " + extra
            extra = "Alltime"

        id = chart_name + extra + chart_type
        chart = ChartItem()
        chart['name'] = chart_name + " " + chart_type
        chart[
            'display_name'] = chart["name"] if chart["name"] else "Top Overall"
        chart['origin'] = response.url
        chart['source'] = 'hotnewhiphop'
        chart['id'] = slugify(id)
        chart['list'] = []
        chart['extra'] = extra

        expires = chartCache.timedeltaUntilDays(1)
        cacheControl = chartCache.setCacheControl(expires)
        chart['date'] = cacheControl.get("Date-Modified")
        chart['expires'] = cacheControl.get("Date-Expires")
        chart['maxage'] = cacheControl.get("Max-Age")

        if "mixtape" in response.url:
            if extra == "Upcoming":
                chart['default'] = 1
            chart['type'] = "Album"
            loader = SingleUrlAlbumItem()
            urlKey = "url"
            url = "http://www.hotnewhiphop.com/ajax/api/getMixtape/"
        elif "song" in response.url:
            chart['type'] = "Track"
            loader = SingleUrlTrackItem()
            # Later on, if we have a hnhh resolver, this url could be used to
            # get a valid mp3 stream.
            url = "hnhh://www.hotnewhiphop.com/ajax/api/getSong/"
            urlKey = "stream_url"
        else:
            log.msg("Error with %s" % (chart['name']))
            return

        chart_list = []
        rank = 0
        for item in hxs.select('//div[@class="newCell newCell2"]'):
            if chart['type'] == "Album":
                loader = XPathItemLoader(SingleUrlAlbumItem(), selector=item)
            if chart['type'] == "Track":
                loader = XPathItemLoader(SingleUrlTrackItem(), selector=item)
            loader.add_xpath(chart['type'].lower(),
                             'div[@class="centerBlock"]/h3/a/text()')
            loader.add_xpath('artist', 'div[@class="centerBlock"]/a/i/text()')
            loader.add_xpath(urlKey, 'div[@class="centerBlock"]/a/@href')
            single = loader.load_item()
            single[urlKey] = url + urlparse(single[urlKey]).path.split(".")[1]
            rank += 1
            single['rank'] = rank
            chart_list.append(dict(single))

        log.msg("Done with %s" % (chart['name']))
        chart['list'] += chart_list
        return chart
Пример #8
0
    def parse_atom(self, feed):
        ns = {'ns': 'http://www.w3.org/2005/Atom',
            'im': 'http://itunes.apple.com/rss'}
        try:
            _id = feed.xpath('/ns:feed/ns:id', namespaces=ns)[0].text
            _type = feed.xpath('/ns:feed/ns:entry/im:contentType/im:contentType', namespaces=ns)[0].attrib['term']
        except IndexError:
            return

        if _type != "Album" and _type != "Track":
            return # skip playlists

        entries = feed.xpath('/ns:feed/ns:entry', namespaces=ns)
        chart_list = []
        rank = 0
        for entry in entries:
            title = entry.xpath('im:name', namespaces=ns)[0].text
            artist = entry.xpath('im:artist', namespaces=ns)[0].text
            if _type == "Album":
                album = title
                item = SingleAlbumItem()
            elif _type == "Track":
                album = None
                collectionNames = entry.xpath('im:collection/im:name', namespaces=ns)
                if len(collectionNames) > 0:
                    album = collectionNames[0].text
                item = SingleTrackItem()
                item['track'] = title
            
            rank += 1
            item['artist'] = artist
            item['album'] = album
            item['rank'] = rank
            chart_list.append( dict(item) )

        title = feed.xpath('ns:title', namespaces=ns)[0].text

        geo = None
        geo_re = re.compile("cc=([a-zA-Z]+)")
        rGeo =  geo_re.search(_id)
        if rGeo != None:
            geo = rGeo.groups()[0]

        genre = None
        genre_re = re.compile("genre=(\d+)/")
        rGenre =  genre_re.search(_id)
        if rGenre != None:
            genre = rGenre.groups()[0]

        if not genre is None:
            genre = get_genre(genre)

        origin = _id
        md5 = hashlib.md5()
        md5.update(_id)
        _id = md5.hexdigest()

        if geo is None:
            geo_s = origin.split("/")
            geo = geo_s

        chart = ChartItem()
        # Itunes expires tomorrow at 00am
        chart['id'] = _id
        chart['display_name'] = genre if genre else "Top Overall"
        chart['origin'] = origin
        chart['genre'] = genre
        chart['geo'] = geo
        chart['name'] = title
        chart['type'] = _type
        chart['list'] = chart_list
        chart['source'] = 'itunes'

        # maxage is the last item scraped
        expires = chartCache.timedeltaUntilDays(1)
        cacheControl = chartCache.setCacheControl(expires)
        chart['date'] = cacheControl.get("Date-Modified")
        chart['expires'] = cacheControl.get("Date-Expires")
        chart['maxage'] = cacheControl.get("Max-Age")

        if(_id == settings["ITUNES_DEFAULT_ALBUMCHART"] or _id == settings["ITUNES_DEFAULT_TRACKCHART"]):
            print "Found default" + _id
            chart['default'] = 1

        return chart
Пример #9
0
 def parse_rss(self, feed, url):
     genre_name = None
     feed_extra = None
     feed_type = "Album"
     geo = None
     genre = filter(lambda k: 'genre' in k, urlparser(url).path.split("/"))
     try :
         genre_name = get_genre( genre[0].split("=")[1] )
         # geo in xpath is different ISO than in url. We want cc not xpath
         # geo = feed.xpath('.//channel/language')[0].text
         geo_re = re.compile("cc=(.*)(?=\/)")
         rGeo =  geo_re.search(url)
         if rGeo != None:
             geo = rGeo.groups()[0]
     except IndexError :
         return
     
     if 'newreleases' in url :
         feed_extra = "New Album Releases"
     if 'justadded' in url :
         feed_extra = "Just Added Albums"
     if 'featuredalbums' in url:
         feed_extra = "Featured Albums"
     
     if feed_extra is None or genre_name is None or geo is None :
         return
     
     ns = { 'itms': 'http://phobos.apple.com/rss/1.0/modules/itms/' }
     entries = feed.xpath('.//channel/item')
     rank = 0
     chart_list = []
     for entry in entries:
         artist = entry.xpath('itms:artist', namespaces=ns)[0].text
         album = entry.xpath('itms:album', namespaces=ns)[0].text
         rank += 1
         item = SingleAlbumItem()
         item['artist'] = artist
         item['album'] = album
         item['rank'] = rank
         chart_list.append( dict(item) )
     
     chart = ChartItem()
     # Unique ids
     _id = url
     md5 = hashlib.md5()
     md5.update(_id)
     _id = md5.hexdigest()
     
     chart['id'] = _id
     chart['origin'] = url
     chart['genre'] = genre_name
     chart['geo'] = geo.lower()
     chart['name'] = genre_name
     chart['extra'] = feed_extra
     chart["newrls"] = True
     chart['type'] = feed_type
     chart['list'] = chart_list
     chart['source'] = 'itunes'
     # maxage is the last item scraped
     # Expires in 1 days
     expires = chartCache.timedeltaUntilDays(1)
     cacheControl = chartCache.setCacheControl(expires)
     chart['date'] = cacheControl.get("Date-Modified")
     chart['expires'] = cacheControl.get("Date-Expires")
     chart['maxage'] = cacheControl.get("Max-Age")
     
     if _id == settings["ITUNES_DEFAULT_NRCHART"]:
         chart['default'] = 1
     
     return chart
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        chart_name = "Top 100"
        try:
            chart_type = hxs.select('//*[@class="tab-right-active"]/text()').extract()[0].strip()
        except IndexError:
            chart_type = hxs.select('//*[@class="tab-left-active"]/text()').extract()[0].strip()

        if "upcoming" in response.url :
            extra = "Upcoming"
        if "mainstream" in response.url :
            extra = "Mainstream"
        if "alltime" in response.url :
            chart_name += " " + extra
            extra = "Alltime"

        id = chart_name + extra + chart_type    
        chart = ChartItem()
        chart['name'] = chart_name + " " + chart_type
        chart['display_name'] = chart["name"] if chart["name"] else "Top Overall"
        chart['origin'] = response.url
        chart['source'] = 'hotnewhiphop'
        chart['id'] = slugify(id)
        chart['list'] = []
        chart['extra'] = extra

        expires = chartCache.timedeltaUntilDays(1)
        cacheControl = chartCache.setCacheControl(expires)
        chart['date'] = cacheControl.get("Date-Modified")
        chart['expires'] = cacheControl.get("Date-Expires")
        chart['maxage'] = cacheControl.get("Max-Age")

        if "mixtape" in response.url :
            if extra == "Upcoming" :
                chart['default'] = 1
            chart['type'] = "Album"
            loader = SingleUrlAlbumItem()
            urlKey = "url"
            url = "http://www.hotnewhiphop.com/ajax/api/getMixtape/"
        elif "song" in response.url :
            chart['type'] = "Track"
            loader = SingleUrlTrackItem()
            # Later on, if we have a hnhh resolver, this url could be used to
            # get a valid mp3 stream.
            url = "hnhh://www.hotnewhiphop.com/ajax/api/getSong/"
            urlKey = "stream_url"
        else :
            log.msg("Error with %s" %(chart['name']))
            return

        chart_list = []
        rank = 0
        for item in hxs.select('//div[@class="newCell newCell2"]'):
            if chart['type'] == "Album" :
                loader = XPathItemLoader(SingleUrlAlbumItem(), selector=item)
            if chart['type'] == "Track" :
                loader = XPathItemLoader(SingleUrlTrackItem(), selector=item)
            loader.add_xpath(chart['type'].lower(), 'div[@class="centerBlock"]/h3/a/text()')
            loader.add_xpath('artist', 'div[@class="centerBlock"]/a/i/text()')
            loader.add_xpath(urlKey, 'div[@class="centerBlock"]/a/@href')
            single = loader.load_item()
            single[urlKey] = url + urlparse(single[urlKey]).path.split(".")[1]
            rank += 1
            single['rank'] = rank
            chart_list.append(dict(single))

        log.msg("Done with %s" %(chart['name']))
        chart['list'] += chart_list
        return chart
Пример #11
0
class MetacriticSpider(CrawlSpider):
    name = "metacritic.com"
    allowed_domains = ["metacritic.com"]
    baseUrl = "http://www.metacritic.com"

    genre_nav_xpath = './/ul[@class="genre_nav"]/li'
    types_xpath = './/ul[contains(@class, "tabs")]/li'
    first_nav_xpath = './/ul[contains(@class, "nav_items")]/li'
    current_page_name_xpath = './/ul[contains(@class, "tabs")]/li/span[@class="active"]/span/text()'
    list_xpath = './/ol[contains(@class,"list_product_condensed")]/li'
    next_page_xpath = './/ul[@class="pages"]/li/a/@href'
    coming_soon_table_xpath = './/table[@class="musicTable"]/tr'
    coming_soon_artist_xpath = './/td[@class="artistName"]'
    coming_soon_album_xpath = './/td[@class="albumTitle"]/text()'

    start_urls = ["http://www.metacritic.com/music"]

    rules = [
        Rule(SgmlLinkExtractor(allow=("albums/genre/\w+", ),
                               deny=(
                                   "music",
                                   "name",
                               ),
                               restrict_xpaths=(genre_nav_xpath, )),
             callback='parse_page',
             follow=True),
        Rule(SgmlLinkExtractor(
            deny=("albums/genre/\w+", "name", "music",
                  "coming-soon/(metascore|userscore|name|date)",
                  "new-releases/name"),
            restrict_xpaths=(types_xpath, )),
             callback='parse_new_releases',
             follow=True),
        Rule(SgmlLinkExtractor(allow=(
            "albums/release-date",
            "albums/score",
        ),
                               deny=(
                                   "feature",
                                   "artist",
                                   "/\w+/people",
                               ),
                               restrict_xpaths=(first_nav_xpath, )),
             callback='parse_new_releases',
             follow=True)
    ]

    # Expires in 2 days
    expires = chartCache.timedeltaUntilDays(1)
    cacheControl = chartCache.setCacheControl(expires)
    source_id = "metacritic"
    source_name = "Metacritic"
    description = "Critically acclaimed and noteworthy music."
    have_extra = True

    details = DetailItem(
        Detail(id=source_id,
               description=description,
               name=source_name,
               have_extra=have_extra))

    def __init__(self, name=None, **kwargs):
        super(MetacriticSpider, self).__init__()
        chartCache.shoveDetails(self.details)
        chartCache.shoveDetails(self.details, False)

    def get_current_genre(self, hxs):
        navList = hxs.select(self.genre_nav_xpath)
        for index, item in enumerate(navList):
            if item.select('.//span'):
                return item.select('.//span/text()').extract()[0].strip()
        return None

    def get_current(self, hxs, chart):
        try:
            active = hxs.select(self.current_page_name_xpath).extract()
            chart["extra"] = active[0].strip()
            chart["name"] = active[1].strip()
            chart["display_name"] = chart["name"]
            chart["id"] = slugify(chart["name"] + chart["extra"])
        except Exception, e:
            if "coming-soon" in chart["origin"]:
                chart["extra"] = "Coming Soon"
                chart["name"] = "By Date"
                chart["display_name"] = chart["name"]
                chart["id"] = slugify(chart["name"] + chart["extra"])
Пример #12
0
    def parse_atom(self, feed):
        ns = {'ns': 'http://www.w3.org/2005/Atom',
            'im': 'http://itunes.apple.com/rss'}
        try:
            _id = feed.xpath('/ns:feed/ns:id', namespaces=ns)[0].text
            _type = feed.xpath('/ns:feed/ns:entry/im:contentType/im:contentType', namespaces=ns)[0].attrib['term']
        except IndexError:
            return

        if _type != "Album" and _type != "Track":
            return # skip playlists

        entries = feed.xpath('/ns:feed/ns:entry', namespaces=ns)
        chart_list = []
        rank = 0
        for entry in entries:
            title = entry.xpath('im:name', namespaces=ns)[0].text
            artist = entry.xpath('im:artist', namespaces=ns)[0].text
            if _type == "Album":
                album = title
                item = SingleAlbumItem()
            elif _type == "Track":
                album = entry.xpath('im:collection/im:name', namespaces=ns)[0].text
                item = SingleTrackItem()
                item['track'] = title
            
            rank += 1
            item['artist'] = artist
            item['album'] = album
            item['rank'] = rank
            chart_list.append( dict(item) )

        title = feed.xpath('ns:title', namespaces=ns)[0].text

        geo = None
        geo_re = re.compile("cc=([a-zA-Z]+)")
        rGeo =  geo_re.search(_id)
        if rGeo != None:
            geo = rGeo.groups()[0]

        genre = None
        genre_re = re.compile("genre=(\d+)/")
        rGenre =  genre_re.search(_id)
        if rGenre != None:
            genre = rGenre.groups()[0]

        if not genre is None:
            genre = get_genre(genre)

        origin = _id
        md5 = hashlib.md5()
        md5.update(_id)
        _id = md5.hexdigest()

        if geo is None:
            geo_s = origin.split("/")
            geo = geo_s

        chart = ChartItem()
        # Itunes expires tomorrow at 00am
        chart['id'] = _id
        chart['display_name'] = genre if genre else "Top Overall"
        chart['origin'] = origin
        chart['genre'] = genre
        chart['geo'] = geo
        chart['name'] = title
        chart['type'] = _type
        chart['list'] = chart_list
        chart['source'] = 'itunes'

        # maxage is the last item scraped
        expires = chartCache.timedeltaUntilDays(1)
        cacheControl = chartCache.setCacheControl(expires)
        chart['date'] = cacheControl.get("Date-Modified")
        chart['expires'] = cacheControl.get("Date-Expires")
        chart['maxage'] = cacheControl.get("Max-Age")

        if(_id == settings["ITUNES_DEFAULT_ALBUMCHART"] or _id == settings["ITUNES_DEFAULT_TRACKCHART"]):
            print "Found default" + _id
            chart['default'] = 1

        return chart
Пример #13
0
 def parse_rss(self, feed, url):
     genre_name = None
     feed_extra = None
     feed_type = "Album"
     geo = None
     genre = filter(lambda k: 'genre' in k, urlparser(url).path.split("/"))
     try :
         genre_name = get_genre( genre[0].split("=")[1] )
         # geo in xpath is different ISO than in url. We want cc not xpath
         # geo = feed.xpath('.//channel/language')[0].text
         geo_re = re.compile("cc=(.*)(?=\/)")
         rGeo =  geo_re.search(url)
         if rGeo != None:
             geo = rGeo.groups()[0]
     except IndexError :
         return
     
     if 'newreleases' in url :
         feed_extra = "New Album Releases"
     if 'justadded' in url :
         feed_extra = "Just Added Albums"
     if 'featuredalbums' in url:
         feed_extra = "Featured Albums"
     
     if feed_extra is None or genre_name is None or geo is None :
         return
     
     ns = { 'itms': 'http://phobos.apple.com/rss/1.0/modules/itms/' }
     entries = feed.xpath('.//channel/item')
     rank = 0
     chart_list = []
     for entry in entries:
         artist = entry.xpath('itms:artist', namespaces=ns)[0].text
         album = entry.xpath('itms:album', namespaces=ns)[0].text
         rank += 1
         item = SingleAlbumItem()
         item['artist'] = artist
         item['album'] = album
         item['rank'] = rank
         chart_list.append( dict(item) )
     
     chart = ChartItem()
     # Unique ids
     _id = url
     md5 = hashlib.md5()
     md5.update(_id)
     _id = md5.hexdigest()
     
     chart['id'] = _id
     chart['origin'] = url
     chart['genre'] = genre_name
     chart['geo'] = geo.lower()
     chart['name'] = genre_name
     chart['extra'] = feed_extra
     chart["newrls"] = True
     chart['type'] = feed_type
     chart['list'] = chart_list
     chart['source'] = 'itunes'
     # maxage is the last item scraped
     # Expires in 1 days
     expires = chartCache.timedeltaUntilDays(1)
     cacheControl = chartCache.setCacheControl(expires)
     chart['date'] = cacheControl.get("Date-Modified")
     chart['expires'] = cacheControl.get("Date-Expires")
     chart['maxage'] = cacheControl.get("Max-Age")
     
     if _id == settings["ITUNES_DEFAULT_NRCHART"]:
         chart['default'] = 1
     
     return chart