Exemplos de slugify em Python, exemplos de scrapers.items.slugify em Python

Exemplo n.º 1

0

Exibir arquivo

 def get_current(self, hxs, chart):
     try:
         active = hxs.select(self.current_page_name_xpath).extract()
         chart["extra"] = active[0].strip()
         chart["name"] = active[1].strip()
         chart["display_name"] = chart["name"]
         chart["id"] = slugify(chart["name"] + chart["extra"])
     except Exception, e:
         if "coming-soon" in chart["origin"]:
             chart["extra"] = "Coming Soon"
             chart["name"] = "By Date"
             chart["display_name"] = chart["name"]
             chart["id"] = slugify(chart["name"] + chart["extra"])

Exemplo n.º 2

0

Exibir arquivo

Arquivo: metacritic_spider.py Projeto: alexdavis-bf-toys/tomahawk-contrib

 def get_current(self, hxs, chart):
     try:
         active = hxs.select(self.current_page_name_xpath).extract();
         chart["extra"] = active[0].strip()
         chart["name"] = active[1].strip()
         chart["display_name"] = chart["name"];
         chart["id"] = slugify(chart["name"]+chart["extra"])
     except Exception, e:
         if "coming-soon" in chart["origin"]:
             chart["extra"] = "Coming Soon"
             chart["name"] = "By Date"
             chart["display_name"] = chart["name"];
             chart["id"] = slugify(chart["name"]+chart["extra"])

Exemplo n.º 3

0

Exibir arquivo

Arquivo: rdio.py Projeto: bencevans/tomahawk-contrib

    def parseUrl(self, type, region):
        response, contents = self.client.request(self.baseUrl, 'POST', urllib.urlencode({
            'method' : 'getTopCharts',
            'type' : type,
            '_region' : region
        }))

        if( response['status'] !=  '200' ) :
            print "Error " + response['status']
            return

        self.setChartOrigin(self.baseUrl)
        self.setChartType(type)
        self.setChartId(slugify("%s %s %s" % (self.chart_name, type, region)))
        self.setChartGeo(region)

        jsonContent = self.getJsonFromResponse(contents)

        chart_list = []
        for rank, items in enumerate(jsonContent['result']) :
            t = {}
            if( type == "Artist"):
                t["artist"] = items.pop("name")
            else:
                t['artist'] = items.pop("artist")
                t[type.lower()] = items.pop("name")
            t["rank"] = rank 
            chart_list.append(t)
        self.storeChartItem(chart_list)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: chart.py Projeto: alexdavis-bf-toys/tomahawk-contrib

    def __createChartItem(self):
        try:
            chart = ChartItem(
                id = slugify(self.chart_id),
                name = self.chart_name,
                display_name = self.display_name,
                origin = self.origin,
                type = self.chart_type,
                default = self.default,
                source = self.source_id,
                date = self.cacheControl.get("Date-Modified"),
                expires = self.cacheControl.get("Date-Expires"),
                maxage = self.cacheControl.get("Max-Age"),
                list = self.chart_list
            )
        except AttributeError:
            print "ChartItem is missing required attributes!"
            raise

        if self.have_extra :
            if self.geo is not None:
                chart['geo'] = self.geo
            if self.genre is not None:
                chart['genre'] = self.genre
            if self.extra is not None:
                chart['extra'] = self.extra

        return chart

Exemplo n.º 5

0

Exibir arquivo

Arquivo: chart.py Projeto: bencevans/tomahawk-contrib

    def __createChartItem(self):
        try:
            chart = ChartItem(id=slugify(self.chart_id),
                              name=self.chart_name,
                              display_name=self.display_name,
                              origin=self.origin,
                              type=self.chart_type,
                              default=self.default,
                              source=self.source_id,
                              date=self.cacheControl.get("Date-Modified"),
                              expires=self.cacheControl.get("Date-Expires"),
                              maxage=self.cacheControl.get("Max-Age"),
                              list=self.chart_list)
        except AttributeError:
            print "ChartItem is missing required attributes!"
            raise

        if self.have_extra:
            if self.geo is not None:
                chart['geo'] = self.geo
            if self.genre is not None:
                chart['genre'] = self.genre
            if self.extra is not None:
                chart['extra'] = self.extra

        return chart

Exemplo n.º 6

0

Exibir arquivo

Arquivo: billboard_spider.py Projeto: bencevans/tomahawk-contrib

    def parse_chart(self, response):
        hxs = HtmlXPathSelector(response)

        chart_name = hxs.select(
            '//h1[@id="page-title"]/text()').extract()[0].strip()
        #chart_type = hxs.select('//*[@id="chart-list"]/div[@id="chart-type-fb"]/text()').extract()[0].strip()

        # get a list of pages
        next_pages = hxs.select(self.next_page_xpath).extract()
        # remove javascript links and turn it into a queue, also, we want to exclude next chart (!)
        next_pages = deque(filter(lambda e: not 'javascript' in e, next_pages))

        # Correct the grammar to fit our expectations
        if chart_name == 'Germany Songs':
            chart_name = 'German Tracks'

        chart = ChartItem()
        chart['name'] = chart_name
        chart['display_name'] = chart_name if chart_name else "Top Overall"
        chart['origin'] = response.url
        chart['source'] = 'billboard'
        chart['id'] = slugify(chart_name)
        chart['list'] = []

        chart['date'] = self.cacheControl.get("Date-Modified")
        chart['expires'] = self.cacheControl.get("Date-Expires")
        chart['maxage'] = self.cacheControl.get("Max-Age")

        # lets figure out the content type
        lower_name = chart_name.lower()
        if 'songs' in lower_name:
            chart['type'] = 'Track'
            typeItem = SingleTrackItem()
        elif 'albums' in lower_name \
            or any(lower_name in s for s in ['soundtracks', 'billboard 200', 'tastemakers']):
            chart['type'] = 'Album'
            typeItem = SingleAlbumItem()
        elif any(lower_name in s for s in ['social 50', 'uncharted']):
            chart['type'] = 'Artist'
            typeItem = SingleArtistItem()
        else:
            chart['type'] = 'Track'
            typeItem = SingleTrackItem()

        if (chart['id'] == settings["BILLBOARD_DEFAULT_ALBUMCHART"]
                or chart['id'] == settings["BILLBOARD_DEFAULT_TRACKCHART"]):
            chart['default'] = 1

        chart = self.parse_items(hxs, chart, typeItem)
        # ok, we've prepped the chart container, lets start getting the pages
        if len(next_pages) > 0:
            next_page = next_pages.popleft()
            request = Request('http://www.billboard.com' + next_page,
                              callback=lambda r: self.parse_page(
                                  r, chart, next_pages, typeItem))
            yield request

Exemplo n.º 7

0

Exibir arquivo

Arquivo: billboard_spider.py Projeto: alexdavis-bf-toys/tomahawk-contrib

    def parse_chart(self, response):
        hxs = HtmlXPathSelector(response)

        chart_name = hxs.select('//h1[@id="page-title"]/text()').extract()[0].strip()
        #chart_type = hxs.select('//*[@id="chart-list"]/div[@id="chart-type-fb"]/text()').extract()[0].strip()

        # get a list of pages
        next_pages = hxs.select(self.next_page_xpath).extract()
        # remove javascript links and turn it into a queue, also, we want to exclude next chart (!)
        next_pages = deque(filter(lambda e: not 'javascript' in e, next_pages))

        # Correct the grammar to fit our expectations
        if chart_name == 'Germany Songs':
            chart_name = 'German Tracks'

        chart = ChartItem()
        chart['name'] = chart_name
        chart['display_name'] = chart_name if chart_name else "Top Overall"
        chart['origin'] = response.url
        chart['source'] = 'billboard'
        chart['id'] = slugify(chart_name)
        chart['list'] = []

        chart['date'] = self.cacheControl.get("Date-Modified")
        chart['expires'] = self.cacheControl.get("Date-Expires")
        chart['maxage'] = self.cacheControl.get("Max-Age")

        # lets figure out the content type
        lower_name = chart_name.lower()
        if 'songs' in lower_name :
            chart['type'] = 'Track'
            typeItem =  SingleTrackItem()
        elif 'albums' in lower_name \
            or any(lower_name in s for s in ['soundtracks', 'billboard 200', 'tastemakers']):
            chart['type'] = 'Album'
            typeItem = SingleAlbumItem()
        elif any(lower_name in s for s in ['social 50', 'uncharted']):
            chart['type'] = 'Artist'
            typeItem =  SingleArtistItem()
        else:
            chart['type'] = 'Track'
            typeItem =  SingleTrackItem()

        if(chart['id'] == settings["BILLBOARD_DEFAULT_ALBUMCHART"] or chart['id'] == settings["BILLBOARD_DEFAULT_TRACKCHART"]):
            chart['default'] = 1

        chart = self.parse_items(hxs, chart, typeItem)
        # ok, we've prepped the chart container, lets start getting the pages
        if len(next_pages) > 0 :
            next_page = next_pages.popleft()
            request = Request('http://www.billboard.com'+next_page, callback = lambda r: self.parse_page(r, chart, next_pages, typeItem))
            yield request

Exemplo n.º 8

0

Exibir arquivo

Arquivo: soundcloudwall.py Projeto: bencevans/tomahawk-contrib

    def parseUrl(self):
        print "%s %s" % (self.chart_name, self.url)
        self.setChartId(slugify(self.chart_name))
        self.setChartDisplayName(self.chart_name)
        self.setChartOrigin(self.url)

        chart_list = []

        jsonContent = self.getJsonContent(self.url)

        if( len(jsonContent) != 0 ):
            rank = 0
            count = 0;
            for rank, items in enumerate(jsonContent):
                item = {}
                # We only take the first 100
                if( count < 100):
                    # Soundcloud metadata is hard
                    try:
                        item["track"] = items.pop("title").rstrip().strip()
                        try:
                            item["artist"] = item["track"][:item["track"].index(" - ")]
                            item["track"] = item["track"][item["track"].index(" - ")+3:]
                        except (ValueError):
                            try:
                                item["artist"] = item["track"][:item["track"].index(" -")]
                                item["track"] = item["track"][item["track"].index(" -")+2:]
                            except (ValueError):
                                try:
                                    item["artist"] = item["track"][:item["track"].index(": ")]
                                    item["track"] = item["track"][item["track"].index(": ")+2:]
                                except (ValueError):
                                    try:
                                        item["artist"] = item["track"][:item["track"].index(":")]
                                        item["track"] = item["track"][item["track"].index(":")+1:]
                                    except (ValueError):
                                        try:
                                            item["artist"] = item["track"][:item["track"].index("\u2014")]
                                            item["track"] = item["track"][item["track"].index("\u2014")+1:]
                                        except (ValueError):
                                            item["artist"] = items.pop("username").rstrip().strip()
                                            
                        item["rank"] = rank
                        item['stream_url'] = "http://api.soundcloud.com/tracks/" + str(items.pop("id")) + "/stream.json?client_id=%s" % (self.apiKey)
                    except (AttributeError):
                        pass
                    count += 1
                    chart_list.append(item)
        # Stores this chart
        self.storeChartItem(chart_list)

Exemplo n.º 9

0

Exibir arquivo

    def create_chart(self, response, name=None, type=None):
        chart = ChartItem(origin=response.url,
                          source=self.source_id,
                          list=[],
                          date=self.cacheControl.get("Date-Modified"),
                          expires=self.cacheControl.get("Date-Expires"),
                          maxage=self.cacheControl.get("Max-Age"),
                          type="Album",
                          newrls=True if "new-releases" in response.url
                          or "coming-soon" in response.url else False)

        if name is not None and type is not None:
            chart["name"] = name
            chart["display_name"] = name
            chart["id"] = slugify(name + type)
            chart["extra"] = type
        return chart

Exemplo n.º 10

0

Exibir arquivo

Arquivo: metacritic_spider.py Projeto: alexdavis-bf-toys/tomahawk-contrib

    def create_chart(self, response, name = None, type = None):
        chart = ChartItem(
            origin=response.url,
            source=self.source_id,
            list=[],
            date=self.cacheControl.get("Date-Modified"),
            expires=self.cacheControl.get("Date-Expires"),
            maxage=self.cacheControl.get("Max-Age"),
            type="Album",
            newrls=True if "new-releases" in response.url or "coming-soon" in response.url else False
        );

        if name is not None and type is not None:
            chart["name"] = name
            chart["display_name"] = name
            chart["id"] = slugify(name+type)
            chart["extra"] = type
        return chart

Exemplo n.º 11

0

Exibir arquivo

Arquivo: rcharts.py Projeto: alexdavis-bf-toys/tomahawk-contrib

    def parse(self):
        for section in self.sections:
            response = self.getJsonContent('{baseUrl}/r/{section}.json'.format(baseUrl=self.baseUrl, section=section))

            self.setChartOrigin(self.baseUrl)
            self.setChartName(section.capitalize())
            self.setChartDisplayName(self.chart_name)
            self.setChartId(slugify(self.chart_name))

            result_list = []
            for rank, item in enumerate(response[u'tracks']):
                chart_item = {
                        'rank' : rank,
                        'artist' : item['artist'],
                        'track' : item['title']
                        }
                result_list.append(chart_item)

            self.storeChartItem(result_list)

Exemplo n.º 12

0

Exibir arquivo

    def parse(self):
        for section in self.sections:
            response = self.getJsonContent('{baseUrl}/r/{section}.json'.format(
                baseUrl=self.baseUrl, section=section))

            self.setChartOrigin(self.baseUrl)
            self.setChartName(section.capitalize())
            self.setChartDisplayName(self.chart_name)
            self.setChartId(slugify(self.chart_name))

            result_list = []
            for rank, item in enumerate(response[u'tracks']):
                chart_item = {
                    'rank': rank,
                    'artist': item['artist'],
                    'track': item['title']
                }
                result_list.append(chart_item)

            self.storeChartItem(result_list)

Exemplo n.º 13

0

Exibir arquivo

Arquivo: billboard_spider.py Projeto: Ramblurr/tomahawk-contrib

    def parse_chart(self, response):
        hxs = HtmlXPathSelector(response)

        # get a list of pages
        next_pages = hxs.select(self.next_page_xpath).extract()
        # remove javascript links and turn it into a queue
        next_pages = deque(filter(lambda e: not 'javascript' in e, next_pages))


        chart_name = hxs.select('//*[@class="printable-chart-header"]/h1/b/text()').extract()[0].strip()
        chart_type = hxs.select('//*[@id="chart-list"]/div[@id="chart-type-fb"]/text()').extract()[0].strip()

        chart = ChartItem()
        chart['name'] = chart_name
        chart['origin'] = response.url
        chart['source'] = 'billboard'
        chart['id'] = slugify(chart_name)
        chart['list'] = []


        # lets figure out the content type
        lower_name = chart_name.lower()
        if chart_type == 'Albums':
            chart['type'] = 'Album'
        elif chart_type == 'Singles':
            chart['type'] = 'Track'
        elif 'albums' in lower_name:
            chart['type'] = 'Album'
        elif 'soundtrack' in lower_name:
            chart['type'] = 'Album'
        else:
            chart['type'] = 'Track'


        # ok, we've prepped the chart container, lets start getting the pages
        next_page = next_pages.popleft()

        request = Request('http://www.billboard.com'+next_page, callback = lambda r: self.parse_page(r, chart, next_pages))

        yield request

Exemplo n.º 14

0

Exibir arquivo

Arquivo: exfm.py Projeto: alexdavis-bf-toys/tomahawk-contrib

    def parseUrl(self, url, extra = None):
        self.setChartName("%s %ss" % (self.exfmType.title(), self.chart_type.title()))
        self.setChartDisplayName(extra.title() if extra else self.exfmType.title())
        self.setChartOrigin(url)

        if extra:
            self.setChartName("%s %s" % (self.chart_name, extra))
        self.setChartId(slugify(self.chart_name))

        jsonContent = self.getJsonContent(url)

        chart_list = []
        for rank, items in enumerate(jsonContent['songs']):
            t = {}
            try:
                t["artist"] = items.pop("artist").rstrip().strip()
                t["track"] = items.pop("title").rstrip().strip()
                t["rank"] = rank
            except (AttributeError):
                pass
            chart_list.append(t)
        self.storeChartItem(chart_list)

Exemplo n.º 15

0

Exibir arquivo

Arquivo: exfm.py Projeto: bencevans/tomahawk-contrib

    def parseUrl(self, url, extra=None):
        self.setChartName("%s %ss" %
                          (self.exfmType.title(), self.chart_type.title()))
        self.setChartDisplayName(
            extra.title() if extra else self.exfmType.title())
        self.setChartOrigin(url)

        if extra:
            self.setChartName("%s %s" % (self.chart_name, extra))
        self.setChartId(slugify(self.chart_name))

        jsonContent = self.getJsonContent(url)

        chart_list = []
        for rank, items in enumerate(jsonContent['songs']):
            t = {}
            try:
                t["artist"] = items.pop("artist").rstrip().strip()
                t["track"] = items.pop("title").rstrip().strip()
                t["rank"] = rank
            except (AttributeError):
                pass
            chart_list.append(t)
        self.storeChartItem(chart_list)

Exemplo n.º 16

0

Exibir arquivo

Arquivo: rovi.py Projeto: alexdavis-bf-toys/tomahawk-contrib

    def parse_albums(self, name, albums, isEditorial ):
        if albums is None:
            # something went wrong
            return

        self.setChartName(name)
        self.setChartDisplayName(name)
        self.setChartType("Album")
        self.setChartId(slugify("%s%s" % (self.source_id, name) if isEditorial is True else "%seditorial %s" % (self.source_id,name)))
        self.setChartExtra("Editorial Choices") if isEditorial else self.setChartExtra(None)

        chart_list =  []
        nullList = []
        for album in albums:
            try:
                album = album['album']
                title = album['title']
                artist = " ".join([ artist['name'] for artist in album['primaryArtists'] ])
                try:
                    review = album['headlineReview']
                    try:
                        review['text'] = re.sub(r'((\[roviLink=.+])(.*?)(\[/roviLink]))', r'\3', review['text'])
                    except Exception,e:
                        print e
                except Exception:
                    review = None

                release_date = album['originalReleaseDate']
                rating = album['rating']
                # instead of filter out by releasedate, we search the api by releaseyear
                # the result seems to be more appealing
                # Note: some albums have Null releaseDate, this doesnt necessarily mean
                # that the release date isnt within our range. We include some of them as well
                if release_date is not None :
                    chart_list.append (
                        {'album': title,
                         'artist': artist,
                         'date': release_date,
                         'rating': rating,
                         'review' : review
                     })
                else :
                    nullList.append (
                       {'album': title,
                        'artist': artist,
                        'date': release_date,
                        'rating': rating,
                         'review' : review
                    })
            except :
                continue

        if(len(nullList) > self.maxAlbums):
            print("Slicing NUllList from %s to %s" %(len(nullList), self.maxAlbums))
            nullList = nullList[-self.maxAlbums:]

        chart_list = sorted(chart_list, key=itemgetter('date'))
        if(len(chart_list) > self.maxAlbums):
            print("Slicing list from %s to %s" %(len(chart_list), self.maxAlbums))
            chart_list = chart_list[-self.maxAlbums:]

        _list = nullList + chart_list
        self.storeChartItem(_list)

Exemplo n.º 17

0

Exibir arquivo

Arquivo: chart.py Projeto: alexdavis-bf-toys/tomahawk-contrib

 def __message(self, ok = False):
     print "%s %s - %s (%s) : %s" % ((self.__outputMsgOk if ok else self.__outputMsgError), 
                                     self.source_id, self.chart_type, slugify(self.chart_id), 
                                     self.display_name)

Exemplo n.º 18

0

Exibir arquivo

Arquivo: hnhh_spider.py Projeto: alexdavis-bf-toys/tomahawk-contrib

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        chart_name = "Top 100"
        try:
            chart_type = hxs.select('//*[@class="tab-right-active"]/text()').extract()[0].strip()
        except IndexError:
            chart_type = hxs.select('//*[@class="tab-left-active"]/text()').extract()[0].strip()

        if "upcoming" in response.url :
            extra = "Upcoming"
        if "mainstream" in response.url :
            extra = "Mainstream"
        if "alltime" in response.url :
            chart_name += " " + extra
            extra = "Alltime"

        id = chart_name + extra + chart_type    
        chart = ChartItem()
        chart['name'] = chart_name + " " + chart_type
        chart['display_name'] = chart["name"] if chart["name"] else "Top Overall"
        chart['origin'] = response.url
        chart['source'] = 'hotnewhiphop'
        chart['id'] = slugify(id)
        chart['list'] = []
        chart['extra'] = extra

        expires = chartCache.timedeltaUntilDays(1)
        cacheControl = chartCache.setCacheControl(expires)
        chart['date'] = cacheControl.get("Date-Modified")
        chart['expires'] = cacheControl.get("Date-Expires")
        chart['maxage'] = cacheControl.get("Max-Age")

        if "mixtape" in response.url :
            if extra == "Upcoming" :
                chart['default'] = 1
            chart['type'] = "Album"
            loader = SingleUrlAlbumItem()
            urlKey = "url"
            url = "http://www.hotnewhiphop.com/ajax/api/getMixtape/"
        elif "song" in response.url :
            chart['type'] = "Track"
            loader = SingleUrlTrackItem()
            # Later on, if we have a hnhh resolver, this url could be used to
            # get a valid mp3 stream.
            url = "hnhh://www.hotnewhiphop.com/ajax/api/getSong/"
            urlKey = "stream_url"
        else :
            log.msg("Error with %s" %(chart['name']))
            return

        chart_list = []
        rank = 0
        for item in hxs.select('//div[@class="newCell newCell2"]'):
            if chart['type'] == "Album" :
                loader = XPathItemLoader(SingleUrlAlbumItem(), selector=item)
            if chart['type'] == "Track" :
                loader = XPathItemLoader(SingleUrlTrackItem(), selector=item)
            loader.add_xpath(chart['type'].lower(), 'div[@class="centerBlock"]/h3/a/text()')
            loader.add_xpath('artist', 'div[@class="centerBlock"]/a/i/text()')
            loader.add_xpath(urlKey, 'div[@class="centerBlock"]/a/@href')
            single = loader.load_item()
            single[urlKey] = url + urlparse(single[urlKey]).path.split(".")[1]
            rank += 1
            single['rank'] = rank
            chart_list.append(dict(single))

        log.msg("Done with %s" %(chart['name']))
        chart['list'] += chart_list
        return chart

Exemplo n.º 19

0

Exibir arquivo

Arquivo: chart.py Projeto: gordielachance/tomahawk-contrib

 def storeChartItem(self, chart_list):
     print "Saving chart: %s - %s (%s) : %s" % (self.source_id, self.chart_type, slugify(self.chart_id), self.display_name)
     self.chart_list = chart_list;
     chart = self.__createChartItem()
     self.__updateCache(self.__createMetadata(chart), chart)

Exemplo n.º 20

0

Exibir arquivo

Arquivo: djshop_spider.py Projeto: bencevans/tomahawk-contrib

    def parse(self, response):
        log.msg("Parsing: %s" % (response.url), loglevel=log.INFO)
        hxs = HtmlXPathSelector(response)
        chart = ChartItem()
        title = hxs.select("//title/text()").extract()[0].strip()
        test = re.compile('^(MP3 Downloads(\sCharts|\s))(.*?)(\sCharts)',
                          re.IGNORECASE)

        try:
            cTitle = test.match(title).group(3)
            if cTitle is not None:
                type = self.chartTypes[2]["pretty"] + " "
                if "vinyl" in response.url.lower():
                    type += self.chartTypes[1]["pretty"]
                else:
                    type += self.chartTypes[0]["pretty"]
                chart["extra"] = type
                chart["name"] = cTitle.replace(self.chartTypes[2]["pretty"],
                                               "")

        except Exception:
            for type in self.chartTypes:
                if type["unpretty"] in title:
                    chart["extra"] = type["pretty"]
                    cTitle = title.replace(type["unpretty"], "")
                    if len(cTitle) == 0:
                        chart["name"] = response.url.split('/')[-1].replace(
                            ".html", "").title().replace("-", " ")
                    else:
                        chart["name"] = cTitle
                    if "Top 100" in chart["extra"]:
                        chart["extra"] += " "
                        if "vinyl" in response.url.lower():
                            chart["extra"] += self.chartTypes[1]["pretty"]
                        else:
                            chart["extra"] += self.chartTypes[0]["pretty"]
                        chart["name"] = chart["name"].replace("Charts", "")

        if "name" in chart:
            chart["name"] = chart["name"].rstrip("-").strip()
            chart['display_name'] = chart["name"] if chart[
                "name"] else "Top Overall"
            chart['origin'] = response.url
            chart['source'] = 'djshop.de'
            chart['id'] = slugify(chart["extra"] + chart["name"])
            chart["type"] = "Album"
            chart['date'] = self.cacheControl.get("Date-Modified")
            chart['expires'] = self.cacheControl.get("Date-Expires")
            chart['maxage'] = self.cacheControl.get("Max-Age")
            chart['list'] = []
            '''
                This could be transformed into a track chart
                However, theres so many various and compilations
                and I dont think Tomahawk would parse them good.
                Also, its actually a Vinyl chart, so theres no "track"
                ranking involved
            '''

            typeItem = SingleAlbumItem()
            cols = hxs.select('//div[@class="column1"]')
            chart_list = []
            for index, col in enumerate(cols):
                loader = XPathItemLoader(typeItem, selector=col)
                loader.add_xpath('rank', str(index + 1))
                loader.add_xpath('artist', "h2/a/text()")
                loader.add_xpath('album', "h3/text()")
                single = loader.load_item()
                chart_list.append(dict(single))

            chart['list'] += chart_list
            yield chart

Exemplo n.º 21

0

Exibir arquivo

Arquivo: chart.py Projeto: bencevans/tomahawk-contrib

 def __message(self, ok=False):
     print "%s %s - %s (%s) : %s" % (
         (self.__outputMsgOk if ok else self.__outputMsgError),
         self.source_id, self.chart_type, slugify(
             self.chart_id), self.display_name)

Exemplo n.º 22

0

Exibir arquivo

Arquivo: hnhh_spider.py Projeto: bencevans/tomahawk-contrib

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        chart_name = "Top 100"
        try:
            chart_type = hxs.select(
                '//*[@class="tab-right-active"]/text()').extract()[0].strip()
        except IndexError:
            chart_type = hxs.select(
                '//*[@class="tab-left-active"]/text()').extract()[0].strip()

        if "upcoming" in response.url:
            extra = "Upcoming"
        if "mainstream" in response.url:
            extra = "Mainstream"
        if "alltime" in response.url:
            chart_name += " " + extra
            extra = "Alltime"

        id = chart_name + extra + chart_type
        chart = ChartItem()
        chart['name'] = chart_name + " " + chart_type
        chart[
            'display_name'] = chart["name"] if chart["name"] else "Top Overall"
        chart['origin'] = response.url
        chart['source'] = 'hotnewhiphop'
        chart['id'] = slugify(id)
        chart['list'] = []
        chart['extra'] = extra

        expires = chartCache.timedeltaUntilDays(1)
        cacheControl = chartCache.setCacheControl(expires)
        chart['date'] = cacheControl.get("Date-Modified")
        chart['expires'] = cacheControl.get("Date-Expires")
        chart['maxage'] = cacheControl.get("Max-Age")

        if "mixtape" in response.url:
            if extra == "Upcoming":
                chart['default'] = 1
            chart['type'] = "Album"
            loader = SingleUrlAlbumItem()
            urlKey = "url"
            url = "http://www.hotnewhiphop.com/ajax/api/getMixtape/"
        elif "song" in response.url:
            chart['type'] = "Track"
            loader = SingleUrlTrackItem()
            # Later on, if we have a hnhh resolver, this url could be used to
            # get a valid mp3 stream.
            url = "hnhh://www.hotnewhiphop.com/ajax/api/getSong/"
            urlKey = "stream_url"
        else:
            log.msg("Error with %s" % (chart['name']))
            return

        chart_list = []
        rank = 0
        for item in hxs.select('//div[@class="newCell newCell2"]'):
            if chart['type'] == "Album":
                loader = XPathItemLoader(SingleUrlAlbumItem(), selector=item)
            if chart['type'] == "Track":
                loader = XPathItemLoader(SingleUrlTrackItem(), selector=item)
            loader.add_xpath(chart['type'].lower(),
                             'div[@class="centerBlock"]/h3/a/text()')
            loader.add_xpath('artist', 'div[@class="centerBlock"]/a/i/text()')
            loader.add_xpath(urlKey, 'div[@class="centerBlock"]/a/@href')
            single = loader.load_item()
            single[urlKey] = url + urlparse(single[urlKey]).path.split(".")[1]
            rank += 1
            single['rank'] = rank
            chart_list.append(dict(single))

        log.msg("Done with %s" % (chart['name']))
        chart['list'] += chart_list
        return chart

Exemplo n.º 23

0

Exibir arquivo

Arquivo: rovi.py Projeto: bencevans/tomahawk-contrib

    def parse_albums(self, name, albums, isEditorial):
        if albums is None:
            # something went wrong
            return

        self.setChartName(name)
        self.setChartDisplayName(name)
        self.setChartType("Album")
        self.setChartId(
            slugify("%s%s" %
                    (self.source_id,
                     name) if isEditorial is True else "%seditorial %s" %
                    (self.source_id, name)))
        self.setChartExtra(
            "Editorial Choices") if isEditorial else self.setChartExtra(None)

        chart_list = []
        nullList = []
        for album in albums:
            try:
                album = album['album']
                title = album['title']
                artist = " ".join(
                    [artist['name'] for artist in album['primaryArtists']])
                try:
                    review = album['headlineReview']
                    try:
                        review['text'] = re.sub(
                            r'((\[roviLink=.+])(.*?)(\[/roviLink]))', r'\3',
                            review['text'])
                    except Exception, e:
                        print e
                except Exception:
                    review = None

                release_date = album['originalReleaseDate']
                rating = album['rating']
                # instead of filter out by releasedate, we search the api by releaseyear
                # the result seems to be more appealing
                # Note: some albums have Null releaseDate, this doesnt necessarily mean
                # that the release date isnt within our range. We include some of them as well
                if release_date is not None:
                    chart_list.append({
                        'album': title,
                        'artist': artist,
                        'date': release_date,
                        'rating': rating,
                        'review': review
                    })
                else:
                    nullList.append({
                        'album': title,
                        'artist': artist,
                        'date': release_date,
                        'rating': rating,
                        'review': review
                    })
            except:
                continue

        if (len(nullList) > self.maxAlbums):
            print("Slicing NUllList from %s to %s" %
                  (len(nullList), self.maxAlbums))
            nullList = nullList[-self.maxAlbums:]

        chart_list = sorted(chart_list, key=itemgetter('date'))
        if (len(chart_list) > self.maxAlbums):
            print("Slicing list from %s to %s" %
                  (len(chart_list), self.maxAlbums))
            chart_list = chart_list[-self.maxAlbums:]

        _list = nullList + chart_list
        self.storeChartItem(_list)

Exemplo n.º 24

0

Exibir arquivo

Arquivo: djshop_spider.py Projeto: alexdavis-bf-toys/tomahawk-contrib

    def parse(self, response):
        log.msg("Parsing: %s"%(response.url), loglevel=log.INFO)
        hxs = HtmlXPathSelector(response)
        chart = ChartItem()
        title = hxs.select("//title/text()").extract()[0].strip()
        test = re.compile('^(MP3 Downloads(\sCharts|\s))(.*?)(\sCharts)', re.IGNORECASE)

        try:
            cTitle = test.match(title).group(3)
            if cTitle is not None:
                type = self.chartTypes[2]["pretty"]+" ";
                if "vinyl" in response.url.lower() :
                    type += self.chartTypes[1]["pretty"]
                else :
                    type += self.chartTypes[0]["pretty"]
                chart["extra"] = type;
                chart["name"] = cTitle.replace(self.chartTypes[2]["pretty"], "")

        except Exception:
            for type in self.chartTypes:
                if type["unpretty"] in title :
                    chart["extra"] = type["pretty"]
                    cTitle = title.replace(type["unpretty"], "")
                    if len(cTitle) == 0:
                        chart["name"] = response.url.split('/')[-1].replace(".html", "").title().replace("-", " ");
                    else :
                        chart["name"] = cTitle
                    if "Top 100" in chart["extra"] :
                        chart["extra"] += " "
                        if "vinyl" in response.url.lower() :
                            chart["extra"] += self.chartTypes[1]["pretty"]
                        else :
                            chart["extra"] += self.chartTypes[0]["pretty"]
                        chart["name"] = chart["name"].replace("Charts", "")

        if "name" in chart :
            chart["name"] = chart["name"].rstrip("-").strip()
            chart['display_name'] = chart["name"] if chart["name"] else "Top Overall"
            chart['origin'] = response.url
            chart['source'] = 'djshop.de'
            chart['id'] = slugify(chart["extra"] + chart["name"])
            chart["type"] = "Album"
            chart['date'] = self.cacheControl.get("Date-Modified")
            chart['expires'] = self.cacheControl.get("Date-Expires")
            chart['maxage'] = self.cacheControl.get("Max-Age")
            chart['list'] = []

            '''
                This could be transformed into a track chart
                However, theres so many various and compilations
                and I dont think Tomahawk would parse them good.
                Also, its actually a Vinyl chart, so theres no "track"
                ranking involved
            '''

            typeItem = SingleAlbumItem()
            cols = hxs.select('//div[@class="column1"]')
            chart_list = []
            for index, col in enumerate(cols):
                loader = XPathItemLoader(typeItem, selector=col)
                loader.add_xpath('rank', str(index+1))
                loader.add_xpath('artist', "h2/a/text()")
                loader.add_xpath('album', "h3/text()")
                single = loader.load_item()
                chart_list.append(dict(single))

            chart['list'] += chart_list
            yield chart