Пример #1
0
def update_playlist(playlist, logger=None):
    """
    Reads a playlist and attempts to find accurate release data on each song.
    """
    hist = History('history.p')
    d = Discogs(logger=logger)

    for song in read_playlist(playlist):
        if hist.check_recent(song.loc):
            continue
        try:
            artist = song.meta.tag.artist
            songname = song.meta.tag.title
        except AttributeError:
            continue

        release = d.get_first_release(artist, songname)
        if release:
            update = build_update(songname, release)
            song.update_info(update)
            song.save()
        hist.store(song.loc)
Пример #2
0
class Gears(object):
    """Gears for scrapers"""
    def __init__(self, logger=None, hist_file='history.p'):
        """Can pass in an external logger"""
        if logger:
            self.logger = logger
        else:
            self.logger = logging.getLogger('temp.log')
        self.history = History(hist_file, logger=self.logger)

    def get(self, url, referer=None, agent=None, delay=True, check_hist=True):
        """
        I keep using this pattern to scrape pages
        """
        if not agent: agent = ragent()
        if delay: rdelay()
        if check_hist:
            if self.history.check_recent(url) is not False: return None

        headers = {'User-Agent': agent, 'referer': referer}
        self.logger.debug('Making request to %s\nwith headers:%s', url, headers)
        try:
            response = requests.get(url, headers=headers)
        except requests.exceptions.ConnectionError:
            self.logger.error('ConnectionError', exc_info=True)
        else:
            if response.status_code != 200: #pragma: no cover
                self.logger.error(
                    'Request != 200: status_code = %s', response.status_code
                    )
                self.logger.error(response.text)
            return response

    def parse_page(self, url, xpath=None, text=None, suffix='">',
                   referer=None, delay=True, check_hist=True):
        """
        Generalized version of tracklists method
        """
        self.logger.debug('Parsing page %s', url)
        response = self.get(url,
                            referer=referer, delay=delay, check_hist=check_hist)
        if not response:
            return []

        if xpath:
            self.logger.debug('with xpath=%s', xpath)
            tree = html.fromstring(response.text)
            elements = tree.xpath(xpath)
            if elements: # pragma: no cover
                self.logger.debug('Found %d elements', len(elements))
                return elements
            else: # pragma: no cover
                self.logger.debug('Found nothing')
                return []
        elif text: # pragma: no cover
            return self.find_string(response.text, text, suffix)

    def find_string(self, raw_text, prefix, suffix='">'):
        """Finds a string from raw HTML text"""
        self.logger.debug('Finding string between %s\nand\n%s', prefix, suffix)
        try:
            idx = raw_text.index(prefix)+len(prefix)
            found = raw_text[idx:].split(suffix)[0]
        except (ValueError, IndexError, AttributeError), err:
            self.logger.error(
                'String not found due to error: %s', err, exc_info=True
                )
            self.logger.debug('Raw text: %s', raw_text)
            return None
        else: