def fetch(self, url, etag=None, modified=None, autodiscovery=True, **kwargs): """ use kwargs to pass extra data to parse_feed in Fetcher subclasses """ # handle local file first if url.startswith('file://'): url = url[len('file://'):] stream = open(url) return self.parse_feed(url, stream, {}, UPDATED_FEED, **kwargs) # remote feed headers = {} if modified is not None: headers['If-Modified-Since'] = modified if etag is not None: headers['If-None-Match'] = etag stream = util.urlopen(url, headers) responses = stream.history + [stream] for i, resp in enumerate(responses): if resp.is_permanent_redirect: # there should always be a next response when a redirect is encountered # If max redirects is reached, TooManyRedirects is raised # TODO: since we've got the end contents anyway, modify model.py to accept contents on NEW_LOCATION return Result(NEW_LOCATION, responses[i + 1].url) res = self._check_statuscode(stream.status_code, stream.url) if res == NOT_MODIFIED: return Result(NOT_MODIFIED, stream.url) if autodiscovery and stream.headers.get('content-type', '').startswith('text/html'): ad = FeedAutodiscovery(url) # response_text() will assume utf-8 if no charset specified ad.feed(util.response_text(stream)) if ad._resolved_url and ad._resolved_url != url: try: self.fetch(ad._resolved_url, etag=None, modified=None, autodiscovery=False, **kwargs) return Result(NEW_LOCATION, ad._resolved_url) except Exception as e: logger.warn('Feed autodiscovery failed', exc_info=True) # Second, try to resolve the URL new_url = self._resolve_url(url) if new_url and new_url != url: return Result(NEW_LOCATION, new_url) # xml documents specify the encoding inline so better pass encoded body. # Especially since requests will use ISO-8859-1 for content-type 'text/xml' # if the server doesn't specify a charset. return self.parse_feed(url, BytesIO(stream.content), stream.headers, UPDATED_FEED, **kwargs)
def get_channel_desc(url, feed_data=None): if 'youtube.com' in url: class YouTubeHTMLDesc(HTMLParser): """This custom html parser searches for the YouTube channel description.""" def __init__(self): super().__init__() self.description = '' def handle_starttag(self, tag, attributes): attribute_dict = { attribute[0]: attribute[1] for attribute in attributes } # Get YouTube channel description. if tag == 'meta' \ and 'name' in attribute_dict \ and attribute_dict['name'] == "description": self.description = attribute_dict['content'] try: channel_url = get_channel_id_url(url, feed_data) r = util.urlopen(channel_url) if not r.ok: raise YouTubeError('Youtube "%s": %d %s' % (url, r.status_code, r.reason)) html_data = util.response_text(r) parser = YouTubeHTMLDesc() parser.feed(html_data) if parser.description: logger.debug('YouTube description for %s is: %s', url, parser.description) return parser.description else: logger.debug('YouTube description for %s is not provided.', url) return _('No description available') except Exception: logger.warning( 'Could not retrieve YouTube channel description for %s.' % url, exc_info=True)
def get_cover(url, feed_data=None): if 'youtube.com' in url: class YouTubeHTMLCoverParser(HTMLParser): """This custom html parser searches for the youtube channel thumbnail/avatar""" def __init__(self): super().__init__() self.url = [] def handle_starttag(self, tag, attributes): attribute_dict = { attribute[0]: attribute[1] for attribute in attributes } # Look for 900x900px image first. if tag == 'link' \ and 'rel' in attribute_dict \ and attribute_dict['rel'] == 'image_src': self.url.append(attribute_dict['href']) # Fallback to image that may only be 100x100px. elif tag == 'img' \ and 'class' in attribute_dict \ and attribute_dict['class'] == "channel-header-profile-image": self.url.append(attribute_dict['src']) try: channel_url = get_channel_id_url(url, feed_data) r = util.urlopen(channel_url) if not r.ok: raise YouTubeError('Youtube "%s": %d %s' % (url, r.status_code, r.reason)) html_data = util.response_text(r) parser = YouTubeHTMLCoverParser() parser.feed(html_data) if parser.url: logger.debug('Youtube cover art for {} is: {}'.format( url, parser.url)) return parser.url[0] except Exception: logger.warning('Could not retrieve cover art', exc_info=True)