def get_video_data(self, item): if item.get('published_parsed'): best_date = struct_time_to_datetime(item['published_parsed']) elif item.get('updated_parsed'): best_date = struct_time_to_datetime(item['updated_parsed']) else: best_date = None link = item.get('link') if 'links' in item: for possible_link in item.links: if possible_link.get('rel') == 'via': # original URL link = possible_link['href'] break if ('content' in item and item['content'] and item['content'][0]['value']): # Atom description = item['content'][0]['value'] else: description = item.get('summary', '') files = [VideoFile(url=enclosure.get('url'), mime_type=enclosure.get('type'), length=(enclosure.get('filesize') or enclosure.get('length'))) for enclosure in get_accepted_enclosures(item)] embed_code = None if 'media_player' in item: player = item['media_player'] if player.get('content'): embed_code = convert_entities(player['content']) elif 'url' in player: files.append(VideoFile( url=player['url'], mime_type=player.get('type'))) if not files: files = None if 'media_license' in item: license = item['media_license']['href'] else: license = item.get('license') return { 'link': link, 'title': convert_entities(item.get('title', '')), 'description': description, 'thumbnail_url': get_entry_thumbnail_url(item), 'files': files, 'publish_datetime': best_date, 'guid': item.get('id'), 'embed_code': embed_code, 'tags': [tag['term'] for tag in item['tags'] if tag['scheme'] is None] if 'tags' in item else None, 'license': license }
def get_feed_last_modified(self, feed, feed_response): """ Returns the last modification date for the ``feed_response`` as a python datetime, or ``None`` if no date can be determined. By default, assumes that the response is a :mod:`feedparser` structure and returns a value based on that. """ if 'updated_parsed' in feed_response.feed: return struct_time_to_datetime(feed_response.feed.updated_parsed) if 'published_parsed' in feed_response.feed: return struct_time_to_datetime(feed_response.feed.published_parsed) return None
def parse_feed_entry(self, entry): enclosure = get_first_accepted_enclosure(entry) if "published_parsed" in entry: best_date = struct_time_to_datetime(entry["published_parsed"]) elif "updated_parsed" in entry: best_date = struct_time_to_datetime(entry["updated_parsed"]) else: best_date = None link = entry.get("link") if "links" in entry: for possible_link in entry.links: if possible_link.get("rel") == "via": # original URL link = possible_link["href"] break if "content" in entry and entry["content"] and entry["content"][0]["value"]: # Atom description = entry["content"][0]["value"] else: description = entry.get("summary", "") embed_code = None if "media_player" in entry: player = entry["media_player"] if player.get("content"): embed_code = convert_entities(player["content"]) elif "url" in player: embed_code = make_embed_code(player["url"], "") if "media_license" in entry: license = entry["media_license"]["href"] else: license = entry.get("license") return { "link": link, "title": convert_entities(entry["title"]), "description": description, "thumbnail_url": get_entry_thumbnail_url(entry), "file_url": enclosure.get("url") if enclosure else None, "file_url_mimetype": enclosure.get("type") if enclosure else None, "file_url_length": ((enclosure.get("filesize") or enclosure.get("length")) if enclosure else None), "publish_datetime": best_date, "guid": entry.get("id"), "embed_code": embed_code, "tags": [tag["term"] for tag in entry["tags"] if tag["scheme"] is None] if "tags" in entry else None, "license": license, }
def parse_feed_entry(self, entry): enclosure = get_first_accepted_enclosure(entry) if 'published_parsed' in entry: best_date = struct_time_to_datetime(entry['published_parsed']) elif 'updated_parsed' in entry: best_date = struct_time_to_datetime(entry['updated_parsed']) else: best_date = None link = entry.get('link') if 'links' in entry: for possible_link in entry.links: if possible_link.get('rel') == 'via': # original URL link = possible_link['href'] break if ('content' in entry and entry['content'] and entry['content'][0]['value']): # Atom description = entry['content'][0]['value'] else: description = entry['summary'] or '' embed_code = None if 'media_player' in entry: player = entry['media_player'] if player.get('content'): embed_code = convert_entities(player['content']) elif 'url' in player: embed_code = make_embed_code(player['url'], '') return { 'link': link, 'title': convert_entities(entry['title']), 'description': description, 'thumbnail_url': get_entry_thumbnail_url(entry), 'file_url': enclosure.get('url') if enclosure else None, 'file_url_mimetype': enclosure.get('type') if enclosure else None, 'file_url_length': ((enclosure.get('filesize') or enclosure.get('length')) if enclosure else None), 'publish_datetime': best_date, 'guid': entry.get('id'), 'embed_code': embed_code, 'tags': [tag['term'] for tag in entry['tags'] if tag['scheme'] is None] if 'tags' in entry else None }
def get_feed_last_modified(self, feed, feed_response): """ Returns the last modification date for the ``feed_response`` as a python datetime, or ``None`` if no date can be determined. By default, assumes that the response is a :mod:`feedparser` structure and returns a value based on that. """ struct_time = feed_response.feed.get('updated_parsed') return (struct_time_to_datetime(struct_time) if struct_time is not None else None)
def parse_feed_entry(self, entry): """ Reusable method to parse a feedparser entry from a youtube rss feed. Returns a dictionary mapping :class:`.Video` fields to values. """ user = entry['author'] if 'published_parsed' in entry: best_date = struct_time_to_datetime(entry['published_parsed']) else: best_date = struct_time_to_datetime(entry['updated_parsed']) if ('summary_detail' in entry and entry['summary_detail']['type'] == 'text/html'): # HTML-ified description in RSS feeds soup = BeautifulSoup(entry['summary']).findAll('span')[0] description = unicode(soup.string) else: description = entry['summary'] data = { 'link': entry['links'][0]['href'].split('&', 1)[0], 'title': entry['title'], 'description': description, 'thumbnail_url': get_entry_thumbnail_url(entry), 'publish_datetime': best_date, 'tags': [t['term'] for t in entry['tags'] if not t['term'].startswith('http')], 'user': user, 'user_url': u'http://www.youtube.com/user/%s' % user, 'guid' : entry['id'], } if entry.id.startswith('tag:youtube.com'): data['guid'] = 'http://gdata.youtube.com/feeds/api/videos/%s' % ( entry.id.split(':')[-1],) if 'media_player' in entry: # only in search feeds/API? data['flash_enclosure_url'] = entry['media_player']['url'] if data['thumbnail_url'].endswith('/default.jpg'): # got a crummy version; increase the resolution data['thumbnail_url'] = data['thumbnail_url'].replace( '/default.jpg', '/hqdefault.jpg') return data
def get_video_data(self, response): if response.status_code == 402: # 402: Payment required. # A note in the previous code said this could happen when too many # requests were made (per second?) Unclear why, though, or why # this is only caught here. return {} params = urlparse.parse_qs(response.text.encode('utf-8')) if params['status'][0] == 'fail': if params['errorcode'][0] == '150': # unembedable return {'is_embeddable': False} return {} data = { 'title': params['title'][0].decode('utf8'), 'thumbnail_url': params['thumbnail_url'][0], } if 'keywords' in params: data['tags'] = params['keywords'][0].decode('utf8').split(',') if data['thumbnail_url'].endswith('/default.jpg'): # got a crummy version; increase the resolution data['thumbnail_url'] = data['thumbnail_url'].replace( '/default.jpg', '/hqdefault.jpg') url_querystrings = params["url_encoded_fmt_stream_map"][0].split(",") url_data = [urlparse.parse_qs(qs) for qs in url_querystrings] url_data_map = dict( (ud['itag'][0], ud) for ud in url_data if 'itag' in ud) data['files'] = [] for code, mime_type, width, height in self.formats: if code in url_data_map: file_data = url_data_map[code] parsed_file_url = urlparse.urlsplit(file_data['url'][0]) parsed_file_url_qs = dict( urlparse.parse_qsl(parsed_file_url.query)) expires = struct_time_to_datetime( time.gmtime(int(parsed_file_url_qs['expire']))) parsed_file_url_qs['signature'] = file_data['sig'][0] url = urlparse.urlunsplit(parsed_file_url[:3] + ( urllib.urlencode(parsed_file_url_qs), ) + parsed_file_url[4:]) data['files'].append( VideoFile(url=url, expires=expires, mime_type=mime_type, width=width, height=height)) return data
def get_video_data(self, item): files = [VideoFile(url=enclosure.get('url'), mime_type=enclosure.get('type'), length=(enclosure.get('filesize') or enclosure.get('length'))) for enclosure in get_accepted_enclosures(item)] data = { 'title': item.title, 'description': item.description, 'thumbnail_url': item.media_thumbnail[0]['url'], 'publish_datetime': struct_time_to_datetime(item.published_parsed), 'user': item['kaltura_userscreenname'], 'files': files or None, } return data
def get_video_data(self, response): if response.status_code == 402: # 402: Payment required. # A note in the previous code said this could happen when too many # requests were made (per second?) Unclear why, though, or why # this is only caught here. return {} params = urlparse.parse_qs(response.text.encode('utf-8')) if params['status'][0] == 'fail': if params['errorcode'][0] == '150': # unembedable return {'is_embeddable': False} return {} data = { 'title': params['title'][0].decode('utf8'), 'thumbnail_url': params['thumbnail_url'][0], } if 'keywords' in params: data['tags'] = params['keywords'][0].decode('utf8').split(',') if data['thumbnail_url'].endswith('/default.jpg'): # got a crummy version; increase the resolution data['thumbnail_url'] = data['thumbnail_url'].replace( '/default.jpg', '/hqdefault.jpg') url_querystrings = params["url_encoded_fmt_stream_map"][0].split(",") url_data = [urlparse.parse_qs(qs) for qs in url_querystrings] url_data_map = dict((ud['itag'][0], ud) for ud in url_data if 'itag' in ud) data['files'] = [] for code, mime_type, width, height in self.formats: if code in url_data_map: file_data = url_data_map[code] parsed_file_url = urlparse.urlsplit(file_data['url'][0]) parsed_file_url_qs = dict(urlparse.parse_qsl(parsed_file_url.query)) expires = struct_time_to_datetime(time.gmtime(int(parsed_file_url_qs['expire']))) parsed_file_url_qs['signature'] = file_data['sig'][0] url = urlparse.urlunsplit(parsed_file_url[:3] + (urllib.urlencode(parsed_file_url_qs),) + parsed_file_url[4:]) data['files'].append(VideoFile(url=url, expires=expires, mime_type=mime_type, width=width, height=height)) return data
def parse_scrape_response(self, response_text): params = urlparse.parse_qs(response_text) if params['status'][0] == 'fail': if params['errorcode'][0] == '150': # unembedable return {'is_embeddable': False} return {} data = { 'title': params['title'][0].decode('utf8'), 'user': params['author'][0].decode('utf8'), 'user_url': u'http://www.youtube.com/user/%s' % ( params['author'][0].decode('utf8')), 'thumbnail_url': params['thumbnail_url'][0], } if 'keywords' in params: data['tags'] = params['keywords'][0].decode('utf8').split(',') if data['thumbnail_url'].endswith('/default.jpg'): # got a crummy version; increase the resolution data['thumbnail_url'] = data['thumbnail_url'].replace( '/default.jpg', '/hqdefault.jpg') # fmt_url_map is a comma separated list of pipe separated # pairs of fmt, url # build the format codes. fmt_list = [int(x.split('/')[0]) for x in params['fmt_list'][0].split(',')] # build the list of available urls. fmt_url_map = params["url_encoded_fmt_stream_map"][0].split(",") # strip url= from url=xxxxxx, strip trailer. fmt_url_map = [urllib.unquote_plus(x[4:]).split(';')[0] for x in fmt_url_map] # now build the actual fmt_url_map ... fmt_url_map = dict(zip(fmt_list, fmt_url_map)) for fmt, mimetype in self.preferred_fmt_types: if fmt in fmt_url_map: data['file_url'] = file_url = fmt_url_map[fmt] data['file_url_mimetype'] = mimetype parsed_url = urlparse.urlparse(file_url) file_url_qs = urlparse.parse_qs(parsed_url.query) data['file_url_expires'] = struct_time_to_datetime( time.gmtime(int(file_url_qs['expire'][0]))) return data
def parse_scrape_response(self, response_text): doc = minidom.parseString(response_text) error_id = doc.getElementsByTagName('error_id').item(0) if (error_id is not None and error_id.firstChild.data == 'embed_blocked'): return { 'is_embedable': False } xml_data = {} for key in ('url', 'caption', 'thumbnail', 'uploader_url', 'uploader_display_name', 'isHD', 'embed_code', 'request_signature', 'request_signature_expires', 'nodeId'): item = doc.getElementsByTagName(key).item(0) str_data = item.firstChild.data if isinstance(str_data, unicode): xml_data[key] = str_data # actually Unicode else: xml_data[key] = str_data.decode('utf8') data = { 'link': xml_data['url'], 'user': xml_data['uploader_display_name'], 'user_url': xml_data['uploader_url'], 'title': xml_data['caption'], 'thumbnail_url': xml_data['thumbnail'], 'embed_code': xml_data['embed_code'], 'file_url_expires': struct_time_to_datetime(time.gmtime( int(xml_data['request_signature_expires']))), 'file_url_mimetype': u'video/x-flv', } base_file_url = ( 'http://www.vimeo.com/moogaloop/play/clip:%(nodeId)s/' '%(request_signature)s/%(request_signature_expires)s' '/?q=' % xml_data) if xml_data['isHD'] == '1': data['file_url'] = base_file_url + 'hd' else: data['file_url'] = base_file_url + 'sd' return data
def parse_scrape_response(self, response_text): doc = minidom.parseString(response_text) xml_data = {} for key in ( "url", "caption", "thumbnail", "uploader_url", "uploader_display_name", "isHD", "embed_code", "request_signature", "request_signature_expires", "nodeId", ): xml_data[key] = doc.getElementsByTagName(key).item(0).firstChild.data.decode("utf8") data = { "link": xml_data["url"], "user": xml_data["uploader_display_name"], "user_url": xml_data["uploader_url"], "title": xml_data["caption"], "thumbnail_url": xml_data["thumbnail"], "embed_code": xml_data["embed_code"], "file_url_expires": struct_time_to_datetime(time.gmtime(int(xml_data["request_signature_expires"]))), "file_url_mimetype": u"video/x-flv", } base_file_url = ( "http://www.vimeo.com/moogaloop/play/clip:%(nodeId)s/" "%(request_signature)s/%(request_signature_expires)s" "/?q=" % xml_data ) if xml_data["isHD"] == "1": data["file_url"] = base_file_url + "hd" else: data["file_url"] = base_file_url + "sd" return data
def data_from_response(self, response): feed = response.feed data = { 'title': feed.get('title'), 'description': feed.get('subtitle'), 'webpage': feed.get('link'), 'guid': feed.get('id'), 'etag': response.get('etag'), } try: data['thumbnail_url'] = get_item_thumbnail_url(feed) except KeyError: pass # Should this be using response.modified? parsed = feed.get('updated_parsed') or feed.get('published_parsed') if parsed: data['last_modified'] = struct_time_to_datetime(parsed) # If there are more entries than page length, don't guess. if self.per_page is None or len(response.entries) < self.per_page: data['video_count'] = len(response.entries) return data
def get_video_data(self, item): if item.get('published_parsed'): best_date = struct_time_to_datetime(item['published_parsed']) elif item.get('updated_parsed'): best_date = struct_time_to_datetime(item['updated_parsed']) else: best_date = None link = item.get('link') if 'links' in item: for possible_link in item.links: if possible_link.get('rel') == 'via': # original URL link = possible_link['href'] break if ('content' in item and item['content'] and item['content'][0]['value']): # Atom description = item['content'][0]['value'] else: description = item.get('summary', '') files = [ VideoFile(url=enclosure.get('url'), mime_type=enclosure.get('type'), length=(enclosure.get('filesize') or enclosure.get('length'))) for enclosure in get_accepted_enclosures(item) ] embed_code = None if 'media_player' in item: player = item['media_player'] if player.get('content'): embed_code = convert_entities(player['content']) elif 'url' in player: files.append( VideoFile(url=player['url'], mime_type=player.get('type'))) if not files: files = None if 'media_license' in item: license = item['media_license']['href'] else: license = item.get('license') return { 'link': link, 'title': convert_entities(item.get('title', '')), 'description': description, 'thumbnail_url': get_entry_thumbnail_url(item), 'files': files, 'publish_datetime': best_date, 'guid': item.get('id'), 'embed_code': embed_code, 'tags': [tag['term'] for tag in item['tags'] if tag['scheme'] is None] if 'tags' in item else None, 'license': license }