예제 #1
0
    def get_video_data(self, item):
        if item.get('published_parsed'):
            best_date = struct_time_to_datetime(item['published_parsed'])
        elif item.get('updated_parsed'):
            best_date = struct_time_to_datetime(item['updated_parsed'])
        else:
            best_date = None

        link = item.get('link')
        if 'links' in item:
            for possible_link in item.links:
                if possible_link.get('rel') == 'via':
                    # original URL
                    link = possible_link['href']
                    break
        if ('content' in item and item['content'] and
            item['content'][0]['value']): # Atom
            description = item['content'][0]['value']
        else:
            description = item.get('summary', '')

        files = [VideoFile(url=enclosure.get('url'),
                           mime_type=enclosure.get('type'),
                           length=(enclosure.get('filesize') or
                                   enclosure.get('length')))
                 for enclosure in get_accepted_enclosures(item)]

        embed_code = None
        if 'media_player' in item:
            player = item['media_player']
            if player.get('content'):
                embed_code = convert_entities(player['content'])
            elif 'url' in player:
                files.append(VideoFile(
                                     url=player['url'],
                                     mime_type=player.get('type')))
        if not files:
            files = None
        if 'media_license' in item:
            license = item['media_license']['href']
        else:
            license = item.get('license')
        return {
            'link': link,
            'title': convert_entities(item.get('title', '')),
            'description': description,
            'thumbnail_url': get_entry_thumbnail_url(item),
            'files': files,
            'publish_datetime': best_date,
            'guid': item.get('id'),
            'embed_code': embed_code,
            'tags': [tag['term'] for tag in item['tags']
                     if tag['scheme'] is None] if 'tags' in item else None,
            'license': license
        }
예제 #2
0
파일: base.py 프로젝트: msabramo/vidscraper
    def get_feed_last_modified(self, feed, feed_response):
        """
        Returns the last modification date for the ``feed_response`` as a
        python datetime, or ``None`` if no date can be determined. By default,
        assumes that the response is a :mod:`feedparser` structure and returns
        a value based on that.

        """
        if 'updated_parsed' in feed_response.feed:
            return struct_time_to_datetime(feed_response.feed.updated_parsed)
        if 'published_parsed' in feed_response.feed:
            return struct_time_to_datetime(feed_response.feed.published_parsed)
        return None
예제 #3
0
    def parse_feed_entry(self, entry):
        enclosure = get_first_accepted_enclosure(entry)
        if "published_parsed" in entry:
            best_date = struct_time_to_datetime(entry["published_parsed"])
        elif "updated_parsed" in entry:
            best_date = struct_time_to_datetime(entry["updated_parsed"])
        else:
            best_date = None

        link = entry.get("link")
        if "links" in entry:
            for possible_link in entry.links:
                if possible_link.get("rel") == "via":
                    # original URL
                    link = possible_link["href"]
                    break
        if "content" in entry and entry["content"] and entry["content"][0]["value"]:  # Atom
            description = entry["content"][0]["value"]
        else:
            description = entry.get("summary", "")

        embed_code = None
        if "media_player" in entry:
            player = entry["media_player"]
            if player.get("content"):
                embed_code = convert_entities(player["content"])
            elif "url" in player:
                embed_code = make_embed_code(player["url"], "")
        if "media_license" in entry:
            license = entry["media_license"]["href"]
        else:
            license = entry.get("license")
        return {
            "link": link,
            "title": convert_entities(entry["title"]),
            "description": description,
            "thumbnail_url": get_entry_thumbnail_url(entry),
            "file_url": enclosure.get("url") if enclosure else None,
            "file_url_mimetype": enclosure.get("type") if enclosure else None,
            "file_url_length": ((enclosure.get("filesize") or enclosure.get("length")) if enclosure else None),
            "publish_datetime": best_date,
            "guid": entry.get("id"),
            "embed_code": embed_code,
            "tags": [tag["term"] for tag in entry["tags"] if tag["scheme"] is None] if "tags" in entry else None,
            "license": license,
        }
예제 #4
0
    def parse_feed_entry(self, entry):
        enclosure = get_first_accepted_enclosure(entry)
        if 'published_parsed' in entry:
            best_date = struct_time_to_datetime(entry['published_parsed'])
        elif 'updated_parsed' in entry:
            best_date = struct_time_to_datetime(entry['updated_parsed'])
        else:
            best_date = None

        link = entry.get('link')
        if 'links' in entry:
            for possible_link in entry.links:
                if possible_link.get('rel') == 'via':
                    # original URL
                    link = possible_link['href']
                    break
        if ('content' in entry and entry['content'] and
            entry['content'][0]['value']): # Atom
            description = entry['content'][0]['value']
        else:
            description = entry['summary'] or ''

        embed_code = None
        if 'media_player' in entry:
            player = entry['media_player']
            if player.get('content'):
                embed_code = convert_entities(player['content'])
            elif 'url' in player:
                embed_code = make_embed_code(player['url'], '')

        return {
            'link': link,
            'title': convert_entities(entry['title']),
            'description': description,
            'thumbnail_url': get_entry_thumbnail_url(entry),
            'file_url': enclosure.get('url') if enclosure else None,
            'file_url_mimetype': enclosure.get('type') if enclosure else None,
            'file_url_length': ((enclosure.get('filesize') or
                                enclosure.get('length'))
                                if enclosure else None),
            'publish_datetime': best_date,
            'guid': entry.get('id'),
            'embed_code': embed_code,
            'tags': [tag['term'] for tag in entry['tags']
                     if tag['scheme'] is None] if 'tags' in entry else None
            }
예제 #5
0
    def get_feed_last_modified(self, feed, feed_response):
        """
        Returns the last modification date for the ``feed_response`` as a
        python datetime, or ``None`` if no date can be determined. By default,
        assumes that the response is a :mod:`feedparser` structure and returns
        a value based on that.

        """
        struct_time = feed_response.feed.get('updated_parsed')
        return (struct_time_to_datetime(struct_time)
                if struct_time is not None else None)
예제 #6
0
    def parse_feed_entry(self, entry):
        """
        Reusable method to parse a feedparser entry from a youtube rss feed.
        Returns a dictionary mapping :class:`.Video` fields to values.

        """
        user = entry['author']
        if 'published_parsed' in entry:
            best_date = struct_time_to_datetime(entry['published_parsed'])
        else:
            best_date = struct_time_to_datetime(entry['updated_parsed'])
        if ('summary_detail' in entry and
            entry['summary_detail']['type'] == 'text/html'):
            # HTML-ified description in RSS feeds
            soup = BeautifulSoup(entry['summary']).findAll('span')[0]
            description = unicode(soup.string)
        else:
            description = entry['summary']
        data = {
            'link': entry['links'][0]['href'].split('&', 1)[0],
            'title': entry['title'],
            'description': description,
            'thumbnail_url': get_entry_thumbnail_url(entry),
            'publish_datetime': best_date,
            'tags': [t['term'] for t in entry['tags']
                    if not t['term'].startswith('http')],
            'user': user,
            'user_url': u'http://www.youtube.com/user/%s' % user,
            'guid' : entry['id'],
        }
        if entry.id.startswith('tag:youtube.com'):
            data['guid'] = 'http://gdata.youtube.com/feeds/api/videos/%s' % (
                entry.id.split(':')[-1],)
        if 'media_player' in entry: # only in search feeds/API?
            data['flash_enclosure_url'] = entry['media_player']['url']
        if data['thumbnail_url'].endswith('/default.jpg'):
            # got a crummy version; increase the resolution
            data['thumbnail_url'] = data['thumbnail_url'].replace(
                '/default.jpg', '/hqdefault.jpg')
        return data
예제 #7
0
    def get_video_data(self, response):
        if response.status_code == 402:
            # 402: Payment required.
            # A note in the previous code said this could happen when too many
            # requests were made (per second?) Unclear why, though, or why
            # this is only caught here.
            return {}
        params = urlparse.parse_qs(response.text.encode('utf-8'))
        if params['status'][0] == 'fail':
            if params['errorcode'][0] == '150':  # unembedable
                return {'is_embeddable': False}
            return {}
        data = {
            'title': params['title'][0].decode('utf8'),
            'thumbnail_url': params['thumbnail_url'][0],
        }
        if 'keywords' in params:
            data['tags'] = params['keywords'][0].decode('utf8').split(',')
        if data['thumbnail_url'].endswith('/default.jpg'):
            # got a crummy version; increase the resolution
            data['thumbnail_url'] = data['thumbnail_url'].replace(
                '/default.jpg', '/hqdefault.jpg')

        url_querystrings = params["url_encoded_fmt_stream_map"][0].split(",")
        url_data = [urlparse.parse_qs(qs) for qs in url_querystrings]
        url_data_map = dict(
            (ud['itag'][0], ud) for ud in url_data if 'itag' in ud)

        data['files'] = []
        for code, mime_type, width, height in self.formats:
            if code in url_data_map:
                file_data = url_data_map[code]
                parsed_file_url = urlparse.urlsplit(file_data['url'][0])
                parsed_file_url_qs = dict(
                    urlparse.parse_qsl(parsed_file_url.query))
                expires = struct_time_to_datetime(
                    time.gmtime(int(parsed_file_url_qs['expire'])))
                parsed_file_url_qs['signature'] = file_data['sig'][0]
                url = urlparse.urlunsplit(parsed_file_url[:3] + (
                    urllib.urlencode(parsed_file_url_qs), ) +
                                          parsed_file_url[4:])
                data['files'].append(
                    VideoFile(url=url,
                              expires=expires,
                              mime_type=mime_type,
                              width=width,
                              height=height))
        return data
예제 #8
0
파일: kaltura.py 프로젝트: lowks/vidscraper
    def get_video_data(self, item):
        files = [VideoFile(url=enclosure.get('url'),
                           mime_type=enclosure.get('type'),
                           length=(enclosure.get('filesize') or
                                   enclosure.get('length')))
                 for enclosure in get_accepted_enclosures(item)]

        data = {
            'title': item.title,
            'description': item.description,
            'thumbnail_url': item.media_thumbnail[0]['url'],
            'publish_datetime': struct_time_to_datetime(item.published_parsed),
            'user': item['kaltura_userscreenname'],
            'files': files or None,
        }
        return data
예제 #9
0
    def get_video_data(self, response):
        if response.status_code == 402:
            # 402: Payment required.
            # A note in the previous code said this could happen when too many
            # requests were made (per second?) Unclear why, though, or why
            # this is only caught here.
            return {}
        params = urlparse.parse_qs(response.text.encode('utf-8'))
        if params['status'][0] == 'fail':
            if params['errorcode'][0] == '150': # unembedable
                return {'is_embeddable': False}
            return {}
        data = {
            'title': params['title'][0].decode('utf8'),
            'thumbnail_url': params['thumbnail_url'][0],
            }
        if 'keywords' in params:
            data['tags'] = params['keywords'][0].decode('utf8').split(',')
        if data['thumbnail_url'].endswith('/default.jpg'):
            # got a crummy version; increase the resolution
            data['thumbnail_url'] = data['thumbnail_url'].replace(
                '/default.jpg', '/hqdefault.jpg')

        url_querystrings = params["url_encoded_fmt_stream_map"][0].split(",")
        url_data = [urlparse.parse_qs(qs) for qs in url_querystrings]
        url_data_map = dict((ud['itag'][0], ud) for ud in url_data if 'itag' in ud)

        data['files'] = []
        for code, mime_type, width, height in self.formats:
            if code in url_data_map:
                file_data = url_data_map[code]
                parsed_file_url = urlparse.urlsplit(file_data['url'][0])
                parsed_file_url_qs = dict(urlparse.parse_qsl(parsed_file_url.query))
                expires = struct_time_to_datetime(time.gmtime(int(parsed_file_url_qs['expire'])))
                parsed_file_url_qs['signature'] = file_data['sig'][0]
                url = urlparse.urlunsplit(parsed_file_url[:3] +
                                          (urllib.urlencode(parsed_file_url_qs),) +
                                          parsed_file_url[4:])
                data['files'].append(VideoFile(url=url,
                                               expires=expires,
                                               mime_type=mime_type,
                                               width=width,
                                               height=height))
        return data
예제 #10
0
    def parse_scrape_response(self, response_text):
        params = urlparse.parse_qs(response_text)
        if params['status'][0] == 'fail':
            if params['errorcode'][0] == '150': # unembedable
                return {'is_embeddable': False}
            return {}
        data = {
            'title': params['title'][0].decode('utf8'),
            'user': params['author'][0].decode('utf8'),
            'user_url': u'http://www.youtube.com/user/%s' % (
                params['author'][0].decode('utf8')),
            'thumbnail_url': params['thumbnail_url'][0],
            }
        if 'keywords' in params:
            data['tags'] = params['keywords'][0].decode('utf8').split(',')
        if data['thumbnail_url'].endswith('/default.jpg'):
            # got a crummy version; increase the resolution
            data['thumbnail_url'] = data['thumbnail_url'].replace(
                '/default.jpg', '/hqdefault.jpg')

        # fmt_url_map is a comma separated list of pipe separated
        # pairs of fmt, url
        # build the format codes.
        fmt_list = [int(x.split('/')[0])
                    for x in params['fmt_list'][0].split(',')]
        # build the list of available urls.
        fmt_url_map = params["url_encoded_fmt_stream_map"][0].split(",")
        # strip url= from url=xxxxxx, strip trailer.
        fmt_url_map = [urllib.unquote_plus(x[4:]).split(';')[0]
                       for x in fmt_url_map]
        # now build the actual fmt_url_map ...
        fmt_url_map = dict(zip(fmt_list, fmt_url_map))
        for fmt, mimetype in self.preferred_fmt_types:
            if fmt in fmt_url_map:
                data['file_url'] = file_url = fmt_url_map[fmt]
                data['file_url_mimetype'] = mimetype
                parsed_url = urlparse.urlparse(file_url)
                file_url_qs = urlparse.parse_qs(parsed_url.query)
                data['file_url_expires'] = struct_time_to_datetime(
                    time.gmtime(int(file_url_qs['expire'][0])))
        return data
예제 #11
0
    def parse_scrape_response(self, response_text):
        doc = minidom.parseString(response_text)
        error_id = doc.getElementsByTagName('error_id').item(0)
        if (error_id is not None and
            error_id.firstChild.data == 'embed_blocked'):
            return {
                'is_embedable': False
                }
        xml_data = {}
        for key in ('url', 'caption', 'thumbnail', 'uploader_url',
                    'uploader_display_name', 'isHD', 'embed_code',
                    'request_signature', 'request_signature_expires',
                    'nodeId'):
            item = doc.getElementsByTagName(key).item(0)
            str_data = item.firstChild.data
            if isinstance(str_data, unicode):
                xml_data[key] = str_data # actually Unicode
            else:
                xml_data[key] = str_data.decode('utf8')

        data = {
            'link': xml_data['url'],
            'user': xml_data['uploader_display_name'],
            'user_url': xml_data['uploader_url'],
            'title': xml_data['caption'],
            'thumbnail_url': xml_data['thumbnail'],
            'embed_code': xml_data['embed_code'],
            'file_url_expires': struct_time_to_datetime(time.gmtime(
                    int(xml_data['request_signature_expires']))),
            'file_url_mimetype': u'video/x-flv',
            }
        base_file_url = (
            'http://www.vimeo.com/moogaloop/play/clip:%(nodeId)s/'
            '%(request_signature)s/%(request_signature_expires)s'
            '/?q=' % xml_data)
        if xml_data['isHD'] == '1':
            data['file_url'] = base_file_url + 'hd'
        else:
            data['file_url'] = base_file_url + 'sd'

        return data
예제 #12
0
    def parse_scrape_response(self, response_text):
        doc = minidom.parseString(response_text)
        xml_data = {}
        for key in (
            "url",
            "caption",
            "thumbnail",
            "uploader_url",
            "uploader_display_name",
            "isHD",
            "embed_code",
            "request_signature",
            "request_signature_expires",
            "nodeId",
        ):
            xml_data[key] = doc.getElementsByTagName(key).item(0).firstChild.data.decode("utf8")

        data = {
            "link": xml_data["url"],
            "user": xml_data["uploader_display_name"],
            "user_url": xml_data["uploader_url"],
            "title": xml_data["caption"],
            "thumbnail_url": xml_data["thumbnail"],
            "embed_code": xml_data["embed_code"],
            "file_url_expires": struct_time_to_datetime(time.gmtime(int(xml_data["request_signature_expires"]))),
            "file_url_mimetype": u"video/x-flv",
        }
        base_file_url = (
            "http://www.vimeo.com/moogaloop/play/clip:%(nodeId)s/"
            "%(request_signature)s/%(request_signature_expires)s"
            "/?q=" % xml_data
        )
        if xml_data["isHD"] == "1":
            data["file_url"] = base_file_url + "hd"
        else:
            data["file_url"] = base_file_url + "sd"

        return data
예제 #13
0
    def data_from_response(self, response):
        feed = response.feed
        data = {
            'title': feed.get('title'),
            'description': feed.get('subtitle'),
            'webpage': feed.get('link'),
            'guid': feed.get('id'),
            'etag': response.get('etag'),
        }
        try:
            data['thumbnail_url'] = get_item_thumbnail_url(feed)
        except KeyError:
            pass

        # Should this be using response.modified?
        parsed = feed.get('updated_parsed') or feed.get('published_parsed')
        if parsed:
            data['last_modified'] = struct_time_to_datetime(parsed)

        # If there are more entries than page length, don't guess.
        if self.per_page is None or len(response.entries) < self.per_page:
            data['video_count'] = len(response.entries)

        return data
예제 #14
0
    def data_from_response(self, response):
        feed = response.feed
        data = {
            'title': feed.get('title'),
            'description': feed.get('subtitle'),
            'webpage': feed.get('link'),
            'guid': feed.get('id'),
            'etag': response.get('etag'),
        }
        try:
            data['thumbnail_url'] = get_item_thumbnail_url(feed)
        except KeyError:
            pass

        # Should this be using response.modified?
        parsed = feed.get('updated_parsed') or feed.get('published_parsed')
        if parsed:
            data['last_modified'] = struct_time_to_datetime(parsed)

        # If there are more entries than page length, don't guess.
        if self.per_page is None or len(response.entries) < self.per_page:
            data['video_count'] = len(response.entries)

        return data
예제 #15
0
    def get_video_data(self, item):
        if item.get('published_parsed'):
            best_date = struct_time_to_datetime(item['published_parsed'])
        elif item.get('updated_parsed'):
            best_date = struct_time_to_datetime(item['updated_parsed'])
        else:
            best_date = None

        link = item.get('link')
        if 'links' in item:
            for possible_link in item.links:
                if possible_link.get('rel') == 'via':
                    # original URL
                    link = possible_link['href']
                    break
        if ('content' in item and item['content']
                and item['content'][0]['value']):  # Atom
            description = item['content'][0]['value']
        else:
            description = item.get('summary', '')

        files = [
            VideoFile(url=enclosure.get('url'),
                      mime_type=enclosure.get('type'),
                      length=(enclosure.get('filesize')
                              or enclosure.get('length')))
            for enclosure in get_accepted_enclosures(item)
        ]

        embed_code = None
        if 'media_player' in item:
            player = item['media_player']
            if player.get('content'):
                embed_code = convert_entities(player['content'])
            elif 'url' in player:
                files.append(
                    VideoFile(url=player['url'], mime_type=player.get('type')))
        if not files:
            files = None
        if 'media_license' in item:
            license = item['media_license']['href']
        else:
            license = item.get('license')
        return {
            'link':
            link,
            'title':
            convert_entities(item.get('title', '')),
            'description':
            description,
            'thumbnail_url':
            get_entry_thumbnail_url(item),
            'files':
            files,
            'publish_datetime':
            best_date,
            'guid':
            item.get('id'),
            'embed_code':
            embed_code,
            'tags':
            [tag['term'] for tag in item['tags']
             if tag['scheme'] is None] if 'tags' in item else None,
            'license':
            license
        }