elif 'video_src' in tag['rel']: data['flash_enclosure_url'] = unicode(tag['href']) elif 'canonical' in tag['rel']: data['link'] = u"http://fora.tv{0}".format( unicode(tag['href'])) elif tag.name == 'span' and tag['id'] == 'program_title_text': data['title'] = unicode(tag.string) elif tag.name == 'dd' and 'description' in tag['class']: data['description'] = ''.join((unicode(t) for t in tag)).strip() elif tag.name == 'a' and 'partner_header' in tag['class']: data['user'] = unicode(tag.string) data['user_url'] = unicode(tag['href']) elif tag.name == 'div' and 'information_left' in tag['class']: dds = tag.find_all('dd') date = unicode(dds[2].string) date = datetime.datetime.strptime(date, "%m.%d.%y") data['publish_date'] = date return data class Suite(BaseSuite): """ Suite for fora.tv. As of 25-03-2012 fora does not offer any public API, only video pages and rss feeds. """ loader_classes = (ScrapeLoader,) registry.register(Suite)
return data def get_next_feed_page_url(self, feed, feed_response): parsed = urlparse.urlparse(feed_response.href) params = urlparse.parse_qs(parsed.query) try: page = int(params.get('page', ['1'])[0]) except ValueError: page = 1 params['page'] = unicode(page + 1) return "%s?%s" % (urlparse.urlunparse(parsed[:4] + (None, None)), urllib.urlencode(params, True)) def get_api_url(self, video): if '-' not in video.url: # http://blip.tv/file/1077145/ # oh no, an older URL; get the redirected URL resp = urllib.urlopen(video.url) video.url = resp.geturl() resp.close() parsed_url = urlparse.urlparse(video.url) post_id = parsed_url[2].rsplit('-', 1)[1] new_parsed_url = parsed_url[:2] + ("/rss/%s" % post_id, None, None, None) return urlparse.urlunparse(new_parsed_url) def parse_api_response(self, response_text): parsed = feedparser.parse(response_text) return self.parse_feed_entry(parsed.entries[0]) registry.register(BlipSuite)
parsed = urlparse.urlsplit(url) if (parsed.scheme in ('http', 'https') and parsed.netloc == 'video.google.com' and parsed.path == '/videoplay' and 'docid' in parsed.query): return {'url': url} raise UnhandledVideo(url) def get_video_data(self, response): soup = BeautifulSoup(response.text).findAll(id=self.id_regex) data = {} for tag in soup: if tag['id'] == 'video-title': data['title'] = unicode(tag.string) elif tag['id'] == 'video-description': data['description'] = ''.join( (unicode(t) for t in tag)).strip() elif tag['id'] == 'embed-video-code': # This isn't the cleanest way of handling the gt/lt problem, # but this is a scrape and liable to break anyway. KISS. data['embed_code'] = unicode(tag.string).replace( ">", ">").replace("<", "<") return data class Suite(BaseSuite): """Suite for scraping video pages from videos.google.com""" loader_classes = (ScrapeLoader, ) registry.register(Suite)
"user_url", ] ) def get_api_url(self, video): video_id = self.video_regex.match(video.url).group("id") if video.api_keys is None or "ustream_key" not in video.api_keys: raise ValueError("API key must be set for Ustream API requests.") return "http://api.ustream.tv/json/video/%s/getInfo/?key=%s" % (video_id, video.api_keys["ustream_key"]) def parse_api_response(self, response_text): parsed = json.loads(response_text)["results"] url = parsed["embedTagSourceUrl"] publish_date = datetime.datetime.strptime(parsed["createdAt"], "%Y-%m-%d %H:%M:%S") data = { "link": parsed["url"], "title": parsed["title"], "description": parsed["description"], "flash_enclosure_url": url, "embed_code": "<iframe src='%s' width='320' height='260' />" % url, "thumbnail_url": parsed["imageUrl"]["medium"], "publish_date": publish_date, "tags": [unicode(tag) for tag in parsed["tags"]], "user": parsed["user"]["userName"], "user_url": parsed["user"]["url"], } return data registry.register(UstreamSuite)
def get_next_page_url_params(self, response): start_index = response['feed'].get('opensearch_startindex', None) per_page = response['feed'].get('opensearch_itemsperpage', None) total_results = response['feed'].get('opensearch_totalresults', None) if start_index is None or per_page is None or total_results is None: return None new_start = int(start_index) + int(per_page) if new_start > int(total_results): return None extra_params = { 'start-index': new_start, 'max-results': per_page } return extra_params def get_next_search_page_url(self, search, search_response): extra_params = self.get_next_page_url_params(search_response) if not extra_params: return None return self.get_search_url( search, extra_params=extra_params) def get_next_feed_page_url(self, feed, feed_response): extra_params = self.get_next_page_url_params(feed_response) if not extra_params: return None return self.get_feed_url(feed.url, extra_params=extra_params) registry.register(YouTubeSuite)
request = client.request(search_url) return json.loads(request[1]) def get_search_total_results(self, search, search_response): return int(search_response["videos"]["total"]) def get_search_results(self, search, search_response): return search_response["videos"]["video"] def parse_search_result(self, search, result): # TODO: results have an embed_privacy key. What is this? Should # vidscraper return that information? Doesn't youtube have something # similar? video_id = result["id"] data = { "title": result["title"], "link": [u["_content"] for u in result["urls"]["url"] if u["type"] == "video"][0], "description": result["description"], "thumbnail_url": result["thumbnails"]["thumbnail"][1]["_content"], "user": result["owner"]["realname"], "user_url": result["owner"]["profileurl"], "publish_datetime": datetime.strptime(result["upload_date"], "%Y-%m-%d %H:%M:%S"), "tags": [t["_content"] for t in result.get("tags", {}).get("tag", [])], "flash_enclosure_url": self._flash_enclosure_url_from_id(video_id), "embed_code": self._embed_code_from_id(video_id), } return data registry.register(VimeoSuite)
for tag in soup: if tag.name == "link": if tag["rel"] == "image_src": data["thumbnail_url"] = unicode(tag["href"]) elif tag["rel"] == "video_src": src = unicode(tag["href"]) data["flash_enclosure_url"] = src flash_url, flash_vars = src.split("?", 1) flash_vars = urlparse.parse_qs(flash_vars) flash_vars["cliptype"] = "full" flash_vars = urllib.urlencode(flash_vars) data["embed_code"] = make_embed_code(flash_url, flash_vars) elif tag["rel"] == "canonical": data["link"] = u"http://fora.tv%s" % unicode(tag["href"]) elif tag.name == "span" and tag["id"] == "program_title_text": data["title"] = unicode(tag.string) elif tag.name == "dd" and tag["class"] == "description": data["description"] = "".join((unicode(t) for t in tag)).strip() elif tag.name == "a" and tag["class"] == "partner_header": data["user"] = unicode(tag.string) data["user_url"] = unicode(tag["href"]) elif tag.name == "div" and tag["class"] == "information_left": dds = tag.findAll("dd") date = unicode(dds[2].string) date = datetime.datetime.strptime(date, "%m.%d.%y") data["publish_date"] = date return data registry.register(ForaSuite)
ID_REGEX = re.compile(r"video-title|video-description|embed-video-code") class GoogleSuite(BaseSuite): """Suite for scraping video pages from videos.google.com""" video_regex = r"^https?://video.google.com/videoplay" scrape_fields = set(["title", "description", "embed_code"]) def get_scrape_url(self, video): return video.url def parse_scrape_response(self, response_text): soup = BeautifulSoup(response_text).findAll(attrs={"id": ID_REGEX}) data = {} for tag in soup: if tag["id"] == "video-title": data["title"] = unicode(tag.string) elif tag["id"] == "video-description": data["description"] = "".join((unicode(t) for t in tag)).strip() elif tag["id"] == "embed-video-code": # This isn't the cleanest way of handling the gt/lt problem, # but this is a scrape and liable to break anyway. KISS. data["embed_code"] = unicode(tag.string).replace(">", ">").replace("<", "<") return data registry.register(GoogleSuite)