def _getData(url): try: data = download_page(url) data = data.decode('utf-8') data = soup(data) except Exception as e: print '[Letterboxd][_getData] %s' % (e) return None, None else: return data, _getNextPage(data)
def get_flashvideo_url(src=None, url=None): if not url and not src: print 'At least src or url required.' if url: src = download_page(url) #there are 2 kinds of videos on the site, google video and archive.org if src.find('googleplayer') > 0: flash_url = GoogleVideo.get_flashvideo_url(src) elif src.find('flowplayer') > 0: flash_url = ArchiveVideo.get_flashvideo_url(src) elif src.find('youtube') > 0: flash_url = YouTube.get_flashvideo_url(src) else: print 'no handler implementd for this url.' return flash_url
def get_flashvideo_url(src): embed_tags = BS(src, parseOnlyThese=SS('embed')) url = embed_tags.find('embed')['src'] docid = urlparse.parse_qs(url.split('?', 1)[1]).get('docid')[0] url = 'http://video.google.com/videoplay?docid=%s&hl=en' % docid #load the googlevideo page for a given docid or googlevideo swf url src = download_page(url) flvurl_pattern = re.compile(r"preview_url:'(.+?)'") m = flvurl_pattern.search(src) if not m: return previewurl = m.group(1) #replace hex things # videoUrl\x3dhttp -> videoUrl=http previewurl = unhex(previewurl) #parse querystring and return the videoUrl params = urlparse.parse_qs(previewurl.split('?', 1)[1]) return urllib.unquote_plus(params['videoUrl'][0])
def _htmlify(url): return BS(download_page(url), convertEntities=BS.HTML_ENTITIES)
def htmlify(url): return BS(download_page(url))
def load_html(url): return BS(download_page(url))
def _html(url): '''Downloads the resource at the given url and parses via BeautifulSoup''' return BS(download_page(url), convertEntities=BS.HTML_ENTITIES)