def resolve(web_url): ''' Resolve a web page to a media stream. It is usually as simple as:: import urlresolver media_url = urlresolver.resolve(web_url) where ``web_url`` is the address of a web page which is associated with a media file and ``media_url`` is the direct URL to the media. Behind the scenes, :mod:`urlresolver` will check each of the available resolver plugins to see if they accept the ``web_url`` in priority order (lowest priotity number first). When it finds a plugin willing to resolve the URL, it passes the ``web_url`` to the plugin and returns the direct URL to the media file, or ``False`` if it was not possible to resolve. .. seealso:: :class:`HostedMediaFile` Args: web_url (str): A URL to a web page associated with a piece of media content. Returns: If the ``web_url`` could be resolved, a string containing the direct URL to the media file, if not, returns ``False``. ''' source = HostedMediaFile(url=web_url) return source.resolve()
def resolve(web_url): ''' Resolve a web page to a media stream. It is usually as simple as:: import urlresolver9 media_url = urlresolver.resolve(web_url) where ``web_url`` is the address of a web page which is associated with a media file and ``media_url`` is the direct URL to the media. Behind the scenes, :mod:`urlresolver` will check each of the available resolver plugins to see if they accept the ``web_url`` in priority order (lowest priotity number first). When it finds a plugin willing to resolve the URL, it passes the ``web_url`` to the plugin and returns the direct URL to the media file, or ``False`` if it was not possible to resolve. .. seealso:: :class:`HostedMediaFile` Args: web_url (str): A URL to a web page associated with a piece of media content. Returns: If the ``web_url`` could be resolved, a string containing the direct URL to the media file, if not, returns ``False``. ''' source = HostedMediaFile(url=web_url) return source.resolve()
def scrape_supported(html, regex=None, host_only=False): ''' returns a list of links scraped from the html that are supported by urlresolver args: html: the html to be scraped regex: an optional argument to override the default regex which is: href\s*=\s*["']([^'"]+ host_only: an optional argument if true to do only host validation vs full url validation (default False) Returns: a list of links scraped from the html that passed validation ''' if regex is None: regex = '''href\s*=\s*['"]([^'"]+)''' links = [] for match in re.finditer(regex, html): stream_url = match.group(1) host = urlparse.urlparse(stream_url).hostname if host_only: if host is None: continue if host in host_cache: if host_cache[host]: links.append(stream_url) continue else: hmf = HostedMediaFile( host=host, media_id='dummy' ) # use dummy media_id to allow host validation else: hmf = HostedMediaFile(url=stream_url) is_valid = hmf.valid_url() host_cache[host] = is_valid if is_valid: links.append(stream_url) return links
def scrape_supported(html, regex=None, host_only=False): ''' returns a list of links scraped from the html that are supported by urlresolver args: html: the html to be scraped regex: an optional argument to override the default regex which is: href\s*=\s*["']([^'"]+ host_only: an optional argument if true to do only host validation vs full url validation (default False) Returns: a list of links scraped from the html that passed validation ''' if regex is None: regex = '''href\s*=\s*['"]([^'"]+)''' links = [] for match in re.finditer(regex, html): stream_url = match.group(1) host = urlparse.urlparse(stream_url).hostname if host_only: if host is None: continue if host in host_cache: if host_cache[host]: links.append(stream_url) continue else: hmf = HostedMediaFile(host=host, media_id='dummy') # use dummy media_id to allow host validation else: hmf = HostedMediaFile(url=stream_url) is_valid = hmf.valid_url() host_cache[host] = is_valid if is_valid: links.append(stream_url) return links