def _open(resource_url): """ Helper function that returns an open file object for a resource, given its resource URL. If the given resource URL uses the "nltk:" protocol, or uses no protocol, then use ``nltk.data.find`` to find its path, and open it with the given mode; if the resource URL uses the 'file' protocol, then open the file with the given mode; otherwise, delegate to ``urllib2.urlopen``. :type resource_url: str :param resource_url: A URL specifying where the resource should be loaded from. The default protocol is "nltk:", which searches for the file in the the NLTK data package. """ # Divide the resource name into "<protocol>:<path>". protocol, path = re.match('(?:(\w+):)?(.*)', resource_url).groups() if protocol is None or protocol.lower() == 'nltk': return find(path).open() elif protocol.lower() == 'file': # urllib might not use mode='rb', so handle this one ourselves: return open(path, 'rb') else: return compat.urlopen(resource_url)
def _open(resource_url): """ Helper function that returns an open file object for a resource, given its resource URL. If the given resource URL uses the "nltk:" protocol, or uses no protocol, then use ``nltk.data.find`` to find its path, and open it with the given mode; if the resource URL uses the 'file' protocol, then open the file with the given mode; otherwise, delegate to ``urllib2.urlopen``. :type resource_url: str :param resource_url: A URL specifying where the resource should be loaded from. The default protocol is "nltk:", which searches for the file in the the NLTK data package. """ resource_url = normalize_resource_url(resource_url) protocol, _path = split_resource_url(resource_url) if protocol is None or protocol.lower() == 'nltk': return find(_path, path + ['']).open() elif protocol.lower() == 'file': # urllib might not use mode='rb', so handle this one ourselves: return find(_path, ['']).open() else: return compat.urlopen(resource_url)
def clean_url(url): html = compat.urlopen(url).read() return clean_html(html)