def bookmarklet_follow(request): uri = request.GET["u"] title = request.GET["t"] # url opener redirect_handler = HTTPRedirectHandler() redirect_handler.max_redirections = settings.CONFIG["url"]["max_redirections"] opener = urllib2.build_opener(redirect_handler) # web page loading handle = opener.open(uri) encoding = detect_header_encoding(handle.headers.dict) content = decode_html(handle.read(), encoding) handle.close() # build a resource discovered_resource = DiscoveredResource() discovered_resource.uri = uri discovered_resource.title = title discovered_resource.content = content # process the discovered resource resource = processing_service_client.process(discovered_resource) topic = topic_manager.create_from_features(resource.title, resource.terms, resource.entities) return redirect("topic_tracking_web.demo.views.topics_show", topic._id)
def _process_web_page(self, resource): # if the 'http://' doesn't exist if (resource.uri[:7] != 'http://'): resource.uri = 'http://' + resource.uri entire_content = '' try: handle = self._opener.open(resource.uri) resource.uri = handle.url encoding = detect_header_encoding(handle.headers.dict) entire_content = decode_html(handle.read(), encoding) resource.content = entire_content handle.close() self._logger.info('Reading %s. Success.' % resource.uri) self._enqueue(resource) except (IOError, HTTPException), e: # mark for retry self._logger.error('Reading %s. IO error %s.' % (resource.uri, e))
from thrift.protocol import TBinaryProtocol from thrift.transport import TSocket, TTransport from topic_tracking.service.text_extraction import TextExtractionService from topic_tracking.util.http import detect_header_encoding from topic_tracking.util.xml import decode_html import urllib transport = TSocket.TSocket('localhost', 9090) transport = TTransport.TFramedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) url = 'http://edition.cnn.com/2010/US/11/29/wikileaks.new.documents/index.html?hpt=T1' handle = urllib.urlopen(url) encoding = detect_header_encoding(handle.headers.dict) html = decode_html(handle.read(), encoding) client = TextExtractionService.Client(protocol) transport.open() content = client.extract(html) transport.close() print(content.__class__) print(content)