示例#1
0
def bookmarklet_follow(request):
    uri = request.GET["u"]
    title = request.GET["t"]

    # url opener
    redirect_handler = HTTPRedirectHandler()
    redirect_handler.max_redirections = settings.CONFIG["url"]["max_redirections"]
    opener = urllib2.build_opener(redirect_handler)

    # web page loading
    handle = opener.open(uri)
    encoding = detect_header_encoding(handle.headers.dict)
    content = decode_html(handle.read(), encoding)
    handle.close()

    # build a resource
    discovered_resource = DiscoveredResource()
    discovered_resource.uri = uri
    discovered_resource.title = title
    discovered_resource.content = content

    # process the discovered resource
    resource = processing_service_client.process(discovered_resource)

    topic = topic_manager.create_from_features(resource.title, resource.terms, resource.entities)

    return redirect("topic_tracking_web.demo.views.topics_show", topic._id)
    def _process_web_page(self, resource):

        # if the 'http://' doesn't exist
        if (resource.uri[:7] != 'http://'):
            resource.uri = 'http://' + resource.uri
        entire_content = ''

        try:
            handle = self._opener.open(resource.uri)
            resource.uri = handle.url
            encoding = detect_header_encoding(handle.headers.dict)
            entire_content = decode_html(handle.read(), encoding)
            resource.content = entire_content
            handle.close()
            self._logger.info('Reading %s. Success.' % resource.uri)
            self._enqueue(resource)
        except (IOError, HTTPException), e:
            # mark for retry
            self._logger.error('Reading %s. IO error %s.' % (resource.uri, e))
from thrift.protocol import TBinaryProtocol
from thrift.transport import TSocket, TTransport
from topic_tracking.service.text_extraction import TextExtractionService
from topic_tracking.util.http import detect_header_encoding
from topic_tracking.util.xml import decode_html
import urllib

transport = TSocket.TSocket('localhost', 9090)
transport = TTransport.TFramedTransport(transport)
protocol = TBinaryProtocol.TBinaryProtocol(transport)

url = 'http://edition.cnn.com/2010/US/11/29/wikileaks.new.documents/index.html?hpt=T1'
handle = urllib.urlopen(url)
encoding = detect_header_encoding(handle.headers.dict)
html = decode_html(handle.read(), encoding)

client = TextExtractionService.Client(protocol)
transport.open()
content = client.extract(html)
transport.close()

print(content.__class__)
print(content)