예제 #1
0
    def _request(self, url, method):
        scheme, host = urlparse(url)[:2]
        scheme = scheme.lower()
        proxies = getproxies_environment()
        if scheme in proxies:
            scheme, host = urlparse(proxies[scheme])[:2]
            scheme = scheme.lower()

        kwargs = {}
        if version_info[1] >= 6:
            kwargs['timeout'] = self.timeout
        else:
            socket.setdefaulttimeout(self.timeout)

        if scheme == "https":
            conn = HTTPSConnection(host, **kwargs)
        else:
            conn = HTTPConnection(host, **kwargs)

        headers={}
        if method == 'GET':
            headers['Range'] = 'bytes=0-%s' % self.max_size

        try:
            try:
                conn.request(method.upper(), iri_to_uri(url),
                             headers=headers)
                response = conn.getresponse()
                data = response.read(self.max_size)
                conn.close()
            except socket.error, e:
                raise HTTPException(e.message or e.args[1])
        finally:
            if version_info[1] < 6:
                socket.setdefaulttimeout(None)

        contenttype = response.getheader('Content-Type', None)
        if contenttype:
            match = re.search('^charset=([a-zA-Z0-9-]+)', contenttype)
            try:
                if match:
                    data = data.decode(match.group(1))
                elif contenttype.startswith('text/'):
                    data = data.decode('utf-8')
            except UnicodeDecodeError:
                guessed = detect(data)
                if guessed['confidence'] > 0.5:
                    charset = guessed['encoding']
                    # Common guessing mistake:
                    if charset.startswith('ISO-8859') and '\x92' in data:
                        charset = 'windows-1252'
                    data = unicode(data, charset, errors='replace')

        return response.status, response.reason, data, response.getheaders()
예제 #2
0
파일: network.py 프로젝트: GertBurger/ibid
    def _request(self, url, method):
        scheme, host = urlparse(url)[:2]
        scheme = scheme.lower()
        proxies = getproxies_environment()
        if scheme in proxies:
            scheme, host = urlparse(proxies[scheme])[:2]
            scheme = scheme.lower()

        kwargs = {}
        if version_info[1] >= 6:
            kwargs["timeout"] = self.timeout
        else:
            socket.setdefaulttimeout(self.timeout)

        if scheme == "https":
            conn = HTTPSConnection(host, **kwargs)
        else:
            conn = HTTPConnection(host, **kwargs)

        headers = {}
        if method == "GET":
            headers["Range"] = "bytes=0-%s" % self.max_size

        try:
            try:
                conn.request(method.upper(), iri_to_uri(url), headers=headers)
                response = conn.getresponse()
                data = response.read(self.max_size)
                conn.close()
            except socket.error, e:
                raise HTTPException(e.message or e.args[1])
        finally:
            if version_info[1] < 6:
                socket.setdefaulttimeout(None)

        contenttype = response.getheader("Content-Type", None)
        if contenttype:
            match = re.search("^charset=([a-zA-Z0-9-]+)", contenttype)
            try:
                if match:
                    data = data.decode(match.group(1))
                elif contenttype.startswith("text/"):
                    data = data.decode("utf-8")
            except UnicodeDecodeError:
                guessed = detect(data)
                if guessed["confidence"] > 0.5:
                    charset = guessed["encoding"]
                    # Common guessing mistake:
                    if charset.startswith("ISO-8859") and "\x92" in data:
                        charset = "windows-1252"
                    data = unicode(data, charset, errors="replace")

        return response.status, response.reason, data, response.getheaders()
예제 #3
0
파일: ascii.py 프로젝트: adrianmoisey/ibid
    def draw(self, event, url, colour, width, height):
        if not urlparse(url).netloc:
            url = 'http://' + url
        if urlparse(url).scheme == 'file':
            event.addresponse(u'Are you trying to haxor me?')
            return
        if not urlparse(url).path:
            url += '/'

        try:
            f = urlopen(iri_to_uri(url))
        except HTTPError, e:
            event.addresponse(u'Sorry, error fetching URL: %s', BaseHTTPRequestHandler.responses[e.code][0])
            return
예제 #4
0
파일: ascii.py 프로젝트: vhata/ibid
    def draw(self, event, url, colour, width, height):
        if not urlparse(url).netloc:
            url = 'http://' + url
        if urlparse(url).scheme == 'file':
            event.addresponse(u'Are you trying to haxor me?')
            return
        if not urlparse(url).path:
            url += '/'

        try:
            f = urlopen(iri_to_uri(url))
        except HTTPError, e:
            event.addresponse(u'Sorry, error fetching URL: %s',
                              BaseHTTPRequestHandler.responses[e.code][0])
            return
예제 #5
0
def get_html_parse_tree(url, data=None, headers={}, treetype='beautifulsoup'):
    "Request a URL, parse with html5lib, and return a parse tree from it"

    req = urllib2.Request(iri_to_uri(url), data, headers)
    f = urllib2.urlopen(req)

    if f.info().gettype() not in ('text/html', 'application/xhtml+xml'):
        f.close()
        raise ContentTypeException("Content type isn't HTML, but " +
                                   f.info().gettype())

    data = f.read()
    f.close()

    encoding = None
    contentType = f.headers.get('content-type')
    if contentType:
        (mediaType, params) = cgi.parse_header(contentType)
        encoding = params.get('charset')

    compression = f.headers.get('content-encoding')
    if compression:
        if compression.lower() == "deflate":
            try:
                data = zlib.decompress(data)
            except zlib.error:
                data = zlib.decompress(data, -zlib.MAX_WBITS)
        elif compression.lower() == "gzip":
            compressedstream = StringIO(data)
            gzipper = GzipFile(fileobj=compressedstream)
            data = gzipper.read()

    if treetype == "beautifulsoup":
        return BeautifulSoup(data, convertEntities=BeautifulSoup.HTML_ENTITIES)
    elif treetype == "etree":
        kwargs = {'tree': treebuilders.getTreeBuilder('etree', ElementTree)}
        # http://code.google.com/p/html5lib/issues/detail?id=138
        if ('namespaceHTMLElements'
                in inspect.getargspec(HTMLParser.__init__)[0]):
            kwargs['namespaceHTMLElements'] = False
        parser = HTMLParser(**kwargs)
    else:
        if treetype == "html5lib-beautifulsoup":
            treetype = "beautifulsoup"
        parser = HTMLParser(tree=treebuilders.getTreeBuilder(treetype))

    return parser.parse(data, encoding=encoding)
예제 #6
0
파일: html.py 프로젝트: B-Rich/ibid-1
def get_html_parse_tree(url, data=None, headers={}, treetype='beautifulsoup'):
    "Request a URL, parse with html5lib, and return a parse tree from it"

    req = urllib2.Request(iri_to_uri(url), data, headers)
    f = urllib2.urlopen(req)

    if f.info().gettype() not in ('text/html', 'application/xhtml+xml'):
        f.close()
        raise ContentTypeException("Content type isn't HTML, but " + f.info().gettype())

    data = f.read()
    f.close()

    encoding = None
    contentType = f.headers.get('content-type')
    if contentType:
        (mediaType, params) = cgi.parse_header(contentType)
        encoding = params.get('charset')

    compression = f.headers.get('content-encoding')
    if compression:
        if compression.lower() == "deflate":
            try:
                data = zlib.decompress(data)
            except zlib.error:
                data = zlib.decompress(data, -zlib.MAX_WBITS)
        elif compression.lower() == "gzip":
            compressedstream = StringIO(data)
            gzipper = GzipFile(fileobj=compressedstream)
            data = gzipper.read()

    if treetype == "beautifulsoup":
        return BeautifulSoup(data, convertEntities=BeautifulSoup.HTML_ENTITIES)
    elif treetype == "etree":
        kwargs = {'tree': treebuilders.getTreeBuilder('etree', ElementTree)}
        # http://code.google.com/p/html5lib/issues/detail?id=138
        if ('namespaceHTMLElements'
                in inspect.getargspec(HTMLParser.__init__)[0]):
            kwargs['namespaceHTMLElements'] = False
        parser = HTMLParser(**kwargs)
    else:
        if treetype == "html5lib-beautifulsoup":
            treetype = "beautifulsoup"
        parser = HTMLParser(tree=treebuilders.getTreeBuilder(treetype))

    return parser.parse(data, encoding = encoding)
예제 #7
0
파일: languages.py 프로젝트: B-Rich/ibid-1
    def translate (self, event, text, src_lang, dest_lang):
        dest_lang = self.language_code(dest_lang or self.dest_lang)
        src_lang = self.language_code(src_lang or '')

        if is_url(text):
            if urlparse(text).scheme in ('', 'http'):
                url = iri_to_uri(text)
                query = {'sl': src_lang, 'tl': dest_lang, 'u': url}
                event.addresponse(u'http://translate.google.com/translate?' +
                                    urlencode(query))
            else:
                event.addresponse(u'I can only translate HTTP pages')
            return

        try:
            translated = self._translate(event, text, src_lang, dest_lang)[0]
            event.addresponse(translated)
        except TranslationException, e:
            event.addresponse(u"I couldn't translate that: %s.", unicode(e))
예제 #8
0
파일: languages.py 프로젝트: vhata/ibid
    def translate(self, event, text, src_lang, dest_lang):
        dest_lang = self.language_code(dest_lang or self.dest_lang)
        src_lang = self.language_code(src_lang or '')

        if is_url(text):
            if urlparse(text).scheme in ('', 'http'):
                url = iri_to_uri(text)
                query = {'sl': src_lang, 'tl': dest_lang, 'u': url}
                event.addresponse(u'http://translate.google.com/translate?' +
                                  urlencode(query))
            else:
                event.addresponse(u'I can only translate HTTP pages')
            return

        try:
            translated = self._translate(event, text, src_lang, dest_lang)[0]
            event.addresponse(translated)
        except TranslationException, e:
            event.addresponse(u"I couldn't translate that: %s.", unicode(e))