def geturl2(url, decode=False): """ use urllib2 to fetch an url. """ logging.warn('fetching %s' % url) request = urllib2.Request(url) request.add_header('User-Agent', useragent()) opener = urllib2.build_opener() result = opener.open(request) tmp = result.read() info = result.info() result.close() if decode: encoding = get_encoding(tmp) logging.info('%s encoding: %s' % (url, encoding)) res = istr(fromenc(tmp, encoding, url)) else: res = istr(tmp) res.info = info return res
def geturl2(url, decode=False): """ use urllib2 to fetch an url. """ logging.info('fetching %s' % url) request = urllib2.Request(url) request.add_header('User-Agent', useragent()) opener = urllib2.build_opener() result = opener.open(request) tmp = result.read() info = result.info() result.close() if decode: encoding = get_encoding(tmp) logging.info('%s encoding: %s' % (url, encoding)) res = istr(fromenc(tmp, encoding, url)) else: res = istr(tmp) res.info = info return res
def geturl2(url, decode=False): """ use urllib2 to fetch an url """ rlog(10, 'url', 'fetching %s' % url) request = urllib2.Request(url) request.add_header('User-Agent', useragent()) opener = urllib2.build_opener() result = opener.open(request) tmp = result.read() info = result.info() # add header information to .info attribute result.close() if decode: encoding = get_encoding(tmp) rlog(0, 'url', '%s encoding: %s' % (url, encoding)) res = istr(fromenc(tmp, encoding, url)) else: res = istr(tmp) res.info = info return res
def geturl2(url, decode=False, timeout=5): """ use urllib2 to fetch an url. """ global enabled if not enabled: raise URLNotEnabled(url) logging.warn('fetching %s' % url) request = urllib2.Request(url) request.add_header('User-Agent', useragent()) opener = urllib2.build_opener() result = opener.open(request, timeout=timeout) tmp = result.read() info = result.info() result.close() if decode: encoding = get_encoding(tmp) logging.info('%s encoding: %s' % (url, encoding)) res = istr(fromenc(tmp, encoding, url)) else: res = istr(tmp) res.status = result.code res.info = info return res
def strip(self, some_html): """ strip html. """ self.theString = u"" self.feed(fromenc(some_html, "ascii")) self.close() return self.theString
def striphtml(txt): """ strip html from txt """ stripper = Stripper() txt = stripper.strip(fromenc(txt)) return txt
def handle_data(self, data): """ data handler """ self.theString += fromenc(data)