def validateURL(url, firstOccurrenceOnly=1, wantRawData=0): """validate RSS from URL, returns events list, or (events, rawdata) tuple""" loggedEvents = [] request = urllib2.Request(url) request.add_header("Accept-encoding", "gzip, deflate") request.add_header("User-Agent", "FeedValidator/1.3") usock = None try: try: usock = urllib2.urlopen(request) rawdata = usock.read(MAXDATALENGTH) if usock.read(1): raise ValidationFailure( logging.ValidatorLimit({ 'limit': 'feed length > ' + str(MAXDATALENGTH) + ' bytes' })) # check for temporary redirects if usock.geturl() <> request.get_full_url(): from urlparse import urlsplit (scheme, netloc, path, query, fragment) = urlsplit(url) if scheme == 'http': from httplib import HTTPConnection requestUri = (path or '/') + (query and '?' + query) conn = HTTPConnection(netloc) conn.request("GET", requestUri) resp = conn.getresponse() if resp.status <> 301: loggedEvents.append(TempRedirect({})) except BadStatusLine, status: raise ValidationFailure( logging.HttpError({'status': status.__class__})) except urllib2.HTTPError, status: rawdata = status.read() if len(rawdata) < 512 or 'content-encoding' in status.headers: loggedEvents.append(logging.HttpError({'status': status})) usock = status else: rawdata = re.sub('<!--.*?-->', '', rawdata) lastline = rawdata.strip().split('\n')[-1].strip() if sniffPossibleFeed(rawdata): loggedEvents.append(logging.HttpError({'status': status})) loggedEvents.append(logging.HttpErrorWithPossibleFeed({})) usock = status else: raise ValidationFailure( logging.HttpError({'status': status}))
def validateURL(url, firstOccurrenceOnly=1, wantRawData=0): """validate RSS from URL, returns events list, or (events, rawdata) tuple""" loggedEvents = [] request = urllib2.Request(url) request.add_header("Accept-encoding", "gzip, deflate") request.add_header("User-Agent", "FeedValidator/1.3") try: usock = urllib2.urlopen(request) rawdata = usock.read(MAXDATALENGTH) if usock.read(1): raise ValidationFailure( logging.ValidatorLimit({ 'limit': 'feed length > ' + str(MAXDATALENGTH) + ' bytes' })) # check for temporary redirects if usock.geturl() <> request.get_full_url(): from httplib import HTTPConnection spliturl = url.split('/', 3) if spliturl[0] == "http:": conn = HTTPConnection(spliturl[2]) conn.request("GET", '/' + spliturl[3].split("#", 1)[0]) resp = conn.getresponse() if resp.status <> 301: loggedEvents.append(TempRedirect({})) except BadStatusLine, status: raise ValidationFailure(logging.HttpError({'status': status.__class__}))
def validateURL(url, firstOccurrenceOnly=1, wantRawData=0): """validate RSS from URL, returns events list, or (events, rawdata) tuple""" loggedEvents = [] request = urllib2.Request(url) request.add_header("Accept-encoding", "gzip, deflate") request.add_header("User-Agent", "FeedValidator/1.3") usock = None try: try: usock = urllib2.urlopen(request) rawdata = usock.read(MAXDATALENGTH) if usock.read(1): raise ValidationFailure( logging.ValidatorLimit({ 'limit': 'feed length > ' + str(MAXDATALENGTH) + ' bytes' })) # check for temporary redirects if usock.geturl() <> request.get_full_url(): from httplib import HTTPConnection spliturl = url.split('/', 3) if spliturl[0] == "http:": conn = HTTPConnection(spliturl[2]) conn.request("GET", '/' + spliturl[3].split("#", 1)[0]) resp = conn.getresponse() if resp.status <> 301: loggedEvents.append(TempRedirect({})) except BadStatusLine, status: raise ValidationFailure( logging.HttpError({'status': status.__class__})) except urllib2.HTTPError, status: rawdata = status.read() if len(rawdata) < 512 or 'content-encoding' in status.headers: loggedEvents.append(logging.HttpError({'status': status})) usock = status else: rawdata = re.sub('<!--.*?-->', '', rawdata) lastline = rawdata.strip().split('\n')[-1].strip() if lastline in ['</rss>', '</feed>', '</rdf:RDF>', '</kml>']: loggedEvents.append(logging.HttpError({'status': status})) usock = status else: raise ValidationFailure( logging.HttpError({'status': status}))
if resp.status<>301: loggedEvents.append(TempRedirect({})) except BadStatusLine, status: raise ValidationFailure(logging.HttpError({'status': status.__class__})) except urllib2.HTTPError, status: rawdata = status.read() lastline = rawdata.strip().split('\n')[-1].strip() if lastline in ['</rss>','</feed>','</rdf:RDF>']: loggedEvents.append(logging.HttpError({'status': status})) usock = status else: raise ValidationFailure(logging.HttpError({'status': status})) except urllib2.URLError, x: raise ValidationFailure(logging.HttpError({'status': x.reason})) except Timeout, x: raise ValidationFailure(logging.IOError({"message": 'Server timed out', "exception":x})) if usock.headers.get('content-encoding', None) == None: loggedEvents.append(Uncompressed({})) if usock.headers.get('content-encoding', None) == 'gzip': import gzip, StringIO try: rawdata = gzip.GzipFile(fileobj=StringIO.StringIO(rawdata)).read() except: import sys exctype, value = sys.exc_info()[:2] event=logging.IOError({"message": 'Server response declares Content-Encoding: gzip', "exception":value}) raise ValidationFailure(event)
if spliturl[0] == "http:": conn = HTTPConnection(spliturl[2]) conn.request("GET", '/' + spliturl[3].split("#", 1)[0]) resp = conn.getresponse() if resp.status <> 301: loggedEvents.append(TempRedirect({})) except BadStatusLine, status: raise ValidationFailure(logging.HttpError({'status': status.__class__})) except urllib2.HTTPError, status: rawdata = status.read() lastline = rawdata.strip().split('\n')[-1].strip() if lastline in ['</rss>', '</feed>', '</rdf:RDF>']: loggedEvents.append(logging.HttpError({'status': status})) usock = status else: raise ValidationFailure(logging.HttpError({'status': status})) except urllib2.URLError, x: raise ValidationFailure(logging.HttpError({'status': x.reason})) except Timeout, x: raise ValidationFailure( logging.IOError({ "message": 'Server timed out', "exception": x })) if usock.headers.get('content-encoding', None) == None: loggedEvents.append(Uncompressed({}))