def test_entries_coverage(self): success = 0 fperrors = 0 sperrors = 0 errcompats = 0 fperror = False total = len(self.files) total = 1000 failedpaths = [] failedentries = [] bozoentries = [] for f in self.files[:total]: fperror = False with open(f) as fo: document = fo.read() try: fpresult = load_cache(f) if fpresult is None: fpresult = feedparser.parse(document) except: fperrors += 1 fperror = True if fpresult.get('bozo', 0): fperrors += 1 fperror = True try: spresult = speedparser.parse(document) except: if fperror: errcompats += 1 else: sperrors += 1 bozoentries.append(f) continue if 'bozo_exception' in spresult: if fperror: errcompats += 1 else: sperrors += 1 bozoentries.append(f) continue try: entry_equivalence(self, fpresult, spresult) success += 1 except: import traceback print("Failure: %s" % f) traceback.print_exc() failedentries.append(f) print("Success: %d out of %d (%0.2f %%, fpe: %d, spe: %d, both: %d)" % (success, total, (100 * success)/float(total-fperrors), fperrors, sperrors, errcompats)) print("Failed entries:\n%s" % pformat(failedentries)) print("Bozo entries:\n%s" % pformat(bozoentries))
def test_single_feed(self): fpresult = feedparser.parse(self.doc) spresult = speedparser.parse(self.doc) d = dict(fpresult) d['entries'] = d['entries'][:4] pprint(d) d = dict(spresult) d['entries'] = d['entries'][:4] pprint(d) feed_equivalence(self, fpresult, spresult) entry_equivalence(self, fpresult, spresult)
def test_feeds(self): for path in self.filenames: with open(path) as f: doc = f.read() fpresult = feedparser.parse(doc) spresult = speedparser.parse(doc) try: feed_equivalence(self, fpresult, spresult) entry_equivalence(self, fpresult, spresult) except: import traceback print "Comp Failure: %s" % path traceback.print_exc()
def test_feeds(self): for path in self.filenames: with open(path) as f: doc = f.read() fpresult = feedparser.parse(doc) spresult = speedparser.parse(doc) try: feed_equivalence(self, fpresult, spresult) entry_equivalence(self, fpresult, spresult) except: import traceback print("Comp Failure: %s" % path) traceback.print_exc()
def test_feed_coverage(self): success = 0 fperrors = 0 sperrors = 0 total = 300 failedpaths = [] failedentries = [] for f in self.files[:total]: fperror = False with open(f) as fo: document = fo.read() try: fpresult = feedparser.parse(document) except: fperrors += 1 fperror = True try: spresult = speedparser.parse(document) except: sperrors += 1 continue try: feed_equivalence(self, fpresult, spresult) success += 1 except: failedpaths.append(f) pass try: entry_equivalence(self, fpresult, spresult) except: failedentries.append(f) print "Success: %d out of %d (%0.2f %%, fpe: %d, spe: %d)" % ( success, total, (100 * success) / float(total - fperrors), fperrors, sperrors) print "Entry Success: %d out of %d (%0.2f %%)" % ( success - len(failedentries), success, (100 * (success - len(failedentries))) / float(total - fperrors)) print "Failed Paths:\n%s" % pformat(failedpaths) print "Failed entries:\n%s" % pformat(failedentries)
def test_feed_coverage(self): success = 0 fperrors = 0 sperrors = 0 total = 300 failedpaths = [] failedentries = [] for f in self.files[:total]: fperror = False with open(f) as fo: document = fo.read() try: fpresult = feedparser.parse(document) except: fperrors += 1 fperror = True try: spresult = speedparser.parse(document) except: sperrors += 1 continue try: feed_equivalence(self, fpresult, spresult) success += 1 except: failedpaths.append(f) pass try: entry_equivalence(self, fpresult, spresult) except: failedentries.append(f) print "Success: %d out of %d (%0.2f %%, fpe: %d, spe: %d)" % (success, total, (100 * success)/float(total-fperrors), fperrors, sperrors) print "Entry Success: %d out of %d (%0.2f %%)" % (success-len(failedentries), success, (100*(success-len(failedentries)))/float(total-fperrors)) print "Failed Paths:\n%s" % pformat(failedpaths) print "Failed entries:\n%s" % pformat(failedentries)
def update_feeds(num=10): with SimpleBufferObject(Entry) as new_entry_buffer: current_time = now() # get all active feeds with subscribers that have not been checked or need to be checked based # on "next_checked" feeds = Feed.active.filter( Q(next_checked=None) | Q(next_checked__lte=current_time))[:num] for feed in feeds: # update last checked to current time feed.last_checked = now() # set "next_checked" based on "check_frequency" feed.next_checked = feed.last_checked + timedelta( hours=feed.check_frequency) # create new FeedLog object log = FeedLog(feed=feed) notes = [] # load conditional GET headers from feed object headers = HEADERS if feed.etag and feed.etag != '': headers['If-None-Match'] = feed.etag if feed.last_modified: last_modified = make_naive(feed.last_modified) headers['If-Modified-Since'] = http_date( last_modified.timestamp()) try: req = requests.get(feed.feed_url, headers=headers, allow_redirects=True) log.status_code = req.status_code log.headers = ', '.join("{!s}={!r}".format(key, val) for (key, val) in headers.items()) log.headers += "--\n" log.headers += ', '.join("{!s}={!r}".format(key, val) for (key, val) in req.headers.items()) notes.append('updating {0}'.format(feed)) # update feed URL if redirected or altered if (req.url != feed.feed_url) and ( req.history[-1].status_code == 301): # if updated feed URL already exists, something is wrong if Feed.objects.filter(feed_url=req.url).exists(): feed.disabled = True notes.append('Feed URL does not match response, \ but new feed already exists with {0}.'.format( req.url)) else: notes.append( 'Updating feed url from {0} to {1}.'.format( feed.feed_url, req.url)) feed.feed_url = req.url if req.status_code == requests.codes.not_modified: notes.append('not modified') elif req.status_code == requests.codes.ok: notes.append('status OK, parsing') # update conditional GET data feed.etag = alphanum.sub('', req.headers.get('etag', '')) feed.last_modified = parse_http_date( req.headers.get('last-modified', None), default=feed.last_checked) # must remove encoding declaration from feed or lxml will pitch a fit text = XML_DECLARATION.sub('', req.text, 1) parsed = speedparser.parse(text, encoding=req.encoding) # bozo feed if parsed.bozo == 1: notes.append('bozo feed') notes.append(parsed.bozo_tb) feed.increment_error_count() else: # update feed meta data, reset error count feed.reset_error_count() feed.title = parsed.feed.get('title', feed.title) feed.title = shorten_string(feed.title) feed.description = parsed.feed.get( 'description', parsed.feed.get('subtitle', None)) # icon/logo are not working in speedparser # feed.icon = parsed.feed.get('logo', feed.icon) # get latest existing entry for feed try: latest_entry = feed.entry_set.latest() except Entry.DoesNotExist: latest_entry = None for count, entry in enumerate(parsed.entries): published = feed_datetime( entry.get( 'published_parsed', entry.get('updated_parsed', None)), default=feed.last_checked) # only proceed if entry is newer than last # entry for feed if latest_entry is None or published > latest_entry.published: # entry ID is a hash of the link or entry id entry_id = hashlib.sha1( entry.get('id', entry.link).encode( 'utf-8')).hexdigest() author = bleach.clean(entry.get( 'author', 'no author'), strip=True, strip_comments=True) author = shorten_string(author) content = None content_items = entry.get('content', None) if content_items is None: content = entry.get( 'summary', 'No summary.') else: for c in content_items: if c.get('type', None) in ('text', 'html', 'xhtml', None): if content is None: content = c.get( 'value', '') else: content += c.get( 'value', '') content = bleach.clean( content, tags=BLEACH_TAGS, attributes=BLEACH_ATTRS, strip=True, strip_comments=True) title = bleach.clean(entry.get( 'title', 'no title'), strip=True, strip_comments=True) title = shorten_string(title) new_entry_buffer.add( Entry(feed=feed, entry_id=entry_id, link=entry.get('link', ''), title=title, author=author, content=content, published=published, updated=feed_datetime( entry.get( 'updated_parsed', None), default=feed.last_checked))) log.entries += 1 else: break if log.entries > 0: feed.has_new_feeds = True else: notes.append('error: {0}'.format(req.status_code)) feed.increment_error_count() except requests.exceptions.Timeout: # pragma: no cover log.notes = 'timeout error' feed.increment_error_count() except requests.exceptions.ConnectionError: # pragma: no cover log.notes = 'connection error' feed.increment_error_count() except requests.exceptions.HTTPError: # pragma: no cover log.notes = 'HTTP error' feed.increment_error_count() except requests.exceptions.TooManyRedirects: # pragma: no cover log.notes = 'too many redirects' feed.increment_error_count() log.notes = '\n'.join(notes) duration = now() - feed.last_checked log.duration = duration.microseconds feed.save() log.save()