def run(self): from singleton import db c = db.cursor() while True: item_uid, rating = self.in_q.get() try: c.execute("""update fm_items set item_rating=?, item_rated=julianday('now') where item_uid=?""", [rating, item_uid]) fb_token = param.settings.get('fb_token', None) if rating == 1 and fb_token: c.execute("""select feed_uid, item_link, item_title, feed_private from fm_items, fm_feeds where item_uid=? and feed_uid=item_feed_uid""", [item_uid]) feed_uid, url, title, private = c.fetchone() db.commit() if rating == 1 and fb_token and not private: callout = random.choice( ['Interesting: ', 'Notable: ', 'Recommended: ', 'Thumbs-up: ', 'Noteworthy: ', 'FYI: ', 'Worth reading: ']) try: social.fb_post(fb_token, callout + title, url) except social.ExpiredToken: notification(db, c, feed_uid, 'Service notification', 'The Facebook access token has expired', link='/settings#facebook') except: util.print_stack() # this will never be reached c.close()
def apply(self, content, *args, **kwargs): item = args[1] if self.link_substr in item['link']: try: # check if this item has not already been loaded before guid = item['id'] from singleton import db, sqlite c = db.cursor() if sqlite.paramstyle == 'qmark': c.execute("select item_link from fm_items where item_guid=?", [guid]) elif sqlite.paramstyle == 'pyformat': c.execute("select item_link from fm_items where item_guid=%guid)s", {'guid': guid}) link = c.fetchone() c.close() if link: print >> param.log, 'not dereferencing', guid, '->', link[0] item['link'] = link[0] return content # we haven't seen this article before, buck up and load it deref = urllib2.urlopen(item['link']).read() m = self.re.search(deref) if m and m.groups(): item['link'] = m.groups()[0] except: util.print_stack() return content
def update_feed(db, c, f, feed_uid, feed_xml, feed_etag, feed_modified, feed_dupcheck=None): print >> param.activity, feed_xml if 'why' in f and f['why'] == 'no change since Etag': return # check for errors - HTTP code 304 means no change if not hasattr(f, 'feed') \ or 'title' not in f.feed and 'link' not in f.feed: if not hasattr(f, 'feed'): print >> param.log, """FFFFF not hasattr(f, 'feed')""", else: print >> param.log, """FFFFF title=%r link=%r""" % ( 'title' not in f.feed, 'link' not in f.feed ), if 'why' in f: print >> param.log, feed_xml, f['why'] else: print >> param.log, feed_xml # error or timeout - increment error count increment_errors(db, c, feed_uid) else: # no error - reset etag and/or modified date and error count clear_errors(db, c, feed_uid, f) try: process_parsed_feed(db, c, f, feed_uid, feed_dupcheck) except: util.print_stack(['c', 'f'])
def evaluate_rules(item, feed, feed_uid, exempt): for rule in rules * (not exempt) + feed_rules.get(feed_uid, list()): try: if rule.test(item, feed, feed_uid): return True, rule except: util.print_stack(['f']) return False, None
def run(self): while True: # XXX should wrap this in a try/except clause self.event.wait(param.refresh_interval) print >> param.activity, time.ctime(), '- refreshing feeds' try: update() except: util.print_stack() self.event.clear()
def update_feed(db, c, f, feed_uid, feed_xml, feed_etag, feed_modified, feed_dupcheck=None): print >> param.activity, feed_xml # check for errors - HTTP code 304 means no change if not hasattr(f, 'feed') or 'status' not in f or \ 'title' not in f.feed and 'link' not in f.feed and \ ('status' not in f or f['status'] not in [304]): # error or timeout - increment error count increment_errors(db, c, feed_uid) else: # no error - reset etag and/or modified date and error count clear_errors(db, c, feed_uid, f) try: process_parsed_feed(db, c, f, feed_uid, feed_dupcheck) except: util.print_stack(['c', 'f'])
def fetch_feed(feed_uid, feed_xml, feed_etag, feed_modified): if not feed_etag: feed_etag = None if not feed_modified: feed_modified = None try: f = feedparser.parse(feed_xml, etag=feed_etag, modified=feed_modified) except socket.timeout: if param.debug: print >> param.log, 'EEEEE error fetching feed', feed_xml f = {'channel': {}, 'items': []} except: if param.debug: util.print_stack() f = {'channel': {}, 'items': []} normalize.normalize_feed(f) return f
def update(where_clause=''): from singleton import db c = db.cursor() # refresh filtering rules filters.load_rules(db, c) # at 3AM by default, perform house-cleaning if time.localtime()[3] == param.backup_hour: cleanup(db, c) # create worker threads and the queues used to communicate with them work_q = Queue.Queue() process_q = Queue.Queue() workers = [] for i in range(param.feed_concurrency): workers.append(FeedWorker(i + 1, work_q, process_q)) workers[-1].start() # assign work c.execute("""select feed_uid, feed_xml, feed_etag, feed_dupcheck, strftime('%s', feed_modified) from fm_feeds where feed_status=0 """ + where_clause) for feed_uid, feed_xml, feed_etag, feed_dupcheck, feed_modified in c: if feed_modified: feed_modified = float(feed_modified) feed_modified = time.localtime(feed_modified) else: feed_modified = None work_q.put((feed_uid, feed_xml, feed_etag, feed_modified, feed_dupcheck)) # None is an indication for workers to stop for i in range(param.feed_concurrency): work_q.put(None) workers_left = param.feed_concurrency while workers_left > 0: feed_info = process_q.get() # exited worker if not feed_info: workers_left -= 1 else: try: update_feed(db, c, *feed_info) except: util.print_stack() db.commit() # give reader threads an opportunity to get their work done time.sleep(1) c.close()
def update(where_clause=''): with dbop.db() as db: c = db.cursor() # refresh filtering rules filters.load_rules(c) # at 3AM by default, perform house-cleaning if time.localtime()[3] == param.backup_hour: cleanup(db, c) # create worker threads and the queues used to communicate with them work_q = Queue.Queue() process_q = Queue.Queue() workers = [] for i in range(param.feed_concurrency): workers.append(FeedWorker(i + 1, work_q, process_q)) workers[-1].start() # assign work c.execute("""select feed_uid, feed_xml, feed_etag, feed_dupcheck, strftime('%s', feed_modified) from fm_feeds where feed_status=0 """ + where_clause) for feed_uid, feed_xml, feed_etag, feed_dupcheck, feed_modified in c: if feed_modified: feed_modified = float(feed_modified) feed_modified = time.localtime(feed_modified) else: feed_modified = None work_q.put( (feed_uid, feed_xml, feed_etag, feed_modified, feed_dupcheck)) # None is an indication for workers to stop for i in range(param.feed_concurrency): work_q.put(None) workers_left = param.feed_concurrency while workers_left > 0: feed_info = process_q.get() # exited worker if not feed_info: workers_left -= 1 else: try: update_feed(db, c, *feed_info) except: util.print_stack() db.commit() # give reader threads an opportunity to get their work done time.sleep(1)
def load_rules(db, c): global loaded, rules, feed_rules if loaded: return rules = [] feed_rules = dict() try: try: c.execute("""select rule_uid, rule_type, rule_text, rule_feed_uid, strftime('%s', rule_expires) from fm_rules where rule_expires is null or rule_expires > julianday('now')""") for uid, rtype, rule, feed_uid, expires in c: if expires: expires = int(expires) if feed_uid: container = feed_rules.setdefault(feed_uid, list()) else: container = rules if rtype == 'python': rule = PythonRule(uid, expires, rule) container.append(rule) elif rtype == 'tag': rule = TagRule(uid, expires, rule) container.append(rule) elif rtype == 'author': rule = AuthorRule(uid, expires, rule) container.append(rule) elif rtype.startswith('union_'): # XXX this convention of adding a second rule object with UID -uid # XXX is a ugly hack container.append(KeywordRule( -uid, expires, rule, rtype.replace('union_', 'title_'))) container.append(KeywordRule( uid, expires, rule, rtype.replace('union_', 'content_'))) else: container.append(KeywordRule(uid, expires, rule, rtype)) c.execute("""select feed_uid, feed_filter from fm_feeds where feed_filter is not null""") for feed_uid, rule in c: rule = PythonRule('feed_%d' % feed_uid, None, rule) feed_rules.setdefault(feed_uid, list()).append(rule) except: util.print_stack() finally: loaded = True
def load_rules(c): global loaded, rules, feed_rules if loaded: return rules = [] feed_rules = dict() try: try: for uid, rtype, rule, feed_uid, expires in \ c.execute("""select rule_uid, rule_type, rule_text, rule_feed_uid, strftime('%s', rule_expires) from fm_rules where rule_expires is null or rule_expires > julianday('now')"""): if expires: expires = int(expires) if feed_uid: container = feed_rules.setdefault(feed_uid, list()) else: container = rules if rtype == 'python': rule = PythonRule(uid, expires, rule) container.append(rule) elif rtype == 'tag': rule = TagRule(uid, expires, rule) container.append(rule) elif rtype == 'author': rule = AuthorRule(uid, expires, rule) container.append(rule) elif rtype.startswith('union_'): # XXX this convention of adding a second rule object with UID -uid # XXX is a ugly hack container.append(KeywordRule( -uid, expires, rule, rtype.replace('union_', 'title_'))) container.append(KeywordRule( uid, expires, rule, rtype.replace('union_', 'content_'))) else: container.append(KeywordRule(uid, expires, rule, rtype)) for feed_uid, rule in \ c.execute("""select feed_uid, feed_filter from fm_feeds where feed_filter is not null"""): rule = PythonRule('feed_%d' % feed_uid, None, rule) feed_rules.setdefault(feed_uid, list()).append(rule) except: util.print_stack() finally: loaded = True
def run(self): while True: item_uid = None try: item_uid, rating = self.in_q.get() with dbop.db() as db: c = db.cursor() try: c.execute( """update fm_items set item_rating=?, item_rated=julianday('now') where item_uid=?""", [rating, item_uid]) fb_token = param.settings.get('fb_token', None) if rating == 1 and fb_token: c.execute( """select feed_uid, item_link, item_title, feed_private from fm_items, fm_feeds where item_uid=? and feed_uid=item_feed_uid""", [item_uid]) feed_uid, url, title, private = c.fetchone() db.commit() if rating == 1 and fb_token and not private: callout = random.choice([ 'Interesting: ', 'Notable: ', 'Recommended: ', 'Thumbs-up: ', 'Noteworthy: ', 'FYI: ', 'Worth reading: ' ]) try: social.fb_post(fb_token, callout + title, url) except social.ExpiredToken: notification( db, c, feed_uid, 'Service notification', 'The Facebook access token has expired', link='/settings#facebook') except: util.print_stack() except: util.print_stack() if item_uid is not None: self.in_q.put((item_uid, rating))
def step(self, rating, date, decay): """The aggregate function takes the following parameters: status: value of item_rating date: value of item_created decay: half-life to use, in days """ # articles older than param.garbage_items cannot be counted towards # the SNR, as the uninteresting ones have been purged and thus skew # the metric towards 100% try: if self.ref_date - date < param.garbage_items: # by convention, 0 means do not decay (i.e. infinite half-life) if decay == 0: decay = 1 else: decay = .5 ** ((self.ref_date - date) / decay) self.sum_rated += decay * int(rating not in [0, -2]) self.sum_good += decay * int(rating == 1) except: util.print_stack() raise
def fetch_feed(feed_uid, feed_xml, feed_etag, feed_modified): if not feed_etag: feed_etag = None if not feed_modified: feed_modified = None try: r = requests.get(feed_xml, headers={'If-None-Match': feed_etag}) if r.content == '': return {'channel': {}, 'items': [], 'why': 'no change since Etag'} f = feedparser.parse(r.content, etag=r.headers.get('Etag'), modified=feed_modified) except (socket.timeout, requests.exceptions.RequestException) as e: if param.debug: print >> param.log, 'EEEEE error fetching feed', feed_xml, e f = {'channel': {}, 'items': [], 'why': repr(e)} except: if param.debug: util.print_stack() f = {'channel': {}, 'items': [], 'why': repr(sys.exc_info[1])} normalize.normalize_feed(f) return f
def apply(self, content, *args, **kwargs): item = args[1] if self.link_substr in item['link']: try: # check if this item has not already been loaded before guid = item['id'] with dbop.db() as db: c = db.cursor() c.execute("select item_link from fm_items where item_guid=?", [guid]) link = c.fetchone() c.close() if link: print >> param.log, 'not dereferencing', guid, '->', link[0] item['link'] = link[0] return content # we haven't seen this article before, buck up and load it deref = requests.get(item['link']).content m = self.re.search(deref) if m and m.groups(): item['link'] = m.groups()[0] except: util.print_stack() return content
def dereference(url, seen=None, level=0): """Recursively dereference a URL""" # this set is used to detect redirection loops if seen is None: seen = set([url]) else: seen.add(url) # stop recursion if it is too deep if level > 16: return url try: r = requests.get(url, allow_redirects=False) if not r.is_redirect: return url else: # break a redirection loop if it occurs redir = r.headers.get('Location') if True not in [ redir.startswith(p) for p in ['http://', 'https://', 'ftp://'] ]: return url if redir in seen: return url # some servers redirect to Unicode URLs, which are not legal try: unicode(redir) except UnicodeDecodeError: return url # there might be several levels of redirection return dereference(redir, seen, level + 1) except (requests.exceptions.RequestException, ValueError, socket.error): return url except: util.print_stack() return url
def process_request(self): try: if self.path in ['', '/']: self.browser_output(301, None, 'This document has moved.', ['Location: /view']) return path, query_string = urlparse.urlparse(self.path)[2:5:2] vars = [] if query_string: # parse_qsl does not comply with RFC 3986, we have to decode UTF-8 query_list = [(n, v.decode('UTF-8')) for n, v in urlparse.parse_qsl(query_string, 1)] self.input.update(dict(query_list)) if param.debug: logging.info((self.command, self.path, self.request_version, vars)) if path.endswith('.gif') and path[1:] in self.images: self.browser_output(200, 'image/gif', self.images[path[1:]], http_headers=no_expire) return if path.endswith('.js') and path[1:] in self.rsrc: self.browser_output(200, 'text/javascript', self.rsrc[path[1:]], http_headers=no_expire) return if path.startswith('/tiny_mce'): # guard against attempts to subvert security using ../ path = os.path.normpath('.' + path) assert path.startswith('tiny_mce') self.set_mime_type(path) self.browser_output(200, self.mime_type, open(path).read(), http_headers=no_expire) return if path.count('favicon.ico') > 0: self.favicon() if path.endswith('.css'): path = path.replace('.css', '_css') tmpl = path.split('/', 1)[1].strip('/') self.use_template(tmpl, [self.input]) if not self.require_auth(param.auth_dict): return if path.startswith('/redirect/'): from singleton import db c = db.cursor() item_uid = int(path[10:]) c.execute('select item_link from fm_items where item_uid=%d' % item_uid) redirect_url = c.fetchone()[0] c.close() self.browser_output(301, None, 'This document has moved.', ['Location: ' + redirect_url]) return if path.startswith('/threads'): frames = sys._current_frames() row = 0 out = [] if singleton.c_opened: out.append('<h1>Open Cursors</h1>\n') for curs, tb in singleton.c_opened.iteritems(): if curs not in singleton.c_closed: row += 1 if row % 2: color = '#ddd' else: color = 'white' out.append('<div style="background-color: ' + color + '">\n<pre>') out.append(curs.replace('<', '<').replace('>', '>') + '\n') out.append('\n'.join(tb[:-2])) out.append('</pre></div>\n') out.append('<h1>Threads</h1>\n') row = 0 for thread_id, frame in sorted(frames.iteritems()): if thread_id == threading.currentThread()._Thread__ident: continue row += 1 if row % 2: color = '#ddd' else: color = 'white' out.append('<div style="background-color: ' + color + '">\n<pre>') out.append('Thread %s (%d refs)\n' % (thread_id, sys.getrefcount(frame))) out.append(''.join(traceback.format_stack(frame)).replace( '&', '&').replace('<', '<').replace('>', '>')) out.append('\n<hr>\n') out.append(pprint.pformat(frame.f_locals).replace( '&', '&').replace('<', '<').replace('>', '>')) out.append('\n</pre>\n</div>\n') del frames self.browser_output(200, 'text/html', ''.join(out)) return if path.startswith('/xmlfeedback/'): op, item_uid = path.split('/')[2::2] item_uid = item_uid.split('.')[0] # for safety, these operations should be idempotent if op in ['promote', 'demote', 'basic', 'yappi']: if op != 'yappi': item_uid = int(item_uid) getattr(self, 'op_' + op)(item_uid) self.xml() return if path.startswith('/stem'): txt = self.input['q'] stem = ' '.join(normalize.stem(normalize.get_words(txt))) self.browser_output(200, 'text/plain', stem) return if path.startswith('/add_kw_rule'): from singleton import db c = db.cursor() try: filters.add_kw_rule(db, c, **self.input) except: util.print_stack() db.commit() c.close() self.xml() return if path.startswith('/del_kw_rule'): from singleton import db c = db.cursor() try: filters.del_kw_rule(db, c, **self.input) except: util.print_stack() db.commit() c.close() self.xml() return if path.startswith('/stats'): from singleton import db c = db.cursor() c.execute("""select date(item_loaded) as date, count(*) as articles, sum(case when item_rating=1 then 1 else 0 end) as interesting, sum(case when item_rating=0 then 1 else 0 end) as unread, sum(case when item_rating=-1 then 1 else 0 end) as filtered from fm_items where item_loaded > julianday('now') - 30 group by 1 order by 1""") csvfile = cStringIO.StringIO() out = csv.writer(csvfile, dialect='excel', delimiter=',') out.writerow([col[0].capitalize() for col in c.description]) for row in c: out.writerow(row) self.browser_output(200, 'text/csv', csvfile.getvalue()) csvfile.close() c.close() return if path.endswith('.css'): path = path.replace('.css', '_css') tmpl = path.split('/', 1)[1].strip('/') self.use_template(tmpl, [self.input]) except TembozTemplate.Redirect, e: redirect_url = e.args[0] self.browser_output(301, None, 'This document has moved.', ['Location: ' + redirect_url]) return
def add_feed(feed_xml): """Try to add a feed. Returns a tuple (feed_uid, num_added, num_filtered)""" with dbop.db() as db: c = db.cursor() feed_xml = feed_xml.replace('feed://', 'http://') # verify the feed r = requests.get(feed_xml) f = feedparser.parse(r.content) if 'url' not in f: f['url'] = feed_xml # CVS versions of feedparser are not throwing exceptions as they should # see: # http://sourceforge.net/tracker/index.php?func=detail&aid=1379172&group_id=112328&atid=661937 if not f.feed or ('link' not in f.feed or 'title' not in f.feed): # some feeds have multiple links, one for self and one for PuSH if f.feed and 'link' not in f.feed and 'links' in f.feed: try: for l in f.feed['links']: if l['rel'] == 'self': f.feed['link'] = l['href'] except KeyError: pass if not f.feed or ('link' not in f.feed or 'title' not in f.feed): # try autodiscovery try: feed_xml = AutoDiscoveryHandler().feed_url(feed_xml) except HTMLParser.HTMLParseError: # in desperate conditions, regexps ride to the rescue try: feed_xml = re_autodiscovery(feed_xml)[0][1] except: util.print_stack() raise AutodiscoveryParseError if not feed_xml: raise ParseError r = requests.get(feed_xml) f = feedparser.parse(r.content) if not f.feed: raise ParseError # we have a valid feed, normalize it normalize.normalize_feed(f) feed = { 'xmlUrl': f['url'], 'htmlUrl': str(f.feed['link']), 'etag': r.headers.get('Etag'), 'title': f.feed['title'].encode('ascii', 'xmlcharrefreplace'), 'desc': f.feed['description'].encode('ascii', 'xmlcharrefreplace') } for key, value in feed.items(): if type(value) == str: feed[key] = value filters.load_rules(c) try: c.execute("""insert into fm_feeds (feed_xml, feed_etag, feed_html, feed_title, feed_desc) values (:xmlUrl, :etag, :htmlUrl, :title, :desc)""", feed) feed_uid = c.lastrowid num_added, num_filtered = process_parsed_feed(db, c, f, feed_uid) db.commit() return feed_uid, feed['title'], num_added, num_filtered except sqlite3.IntegrityError, e: if 'feed_xml' in str(e): db.rollback() raise FeedAlreadyExists else: db.rollback() raise UnknownError(str(e))
def process_parsed_feed(db, c, f, feed_uid, feed_dupcheck=None, exempt=None): """Insert the entries from a feedparser parsed feed f in the database using the cursor c for feed feed_uid. Returns a tuple (number of items added unread, number of filtered items)""" num_added = 0 num_filtered = 0 filters.load_rules(c) # check if duplicate title checking is in effect if feed_dupcheck is None: c.execute("select feed_dupcheck from fm_feeds where feed_uid=?", [feed_uid]) feed_dupcheck = bool(c.fetchone()[0]) # check if the feed is exempt from filtering if exempt is None: c.execute("select feed_exempt from fm_feeds where feed_uid=?", [feed_uid]) exempt = bool(c.fetchone()[0]) # the Radio convention is reverse chronological order f['items'].reverse() for item in f['items']: try: normalize.normalize(item, f) except: util.print_stack() continue # evaluate the FilteringRules skip, rule = filters.evaluate_rules(item, f, feed_uid, exempt) filtered_by = None if skip: skip = -2 if type(rule.uid) == int: filtered_by = rule.uid else: # XXX clunky convention for feed_rule, but that should disappear # XXX eventually filtered_by = 0 title = item['title'] link = item['link'] guid = item['id'] author = item['author'] created = item['created'] modified = item['modified'] if not modified: modified = None content = item['content'] # check if the item already exists, using the GUID as key # but cache all seen GUIDs in a dictionary first, since most articles are # existing ones and we can save a database query this way if feed_uid in feed_guid_cache and guid in feed_guid_cache[feed_uid]: # existing entry and we've seen it before in this process instance # update the time stamp to prevent premature garbage-collection # in prune_feed_guid_cache feed_guid_cache.setdefault(feed_uid, dict())[guid] = time.time() continue else: feed_guid_cache.setdefault(feed_uid, dict())[guid] = time.time() # not seen yet, it may or may not be a duplicate, we have to find out the # hard way c.execute("""select item_uid, item_link, item_loaded, item_created, item_modified, item_md5hex, item_title, item_content, item_creator from fm_items where item_feed_uid=? and item_guid=?""", [feed_uid, guid]) l = c.fetchall() # unknown GUID, but title/link duplicate checking may be in effect if not l: if feed_dupcheck: c.execute("""select count(*) from fm_items where item_feed_uid=? and (item_title=? or item_link=?)""", [feed_uid, title, link]) l = bool(c.fetchone()[0]) if l: print >> param.activity, 'DUPLICATE TITLE', title # XXX Runt items (see normalize.py) are almost always spurious, we just # XXX skip them, although we may revisit this decision in the future if not l and item.get('RUNT', False): print >> param.activity, 'RUNT ITEM', item l = True # GUID already exists, this is a change else: assert len(l) == 1 (item_uid, item_link, item_loaded, item_created, item_modified, item_md5hex, item_title, item_content, item_creator) = l[0] # if this is a feed without timestamps, use our timestamp to determine # the oldest item in the feed XML file if 'oldest' in f and f['oldest'] == '1970-01-01 00:00:00': if 'oldest_ts' not in f: f['oldest_ts'] = item_created else: f['oldest_ts'] = min(f['oldest_ts'], item_created) # XXX update item here # XXX update tags if required # GUID doesn't exist yet, insert it if not l: # finally, dereference the URL to get rid of annoying tracking servers # like feedburner, but only do this once to avoid wasting bandwidth link = normalize.dereference(link) try: c.execute("""insert into fm_items (item_feed_uid, item_guid, item_created, item_modified, item_link, item_md5hex, item_title, item_content, item_creator, item_rating, item_rule_uid) values (?, ?, julianday(?), julianday(?), ?, ?, ?, ?, ?, ?, ?)""", [feed_uid, guid, created, modified, link, hashlib.md5(content).hexdigest(), title, content, author, skip, filtered_by]) # if we have tags, insert them # note: feedparser.py handles 'category' as a special case, so we # need to work around that to get to the data if item['item_tags']: c.execute("""select item_uid from fm_items where item_feed_uid=? and item_guid=?""", [feed_uid, guid]) item_uid = c.fetchone()[0] for tag in item['item_tags']: c.execute("""insert or ignore into fm_tags (tag_name, tag_item_uid) values (?, ?)""", [tag, item_uid]) if skip: num_filtered += 1 print >> param.activity, 'SKIP', title, rule else: num_added += 1 print >> param.activity, ' ' * 4, title except: util.print_stack(['c', 'f']) continue # update timestamp of the oldest item still in the feed file if 'oldest' in f and f['oldest'] != '9999-99-99 99:99:99': if f['oldest'] == '1970-01-01 00:00:00' and 'oldest_ts' in f: c.execute("update fm_feeds set feed_oldest=? where feed_uid=?", [f['oldest_ts'], feed_uid]) else: c.execute("""update fm_feeds set feed_oldest=julianday(?) where feed_uid=?""", [f['oldest'], feed_uid]) return (num_added, num_filtered)
def normalize(item, f, run_filters=True): # get rid of RDF lossage... for key in ['title', 'link', 'created', 'modified', 'author', 'content', 'content_encoded', 'description']: if type(item.get(key)) == list: if len(item[key]) == 1: item[key] = item[key][0] else: candidate = [i for i in item[key] if i.get('type') == 'text/html'] if len(candidate) > 1 and key == 'content': candidate = sorted(candidate, key=lambda i: len(i.get('value', '')), reverse=True)[:1] if len(candidate) == 1: item[key] = candidate[0] else: # XXX not really sure how to handle these cases print >> param.log, 'E' * 16, 'ambiguous RDF', key, item[key] item[key] = item[key][0] if isinstance(item.get(key), dict) and 'value' in item[key]: item[key] = item[key]['value'] ######################################################################## # title if 'title' not in item or not item['title'].strip(): item['title'] = 'Untitled' # XXX for debugging if type(item['title']) not in [str, unicode]: print >> param.log, 'TITLE' * 15 import code from sys import exit code.interact(local=locals()) item['title_lc'] = lower(item['title']) item['title_words_exact'] = get_words(item['title_lc']) item['title_words'] = stem(item['title_words_exact']) ######################################################################## # link # # The RSS 2.0 specification allows items not to have a link if the entry # is complete in itself # that said this is almost always spurious, so we filter it below if 'link' not in item: item['link'] = f['channel']['link'] # We have to be careful not to assign a default URL as the GUID # otherwise only one item will ever be recorded if 'id' not in item: item['id'] = 'HASH_CONTENT' item['RUNT'] = True if type(item['link']) == unicode: item['link'] = str(item['link'].encode('UTF-8')) if type(item['link']) != str: print >> param.log, 'LINK IS NOT str', repr(item['link']) # XXX special case handling for annoying Sun/Roller malformed entries if 'blog.sun.com' in item['link'] or 'blog.sun.com' in item['link']: item['link'] = item['link'].replace( 'blog.sun.com', 'blogs.sun.com').replace( 'blogs.sun.com/page', 'blogs.sun.com/roller/page') ######################################################################## # GUID if 'id' not in item: item['id'] = item['link'] ######################################################################## # creator if 'author' not in item or item['author'] == 'Unknown': item['author'] = 'Unknown' if 'author' in f['channel']: item['author'] = f['channel']['author'] ######################################################################## # created amd modified dates if 'modified' not in item: item['modified'] = f['channel'].get('modified') # created - use modified if not available if 'created' not in item: if 'modified_parsed' in item: created = item['modified_parsed'] else: created = None else: created = item['created_parsed'] if not created: # XXX use HTTP last-modified date here created = time.gmtime() # feeds that do not have timestamps cannot be garbage-collected # XXX need to find a better heuristic, as high-volume sites such as # XXX The Guardian, CNET.com or Salon.com lack item-level timestamps f['oldest'] = '1970-01-01 00:00:00' created = fix_date(created) item['created'] = time.strftime(date_fmt, created) # keep track of the oldest item still in the feed file if 'oldest' not in f: f['oldest'] = '9999-99-99 99:99:99' if item['created'] < f['oldest']: f['oldest'] = item['created'] # finish modified date if 'modified_parsed' in item and item['modified_parsed']: modified = fix_date(item['modified_parsed']) # add a fudge factor time window within which modifications are not # counted as such, 10 minutes here if not modified or abs(time.mktime(modified) - time.mktime(created)) < 600: item['modified'] = None else: item['modified'] = time.strftime(date_fmt, modified) else: item['modified'] = None ######################################################################## # content if 'content' in item: content = item['content'] elif 'content_encoded' in item: content = item['content_encoded'] elif 'description' in item: content = item['description'] else: content = '<a href="' + item['link'] + '">' + item['title'] + '</a>' if not content: content = '<a href="' + item['link'] + '">' + item['title'] + '</a>' # strip embedded NULs as a defensive measure content = content.replace('\0', '') # apply ad filters and other degunking to content old_content = None while old_content != content: old_content = content try: for filter in transform.filter_list: content = filter.apply(content, f, item) except: util.print_stack(black_list=['item']) # balance tags like <b>...</b> content = balance(content) content_lc = lower(content) # the content might have invalid 8-bit characters. # Heuristic suggested by Georg Bauer if type(content) != unicode: try: content = content.decode('utf-8') except UnicodeError: content = content.decode('iso-8859-1') # item['content'] = content # we recalculate this as content may have changed due to tag rebalancing, etc item['content_lc'] = lower(content) item['content_words_exact'] = get_words(item['content_lc']) item['content_words'] = stem(item['content_words_exact']) item['union_lc'] = item['title_lc'] + '\n' + item['content_lc'] item['union_words'] = item['title_words'].union(item['content_words']) item['urls'] = url_re.findall(content) ######################################################################## # categories/tags # we used 'category' before, but 'category' and 'categories' are # intercepted by feedparser.FeedParserDict.__getitemm__ and treated as # special case if 'tags' in item and type(item['tags']) == list: item['item_tags'] = set([lower(t['term']) for t in item['tags']]) else: item['item_tags'] = [] ######################################################################## # map unicode for key in ['title', 'link', 'created', 'modified', 'author', 'content']: if type(item.get(key)) == unicode: item[key] = item[key].encode('ascii', 'xmlcharrefreplace') # hash the content as the GUID if required if item['id'] == 'HASH_CONTENT': item['id']= hashlib.md5(item['title'] + item['content']).hexdigest()
def normalize(item, f, run_filters=True): # get rid of RDF lossage... for key in [ 'title', 'link', 'created', 'modified', 'author', 'content', 'content_encoded', 'description' ]: if type(item.get(key)) == list: if len(item[key]) == 1: item[key] = item[key][0] else: candidate = [ i for i in item[key] if i.get('type') == 'text/html' ] if len(candidate) > 1 and key == 'content': candidate = sorted(candidate, key=lambda i: len(i.get('value', '')), reverse=True)[:1] if len(candidate) == 1: item[key] = candidate[0] else: # XXX not really sure how to handle these cases print >> param.log, 'E' * 16, 'ambiguous RDF', key, item[ key] item[key] = item[key][0] if isinstance(item.get(key), dict) and 'value' in item[key]: item[key] = item[key]['value'] ######################################################################## # title if 'title' not in item or not item['title'].strip(): item['title'] = 'Untitled' # XXX for debugging if type(item['title']) not in [str, unicode]: print >> param.log, 'TITLE' * 15 import code from sys import exit code.interact(local=locals()) item['title_lc'] = lower(item['title']) item['title_words_exact'] = get_words(item['title_lc']) item['title_words'] = stem(item['title_words_exact']) ######################################################################## # link # # The RSS 2.0 specification allows items not to have a link if the entry # is complete in itself # that said this is almost always spurious, so we filter it below if 'link' not in item: item['link'] = f['channel']['link'] # We have to be careful not to assign a default URL as the GUID # otherwise only one item will ever be recorded if 'id' not in item: item['id'] = 'HASH_CONTENT' item['RUNT'] = True if type(item['link']) == unicode: item['link'] = str(item['link'].encode('UTF-8')) if type(item['link']) != str: print >> param.log, 'LINK IS NOT str', repr(item['link']) # XXX special case handling for annoying Sun/Roller malformed entries if 'blog.sun.com' in item['link'] or 'blog.sun.com' in item['link']: item['link'] = item['link'].replace('blog.sun.com', 'blogs.sun.com').replace( 'blogs.sun.com/page', 'blogs.sun.com/roller/page') ######################################################################## # GUID if 'id' not in item: item['id'] = item['link'] ######################################################################## # creator if 'author' not in item or item['author'] == 'Unknown': item['author'] = 'Unknown' if 'author' in f['channel']: item['author'] = f['channel']['author'] ######################################################################## # created amd modified dates if 'modified' not in item: item['modified'] = f['channel'].get('modified') # created - use modified if not available if 'created' not in item: if 'modified_parsed' in item: created = item['modified_parsed'] else: created = None else: created = item['created_parsed'] if not created: # XXX use HTTP last-modified date here created = time.gmtime() # feeds that do not have timestamps cannot be garbage-collected # XXX need to find a better heuristic, as high-volume sites such as # XXX The Guardian, CNET.com or Salon.com lack item-level timestamps f['oldest'] = '1970-01-01 00:00:00' created = fix_date(created) item['created'] = time.strftime(date_fmt, created) # keep track of the oldest item still in the feed file if 'oldest' not in f: f['oldest'] = '9999-99-99 99:99:99' if item['created'] < f['oldest']: f['oldest'] = item['created'] # finish modified date if 'modified_parsed' in item and item['modified_parsed']: modified = fix_date(item['modified_parsed']) # add a fudge factor time window within which modifications are not # counted as such, 10 minutes here if not modified or abs(time.mktime(modified) - time.mktime(created)) < 600: item['modified'] = None else: item['modified'] = time.strftime(date_fmt, modified) else: item['modified'] = None ######################################################################## # content if 'content' in item: content = item['content'] elif 'content_encoded' in item: content = item['content_encoded'] elif 'description' in item: content = item['description'] else: content = '<a href="' + item['link'] + '">' + item['title'] + '</a>' if not content: content = '<a href="' + item['link'] + '">' + item['title'] + '</a>' # strip embedded NULs as a defensive measure content = content.replace('\0', '') # apply ad filters and other degunking to content old_content = None while old_content != content: old_content = content try: for filter in transform.filter_list: content = filter.apply(content, f, item) except: util.print_stack(black_list=['item']) # balance tags like <b>...</b> content = balance(content) content_lc = lower(content) # the content might have invalid 8-bit characters. # Heuristic suggested by Georg Bauer if type(content) != unicode: try: content = content.decode('utf-8') except UnicodeError: content = content.decode('iso-8859-1') # item['content'] = content # we recalculate this as content may have changed due to tag rebalancing, etc item['content_lc'] = lower(content) item['content_words_exact'] = get_words(item['content_lc']) item['content_words'] = stem(item['content_words_exact']) item['union_lc'] = item['title_lc'] + '\n' + item['content_lc'] item['union_words'] = item['title_words'].union(item['content_words']) item['urls'] = url_re.findall(content) ######################################################################## # categories/tags # we used 'category' before, but 'category' and 'categories' are # intercepted by feedparser.FeedParserDict.__getitemm__ and treated as # special case if 'tags' in item and type(item['tags']) == list: item['item_tags'] = set([lower(t['term']) for t in item['tags']]) else: item['item_tags'] = [] ######################################################################## # map unicode for key in ['title', 'link', 'created', 'modified', 'author', 'content']: if type(item.get(key)) == unicode: item[key] = item[key].encode('ascii', 'xmlcharrefreplace') # hash the content as the GUID if required if item['id'] == 'HASH_CONTENT': item['id'] = hashlib.md5(item['title'] + item['content']).hexdigest()
def process_parsed_feed(db, c, f, feed_uid, feed_dupcheck=None, exempt=None): """Insert the entries from a feedparser parsed feed f in the database using the cursor c for feed feed_uid. Returns a tuple (number of items added unread, number of filtered items)""" num_added = 0 num_filtered = 0 filters.load_rules(db, c) # check if duplicate title checking is in effect if feed_dupcheck is None: c.execute("select feed_dupcheck from fm_feeds where feed_uid=?", [feed_uid]) feed_dupcheck = bool(c.fetchone()[0]) # check if the feed is exempt from filtering if exempt is None: c.execute("select feed_exempt from fm_feeds where feed_uid=?", [feed_uid]) exempt = bool(c.fetchone()[0]) # the Radio convention is reverse chronological order f['items'].reverse() for item in f['items']: try: normalize.normalize(item, f) except: util.print_stack() continue # evaluate the FilteringRules skip, rule = filters.evaluate_rules(item, f, feed_uid, exempt) filtered_by = None if skip: skip = -2 if type(rule.uid) == int: filtered_by = rule.uid else: # XXX clunky convention for feed_rule, but that should disappear # XXX eventually filtered_by = 0 title = item['title'] link = item['link'] guid = item['id'] author = item['author'] created = item['created'] modified = item['modified'] if not modified: modified = None content = item['content'] # check if the item already exists, using the GUID as key # but cache all seen GUIDs in a dictionary first, since most articles are # existing ones and we can save a database query this way if feed_uid in feed_guid_cache and guid in feed_guid_cache[feed_uid]: # existing entry and we've seen it before in this process instance # update the time stamp to prevent premature garbage-collection # in prune_feed_guid_cache feed_guid_cache.setdefault(feed_uid, dict())[guid] = time.time() continue else: feed_guid_cache.setdefault(feed_uid, dict())[guid] = time.time() # not seen yet, it may or may not be a duplicate, we have to find out the # hard way c.execute("""select item_uid, item_link, item_loaded, item_created, item_modified, item_md5hex, item_title, item_content, item_creator from fm_items where item_feed_uid=? and item_guid=?""", [feed_uid, guid]) l = c.fetchall() # unknown GUID, but title/link duplicate checking may be in effect if not l: if feed_dupcheck: c.execute("""select count(*) from fm_items where item_feed_uid=? and (item_title=? or item_link=?)""", [feed_uid, title, link]) l = bool(c.fetchone()[0]) if l: print >> param.activity, 'DUPLICATE TITLE', title # XXX Runt items (see normalize.py) are almost always spurious, we just # XXX skip them, although we may revisit this decision in the future if not l and item.get('RUNT', False): print >> param.activity, 'RUNT ITEM', item l = True # GUID already exists, this is a change else: assert len(l) == 1 (item_uid, item_link, item_loaded, item_created, item_modified, item_md5hex, item_title, item_content, item_creator) = l[0] # if this is a feed without timestamps, use our timestamp to determine # the oldest item in the feed XML file if 'oldest' in f and f['oldest'] == '1970-01-01 00:00:00': if 'oldest_ts' not in f: f['oldest_ts'] = item_created else: f['oldest_ts'] = min(f['oldest_ts'], item_created) # XXX update item here # XXX update tags if required # GUID doesn't exist yet, insert it if not l: # finally, dereference the URL to get rid of annoying tracking servers # like feedburner, but only do this once to avoid wasting bandwidth link = normalize.dereference(link) try: c.execute("""insert into fm_items (item_feed_uid, item_guid, item_created, item_modified, item_link, item_md5hex, item_title, item_content, item_creator, item_rating, item_rule_uid) values (?, ?, julianday(?), julianday(?), ?, ?, ?, ?, ?, ?, ?)""", [feed_uid, guid, created, modified, link, hashlib.md5(content).hexdigest(), title, content, author, skip, filtered_by]) # if we have tags, insert them # note: feedparser.py handles 'category' as a special case, so we # need to work around that to get to the data if item['item_tags']: c.execute("""select item_uid from fm_items where item_feed_uid=? and item_guid=?""", [feed_uid, guid]) item_uid = c.fetchone()[0] for tag in item['item_tags']: c.execute("""insert or ignore into fm_tags (tag_name, tag_item_uid) values (?, ?)""", [tag, item_uid]) if skip: num_filtered += 1 print >> param.activity, 'SKIP', title, rule else: num_added += 1 print >> param.activity, ' ' * 4, title except: util.print_stack(['c', 'f']) continue # update timestamp of the oldest item still in the feed file if 'oldest' in f and f['oldest'] != '9999-99-99 99:99:99': if f['oldest'] == '1970-01-01 00:00:00' and 'oldest_ts' in f: c.execute("update fm_feeds set feed_oldest=? where feed_uid=?", [f['oldest_ts'], feed_uid]) else: c.execute("""update fm_feeds set feed_oldest=julianday(?) where feed_uid=?""", [f['oldest'], feed_uid]) return (num_added, num_filtered)
return url except (urllib2.URLError, ValueError, socket.error): return url except Redirect, e: # break a redirection loop if it occurs if e.url in seen: return url # some servers redirect to Unicode URLs, which are not legal try: unicode(e.url) except UnicodeDecodeError: return url # there might be several levels of redirection return dereference(e.url, seen, level + 1) except: util.print_stack() return url url_re = re.compile('(?:href|src)="([^"]*)"', re.IGNORECASE) def normalize(item, f, run_filters=True): # get rid of RDF lossage... for key in ['title', 'link', 'created', 'modified', 'author', 'content', 'content_encoded', 'description']: if type(item.get(key)) == list: if len(item[key]) == 1: item[key] = item[key][0] else: candidate = [i for i in item[key] if i.get('type') == 'text/html'] if len(candidate) > 1 and key == 'content': candidate = sorted(candidate,
def add_feed(feed_xml): """Try to add a feed. Returns a tuple (feed_uid, num_added, num_filtered)""" from singleton import db c = db.cursor() feed_xml = feed_xml.replace('feed://', 'http://') try: # verify the feed f = feedparser.parse(feed_xml) # CVS versions of feedparser are not throwing exceptions as they should # see: # http://sourceforge.net/tracker/index.php?func=detail&aid=1379172&group_id=112328&atid=661937 if not f.feed or ('link' not in f.feed or 'title' not in f.feed): # some feeds have multiple links, one for self and one for PuSH if f.feed and 'link' not in f.feed and 'links' in f.feed: try: for l in f.feed['links']: if l['rel'] == 'self': f.feed['link'] = l['href'] except KeyError: pass if not f.feed or ('link' not in f.feed or 'title' not in f.feed): # try autodiscovery try: feed_xml = AutoDiscoveryHandler().feed_url(feed_xml) except HTMLParser.HTMLParseError: # in desperate conditions, regexps ride to the rescue try: feed_xml = re_autodiscovery(feed_xml)[0][1] except: util.print_stack() raise AutodiscoveryParseError if not feed_xml: raise ParseError f = feedparser.parse(feed_xml) if not f.feed: raise ParseError # we have a valid feed, normalize it normalize.normalize_feed(f) feed = { 'xmlUrl': f['url'], 'htmlUrl': str(f.feed['link']), 'etag': f.get('etag'), 'title': f.feed['title'].encode('ascii', 'xmlcharrefreplace'), 'desc': f.feed['description'].encode('ascii', 'xmlcharrefreplace') } for key, value in feed.items(): if type(value) == str: feed[key] = value filters.load_rules(db, c) try: c.execute("""insert into fm_feeds (feed_xml, feed_etag, feed_html, feed_title, feed_desc) values (:xmlUrl, :etag, :htmlUrl, :title, :desc)""", feed) feed_uid = c.lastrowid num_added, num_filtered = process_parsed_feed(db, c, f, feed_uid) db.commit() return feed_uid, feed['title'], num_added, num_filtered except sqlite.IntegrityError, e: if 'feed_xml' in str(e): db.rollback() raise FeedAlreadyExists else: db.rollback() raise UnknownError(str(e)) finally: c.close()