class Shortener(BCRelay): def __init__(self, *argz, **kwz): super(Shortener, self).__init__(*argz, **kwz) log.noise('Compiling regex: {!r}'.format(self.conf.regex)) self.regex = re.compile(self.conf.regex) self.client = HTTPClient() @defer.inlineCallbacks def dispatch(self, msg): match = self.regex.search(msg) if not match: defer.returnValue(msg) msg = u''.join(map(force_unicode, [ msg[:match.start('url')], (yield self.shorten(match.group('url'))), msg[match.end('url'):] ])) defer.returnValue(msg) @defer.inlineCallbacks def shorten(self, url): url = force_bytes(url) if len(url) >= self.conf.length_min: try: func = getattr(self, 'shorten_{}'.format(self.conf.api.type)) except AttributeError: raise ValueError('URL shortener "{}" is not supported') url = yield defer.maybeDeferred(func, url, self.conf.api.parameters) defer.returnValue(force_unicode(re.sub(r'^(?i)(https?|spdy)://', '', url))) def shorten_clean(self, url, params): return url def shorten_cut(self, url, params): return url[:(params or 50)] @defer.inlineCallbacks def shorten_m29(self, url, params): # based on https://github.com/netd/m29_python import Crypto.Cipher.AES # pycrypto key1, key2 = os.urandom(8), os.urandom(8) pad = lambda s: s + (16 - len(s) % 16) * '\0' encrypted = Crypto.Cipher.AES\ .new(key1 + key2, Crypto.Cipher.AES.MODE_ECB)\ .encrypt(pad(url)) base64 = lambda url: url.encode('base64')\ .strip().replace('+', '-').replace('/', '_').replace('=', '') data, headers = yield self.client.request( 'http://api.m29.us/urlshortener/v1/url', 'post', encode='json', decode='json', data=dict( firstKey=base64(key1), longUrlEncrypted=base64(encrypted) ) ) defer.returnValue(data['id'] + '/' + base64(key2))
class FeedSyndication(BCRelay): feeds = None def __init__(self, *argz, **kwz): super(FeedSyndication, self).__init__(*argz, **kwz) self.client = HTTPClient( use_cache_headers=self.conf.use_cache_headers, request_pool_options=self.conf.request_pool_options, ca_certs_files=self.conf.ca_certs_files, user_agent=self.conf.user_agent, hide_connection_errors=self.conf.hide_connection_errors ) self.feeds = dict() base = self.conf.feeds.pop('_default') for url, opts in self.conf.feeds.viewitems(): opts.rebase(base) if isinstance(opts.template, types.StringTypes): opts.template = [opts.template] opts.template = map(force_unicode, opts.template) assert opts.type in ['feed', 'reddit-json'],\ 'Feed type must be either "feed" or "reddit-json", not {!r}'.format(self.feeds[url].type) self.feeds[url] = opts self.schedule_fetch(url, fast=opts.interval.fetch_on_startup) self.filter_db = PostHashDB( self.conf.deduplication_cache.path, self.conf.deduplication_cache.keep ) def schedule_fetch(self, url, fast=False): interval = self.feeds[url].interval jitter = interval.jitter * interval.base * random.random() interval = jitter if fast else (interval.base + (jitter * random.choice([-1, 1]))) log.noise('Scheduling fetch for feed (url: {}) in {}s'.format(url, interval)) reactor.callLater(interval, self.fetch_feed, url) @defer.inlineCallbacks def fetch_feed(self, url): feed_type = self.feeds[url].type err = None try: data = yield self.client.request(url) except HTTPClientError as err: log.warn('Failed to fetch feed ({}): {}'.format(url, err)) data = None finally: self.schedule_fetch(url, fast=bool(err)) # do faster re-fetch on errors if data is None: defer.returnValue(None) # cache hit, not modified, error data, headers = data if feed_type == 'feed': import feedparser parser = feedparser.parse(data, response_headers=headers) feed, posts = parser.feed, parser.entries elif feed_type == 'reddit-json': from lya import AttrDict # mandatory dep anyway data = json.loads(data)['data'] posts = list(AttrDict(post['data']) for post in data.pop('children')) feed = AttrDict(data) else: raise ValueError('Unrecognized feed type: {!r}'.format(self.feeds[url].type)) count = 0 for post in reversed(posts): if feed_type == 'reddit-json': # Some reddit-api-specific encoding hacks try: title = unescape(post['title']) except KeyError: pass else: post.title = title post_obj = FeedEntryInfo(feed, post, self.conf) post_id = list( force_bytes(post_obj.get_by_path(attr)) for attr in self.feeds[url].deduplication ) if not self.filter_db.add(url, post_id): continue first_err = None for template in self.feeds[url].template: try: event = template.format(**post_obj._asdict()) except (KeyError, IndexError, AttributeError) as err: if not first_err: first_err = ValueError( 'Failed to format template {!r} (data: {}): {}'\ .format(template, post_obj, err) ) continue event = RelayedEvent(event) event.data = post_obj # for any further tricky filtering reactor.callLater(0, self.interface.dispatch, event, source=self) break else: raise first_err # all templates failed count += 1 if self.feeds[url].process_max and count >= self.feeds[url].process_max: break