Exemplo n.º 1
0
	def fetch_feed(self, url):
		feed_type = self.feeds[url].type

		err = None
		try: data = yield self.client.request(url)
		except HTTPClientError as err:
			log.warn('Failed to fetch feed ({}): {}'.format(url, err))
			data = None
		finally: self.schedule_fetch(url, fast=bool(err)) # do faster re-fetch on errors

		if data is None: defer.returnValue(None) # cache hit, not modified, error
		data, headers = data

		if feed_type == 'feed':
			import feedparser
			parser = feedparser.parse(data, response_headers=headers)
			feed, posts = parser.feed, parser.entries
		elif feed_type == 'reddit-json':
			from lya import AttrDict # mandatory dep anyway
			data = json.loads(data)['data']
			posts = list(AttrDict(post['data']) for post in data.pop('children'))
			feed = AttrDict(data)
		else:
			raise ValueError('Unrecognized feed type: {!r}'.format(self.feeds[url].type))

		count = 0
		for post in reversed(posts):
			if feed_type == 'reddit-json':
				# Some reddit-api-specific encoding hacks
				try: title = unescape(post['title'])
				except KeyError: pass
				else: post.title = title

			post_obj = FeedEntryInfo(feed, post, self.conf)

			post_id = list(
				force_bytes(post_obj.get_by_path(attr))
				for attr in self.feeds[url].deduplication )
			if not self.filter_db.add(url, post_id): continue

			first_err = None
			for template in self.feeds[url].template:
				try: event = template.format(**post_obj._asdict())
				except (KeyError, IndexError, AttributeError) as err:
					if not first_err:
						first_err = ValueError(
							'Failed to format template {!r} (data: {}): {}'\
							.format(template, post_obj, err) )
					continue
				event = RelayedEvent(event)
				event.data = post_obj # for any further tricky filtering
				reactor.callLater(0, self.interface.dispatch, event, source=self)
				break
			else: raise first_err # all templates failed

			count += 1
			if self.feeds[url].process_max and count >= self.feeds[url].process_max: break
Exemplo n.º 2
0
	def dispatch(self, msg):
		msg_etree = self.lxml_soup(msg)
		if not self.conf.process_links or self.conf.process_links.enabled:
			for tag in msg_etree.iter(tag='a'):
				try:
					assert tag.text
					if self.conf.process_links.detect_hashtags:
						parent = tag.getparent()
						assert not parent.tag == 'span' and parent.get('class') == 'tag'
					if self.conf.process_links.detect_inlined_urls:
						link = tag.get('href')
						assert not op.eq(*it.imap(ft.partial(re.sub, ur'^https?://', ''), [link, tag.text]))
				except AssertionError: pass
				else: tag.text = u'{} <{}>'.format(tag.text, tag.attrib['href'])
				tag.drop_tag()
		msg_new = msg_etree.text_content()
		if isinstance(msg, RelayedEvent) and hasattr(msg, 'data'):
			msg_new = RelayedEvent(msg_new)
			msg_new.data = msg.data
		return msg_new
Exemplo n.º 3
0
	def handle_line(self, line, path):
		log.noise('New line: {!r}'.format(line))
		event = RelayedEvent(force_unicode(line))
		event.data = AttrDict(path=path.path)
		reactor.callLater(0, self.interface.dispatch, event, source=self)
Exemplo n.º 4
0
	def dispatch(self, msg):
		if not msg.strip(): return

		## Event lines are cached until EOE msg is encountered
		match = self._re_base.search(msg)
		if not match:
			log.warn('Failed to match audit event spec: {!r}'.format(msg))
			return

		node, ev_id, ev_type, msg = (match.group(k) for k in ['node', 'ev_id', 'type', 'msg'])
		ev_key = node, ev_id

		if ev_key not in self._ev_cache:
			self._ev_cache[ev_key] = defaultdict(list)
			self._ev_cache[ev_key].update(ts=time.time(), node=node, ev_id=ev_id)
			self._ev_cache_gc()
		ev = self._ev_cache[ev_key]

		if ev_type != 'EOE': # cache event data
			ev[ev_type].append(msg)
			return
		del self._ev_cache[ev_key]

		## Get "key" value for event, if present
		ev_key = None
		try: syscall, = ev['SYSCALL'] # currently handled events always have it
		except ValueError: pass
		else:
			try: ev_key = self.get_msg_val(syscall, 'key', ur'"(?P<val>[^"]+)"')
			except KeyError as err:
				log.noise('Failed to get ev_key from syscall: {}'.format(err))
		if not ev_key:
			log.noise('Unhandled event: {!r}'.format(ev))
			return

		## Processing

		if ev_key in self.conf.events.watches.ev_keys:
			# Extract all necessary attributes
			ev_vals = dict(node=ev['node'], ev_id=ev['ev_id'], key=ev_key)
			for k in it.imap(''.join, it.product(['', 'e', 's', 'fs'], ['uid', 'gid'])):
				ev_vals[k] = self.get_msg_val(syscall, k)
			for k in 'comm', 'exe':
				ev_vals[k] = self.get_msg_val(syscall, k, ur'"(?P<val>[^"]+)"')
			ev_vals['tty'] = self.get_msg_val(syscall, 'tty', '(?P<val>\S+)')
			paths = ev_vals['paths'] = list()
			for msg in ev['PATH']:
				path = self.get_msg_val(msg, 'name', ur'(?P<val>"[^"]+"|\(null\)|[0-9A-F]+)')
				paths.append(dict( path=path,
					inode=self.get_msg_val(msg, 'inode', fallback='nil'),
					dev=self.get_msg_val(msg, 'dev', '(?P<val>[a-f\d]{2}:[a-f\d]{2})', fallback='nil') ))

			# Formatting
			err, tpl = None, force_unicode(self.conf.events.watches.template_path)
			ev_vals['paths'] = list()
			for val in paths:
				try: ev_vals['paths'].append(tpl.format(**val))
				except self._lookup_error as err: break
			if not err:
				ev_vals['paths'] = ', '.join(ev_vals['paths'])
				tpl, val = force_unicode(self.conf.events.watches.template), ev_vals
				try: event = tpl.format(**val)
				except self._lookup_error as err: pass
				event = RelayedEvent(event)
				event.data = ev_vals
				return event
			raise ValueError( 'Failed to format template {!r} (data: {}): {}'.format(tpl, val, err))