def dispatch(self, msg): match = self.regex.search(msg) if not match: log.debug('Failed to match snort rule-sid in msg: {!r}'.format(msg)) return msg sid = match.group('sid') if self.gid_ignore: try: gid = match.group('gid') except IndexError: pass else: if gid in self.gid_ignore: return msg ts = time() if self.sid_db_ts < ts - self.conf.sid_db_mtime_check_interval: if not os.path.exists(self.conf.paths.sid_db)\ or max(0, *( os.stat(p).st_mtime for p in [self.conf.paths.sid_src, self.conf.paths.refs] if os.path.exists(p) )) > os.stat(self.conf.paths.sid_db).st_mtime: self.update_sid_db() self.sid_db = anydbm.open(self.conf.paths.sid_db) try: ref = force_unicode(self.sid_db[force_bytes(sid)]) except KeyError: log.info('Failed to find refs for sid: {!r} (msg: {!r})'.format(sid, msg)) else: msg += u'\n refs: {}'.format(ref) return msg
def shorten(self, url): url = force_bytes(url) if len(url) >= self.conf.length_min: try: func = getattr(self, 'shorten_{}'.format(self.conf.api.type)) except AttributeError: raise ValueError('URL shortener "{}" is not supported') url = yield defer.maybeDeferred(func, url, self.conf.api.parameters) defer.returnValue(force_unicode(re.sub(r'^(?i)(https?|spdy)://', '', url)))
def fetch_feed(self, url): feed_type = self.feeds[url].type err = None try: data = yield self.client.request(url) except HTTPClientError as err: log.warn('Failed to fetch feed ({}): {}'.format(url, err)) data = None finally: self.schedule_fetch(url, fast=bool(err)) # do faster re-fetch on errors if data is None: defer.returnValue(None) # cache hit, not modified, error data, headers = data if feed_type == 'feed': import feedparser parser = feedparser.parse(data, response_headers=headers) feed, posts = parser.feed, parser.entries elif feed_type == 'reddit-json': from lya import AttrDict # mandatory dep anyway data = json.loads(data)['data'] posts = list(AttrDict(post['data']) for post in data.pop('children')) feed = AttrDict(data) else: raise ValueError('Unrecognized feed type: {!r}'.format(self.feeds[url].type)) count = 0 for post in reversed(posts): if feed_type == 'reddit-json': # Some reddit-api-specific encoding hacks try: title = unescape(post['title']) except KeyError: pass else: post.title = title post_obj = FeedEntryInfo(feed, post, self.conf) post_id = list( force_bytes(post_obj.get_by_path(attr)) for attr in self.feeds[url].deduplication ) if not self.filter_db.add(url, post_id): continue first_err = None for template in self.feeds[url].template: try: event = template.format(**post_obj._asdict()) except (KeyError, IndexError, AttributeError) as err: if not first_err: first_err = ValueError( 'Failed to format template {!r} (data: {}): {}'\ .format(template, post_obj, err) ) continue event = RelayedEvent(event) event.data = post_obj # for any further tricky filtering reactor.callLater(0, self.interface.dispatch, event, source=self) break else: raise first_err # all templates failed count += 1 if self.feeds[url].process_max and count >= self.feeds[url].process_max: break
def name_from_patch_link( self, link, _re_path=re.compile(r'\bpackages/[\w\-]+/(?P<name>[\w\-]+)/') ): names = set() try: page = yield getPage(force_bytes(link), timeout=120) except Exception as err: log.warn('Failed to download patch: {}'.format(err)) defer.returnValue(None) page = it.imap(op.methodcaller('strip'), page.splitlines()) for line in page: if re.search(r'^\s*(\S+\s+\|\s+\d+\s+[\-+]*\s*$|rename |diff --git |[\-+]{3} )', line): line = _re_path.search(line) if line: names.add(line.group('name')) defer.returnValue(names)
def hash(self, val): if not isinstance(val, types.StringTypes): val = '\0'.join(val) val = force_bytes(val) return hashlib.sha256(val).digest()
def request(self, url, method='get', decode=None, encode=None, data=None): method, url = force_bytes(method).upper(), force_bytes(url) headers = {'User-Agent': self.user_agent} if method == 'GET' and self.use_cache_headers: # Avoid doing extra work cache = self.fetch_cache.get(url, dict()) if 'cache-control' in cache and cache['cache-control'] >= time.time(): defer.returnValue(None) # no need to re-process same thing if 'last-modified' in cache: headers['If-Modified-Since'] = rfc822date(cache['last-modified']) if 'etag' in cache: headers['If-None-Match'] = '"{}"'.format(cache['etag']) log.noise( 'HTTP request: {} {} (h: {}, enc: {}, dec: {}, data: {!r})'\ .format(method, url[:100], headers, encode, decode, type(data)) ) if data is not None: if encode is None: if isinstance(data, types.StringTypes): data = io.BytesIO(data) elif encode == 'form': headers.setdefault('Content-Type', 'application/x-www-form-urlencoded') data = io.BytesIO(urlencode(data)) elif encode == 'json': headers.setdefault('Content-Type', 'application/json') data = io.BytesIO(json.dumps(data)) else: raise ValueError('Unknown request encoding: {}'.format(encode)) data_raw, data = data, FileBodyProducer(data) else: data_raw = None if decode not in ['json', None]: raise ValueError('Unknown response decoding method: {}'.format(decode)) requests = None # indicates fallback to requests module (for e.g. ipv6-only site) err = None try: res = yield self.request_agent.request( method, url, Headers(dict((k,[v]) for k,v in (headers or dict()).viewitems())), data ) except error.DNSLookupError: import requests, socket try: res = yield self.sync_wrap( getattr(requests, method.lower()), url, headers=headers, data=data_raw ) except ( socket.error, SyncTimeout, requests.exceptions.RequestException ) as err: pass except ( RequestTransmissionFailed, RequestNotSent, ResponseFailed ) as err: pass if err: if not self.hide_connection_errors: raise HTTPClientError(None, 'Lookup/connection error: {}'.format(err)) else: log.debug('Lookup/connection error (supressed): {}'.format(err)) defer.returnValue(None) # should also supress fast refetching code, phrase, version = (res.code, res.phrase, res.version)\ if not requests else ( res.status_code, http.RESPONSES[res.status_code], ('HTTP', 1, 1) ) log.noise( 'HTTP request done ({} {}): {} {} {}'\ .format(method, url[:100], code, phrase, version) ) if code in [http.NO_CONTENT, http.NOT_MODIFIED]: defer.returnValue(None) if code not in [http.OK, http.CREATED]: raise HTTPClientError(code, phrase) if not requests: data = defer.Deferred() res.deliverBody(DataReceiver(data)) data = yield data headers = dict((k, v[-1]) for k,v in res.headers.getAllRawHeaders()) else: try: data = yield self.sync_wrap(getattr, res, 'text') headers = yield self.sync_wrap(getattr, res, 'headers') except (requests.exceptions.RequestException, SyncTimeout) as err: raise HTTPClientError(None, 'Sync connection error: {}'.format(err)) if method == 'GET' and self.use_cache_headers: cache = dict((k.lower(), v) for k,v in headers.items()) cache = dict( (k, cache[k]) for k in ['last-modified', 'cache-control', 'etag'] if k in cache ) # Update headers' cache if 'last-modified' in cache: ts = rfc822.parsedate_tz(cache['last-modified']) cache['last-modified'] = time.mktime(ts[:9]) + (ts[9] or 0) if 'cache-control' in cache: match = re.search(r'\bmax-age=(\d+)\b', cache.pop('cache-control')) if match: cache['cache-control'] = time.time() + int(match.group(1)) if cache: self.fetch_cache[url] = cache defer.returnValue((json.loads(data) if decode is not None else data, headers))