class EventLogSelectionDownstreamMixin(object): """Defines parameters for passing upstream to tasks that use EventLogSelectionMixin.""" source = luigi.Parameter( is_list=True, config_path={ 'section': 'event-logs', 'name': 'source' }, description= 'A URL to a path that contains log files that contain the events. (e.g., s3://my_bucket/foo/).', ) interval = luigi.DateIntervalParameter( description='The range of dates to export logs for.', ) expand_interval = luigi.TimeDeltaParameter( config_path={ 'section': 'event-logs', 'name': 'expand_interval' }, description= 'A time interval to add to the beginning and end of the interval to expand the windows of ' 'files captured.', ) pattern = luigi.Parameter( is_list=True, config_path={ 'section': 'event-logs', 'name': 'pattern' }, description= 'A regex with a named capture group for the date that approximates the date that the events ' 'within were emitted. Note that the search interval is expanded, so events don\'t have to be in exactly ' 'the right file in order for them to be processed.', ) date_pattern = luigi.Parameter( default='%Y%m%d', description= 'The format of the date as it appears in the source file name. Note that this correlates with the ' 'named capture group for date in the pattern parameter. This is intended to select relevant event log files ' 'by making sure the date is within the interval.', )
def testTimeDelta8601Weeks(self): p = luigi.TimeDeltaParameter( config_path=dict(section="foo", name="bar")) self.assertEqual(timedelta(weeks=5), p.value)
def testTimeDelta8601NoTimeComponent(self): p = luigi.TimeDeltaParameter( config_path=dict(section="foo", name="bar")) self.assertEqual(timedelta(days=5), p.value)
def testTimeDelta8601(self): p = luigi.TimeDeltaParameter( config_path=dict(section="foo", name="bar")) self.assertEqual(timedelta(days=4, hours=12, minutes=30, seconds=5), p.value)
def testTimeDeltaMultiple(self): p = luigi.TimeDeltaParameter( config_path=dict(section="foo", name="bar")) self.assertEqual(timedelta(weeks=3, hours=4, minutes=5), p.value)
def testTimeDeltaPlural(self): p = luigi.TimeDeltaParameter( config_path=dict(section="foo", name="bar")) self.assertEqual(timedelta(seconds=2), p.value)
def testTimeDelta(self): p = luigi.TimeDeltaParameter(config_path=dict(section="foo", name="bar")) self.assertEqual(timedelta(days=1), _value(p))
def testTimeDelta8601MAfterT(self): p = luigi.TimeDeltaParameter( default_from_config=dict(section="foo", name="bar")) self.assertEquals(timedelta(minutes=6), p.default)
def testTimeDelta8601(self): p = luigi.TimeDeltaParameter( default_from_config=dict(section="foo", name="bar")) self.assertEquals(timedelta(days=4, hours=12, minutes=30, seconds=5), p.default)
def testTimeDelta8601Weeks(self): p = luigi.TimeDeltaParameter( default_from_config=dict(section="foo", name="bar")) self.assertEquals(timedelta(weeks=5), p.default)
def testTimeDeltaMultiple(self): p = luigi.TimeDeltaParameter( default_from_config=dict(section="foo", name="bar")) self.assertEquals(timedelta(weeks=3, hours=4, minutes=5), p.default)
def testTimeDeltaPlural(self): p = luigi.TimeDeltaParameter( default_from_config=dict(section="foo", name="bar")) self.assertEquals(timedelta(seconds=2), p.default)
class ContentTask(Task): limit = luigi.TimeDeltaParameter(default=settings.TIME_LIMIT) def requires(self): yield Task(dtstart=self.dtstart) def output(self): return self._default_target('{ns}_content_{dtstart}.hdf') def _channel(self): for channel in self.input(): with channel.open() as f: try: for msg_id, msg in json.load(f).items(): yield self._message(msg_id, msg) except Exception as e: logging.error(e, msg_id, msg) def _message(self, _id, _msg): # override to produce typed messages Message = namedtuple('Message', _msg.keys()) return _id, Message(**_msg) def _githash(self, data): s = sha1() lede = "blob {}\0".format(len(data)) s.update(lede.encode()) s.update(data) return s.hexdigest() def _create_dataset(self, hdf, dataset_id, content): try: ds = hdf.create_dataset( dataset_id, data=np.void( snappy.compress(content) ) ) ds.attrs['x_compression'] = 'snappy' except RuntimeError as e: logger.error("Could not create dataset: %s" % dataset_id) ds = None return ds def run(self): start_time = dt.datetime.now() with h5.File(self.output().path,'w') as hdf: for message_id, message in self._channel(): logger.info('> %s' % message.url) try: r = requests.get(message.url, stream=True) except requests.exceptions.TooManyRedirects as e: logger.error('%s: %s' % (self.__class__, e)) except requests.exceptions.ConnectionError as e: logger.error('%s: %s' % (self.__class__, e)) if r.ok: content = r.raw.data content_id = self._githash(content) r.raw.decode_content = True dataset_id = "{}/{}".format(content_id[:2], content_id[2:]) ds = self._create_dataset(hdf, dataset_id, content) if ds: ds.attrs['content_id'] = content_id ds.attrs['message_id'] = message_id for field, value in zip(message._fields, message): ds.attrs[field] = value hdf.flush() tdelt = dt.datetime.now() - start_time if self.limit and tdelt > self.limit: logger.warn('Time limit exceeded.') break
def testTimeDelta8601MAfterT(self): p = luigi.TimeDeltaParameter( config_path=dict(section="foo", name="bar")) self.assertEqual(timedelta(minutes=6), p.value)
def f(): return luigi.TimeDeltaParameter( default_from_config=dict(section="foo", name="bar")).default
def f(): return luigi.TimeDeltaParameter( config_path=dict(section="foo", name="bar")).value
def testSerialize(self): tdelta = timedelta(weeks=5, days=4, hours=3, minutes=2, seconds=1) self.assertEqual(luigi.TimeDeltaParameter().serialize(tdelta), '5 w 4 d 3 h 2 m 1 s') tdelta = timedelta(seconds=0) self.assertEqual(luigi.TimeDeltaParameter().serialize(tdelta), '0 w 0 d 0 h 0 m 0 s')