Пример #1
0
class EventLogSelectionDownstreamMixin(object):
    """Defines parameters for passing upstream to tasks that use EventLogSelectionMixin."""

    source = luigi.Parameter(
        is_list=True,
        config_path={
            'section': 'event-logs',
            'name': 'source'
        },
        description=
        'A URL to a path that contains log files that contain the events. (e.g., s3://my_bucket/foo/).',
    )
    interval = luigi.DateIntervalParameter(
        description='The range of dates to export logs for.', )
    expand_interval = luigi.TimeDeltaParameter(
        config_path={
            'section': 'event-logs',
            'name': 'expand_interval'
        },
        description=
        'A time interval to add to the beginning and end of the interval to expand the windows of '
        'files captured.',
    )
    pattern = luigi.Parameter(
        is_list=True,
        config_path={
            'section': 'event-logs',
            'name': 'pattern'
        },
        description=
        'A regex with a named capture group for the date that approximates the date that the events '
        'within were emitted. Note that the search interval is expanded, so events don\'t have to be in exactly '
        'the right file in order for them to be processed.',
    )

    date_pattern = luigi.Parameter(
        default='%Y%m%d',
        description=
        'The format of the date as it appears in the source file name. Note that this correlates with the '
        'named capture group for date in the pattern parameter. This is intended to select relevant event log files '
        'by making sure the date is within the interval.',
    )
Пример #2
0
 def testTimeDelta8601Weeks(self):
     p = luigi.TimeDeltaParameter(
         config_path=dict(section="foo", name="bar"))
     self.assertEqual(timedelta(weeks=5), p.value)
Пример #3
0
 def testTimeDelta8601NoTimeComponent(self):
     p = luigi.TimeDeltaParameter(
         config_path=dict(section="foo", name="bar"))
     self.assertEqual(timedelta(days=5), p.value)
Пример #4
0
 def testTimeDelta8601(self):
     p = luigi.TimeDeltaParameter(
         config_path=dict(section="foo", name="bar"))
     self.assertEqual(timedelta(days=4, hours=12, minutes=30, seconds=5),
                      p.value)
Пример #5
0
 def testTimeDeltaMultiple(self):
     p = luigi.TimeDeltaParameter(
         config_path=dict(section="foo", name="bar"))
     self.assertEqual(timedelta(weeks=3, hours=4, minutes=5), p.value)
Пример #6
0
 def testTimeDeltaPlural(self):
     p = luigi.TimeDeltaParameter(
         config_path=dict(section="foo", name="bar"))
     self.assertEqual(timedelta(seconds=2), p.value)
Пример #7
0
 def testTimeDelta(self):
     p = luigi.TimeDeltaParameter(config_path=dict(section="foo", name="bar"))
     self.assertEqual(timedelta(days=1), _value(p))
Пример #8
0
 def testTimeDelta8601MAfterT(self):
     p = luigi.TimeDeltaParameter(
         default_from_config=dict(section="foo", name="bar"))
     self.assertEquals(timedelta(minutes=6), p.default)
Пример #9
0
 def testTimeDelta8601(self):
     p = luigi.TimeDeltaParameter(
         default_from_config=dict(section="foo", name="bar"))
     self.assertEquals(timedelta(days=4, hours=12, minutes=30, seconds=5),
                       p.default)
Пример #10
0
 def testTimeDelta8601Weeks(self):
     p = luigi.TimeDeltaParameter(
         default_from_config=dict(section="foo", name="bar"))
     self.assertEquals(timedelta(weeks=5), p.default)
Пример #11
0
 def testTimeDeltaMultiple(self):
     p = luigi.TimeDeltaParameter(
         default_from_config=dict(section="foo", name="bar"))
     self.assertEquals(timedelta(weeks=3, hours=4, minutes=5), p.default)
Пример #12
0
 def testTimeDeltaPlural(self):
     p = luigi.TimeDeltaParameter(
         default_from_config=dict(section="foo", name="bar"))
     self.assertEquals(timedelta(seconds=2), p.default)
Пример #13
0
class ContentTask(Task):

    limit = luigi.TimeDeltaParameter(default=settings.TIME_LIMIT)

    def requires(self):
        yield Task(dtstart=self.dtstart)

    def output(self):
        return self._default_target('{ns}_content_{dtstart}.hdf')

    def _channel(self):
        for channel in self.input():
            with channel.open() as f:
                try:
                    for msg_id, msg in json.load(f).items():
                        yield self._message(msg_id, msg)
                except Exception as e:
                    logging.error(e, msg_id, msg)

    def _message(self, _id, _msg):
        # override to produce typed messages
        Message = namedtuple('Message', _msg.keys())
        return _id, Message(**_msg)

    def _githash(self, data):
        s = sha1()
        lede = "blob {}\0".format(len(data))
        s.update(lede.encode())
        s.update(data)
        return s.hexdigest()

    def _create_dataset(self, hdf, dataset_id, content):
        try:
            ds = hdf.create_dataset(
                dataset_id,
                data=np.void(
                    snappy.compress(content)
                )
            )
            ds.attrs['x_compression'] = 'snappy'
        except RuntimeError as e:
            logger.error("Could not create dataset: %s" % dataset_id)
            ds = None
        return ds

    def run(self):
        start_time = dt.datetime.now()
        with h5.File(self.output().path,'w') as hdf:
            for message_id, message in self._channel():
                logger.info('> %s' % message.url)
                try:
                    r = requests.get(message.url, stream=True)
                except requests.exceptions.TooManyRedirects as e:
                    logger.error('%s: %s' % (self.__class__, e))
                except requests.exceptions.ConnectionError as e:
                    logger.error('%s: %s' % (self.__class__, e))
                if r.ok:
                    content = r.raw.data
                    content_id = self._githash(content)
                    r.raw.decode_content = True
                    dataset_id = "{}/{}".format(content_id[:2], content_id[2:])
                    ds = self._create_dataset(hdf, dataset_id, content)
                    if ds:
                        ds.attrs['content_id'] = content_id
                        ds.attrs['message_id'] = message_id
                        for field, value in zip(message._fields, message):
                            ds.attrs[field] = value
                        hdf.flush()
                tdelt = dt.datetime.now() - start_time
                if self.limit and tdelt > self.limit:
                    logger.warn('Time limit exceeded.')
                    break
Пример #14
0
 def testTimeDelta8601MAfterT(self):
     p = luigi.TimeDeltaParameter(
         config_path=dict(section="foo", name="bar"))
     self.assertEqual(timedelta(minutes=6), p.value)
Пример #15
0
 def f():
     return luigi.TimeDeltaParameter(
         default_from_config=dict(section="foo", name="bar")).default
Пример #16
0
 def f():
     return luigi.TimeDeltaParameter(
         config_path=dict(section="foo", name="bar")).value
Пример #17
0
 def testSerialize(self):
     tdelta = timedelta(weeks=5, days=4, hours=3, minutes=2, seconds=1)
     self.assertEqual(luigi.TimeDeltaParameter().serialize(tdelta), '5 w 4 d 3 h 2 m 1 s')
     tdelta = timedelta(seconds=0)
     self.assertEqual(luigi.TimeDeltaParameter().serialize(tdelta), '0 w 0 d 0 h 0 m 0 s')