示例#1
0
def as_timestamp(d):
    """
    Attempt to convert the passed parameter to a datetime.  Note that for strings this
    uses python-dateutil, which is flexible, but SLOW.  So do not use this methig if you
    are going to be parsing a billion dates.  Use the high performance methods in timestamp.py

    return: float   seconds since epoch (unix timestamp)
    """
    if d is None:
        return None
    elif isinstance(d, datetime):
        return timestampFromDatetime(
            pytz.UTC.localize(d) if d.tzinfo is None else d)
        # return timestampFromDatetime(pytz.UTC.localize(d))
    elif isinstance(d, date):
        return timestampFromDatetime(
            pytz.UTC.localize(datetime.combine(d, datetime.min.time())))
    elif isinstance(d, (float, int, Timestamp)):
        return float(d)
    elif isinstance(d, basestring):
        return as_timestamp(dateutil_parse(d))
    else:
        raise ValueError(
            'Unsupported data type. Unable to convert value "%s" to to a timestamp'
            % d)
示例#2
0
 def _date_to_sql_timestamp(date):
     if isinstance(date, basestring):
         return 'TIMESTAMP({})'.format(date)
     elif isinstance(date, datetime):
         timestamp = timestampFromDatetime(date)
     else:
         # assume that date is already a timestamp
         timestamp = date
     return 'SEC_TO_TIMESTAMP({})'.format(int(timestamp))
示例#3
0
 def _date_to_sql_timestamp(date, use_legacy_sql=True):
     if isinstance(date, six.string_types):
         return 'TIMESTAMP({})'.format(date)
     elif isinstance(date, datetime):
         timestamp = timestampFromDatetime(date)
     else:
         # assume that date is already a timestamp
         timestamp = date
     ts_fn = 'SEC_TO_TIMESTAMP' if use_legacy_sql  else 'TIMESTAMP_SECONDS'
     return '{}({})'.format(ts_fn, int(timestamp))
示例#4
0
    def segment_source(self):
        if self.date_range[0] is None:
            return beam.Create([])

        dt = datetimeFromTimestamp(self.date_range[0])
        ts = timestampFromDatetime(dt - timedelta(days=1))

        try:
            source = GCPSource(gcp_path=self.options.segments,
                               first_date_ts=ts,
                               last_date_ts=ts)
        except HttpError as exn:
            logging.warn("Segment source not found: %s %s" %
                         (self.options.segments, dt))
            if exn.status_code == 404:
                return beam.Create([])
            else:
                raise
        return source
示例#5
0
from datetime import datetime
import pytz

import apache_beam as beam
from apache_beam import typehints

from pipe_tools.timestamp import timestampFromDatetime
from pipe_tools.coders import JSONDict

DEFAULT_START_TS = timestampFromDatetime(
    datetime(2017, 1, 1, 0, 0, 0, tzinfo=pytz.UTC))
HOUR_IN_SECONDS = 60 * 60


class MessageGenerator():
    def __init__(self,
                 start_ts=DEFAULT_START_TS,
                 increment=HOUR_IN_SECONDS,
                 count=72):
        self.start_ts = start_ts
        self.increment = increment
        self.count = count

    def __iter__(self):
        return self.messages()

    def messages(self):
        ts = self.start_ts
        for idx in xrange(self.count):
            yield JSONDict(mmsi=1, timestamp=ts, idx=idx)
            ts += self.increment
示例#6
0
class TestTransforms():
    ts = timestampFromDatetime(datetime(2017, 1, 1, 0, 0, 0, tzinfo=pytz.UTC))

    @staticmethod
    def _seg_id(ssvid, ts):
        ts = datetimeFromTimestamp(ts)
        return '{}-{}'.format(ssvid, datetime2str(ts))

    @staticmethod
    def groupby_fn(msg):
        return (msg['ssvid'], msg)

    def _run_segment(self, messages_in, segments_in, temp_dir):
        messages_file = pp.join(temp_dir, '_run_segment', 'messages')
        segments_file = pp.join(temp_dir, '_run_segment', 'segments')

        with _TestPipeline() as p:
            messages = (
                p | 'CreateMessages' >> beam.Create(messages_in)
                | 'AddKeyMessages' >> beam.Map(self.groupby_fn)
                | "MessagesGroupByKey" >> beam.GroupByKey()
            )
            segments = (
                p | 'CreateSegments' >> beam.Create(segments_in)
                | 'AddKeySegments' >> beam.Map(self.groupby_fn)
                | "SegmentsGroupByKey" >> beam.GroupByKey()
            )
            segmented = (
                messages
                | "Segment" >> Segment(segments)
            )
            messages = segmented['messages']
            segments = segmented[Segment.OUTPUT_TAG_SEGMENTS]
            messages | "WriteMessages" >> beam.io.WriteToText(
                messages_file, coder=JSONDictCoder())
            segments | "WriteSegments" >> beam.io.WriteToText(
                segments_file, coder=JSONDictCoder())

            p.run()

            with open_shards('%s*' % messages_file) as output:
                messages = sorted(list(nlj.load(output)), key=lambda m: (m['ssvid'], m['timestamp']))
            with open_shards('%s*' % segments_file) as output:
                segments = list(nlj.load(output))

            assert list_contains(messages, messages_in)

            return messages, segments

    def test_segment_empty(self, temp_dir):
        self._run_segment([], [], temp_dir=temp_dir)

    def test_segment_single(self, temp_dir):
        messages_in = [{'ssvid': 1, 'timestamp': self.ts}]
        segments_in = []
        messages_out, segments_out = self._run_segment(messages_in, segments_in, temp_dir=temp_dir)

    def test_segment_segments_in(self, temp_dir):
        prev_ts = self.ts - 1
        messages_in = [{'ssvid': "1", 'timestamp': self.ts}]
        segments_in = [{'ssvid': "1", 'timestamp': prev_ts,
                     'seg_id': self._seg_id(1, prev_ts),
                     'origin_ts': prev_ts,
                     'timestamp_last': self.ts,
                     'noise': False,
                     'last_pos_lat': 0,
                     'last_pos_lon': 0,
                     'message_count': 1}]
        messages_out, segments_out = self._run_segment(messages_in, segments_in, temp_dir=temp_dir)
        assert messages_out[0]['seg_id'] == segments_in[0]['seg_id']

    def test_segment_out_in(self, temp_dir):
        prev_ts = self.ts - 1
        messages_in = [{'ssvid': "1", 'timestamp': self.ts-1},
                       {'ssvid': "2", 'timestamp': self.ts-1}]
        segments_in = []
        messages_out, segments_out = self._run_segment(messages_in, segments_in, temp_dir=temp_dir)
        messages_in = [{'ssvid': "1", 'timestamp': self.ts},
                       {'ssvid': "2", 'timestamp': self.ts}]
        segments_in = segments_out
        messages_out, segments_out = self._run_segment(messages_in, segments_in, temp_dir=temp_dir)

        assert len(segments_out) == 2
        assert all(seg['message_count'] == 2 for seg in segments_out)
        assert all(seg['seg_id'] == self._seg_id(seg['ssvid'], prev_ts) for seg in segments_out)


    @pytest.mark.parametrize("message, expected", [
        ({}, {}),
        ({'shipname': 'f/v boaty Mc Boatface'}, {'n_shipname': 'BOATYMCBOATFACE'}),
        ({'shipname': 'Bouy 42%'}, {'n_shipname': 'BOUY'}),
        ({'callsign': '@@123'}, {'n_callsign': '123'}),
        ({'imo': 8814275}, {'n_imo': 8814275}),
    ])
    def test_normalize(self, message, expected):
        normalize = NormalizeDoFn()
        assert list_contains(list(normalize.process(message)), [expected])

    def test_normalize_invalid_imo(self):
        normalize = NormalizeDoFn()
        assert all ('n_imo' not in m for m in list(normalize.process({'imo': 0000000})))

    def test_noise_segment(self, temp_dir):
        messages_in = [
            {"timestamp": as_timestamp("2017-07-20T05:59:35.000000Z"),
             "ssvid": "338013000",
             "lon": -161.3321333333,
             "lat": -9.52616,
             "speed": 11.1},
            {"timestamp": as_timestamp("2017-07-20T06:00:38.000000Z"),
             "ssvid": "338013000",
             "lon": -161.6153106689,
             "lat": -9.6753702164,
             "speed": 11.3999996185},
            {"timestamp": as_timestamp("2017-07-20T06:01:00.000000Z"),
             "ssvid": "338013000"}
        ]

        segments_in = []
        messages_out, segments_out = self._run_segment(messages_in, segments_in, temp_dir=temp_dir)

        seg_stats = {(seg['seg_id'], seg['message_count'], seg['noise']) for seg in segments_out}

        assert seg_stats == {('338013000-2017-07-20T05:59:35.000000Z', 2, False),
                             ('338013000-2017-07-20T06:00:38.000000Z', 1, True)}

        messages_in = [{"timestamp": as_timestamp("2017-07-20T06:02:00.000000Z"),
             "ssvid": "338013000"}
        ]
        segments_in = segments_out
        messages_out, segments_out = self._run_segment(messages_in, segments_in, temp_dir=temp_dir)

        seg_stats = {(seg['seg_id'], seg['message_count'], seg['noise']) for seg in segments_out}

        assert seg_stats == {('338013000-2017-07-20T05:59:35.000000Z', 3, False)}


    def test_expected_segments(self, temp_dir):
        messages_in = [
            {"timestamp": as_timestamp("2017-11-15T11:14:32.000000Z"),
             "ssvid": 257666800,
             "lon": 5.3108466667,
             "lat": 60.40065,
             "speed": 6.5},
            {"timestamp": as_timestamp("2017-11-26T11:20:16.000000Z"),
             "ssvid": 257666800,
             "lon": 5.32334,
             "lat": 60.396235,
             "speed": 3.2000000477},
        ]

        segments_in = []
        messages_out, segments_out = self._run_segment(messages_in, segments_in, temp_dir=temp_dir)
        seg_stats = [(seg['seg_id'], seg['message_count'], seg['noise']) for seg in segments_out]

        expected = [('257666800-2017-11-15T11:14:32.000000Z', 1, False),
                    ('257666800-2017-11-26T11:20:16.000000Z', 1, False)]
        assert seg_stats == expected


    def test_message_type(self, temp_dir):
        messages_in = [
            {"timestamp": as_timestamp("2018-01-01 00:00"),
             "ssvid": "123456789",
             "type": "AIS.1",
             "lon": 0.0,
             "lat": 0.0},
            {"timestamp": as_timestamp("2018-01-01 01:00"),
             "ssvid": "123456789",
             "type": "AIS.18",
             "lon": 0.0,
             "lat": 2.0},
            {"timestamp": as_timestamp("2018-01-01 02:00"),
             "ssvid": "123456789",
             "type": "AIS.1",
             "lon": 0.0,
             "lat": 0.5},
            {"timestamp": as_timestamp("2018-01-01 03:00"),
             "ssvid": "123456789",
             "type": "AIS.18",
             "lon": 0.0,
             "lat": 1.5},
            {"timestamp": as_timestamp("2018-01-01 04:00"),
             "ssvid": "123456789",
             "type": "AIS.5",
             "shipname": "Boaty"},
        ]

        segments_in = []
        messages_out, segments_out = self._run_segment(messages_in, segments_in, temp_dir=temp_dir)
        seg_stats = [(seg['seg_id'], seg['message_count'], seg['shipname_most_common']) for seg in segments_out]

        expected = [('123456789-2018-01-01T00:00:00.000000Z', 3, 'Boaty'),
                    ('123456789-2018-01-01T01:00:00.000000Z', 2, None)]
        assert seg_stats == expected