/
pipeline.py
120 lines (89 loc) · 3.85 KB
/
pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from __future__ import absolute_import
import logging
from datetime import datetime
import apache_beam as beam
import cv2
import numpy as np
import six
from apache_beam.options.pipeline_options import PipelineOptions, StandardOptions
from apache_beam.options.pipeline_options import SetupOptions
from apache_beam.transforms import window
from apache_beam.transforms.trigger import AccumulationMode, AfterProcessingTime
from ml_processing.model.inference import DetectLabelsFn
class AddTimestampFn(beam.DoFn):
def process(self, x, **kwargs):
date, _ = x
yield window.TimestampedValue(x, datetime.timestamp(
datetime.strptime(date, '%Y-%m-%d %H:%M:%S')))
class WindowFormatterFn(beam.DoFn):
def process(self, x,
win=beam.DoFn.WindowParam,
tsp=beam.DoFn.TimestampParam):
yield '%s - %s - %s' % (win, tsp, x)
class KeyIntoWindow(beam.DoFn):
def process(self, x, win=beam.DoFn.WindowParam):
yield (win, x)
class DropKey(beam.DoFn):
def process(self, x):
_, value = x
yield value
class TransformToNumpyArrayFn(beam.DoFn):
def process(self, x):
np_arr = np.frombuffer(x, np.uint8)
frame = cv2.imdecode(np_arr, cv2.IMREAD_COLOR)
yield [frame]
class ComputeMean(beam.DoFn):
def process(self, x, num_frames):
label, occurrences = x
mean = sum(occurrences) / num_frames
yield label, mean
def run(argv=None):
class TemplateOptions(PipelineOptions):
@classmethod
def _add_argparse_args(cls, parser):
parser.add_argument(
'--input', default='frames.log')
parser.add_argument(
'--output', default='output.txt')
options = PipelineOptions(flags=argv)
options.view_as(SetupOptions).save_main_session = True
options.view_as(StandardOptions).streaming = True
# Uncomment this to run the pipeline on the Cloud (Dataflow)
# options.view_as(StandardOptions).runner = 'DataflowRunner'
with beam.Pipeline(options=options) as p:
windowed = \
(p
| 'Read From Pub/Sub' >> beam.io.ReadFromPubSub(
topic='projects/alert-shape-256811/topics/ml-flow',
timestamp_attribute='timestamp').with_output_types(six.binary_type)
| 'Transform To Numpy Array' >> beam.ParDo(TransformToNumpyArrayFn())
| beam.WindowInto(
window.FixedWindows(10),
trigger=AfterProcessingTime(5),
accumulation_mode=AccumulationMode.DISCARDING))
counted = \
(windowed
| 'Add Default Key' >> beam.Map(lambda x: (0, 1))
| 'Count Num Frames' >> beam.CombinePerKey(sum)
| 'Drop Default Key' >> beam.ParDo(DropKey()))
# a = \
# (windowed_frames
# | 'Add Window As Key' >> beam.ParDo(KeyIntoWindow()))
# b = (windowed_frames
# | 'Group By Key' >> beam.GroupByKey()
# | 'Drop Key' >> beam.ParDo(DropKey()))
(windowed | 'Detect Labels' >> beam.ParDo(DetectLabelsFn())
| 'Flatten' >> beam.FlatMap(lambda x: x)
| 'Pair With One' >> beam.Map(lambda x: (x, 1))
| 'Group For Mean' >> beam.GroupByKey()
| 'Mboulouté' >> beam.ParDo(ComputeMean(), beam.pvalue.AsSingleton(counted))
# | 'Sum Label Occurrences' >> beam.CombineValues(MeanCombineFn())
# | 'Format with Window and Timestamp' >> beam.ParDo(WindowFormatterFn())
# | 'Publish Frames' >> beam.io.WriteToPubSub(
# topic='projects/alert-shape-256811/topics/ml-flow-out'))
| 'Just Print' >> beam.Map(lambda x: logging.info(x)))
if __name__ == '__main__':
logging \
.getLogger() \
.setLevel(logging.INFO)
run()