def generate_events(self): publish_client = pubsub.Client(project=self.project) topic = publish_client.topic(self.topic_name) sub = topic.subscription(self.subscription_name) logging.info('Generating auction events to topic %s', topic.name) if self.args.input.startswith('gs://'): from apache_beam.io.gcp.gcsfilesystem import GCSFileSystem fs = GCSFileSystem(self.pipeline_options) with fs.open(self.args.input) as infile: for line in infile: topic.publish(line) else: with open(self.args.input) as infile: for line in infile: topic.publish(line) logging.info('Finished event generation.') # Read from PubSub into a PCollection. if self.args.subscription_name: raw_events = self.pipeline | 'ReadPubSub' >> beam.io.ReadFromPubSub( subscription=sub.full_name) else: raw_events = self.pipeline | 'ReadPubSub' >> beam.io.ReadFromPubSub( topic=topic.full_name) return raw_events
def generate_events(self): publish_client = pubsub.Client(project=self.project) topic = publish_client.topic(self.topic_name) sub = topic.subscription(self.subscription_name) logging.info('Generating auction events to topic %s', topic.name) if self.args.input.startswith('gs://'): from apache_beam.io.gcp.gcsfilesystem import GCSFileSystem fs = GCSFileSystem(self.pipeline_options) with fs.open(self.args.input) as infile: for line in infile: topic.publish(line) else: with open(self.args.input) as infile: for line in infile: topic.publish(line) logging.info('Finished event generation.') # Read from PubSub into a PCollection. if self.args.subscription_name: raw_events = self.pipeline | 'ReadPubSub' >> beam.io.ReadFromPubSub( subscription=sub.full_name) else: raw_events = self.pipeline | 'ReadPubSub' >> beam.io.ReadFromPubSub( topic=topic.full_name) raw_events = ( raw_events | 'deserialization' >> beam.ParDo(nexmark_util.ParseJsonEvnetFn()) | 'timestamping' >> beam.Map(lambda e: window.TimestampedValue(e, e.date_time))) return raw_events
def generate_events(self): from google.cloud import pubsub publish_client = pubsub.Client(project=self.project) topic = publish_client.topic(self.topic_name) logging.info('Generating auction events to topic %s', topic.name) if self.args.input.startswith('gs://'): from apache_beam.io.gcp.gcsfilesystem import GCSFileSystem fs = GCSFileSystem(self.pipeline_options) with fs.open(self.args.input) as infile: for line in infile: topic.publish(line) else: with open(self.args.input) as infile: for line in infile: topic.publish(line) logging.info('Finished event generation.')
class ReadGCSNotifications(beam.PTransform): def __init__(self, env, bucket_name, log_name, pipeline_args): self.bucket_name = bucket_name self.env = env self.gcs = None self.pipeline_args = pipeline_args self.log_name = log_name def parse_element(self, element): message = json.loads(element.data) bucket = message['bucket'] # Only import from the bucket we are expecting. if bucket != self.bucket_name: return [] filepath = message['name'] logging.info('Got file: %s, %s', bucket, filepath) logging.info('Got -: %s', message) logline_metadata = None # try: # Split path component. Expecting logs/date/bundleId/env/ path_comps = filepath.split('/') if len(path_comps) < 3 or (path_comps[3] != self.env and self.env is not None): logging.info('Skipping %s', filepath) return [] name = path_comps[len(path_comps) - 1] if name.endswith('.txt'): name = name[0:len(name) - 4] name_comps = name.split('_') self.env = path_comps[3] self.log_name = 'client-logs-%s' % ( self.env) if self.log_name is None else self.log_name logline_metadata = { 'suffix': name_comps[2], 'bundleId': path_comps[2], 'env': path_comps[3], 'phone': urllib2.unquote(name_comps[0]).decode('utf8'), 'filepath': filepath } self.logline_metadata = logline_metadata logging.info('Got file: %s with %s', filepath, logline_metadata) if not self.gcs: # These imports have to be nested (ugh) because the constructor and the # main pipeline get evaluated locally when deploying remotely from # the cmdline, and this class is only available when running on GCS from apache_beam.io.gcp.gcsfilesystem import GCSFileSystem self.gcs = GCSFileSystem(PipelineOptions(self.pipeline_args)) self.logger = stackdriver_logging.Client().logger(self.log_name) # Read the whole file (ugh) from GCS. Without SDoFns support in Python, that's the best # we can do in dataflow right now. with self.gcs.open('gs://%s/%s' % (bucket, filepath), mime_type='text/plain') as infile: for line in infile: if sys.getsizeof(line) > 1000: lines = textwrap.wrap(line, 1000, break_long_words=False) for text in lines: self.writeLog(text) else: self.writeLog(line) return [] def writeLog(self, text): severity_pattern = re.compile('^([A-Za-z]+)') severity_remappings = { 'TRACE': 'DEBUG', 'LOG': 'DEBUG', 'WARN': 'WARNING', 'CRIT': 'CRITICAL' } # Build log element from message, and labels from metadata log_element = dict(self.logline_metadata) log_element['msg'] = text # Try to parse out the severity from the start of the line # And try and make sure it maps to a valid SD severity match = severity_pattern.match(text) if match: log_severity = match.group(1).upper() log_severity = severity_remappings.get(log_severity, log_severity) try: # Write the struct to SD using the hopefully valid severity self.logger.log_struct(log_element, severity=log_severity) except: # Write the struct to SD without a severity self.logger.log_struct(log_element) else: # Write the struct to SD without a severity self.logger.log_struct(log_element) def expand(self, pcoll): return pcoll | 'ReadGCSNotifications' >> beam.FlatMap( self.parse_element)