def generate_events(self): publish_client = pubsub.Client(project=self.project) topic = publish_client.topic(self.topic_name) sub = topic.subscription(self.subscription_name) logging.info('Generating auction events to topic %s', topic.name) if self.args.input.startswith('gs://'): from apache_beam.io.gcp.gcsfilesystem import GCSFileSystem fs = GCSFileSystem(self.pipeline_options) with fs.open(self.args.input) as infile: for line in infile: topic.publish(line) else: with open(self.args.input) as infile: for line in infile: topic.publish(line) logging.info('Finished event generation.') # Read from PubSub into a PCollection. if self.args.subscription_name: raw_events = self.pipeline | 'ReadPubSub' >> beam.io.ReadFromPubSub( subscription=sub.full_name) else: raw_events = self.pipeline | 'ReadPubSub' >> beam.io.ReadFromPubSub( topic=topic.full_name) return raw_events
def generate_events(self): publish_client = pubsub.Client(project=self.project) topic = publish_client.topic(self.topic_name) sub = topic.subscription(self.subscription_name) logging.info('Generating auction events to topic %s', topic.name) if self.args.input.startswith('gs://'): from apache_beam.io.gcp.gcsfilesystem import GCSFileSystem fs = GCSFileSystem(self.pipeline_options) with fs.open(self.args.input) as infile: for line in infile: topic.publish(line) else: with open(self.args.input) as infile: for line in infile: topic.publish(line) logging.info('Finished event generation.') # Read from PubSub into a PCollection. if self.args.subscription_name: raw_events = self.pipeline | 'ReadPubSub' >> beam.io.ReadFromPubSub( subscription=sub.full_name) else: raw_events = self.pipeline | 'ReadPubSub' >> beam.io.ReadFromPubSub( topic=topic.full_name) raw_events = ( raw_events | 'deserialization' >> beam.ParDo(nexmark_util.ParseJsonEvnetFn()) | 'timestamping' >> beam.Map(lambda e: window.TimestampedValue(e, e.date_time))) return raw_events
def parse_element(self, element): message = json.loads(element.data) bucket = message['bucket'] # Only import from the bucket we are expecting. if bucket != self.bucket_name: return [] filepath = message['name'] logging.info('Got file: %s, %s', bucket, filepath) logging.info('Got -: %s', message) logline_metadata = None # try: # Split path component. Expecting logs/date/bundleId/env/ path_comps = filepath.split('/') if len(path_comps) < 3 or (path_comps[3] != self.env and self.env is not None): logging.info('Skipping %s', filepath) return [] name = path_comps[len(path_comps) - 1] if name.endswith('.txt'): name = name[0:len(name) - 4] name_comps = name.split('_') self.env = path_comps[3] self.log_name = 'client-logs-%s' % ( self.env) if self.log_name is None else self.log_name logline_metadata = { 'suffix': name_comps[2], 'bundleId': path_comps[2], 'env': path_comps[3], 'phone': urllib2.unquote(name_comps[0]).decode('utf8'), 'filepath': filepath } self.logline_metadata = logline_metadata logging.info('Got file: %s with %s', filepath, logline_metadata) if not self.gcs: # These imports have to be nested (ugh) because the constructor and the # main pipeline get evaluated locally when deploying remotely from # the cmdline, and this class is only available when running on GCS from apache_beam.io.gcp.gcsfilesystem import GCSFileSystem self.gcs = GCSFileSystem(PipelineOptions(self.pipeline_args)) self.logger = stackdriver_logging.Client().logger(self.log_name) # Read the whole file (ugh) from GCS. Without SDoFns support in Python, that's the best # we can do in dataflow right now. with self.gcs.open('gs://%s/%s' % (bucket, filepath), mime_type='text/plain') as infile: for line in infile: if sys.getsizeof(line) > 1000: lines = textwrap.wrap(line, 1000, break_long_words=False) for text in lines: self.writeLog(text) else: self.writeLog(line) return []
def _data_to_load(self, gcs: GCSFileSystem, scan_type: str, incremental_load: bool, table_name: str, start_date: Optional[datetime.date] = None, end_date: Optional[datetime.date] = None) -> List[str]: """Select the right files to read. Args: gcs: GCSFileSystem object scan_type: one of 'echo', 'discard', 'http', 'https' incremental_load: boolean. If true, only read the latest new data table_name: dataset.table name like 'base.scan_echo' start_date: date object, only files after or at this date will be read end_date: date object, only files at or before this date will be read Returns: A List of filename strings. ex ['gs://firehook-scans/echo/CP_Quack-echo-2020-08-22-06-08-03/results.json', 'gs://firehook-scans/echo/CP_Quack-echo-2020-08-23-06-01-02/results.json'] """ if incremental_load: full_table_name = self._get_full_table_name(table_name) existing_sources = _get_existing_datasources(full_table_name) else: existing_sources = [] # Both zipped and unzipped data to be read in zipped_regex = self.bucket + scan_type + '/**/results.json.gz' unzipped_regex = self.bucket + scan_type + '/**/results.json' zipped_metadata = [m.metadata_list for m in gcs.match([zipped_regex])][0] unzipped_metadata = [ m.metadata_list for m in gcs.match([unzipped_regex]) ][0] file_metadata = zipped_metadata + unzipped_metadata filenames = [metadata.path for metadata in file_metadata] file_sizes = [metadata.size_in_bytes for metadata in file_metadata] filtered_filenames = [ filename for (filename, file_size) in zip(filenames, file_sizes) if (_between_dates(filename, start_date, end_date) and _source_from_filename(filename) not in existing_sources and file_size != 0) ] return filtered_filenames
def run(argv=None): """The main function which creates the pipeline and runs it.""" parser = argparse.ArgumentParser() # Here we add some specific command line arguments we expect. # Specifically we have the input file to load and the output table to # This is the final stage of the pipeline, where we define the destination # of the data. In this case we are writing to BigQuery. parser.add_argument( '--input_subscription', required=True, help=('Input PubSub subscription of the form ' '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."')) parser.add_argument('--output', required=True, help='Output bucket for data', default='') parser.add_argument('--log', required=True, help='log bucket', default='') # Parse arguments from the command line. known_args, pipeline_args = parser.parse_known_args(argv) #import pprint #pprint.pprint(known_args) #pprint.pprint(pipeline_args) #pprint.pprint(pipeline_options.get_all_options()) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True pipeline_options.view_as(StandardOptions).streaming = True # get options project_id = pipeline_options.get_all_options()['project'] output_bucket_name = known_args.output log_bucket_name = known_args.log log_file_path = 'gs://{}/logs'.format(log_bucket_name) fs = GCSFileSystem(pipeline_options=pipeline_options) # DataIngestion is a class we built in this script to hold the logic for # transforming the file into a BigQuery table. data_copier = DataCopier() # Initiate the pipeline using the pipeline arguments passed in from the # command line. This includes information including where Dataflow should # store temp files, and what the project id is p = beam.Pipeline(options=pipeline_options) (p | beam.io.ReadFromPubSub(subscription=known_args.input_subscription) | 'Copying customer data to the final data-bucket/customer-id' >> beam.Map(lambda m: data_copier.parse_method(m, project_id, fs, output_bucket_name)) | 'Write results to the output bucket' >> WriteToText(file_path_prefix=log_file_path)) p.run().wait_until_finish()
def run_beam_pipeline(self, scan_type: str, incremental_load: bool, job_name: str, table_name: str, start_date: Optional[datetime.date], end_date: Optional[datetime.date]) -> None: """Run a single apache beam pipeline to load json data into bigquery. Args: scan_type: one of 'echo', 'discard', 'http', 'https' or 'satellite' incremental_load: boolean. If true, only load the latest new data, if false reload all data. job_name: string name for this pipeline job. table_name: dataset.table name like 'base.scan_echo' start_date: date object, only files after or at this date will be read. Mostly only used during development. end_date: date object, only files at or before this date will be read. Mostly only used during development. Raises: Exception: if any arguments are invalid or the pipeline fails. """ logging.getLogger().setLevel(logging.INFO) pipeline_options = self._get_pipeline_options(scan_type, job_name) gcs = GCSFileSystem(pipeline_options) new_filenames = self._data_to_load(gcs, scan_type, incremental_load, table_name, start_date, end_date) if not new_filenames: logging.info('No new files to load') return with beam.Pipeline(options=pipeline_options) as p: # PCollection[Tuple[filename,line]] lines = _read_scan_text(p, new_filenames) if scan_type == satellite.SCAN_TYPE_SATELLITE: # PCollection[Row], PCollection[Row] satellite_rows, blockpage_rows = satellite.process_satellite_lines( lines) # PCollection[Row] rows_with_metadata = self._add_metadata(satellite_rows) self._write_to_bigquery( satellite.SCAN_TYPE_BLOCKPAGE, blockpage_rows, satellite.get_blockpage_table_name(table_name, scan_type), incremental_load) else: # Hyperquack scans # PCollection[Row] rows = (lines | 'flatten json' >> beam.ParDo( flatten.FlattenMeasurement()).with_output_types(Row)) # PCollection[Row] rows_with_metadata = self._add_metadata(rows) _raise_error_if_collection_empty(rows_with_metadata) self._write_to_bigquery(scan_type, rows_with_metadata, table_name, incremental_load)
def get_filesystem(path): """Function that returns the FileSystem class to use based on the path provided in the input. """ if path.startswith('gs://'): from apache_beam.io.gcp.gcsfilesystem import GCSFileSystem return GCSFileSystem() else: return LocalFileSystem()
def generate_events(self): from google.cloud import pubsub publish_client = pubsub.Client(project=self.project) topic = publish_client.topic(self.topic_name) logging.info('Generating auction events to topic %s', topic.name) if self.args.input.startswith('gs://'): from apache_beam.io.gcp.gcsfilesystem import GCSFileSystem fs = GCSFileSystem(self.pipeline_options) with fs.open(self.args.input) as infile: for line in infile: topic.publish(line) else: with open(self.args.input) as infile: for line in infile: topic.publish(line) logging.info('Finished event generation.')
def get_filesystem(path): """Function that returns the FileSystem class to use based on the path provided in the input. """ if path.startswith('gs://'): try: from apache_beam.io.gcp.gcsfilesystem import GCSFileSystem except ImportError: raise ImportError('Google Cloud Platform IO not available, ' 'please install apache_beam[gcp]') return GCSFileSystem() return LocalFileSystem()
def _data_to_load(self, gcs: GCSFileSystem, scan_type: str, incremental_load: bool, table_name: str, start_date: Optional[datetime.date] = None, end_date: Optional[datetime.date] = None) -> List[str]: """Select the right files to read. Args: gcs: GCSFileSystem object scan_type: one of 'echo', 'discard', 'http', 'https', 'satellite' incremental_load: boolean. If true, only read the latest new data table_name: dataset.table name like 'base.scan_echo' start_date: date object, only files after or at this date will be read end_date: date object, only files at or before this date will be read Returns: A List of filename strings. ex ['gs://firehook-scans/echo/CP_Quack-echo-2020-08-22-06-08-03/results.json', 'gs://firehook-scans/echo/CP_Quack-echo-2020-08-23-06-01-02/results.json'] """ if incremental_load: full_table_name = self._get_full_table_name(table_name) existing_sources = _get_existing_datasources(full_table_name) else: existing_sources = [] if scan_type == satellite.SCAN_TYPE_SATELLITE: files_to_load = flatten_satellite.SATELLITE_FILES else: files_to_load = SCAN_FILES # Filepath like `gs://firehook-scans/echo/**/*' files_regex = f'{self.bucket}{scan_type}/**/*' file_metadata = [m.metadata_list for m in gcs.match([files_regex])][0] filepaths = [metadata.path for metadata in file_metadata] file_sizes = [metadata.size_in_bytes for metadata in file_metadata] filtered_filenames = [ filepath for (filepath, file_size) in zip(filepaths, file_sizes) if (_between_dates(filepath, start_date, end_date) and _filename_matches(filepath, files_to_load) and flatten_base.source_from_filename(filepath) not in existing_sources and file_size > EMPTY_GZIPPED_FILE_SIZE) ] return filtered_filenames
def run(): p = beam.Pipeline(options=PipelineOptions()) gcs = GCSFileSystem(PipelineOptions()) pattern_1 = [ 'gs://dataflow-buffer/parent-unpack/2018/i20180130/PxpFJwJabD-untarI20180130/DESIGN/USD0808610-20180130.ZIP'] input_pattern = ['gs://dataflow-buffer/parent-unpack/2018/i20180130/PxpFJwJabD-untar*/**/*.ZIP'] input_pattern_1 = 'gs://dataflow-buffer/parent-unpack/2018/i20180130/PxpFJwJabD-untar*/**/*.ZIP' parent_zip = 'gs://bulk_pdfimages_dump/bulkdata.uspto.gov/data/patent/grant/redbook/2010/I20100202.zip' result = [m.metadata_list for m in gcs.match(input_pattern)] metadata_list = result.pop() print 'satya' parts = (p # | 'Match Files' >> fileio.MatchFiles(pattern_1) | 'Return nested files' >> beam.Create(metadata_list) # | 'print Files' >> beam | 'Print read file' >> beam.ParDo(ImageExtract()) # | 'one' >> beam.Map() ) p.run().wait_until_finish()
def run_beam_pipeline(self, scan_type: str, incremental_load: bool, job_name: str, table_name: str, start_date: Optional[datetime.date], end_date: Optional[datetime.date]) -> None: """Run a single apache beam pipeline to load json data into bigquery. Args: scan_type: one of 'echo', 'discard', 'http', 'https' incremental_load: boolean. If true, only load the latest new data, if false reload all data. job_name: string name for this pipeline job. table_name: dataset.table name like 'base.scan_echo' start_date: date object, only files after or at this date will be read. Mostly only used during development. end_date: date object, only files at or before this date will be read. Mostly only used during development. Raises: Exception: if any arguments are invalid or the pipeline fails. """ logging.getLogger().setLevel(logging.INFO) pipeline_options = self._get_pipeline_options(scan_type, job_name) gcs = GCSFileSystem(pipeline_options) new_filenames = self._data_to_load(gcs, scan_type, incremental_load, table_name, start_date, end_date) if not new_filenames: logging.info('No new files to load incrementally') return with beam.Pipeline(options=pipeline_options) as p: # PCollection[Tuple[filename,line]] lines = _read_scan_text(p, new_filenames) # PCollection[Row] rows = ( lines | 'flatten json' >> beam.FlatMapTuple(_flatten_measurement).with_output_types(Row)) # PCollection[Row] rows_with_metadata = self._add_metadata(rows) self._write_to_bigquery(rows_with_metadata, table_name, incremental_load)
class ReadGCSNotifications(beam.PTransform): def __init__(self, env, bucket_name, log_name, pipeline_args): self.bucket_name = bucket_name self.env = env self.gcs = None self.pipeline_args = pipeline_args self.log_name = log_name def parse_element(self, element): message = json.loads(element.data) bucket = message['bucket'] # Only import from the bucket we are expecting. if bucket != self.bucket_name: return [] filepath = message['name'] logging.info('Got file: %s, %s', bucket, filepath) logging.info('Got -: %s', message) logline_metadata = None # try: # Split path component. Expecting logs/date/bundleId/env/ path_comps = filepath.split('/') if len(path_comps) < 3 or (path_comps[3] != self.env and self.env is not None): logging.info('Skipping %s', filepath) return [] name = path_comps[len(path_comps) - 1] if name.endswith('.txt'): name = name[0:len(name) - 4] name_comps = name.split('_') self.env = path_comps[3] self.log_name = 'client-logs-%s' % ( self.env) if self.log_name is None else self.log_name logline_metadata = { 'suffix': name_comps[2], 'bundleId': path_comps[2], 'env': path_comps[3], 'phone': urllib2.unquote(name_comps[0]).decode('utf8'), 'filepath': filepath } self.logline_metadata = logline_metadata logging.info('Got file: %s with %s', filepath, logline_metadata) if not self.gcs: # These imports have to be nested (ugh) because the constructor and the # main pipeline get evaluated locally when deploying remotely from # the cmdline, and this class is only available when running on GCS from apache_beam.io.gcp.gcsfilesystem import GCSFileSystem self.gcs = GCSFileSystem(PipelineOptions(self.pipeline_args)) self.logger = stackdriver_logging.Client().logger(self.log_name) # Read the whole file (ugh) from GCS. Without SDoFns support in Python, that's the best # we can do in dataflow right now. with self.gcs.open('gs://%s/%s' % (bucket, filepath), mime_type='text/plain') as infile: for line in infile: if sys.getsizeof(line) > 1000: lines = textwrap.wrap(line, 1000, break_long_words=False) for text in lines: self.writeLog(text) else: self.writeLog(line) return [] def writeLog(self, text): severity_pattern = re.compile('^([A-Za-z]+)') severity_remappings = { 'TRACE': 'DEBUG', 'LOG': 'DEBUG', 'WARN': 'WARNING', 'CRIT': 'CRITICAL' } # Build log element from message, and labels from metadata log_element = dict(self.logline_metadata) log_element['msg'] = text # Try to parse out the severity from the start of the line # And try and make sure it maps to a valid SD severity match = severity_pattern.match(text) if match: log_severity = match.group(1).upper() log_severity = severity_remappings.get(log_severity, log_severity) try: # Write the struct to SD using the hopefully valid severity self.logger.log_struct(log_element, severity=log_severity) except: # Write the struct to SD without a severity self.logger.log_struct(log_element) else: # Write the struct to SD without a severity self.logger.log_struct(log_element) def expand(self, pcoll): return pcoll | 'ReadGCSNotifications' >> beam.FlatMap( self.parse_element)
def parse_element(self, element): if not self.gcs: # These imports have to be nested (ugh) because the constructor and the # main pipeline get evaluated locally when deploying remotely from # the cmdline, and this class is only available when running on GCS from apache_beam.io.gcp.gcsfilesystem import GCSFileSystem self.gcs = GCSFileSystem(PipelineOptions(self.pipeline_args)) self.logger = stackdriver_logging.Client().logger(self.log_name) message = json.loads(element.data) bucket = message['bucket'] # Only import from the bucket we are expecting. if bucket != self.bucket_name: return [] filepath = message['name'] logging.info('Got file: %s, %s', bucket, filepath) logging.info('Got -: %s', message) logline_metadata = None # try: # Split path component. Expecting logs/bundleId/env/ path_comps = filepath.split('/') if len(path_comps) < 3 or path_comps[2] != self.env: logging.info('Skipping %s', filepath) return [] name = path_comps[len(path_comps) - 1] if name.endswith('.txt'): name = name[0:len(name) - 4] name_comps = name.split('_') logline_metadata = { 'suffix': name_comps[2], 'bundleId': path_comps[1], 'env': path_comps[2], 'phone': urllib2.unquote(name_comps[0]).decode('utf8'), 'filepath': filepath } # except: # logging.warn("Couldn't read metadata for %s", filepath) # return [] logging.info('Got file: %s with %s', filepath, logline_metadata) severity_pattern = re.compile('^([A-Za-z]+)') severity_remappings = { 'TRACE': 'DEBUG', 'LOG': 'DEBUG', 'WARN': 'WARNING', 'CRIT': 'CRITICAL' } # Read the whole file (ugh) from GCS. Without SDoFns support in Python, that's the best # we can do in dataflow right now. with self.gcs.open('gs://%s/%s' % (bucket, filepath), mime_type='text/plain') as infile: for line in infile: # Build log element from message, and labels from metadata log_element = dict(logline_metadata) log_element['msg'] = line # Try to parse out the severity from the start of the line # And try and make sure it maps to a valid SD severity match = severity_pattern.match(line) if match: log_severity = match.group(1).upper() log_severity = severity_remappings.get( log_severity, log_severity) try: # Write the struct to SD using the hopefully valid severity self.logger.log_struct(log_element, severity=log_severity) except: # Write the struct to SD without a severity self.logger.log_struct(log_element) else: # Write the struct to SD without a severity self.logger.log_struct(log_element) return []
def tearDownClass(cls): GCSFileSystem(pipeline_options=PipelineOptions()) \ .delete([cls.staging_bucket_name])
def tearDown(self): GCSFileSystem(pipeline_options=PipelineOptions()) \ .delete([self.staging_bucket_name])