def main(gcs_path, out, start=None, end=None, pipeline_args=None): steps = [ apache_beam.FlatMap('Parse XML and filter', parse_xml), apache_beam.Map( 'Coerce "wikitext" key to string type', force_string_function('wikitext')), apache_beam.FlatMap('Parse markdown into plaintext', parse_wikitext), apache_beam.Map( 'Coerce "text" key to string type', force_string_function('text')), apache_beam.Map( 'Filter out any vestigial HTML', html_to_text), core.ParDo('batch', BatchFn(10)), apache_beam.FlatMap( 'Entities (batch)', analyze_entities_batch), ] p = apache_beam.Pipeline(argv=pipeline_args) if start: value = p | apache_beam.Read( 'Pick up at step {}'.format(start), apache_beam.io.TextFileSource( gcs_path)) | \ apache_beam.Map('Parse JSON', json.loads) else: value = p | apache_beam.Read( 'Read XML', custom_sources.XmlFileSource('page', gcs_path)) for step in steps[start:end]: value = value | step if end: if not out.startswith('gs://'): raise ValueError('Output must be GCS path if an end is specified.') value = value | apache_beam.Map('to JSON', json.dumps) | \ apache_beam.Write('Dump to GCS', apache_beam.io.TextFileSink(out)) else: value = value | apache_beam.Write( 'Dump metadata to BigQuery', apache_beam.io.BigQuerySink( out, schema=', '.join([ 'article_id:STRING', 'article_title:STRING', 'article_sentiment_polarity:FLOAT', 'article_sentiment_magnitude:FLOAT', 'entity_name:STRING', 'entity_type:STRING', 'entity_wikipedia_url:STRING', 'entity_salience:FLOAT', 'entity_num_mentions:INTEGER', ]), create_disposition=( apache_beam.io.BigQueryDisposition.CREATE_IF_NEEDED), write_disposition=( apache_beam.io.BigQueryDisposition.WRITE_APPEND))) p.run()
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir): feature_set = iris.IrisFeatures() training_data_source = beam.io.TextFileSource( training_data, strip_trailing_newlines=True, coder=io.CsvCoder.from_feature_set(feature_set, feature_set.csv_columns)) eval_data_source = beam.io.TextFileSource( eval_data, strip_trailing_newlines=True, coder=io.CsvCoder.from_feature_set(feature_set, feature_set.csv_columns)) predict_data_source = beam.io.TextFileSource( predict_data, strip_trailing_newlines=True, coder=io.CsvCoder.from_feature_set(feature_set, feature_set.csv_columns, has_target_columns=False)) train = pipeline | beam.Read('ReadTrainingData', training_data_source) evaluate = pipeline | beam.Read('ReadEvalData', eval_data_source) predict = pipeline | beam.Read('ReadPredictData', predict_data_source) # TODO(b/32726166) Update input_format and format_metadata to read from these # values directly from the coder. (metadata, train_features, eval_features, predict_features) = ( (train, evaluate, predict) | 'Preprocess' >> ml.Preprocess( feature_set, input_format='csv', format_metadata={'headers': feature_set.csv_columns})) # Writes metadata.yaml, features_train, features_eval, and features_eval files # pylint: disable=expression-not-assigned (metadata | 'SaveMetadata' >> io.SaveMetadata( os.path.join(output_dir, 'metadata.yaml'))) # We turn off sharding of these feature files because the dataset very small. (train_features | 'SaveTrain' >> io.SaveFeatures( os.path.join(output_dir, 'features_train'))) (eval_features | 'SaveEval' >> io.SaveFeatures(os.path.join(output_dir, 'features_eval'))) (predict_features | 'SavePredict' >> io.SaveFeatures( os.path.join(output_dir, 'features_predict'))) # pylint: enable=expression-not-assigned return metadata, train_features, eval_features, predict_features
def run(project, bucket): argv = [ '--project={0}'.format(project), '--job_name=ch03timecorr', '--save_main_session', '--staging_location=gs://{0}/flights/staging/'.format(bucket), '--temp_location=gs://{0}/flights/temp/'.format(bucket), '--setup_file=./setup.py', '--max_num_workers=10', '--autoscaling_algorithm=THROUGHPUT_BASED', '--runner=DataflowPipelineRunner' ] airports_filename = 'gs://{}/flights/airports/airports.csv.gz'.format(bucket) flights_raw_files = 'gs://{}/flights/raw/*.csv'.format(bucket) flights_output = 'gs://{}/flights/tzcorr/all_flights'.format(bucket) events_output = '{}:flights.simevents'.format(project) pipeline = beam.Pipeline(argv=argv) airports = (pipeline | 'airports:read' >> beam.Read(beam.io.TextFileSource(airports_filename)) | 'airports:fields' >> beam.Map(lambda line: next(csv.reader([line]))) | 'airports:tz' >> beam.Map(lambda fields: (fields[0], addtimezone(fields[21], fields[26]))) ) flights = (pipeline | 'flights:read' >> beam.Read(beam.io.TextFileSource(flights_raw_files)) | 'flights:tzcorr' >> beam.FlatMap(tz_correct, beam.pvalue.AsDict(airports)) ) (flights | 'flights:tostring' >> beam.Map(lambda fields: ','.join(fields)) | 'flights:out' >> beam.io.textio.WriteToText(flights_output) ) events = flights | beam.FlatMap(get_next_event) schema = 'FL_DATE:date,UNIQUE_CARRIER:string,AIRLINE_ID:string,CARRIER:string,FL_NUM:string,ORIGIN_AIRPORT_ID:string,ORIGIN_AIRPORT_SEQ_ID:integer,ORIGIN_CITY_MARKET_ID:string,ORIGIN:string,DEST_AIRPORT_ID:string,DEST_AIRPORT_SEQ_ID:integer,DEST_CITY_MARKET_ID:string,DEST:string,CRS_DEP_TIME:timestamp,DEP_TIME:timestamp,DEP_DELAY:float,TAXI_OUT:float,WHEELS_OFF:timestamp,WHEELS_ON:timestamp,TAXI_IN:float,CRS_ARR_TIME:timestamp,ARR_TIME:timestamp,ARR_DELAY:float,CANCELLED:string,CANCELLATION_CODE:string,DIVERTED:string,DISTANCE:float,DEP_AIRPORT_LAT:float,DEP_AIRPORT_LON:float,DEP_AIRPORT_TZOFFSET:float,ARR_AIRPORT_LAT:float,ARR_AIRPORT_LON:float,ARR_AIRPORT_TZOFFSET:float,EVENT:string,NOTIFY_TIME:timestamp,EVENT_DATA:string' (events | 'events:totablerow' >> beam.Map(lambda fields: create_row(fields)) | 'events:out' >> beam.io.Write(beam.io.BigQuerySink( events_output, schema=schema, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)) ) pipeline.run()
def run(): parser = argparse.ArgumentParser(description='Compute monthly NDVI') parser.add_argument('--index_file', default='2015index.txt.gz', help='default=2015index.txt.gz Use gs://gcp-public-data-landsat/index.csv.gz to process full dataset') parser.add_argument('--output_file', default='output.txt', help='default=output.txt Supply a location on GCS when running on cloud') parser.add_argument('--output_dir', required=True, help='Where should the ndvi images be stored? Supply a GCS location when running on cloud') known_args, pipeline_args = parser.parse_known_args() p = beam.Pipeline(argv=pipeline_args) index_file = known_args.index_file output_file = known_args.output_file output_dir = known_args.output_dir #lat = 4.37; lon = -7.71 # Cape Palmas lat =-21.1; lon = 55.50 # Reunion Island # Read the index file and find the best look scenes = (p | 'read_index' >> beam.Read(beam.io.TextFileSource(index_file)) | 'filter_scenes' >> beam.FlatMap(lambda line: filterScenes(line, lat, lon) ) | 'least_cloudy' >> beam.CombinePerKey(clearest) ) # write out info about scene scenes | beam.Map(lambda (yrmon, scene): scene.__dict__) | 'scene_info' >> beam.io.textio.WriteToText(output_file) # compute ndvi on scene scenes | 'compute_ndvi' >> beam.Map(lambda (yrmon, scene): ndvi.computeNdvi(scene.BASE_URL, output_dir)) p.run()
def configure_pipeline(p, opt): """Specify PCollection and transformations in pipeline.""" input_source = beam.io.TextFileSource(opt.input_path, strip_trailing_newlines=True) label_source = beam.io.TextFileSource(opt.input_dict, strip_trailing_newlines=True) labels = (p | 'Read dictionary' >> beam.Read(label_source)) _ = (p | 'Read input' >> beam.Read(input_source) | 'Parse input' >> beam.Map(lambda line: csv.reader([line]).next()) | 'Extract label ids' >> beam.ParDo(ExtractLabelIdsDoFn(), beam.pvalue.AsIter(labels)) | 'Read and convert to JPEG' >> beam.ParDo( ReadImageAndConvertToJpegDoFn()) | 'Embed and make TFExample' >> beam.ParDo(TFExampleFromImageDoFn()) | 'Save to disk' >> SaveFeatures(opt.output_path))
def test_run_direct(self): file_name = self._create_temp_file('aaaa\nbbbb\ncccc\ndddd') pipeline = TestPipeline() pcoll = pipeline | beam.Read(LineSource(file_name)) assert_that(pcoll, equal_to(['aaaa', 'bbbb', 'cccc', 'dddd'])) pipeline.run()
def run(argv=None): """Constructs and runs the example filtering pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', help='BigQuery table to read from.', default='clouddataflow-readonly:samples.weather_stations') parser.add_argument('--output', required=True, help='BigQuery table to write to.') parser.add_argument('--month_filter', default=7, help='Numeric value of month to filter on.') known_args, pipeline_args = parser.parse_known_args(argv) p = beam.Pipeline(argv=pipeline_args) input_data = p | beam.Read(beam.io.BigQuerySource(known_args.input)) # pylint: disable=expression-not-assigned (filter_cold_days(input_data, known_args.month_filter) | 'SaveToBQ' >> beam.io.Write( beam.io.BigQuerySink( known_args.output, schema='year:INTEGER,month:INTEGER,day:INTEGER,mean_temp:FLOAT', create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))) # Actually run the pipeline (all operations above are deferred). p.run()
def run(): import os parser = argparse.ArgumentParser(description='Compute monthly NDVI') parser.add_argument( '--index_file', default='2015index.txt.gz', help= 'default=2015.txt.gz ... gs://cloud-training-demos/landsat/2015index.txt.gz Use gs://gcp-public-data-landsat/index.csv.gz to process full dataset' ) parser.add_argument( '--output_file', default='output.txt', help='default=output.txt Supply a location on GCS when running on cloud' ) parser.add_argument( '--output_dir', required=True, help= 'Where should the ndvi images be stored? Supply a GCS location when running on cloud' ) known_args, pipeline_args = parser.parse_known_args() p = beam.Pipeline(argv=pipeline_args) index_file = known_args.index_file output_file = known_args.output_file output_dir = known_args.output_dir lat = -21.1 lon = 55.50 # center of Reunion Island dlat = 0.4 dlon = 0.4 # Read the index file and find all scenes that cover this area allscenes = (p | 'read_index' >> beam.Read(beam.io.TextFileSource(index_file)) | 'to_scene' >> beam.Map(lambda line: SceneInfo(line)) | 'by_area' >> beam.FlatMap(lambda scene: filterByArea( scene, lat + dlat, lon - dlon, lat - dlat, lon + dlon))) # for each month and spacecraft-coverage-pattern (given by the path and row), find clearest scene scenes = ( allscenes | 'cov_month' >> beam.Map(lambda scene: (scene.month_path_row(), scene)) | 'least_cloudy' >> beam.CombinePerKey(clearest) | 'yrmon-scene' >> beam.Map(lambda (key, scene): (scene.yrmon(), scene))) # write out info about scene scenes | beam.Map( lambda (yrmon, scene): '{}: {}'.format(yrmon, scene.SCENE_ID) ) | 'scene_info' >> beam.io.textio.WriteToText(output_file) # compute ndvi on scene scenes | 'compute_ndvi' >> beam.Map(lambda (yrmon, scene): ndvi.computeNdvi( scene.BASE_URL, os.path.join(output_dir, yrmon), scene.SPACECRAFT_ID)) p.run()
def preprocess(pipeline): feature_set = iris.IrisFeatures() training_data = beam.io.TextFileSource( args.training_data, strip_trailing_newlines=True, coder=io.CsvCoder.from_feature_set(feature_set, feature_set.csv_columns)) eval_data = beam.io.TextFileSource( args.eval_data, strip_trailing_newlines=True, coder=io.CsvCoder.from_feature_set(feature_set, feature_set.csv_columns)) predict_data = beam.io.TextFileSource( args.predict_data, strip_trailing_newlines=True, coder=io.CsvCoder.from_feature_set(feature_set, feature_set.csv_columns, has_target_columns=False)) train = pipeline | beam.Read('ReadTrainingData', training_data) evaluate = pipeline | beam.Read('ReadEvalData', eval_data) predict = pipeline | beam.Read('ReadPredictData', predict_data) (metadata, train_features, eval_features, predict_features) = ( (train, evaluate, predict) | 'Preprocess' >> ml.Preprocess(feature_set, input_format='csv', format_metadata={'headers': feature_set.csv_columns})) # Writes metadata.yaml (text file), features_train, features_eval, and # features_eval (TFRecord files) (metadata | 'SaveMetadata' >> io.SaveMetadata(os.path.join(args.output_dir, 'metadata.yaml'))) # We turn off sharding of the feature files because the dataset is very small. (train_features | 'SaveTrain' >> io.SaveFeatures( os.path.join(args.output_dir, 'features_train'))) (eval_features | 'SaveEval' >> io.SaveFeatures( os.path.join(args.output_dir, 'features_eval'))) (predict_features | 'SavePredict' >> io.SaveFeatures( os.path.join(args.output_dir, 'features_predict'))) return metadata, train_features, eval_features, predict_features
def test_run_concat_direct(self): source = ConcatSource([RangeSource(0, 10), RangeSource(10, 100), RangeSource(100, 1000), ]) pipeline = TestPipeline() pcoll = pipeline | beam.Read(source) assert_that(pcoll, equal_to(range(1000))) pipeline.run()
def test_process_auto(self): path = os.path.join(self._new_tempdir(), 'result.gz') self._write_file_gzip(path, FOO_BAR_RECORD_BASE64) with beam.Pipeline(DirectRunner()) as p: result = (p | beam.Read( _TFRecordSource( path, coder=coders.BytesCoder(), compression_type=fileio.CompressionTypes.AUTO))) beam.assert_that(result, beam.equal_to(['foo', 'bar']))
def test_process_single(self): path = os.path.join(self._new_tempdir(), 'result') self._write_file(path, FOO_RECORD_BASE64) with TestPipeline() as p: result = (p | beam.Read( _TFRecordSource( path, coder=coders.BytesCoder(), compression_type=fileio.CompressionTypes.AUTO))) beam.assert_that(result, beam.equal_to(['foo']))
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', required=True, help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') parser.add_argument('--model', dest='model', required=True, help='Checkpoint file of the model.') parser.add_argument('--source', dest='source', required=True, help='Data source location (cs|bq).') known_args, pipeline_args = parser.parse_known_args(argv) if known_args.source == 'cs': def _to_dictionary(line): result = {} result['key'], result['image'] = line.split(':') return result p = beam.Pipeline(argv=pipeline_args) images = (p | 'ReadFromText' >> beam.io.ReadFromText(known_args.input) | 'ConvertToDict' >> beam.Map(_to_dictionary)) predictions = images | 'Prediction' >> beam.ParDo( PredictDoFn(), known_args.model) predictions | 'WriteToText' >> beam.io.WriteToText(known_args.output) else: schema = 'key:INTEGER' for i in range(10): schema += (', pred%d:FLOAT' % i) p = beam.Pipeline(argv=pipeline_args) images = p | 'ReadFromBQ' >> beam.Read( beam.io.BigQuerySource(known_args.input)) predictions = images | 'Prediction' >> beam.ParDo( PredictDoFn(), known_args.model) predictions | 'WriteToBQ' >> beam.Write( beam.io.BigQuerySink( known_args.output, schema=schema, create_disposition=beam.io.BigQueryDisposition. CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)) logging.getLogger().setLevel(logging.INFO) p.run()
def test_process_gzip(self): path = os.path.join(self._new_tempdir(), 'result') self._write_file_gzip(path, FOO_BAR_RECORD_BASE64) with TestPipeline() as p: result = (p | beam.Read( _TFRecordSource( path, coder=coders.BytesCoder(), compression_type=CompressionTypes.GZIP, validate=True))) beam.assert_that(result, beam.equal_to(['foo', 'bar']))
def test_read_auto_single_file_gzip(self): _, lines = write_data(10) filename = tempfile.NamedTemporaryFile( delete=False, prefix=tempfile.template, suffix='.gz').name with gzip.GzipFile(filename, 'wb') as f: f.write('\n'.join(lines)) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> beam.Read(LineSource( filename, compression_type=CompressionTypes.AUTO)) assert_that(pcoll, equal_to(lines)) pipeline.run()
def test_read_file_bzip2(self): _, lines = write_data(10) filename = tempfile.NamedTemporaryFile(delete=False, prefix=tempfile.template).name with bz2.BZ2File(filename, 'wb') as f: f.write('\n'.join(lines)) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> beam.Read( LineSource(filename, splittable=False, compression_type=fileio.CompressionTypes.BZIP2)) assert_that(pcoll, equal_to(lines)) pipeline.run()
def test_read_pattern_bzip2(self): _, lines = write_data(200) splits = [0, 34, 100, 140, 164, 188, 200] chunks = [lines[splits[i-1]:splits[i]] for i in xrange(1, len(splits))] compressed_chunks = [] for c in chunks: compressobj = bz2.BZ2Compressor() compressed_chunks.append( compressobj.compress('\n'.join(c)) + compressobj.flush()) file_pattern = write_prepared_pattern(compressed_chunks) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> beam.Read(LineSource( file_pattern, splittable=False, compression_type=CompressionTypes.BZIP2)) assert_that(pcoll, equal_to(lines)) pipeline.run()
def test_read_auto_pattern(self): _, lines = write_data(200) splits = [0, 34, 100, 140, 164, 188, 200] chunks = [lines[splits[i - 1]:splits[i]] for i in xrange(1, len(splits))] compressed_chunks = [] for c in chunks: out = cStringIO.StringIO() with gzip.GzipFile(fileobj=out, mode="w") as f: f.write('\n'.join(c)) compressed_chunks.append(out.getvalue()) file_pattern = write_prepared_pattern( compressed_chunks, suffixes=['.gz']*len(chunks)) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> beam.Read(LineSource( file_pattern, compression_type=CompressionTypes.AUTO)) assert_that(pcoll, equal_to(lines)) pipeline.run()
def main(src_path, dest_table, pipeline_args): p = apache_beam.Pipeline(argv=pipeline_args) value = p | 'Read JSON' >> apache_beam.Read(JsonFileSource(src_path)) value |= ( 'Remove records that lack location or year data' >> apache_beam.FlatMap(discard_incomplete)) value |= ( 'Convert string values to their types' >> apache_beam.Map(convert_types)) value |= ( 'Filter bad data' >> apache_beam.FlatMap(filter_suspicious)) value |= ( 'Massage fields with "rec" prefix' >> apache_beam.Map(massage_rec)) value |= ( 'Dump data to BigQuery' >> apache_beam.Write(apache_beam.io.BigQuerySink( dest_table, schema=', '.join([ 'fall:STRING', 'year:INTEGER', 'nametype:STRING', 'mass:FLOAT', 'name:STRING', 'class:STRING', 'latitude:FLOAT', 'longitude:FLOAT', 'id:STRING', ]), create_disposition=( apache_beam.io.BigQueryDisposition.CREATE_IF_NEEDED), write_disposition=( apache_beam.io.BigQueryDisposition.WRITE_TRUNCATE)))) p.run()
def run(): argv = [ '--project={0}'.format(PROJECT), '--job_name=examplejob2', '--save_main_session', '--staging_location=gs://{0}/staging/'.format(BUCKET), '--temp_location=gs://{0}/staging/'.format(BUCKET), '--runner=BlockingDataflowPipelineRunner' ] p = beam.Pipeline(argv=argv) input = 'gs://{0}/javahelp/*.java'.format(BUCKET) output_prefix = 'gs://{0}/javahelp/output'.format(BUCKET) searchTerm = 'import' # find all lines that contain the searchTerm (p | 'GetJava' >> beam.Read(beam.io.TextFileSource(input)) | 'Grep' >> beam.FlatMap(lambda line: my_grep(line, searchTerm)) | 'write' >> beam.io.textio.WriteToText(output_prefix)) p.run()
def run(argv=None): """Run the workflow.""" parser = argparse.ArgumentParser() parser.add_argument('--output') parser.add_argument('--ignore_corpus', default='') parser.add_argument('--ignore_word', default='') parser.add_argument('--num_groups') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) group_ids = [] for i in xrange(0, int(known_args.num_groups)): group_ids.append('id' + str(i)) query_corpus = 'select UNIQUE(corpus) from publicdata:samples.shakespeare' query_word = 'select UNIQUE(word) from publicdata:samples.shakespeare' ignore_corpus = known_args.ignore_corpus ignore_word = known_args.ignore_word pcoll_corpus = p | 'read corpus' >> beam.io.Read( beam.io.BigQuerySource(query=query_corpus)) pcoll_word = p | 'read_words' >> beam.Read( beam.io.BigQuerySource(query=query_word)) pcoll_ignore_corpus = p | 'create_ignore_corpus' >> beam.Create( [ignore_corpus]) pcoll_ignore_word = p | 'create_ignore_word' >> beam.Create([ignore_word]) pcoll_group_ids = p | 'create groups' >> beam.Create(group_ids) pcoll_groups = create_groups(pcoll_group_ids, pcoll_corpus, pcoll_word, pcoll_ignore_corpus, pcoll_ignore_word) # pylint:disable=expression-not-assigned pcoll_groups | WriteToText(known_args.output) p.run()
def test_read_auto_pattern_compressed_and_uncompressed(self): _, lines = write_data(200) splits = [0, 34, 100, 140, 164, 188, 200] chunks = [ lines[splits[i - 1]:splits[i]] for i in xrange(1, len(splits)) ] chunks_to_write = [] for i, c in enumerate(chunks): if i % 2 == 0: out = cStringIO.StringIO() with gzip.GzipFile(fileobj=out, mode="w") as f: f.write('\n'.join(c)) chunks_to_write.append(out.getvalue()) else: chunks_to_write.append('\n'.join(c)) file_pattern = write_prepared_pattern(chunks_to_write, suffixes=(['.gz', ''] * 3)) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> beam.Read( LineSource(file_pattern, compression_type=fileio.CompressionTypes.AUTO)) assert_that(pcoll, equal_to(lines)) pipeline.run()
def _run_source_test(self, pattern, expected_data, splittable=True): pipeline = TestPipeline() pcoll = pipeline | 'Read' >> beam.Read( LineSource(pattern, splittable=splittable)) assert_that(pcoll, equal_to(expected_data)) pipeline.run()
'gamma': 1.2 } model = xgb.train(best_params, dtrain, num_boost_round=1000, evals=watchlist, evals_result=evals_result, verbose_eval=True) test.loc[:, "predict"] = model.predict(dtest) return test[["shop_id", "date", "predict", "sales"]].to_dict(orient='records') (pipeline | "Query data" >> beam.Read(beam.io.BigQuerySource(query=query)) | "Assign time" >> beam.Map(assign_timevalue) | "Set window" >> beam.WindowInto(window.SlidingWindows(size=3, period=1)) | "Set group key" >> beam.Map(lambda v: ('shop_id', v)) | beam.GroupByKey() | "Learn and predict" >> beam.FlatMap(learn_predict) | "Write data" >> beam.Write( beam.io.BigQuerySink( 'dataset.table', schema="shop_id:STRING, date:STRING, predict:FLOAT, sales:INTEGER", write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED))) pipeline.run()
def run(argv=None): """Runs the workflow.""" parser = argparse.ArgumentParser() parser.add_argument('--input', required=True, help='Input file to process.') parser.add_argument('--output', required=True, help='Output BigQuery table: PROJECT:DATASET.TABLE') known_args, pipeline_args = parser.parse_known_args(argv) schema = bigquery.TableSchema() schema.fields.append(field('Alexa_rank', 'integer')) schema.fields.append(field('Alexa_domain')) schema.fields.append(field('DMOZ_title')) schema.fields.append(field('DMOZ_description')) schema.fields.append(field('DMOZ_url')) schema.fields.append(field('DMOZ_topic', 'string', 'repeated')) schema.fields.append(field('Host')) schema.fields.append(field('FinalLocation')) schema.fields.append(field('HTTPOk', 'boolean')) schema.fields.append(field('HTTPSOk', 'boolean')) schema.fields.append(field('HTTPSOnly', 'boolean')) schema.fields.append(build_response_schema('HTTPResponses')) schema.fields.append(build_response_schema('HTTPSResponses')) schema.fields.append(field('Error')) options = PipelineOptions(pipeline_args) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). options.view_as(SetupOptions).save_main_session = True # https://cloud.google.com/dataflow/pipelines/specifying-exec-params gc_options = options.view_as(GoogleCloudOptions) gc_options.project = 'httparchive' gc_options.job_name = 'host-scan-import-' + str(datetime.date.today()) gc_options.staging_location = 'gs://httparchive/dataflow-binaries' gc_options.temp_location = 'gs://httparchive/dataflow-tmp' wk_options = options.view_as(WorkerOptions) wk_options.num_workers = 10 # options.view_as(StandardOptions).runner = 'DirectPipelineRunner' options.view_as(StandardOptions).runner = 'DataflowPipelineRunner' p = beam.Pipeline(options=options) (p | 'read' >> beam.Read( beam.io.TextFileSource(known_args.input, coder=JsonCoder())) | 'process' >> beam.FlatMap(process_record) # | 'local-write' >> beam.Write(beam.io.TextFileSink('./results'))) | 'bq-write' >> beam.io.Write( beam.io.BigQuerySink( known_args.output, schema=schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))) p.run()
for f in [13, 14, 17]: #crsdeptime, deptime, wheelsoff fields[f] = as_utc(fields[0], fields[f], dep_timezone) for f in [18, 20, 21]: #wheelson, crsarrtime, arrtime fields[f] = as_utc(fields[0], fields[f], arr_timezone) yield ','.join(fields) if __name__ == '__main__': pipeline = beam.Pipeline('DirectPipelineRunner') airports = ( pipeline | 'airports:read' >> beam.Read(beam.io.TextFileSource('airports.csv.gz')) | 'airports:fields' >> beam.Map(lambda line: next(csv.reader([line]))) | 'airports:tz' >> beam.Map( lambda fields: (fields[0], addtimezone(fields[21], fields[26])))) flights = ( pipeline | 'flights:read' >> beam.Read(beam.io.TextFileSource('201501_part.csv')) | 'flights:tzcorr' >> beam.FlatMap(tz_correct, beam.pvalue.AsDict(airports))) flights | beam.io.textio.WriteToText('all_flights') pipeline.run()
(SELECT __key__.id as accnt_id FROM [lead-pages:leadpages.Account_cleansed] LIMIT 100)""" options = PipelineOptions(flags=sys.argv) # For Cloud execution, set the Cloud Platform project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = project google_cloud_options.job_name = 'lp-analysis' google_cloud_options.staging_location = 'gs://lp_activity_transform/staging' google_cloud_options.temp_location = 'gs://lp_activity_transform/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' p = beam.Pipeline(options=options) (p | 'read' >> beam.Read(beam.io.BigQuerySource(query=input_query)) | 'cast ints' >> beam.Map(lambda row: (row['account_id'], int(row['views']))) | beam.CombinePerKey(sum) | 'format for gbq' >> beam.Map(lambda (k, v): { 'account_id': k, 'total_views': v }) | 'save' >> beam.Write( beam.io.BigQuerySink( output_table, schema='account_id:INTEGER, total_views:INTEGER', create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))) p.run()
import apache_beam as beam project = 'teak-proton-148317' input_table = 'clouddataflow-readonly:samples.weather_stations' output_table = 'mydataset.weather_copy_from_dataflow1' p = beam.Pipeline(argv=['--project', project]) read = beam.Read(beam.io.BigQuerySource(input_table)) tornadoesMonths = beam.FlatMap(lambda row: [(int(row['month']), 1)] if row['tornado'] else []) monthlyCount = beam.CombinePerKey(sum) frmat = beam.Map(lambda (k, v): {'month': k, 'tornado_count': v}) sve = beam.Write( beam.io.BigQuerySink( output_table, schema='month:INTEGER, tornado_count:INTEGER', create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)) (p | read | tornadoesMonths | monthlyCount | frmat | sve) p.run()
import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions options = PipelineOptions() pipeline = beam.Pipeline('DirectPipelineRunner') airports = (pipeline | beam.Read(beam.io.TextFileSource('airports.csv.gz')) | beam.Map(lambda line: next(csv.reader([line]))) | beam.Map(lambda fields: (fields[0], (fields[21], fields[26]))))
def expand(self, pvalue): return pvalue.pipeline | beam.Read(_TFRecordSource(*self._args))