def test_dict_to_csv(self): test_dict = { "a": 0, "b": 1, "c": 2, } csv1 = dict_to_csv(test_dict, ['a', 'b', 'c']) self.assertEquals(csv1, '0,1,2') csv2 = dict_to_csv(test_dict, ['c', 'b', 'a']) self.assertEquals(csv2, '2,1,0')
def run(argv=None): """ This function parses the command line arguments and runs the Beam Pipeline. Args: argv: list containing the commandline arguments for this call of the script. """ # Keeps track if schema was inferred by input or ouput table. schema_inferred = False data_args, pipeline_args = parse_data_generator_args(argv) data_args, schema_inferred = fetch_schema(data_args, schema_inferred) pipeline_options = PipelineOptions(pipeline_args) temp_location = pipeline_options.display_data()['temp_location'] temp_blob = write_n_line_file_to_gcs( pipeline_options.display_data()['project'], temp_location, data_args.num_records) data_gen = DataGenerator(bq_schema_filename=data_args.schema_file, input_bq_table=data_args.input_bq_table, p_null=data_args.p_null, n_keys=data_args.n_keys, min_date=data_args.min_date, max_date=data_args.max_date, only_pos=data_args.only_pos, max_int=data_args.max_int, max_float=data_args.max_float, float_precision=data_args.float_precision, write_disp=data_args.write_disp, key_skew=data_args.key_skew, primary_key_cols=data_args.primary_key_cols) # Initiate the pipeline using the pipeline arguments passed in from the # command line. This includes information including where Dataflow should # store temp files, and what the project id is and what runner to use. p = beam.Pipeline(options=pipeline_options) rows = ( p # Read the file we created with num_records newlines. | 'Read file with num_records lines' >> beam.io.ReadFromText( os.path.join('gs://', temp_blob.bucket.name, temp_blob.name)) # Use our instance of our custom DataGenerator Class to generate 1 fake # datum with the appropriate schema for each element in the PColleciton # created above. | 'Generate Data' >> beam.ParDo(FakeRowGen(data_gen)) | 'Parse Json Strings' >> beam.FlatMap(lambda row: [json.loads(row)])) if data_args.primary_key_cols: for key in data_args.primary_key_cols.split(','): rows |= 'Enforcing primary key: {}'.format( key) >> EnforcePrimaryKeys(key) if data_args.csv_schema_order: (rows | 'Order fields for CSV writing.' >> beam.FlatMap( lambda d: [dict_to_csv(d, data_args.csv_schema_order.split(','))]) | 'Write to GCS' >> beam.io.textio.WriteToText( file_path_prefix=data_args.output_prefix, file_name_suffix='.csv') ) if data_args.avro_schema_file: fastavro_avsc = fastavro.schema.load_schema(data_args.avro_schema_file) (rows # Need to convert time stamps from strings to timestamp-micros | 'Fix date and time Types for Avro.' >> beam.FlatMap(lambda row: fix_record_for_avro(row, fastavro_avsc)) | 'Write to Avro.' >> beam.io.avroio.WriteToAvro( file_path_prefix=data_args.output_prefix, codec='null', file_name_suffix='.avro', use_fastavro=True, schema=fastavro_avsc)) if data_args.write_to_parquet: with open(data_args.schema_file, 'r') as infile: str_schema = json.load(infile) pa_schema = get_pyarrow_translated_schema(str_schema) (rows | 'Fix data and time Types for Parquet.' >> beam.FlatMap(lambda row: fix_record_for_parquet(row, str_schema)) | 'Write to Parquet.' >> beam.io.WriteToParquet( file_path_prefix=data_args.output_prefix, codec='null', file_name_suffix='.parquet', schema=pa_schema)) if data_args.output_bq_table: (rows | 'Write to BigQuery.' >> beam.io.gcp.bigquery.WriteToBigQuery( # The table name is a required argument for the BigQuery sink. # In this case we use the value passed in from the command # line. data_args.output_bq_table, schema=None if schema_inferred else data_gen.get_bq_schema(), # Creates the table in BigQuery if it does not yet exist. create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=data_gen.write_disp, # Use the max recommended batch size. batch_size=500)) p.run().wait_until_finish() # Manually clean up of temp_num_records.txt because it will be outside this # job's directory and Dataflow will not remove it for us. temp_blob.delete()
def run(argv=None): """ This funciton parses the command line arguments and runs the Beam Pipeline. Args: argv: list containing the commandline arguments for this call of the script. """ schema_inferred = False data_args, pipeline_args = parse_data_generator_args(argv) data_args, schema_inferred = fetch_schema(data_args, schema_inferred) pipeline_options = PipelineOptions(pipeline_args) data_gen = DataGenerator(bq_schema_filename=data_args.schema_file, input_bq_table=data_args.input_bq_table, hist_bq_table=data_args.hist_bq_table, p_null=data_args.p_null, n_keys=data_args.n_keys, min_date=data_args.min_date, max_date=data_args.max_date, only_pos=data_args.only_pos, max_int=data_args.max_int, max_float=data_args.max_float, float_precision=data_args.float_precision, write_disp=data_args.write_disp, key_skew=data_args.key_skew) # Initiate the pipeline using the pipeline arguments passed in from the # command line. This includes information including where Dataflow should # store temp files, and what the project id is and what runner to use. p = beam.Pipeline(options=pipeline_options) rows = ( p | 'Read Histogram Table.' >> beam.io.Read( beam.io.BigQuerySource(data_gen.hist_bq_table)) | 'Generate Data' >> beam.ParDo(FakeRowGen(data_gen)) | 'Parse Json Strings' >> beam.FlatMap(lambda row: [json.loads(row)])) if data_args.primary_key_cols: rows |= EnforcePrimaryKeys(data_args.primary_key_col) if data_args.csv_schema_order: (rows | 'Order fields for CSV writing.' >> beam.FlatMap( lambda d: [dict_to_csv(d, data_args.csv_schema_order.split(','))]) | 'Write to GCS' >> beam.io.textio.WriteToText( file_path_prefix=data_args.output_prefix, file_name_suffix='.csv') ) if data_args.avro_schema_file: avsc = avro.schema.Parse(open(data_args.avro_schema_file, 'rb').read()) (rows # Need to convert time stamps from strings to timestamp-micros | 'Fix date and time Types for Avro.' >> beam.FlatMap(lambda row: fix_record_for_avro(row, avsc)) | 'Write to Avro.' >> beam.io.avroio.WriteToAvro( file_path_prefix=data_args.output_prefix, codec='null', file_name_suffix='.avro', use_fastavro=True, schema=avsc)) if data_args.output_bq_table: (rows | 'Write to BigQuery.' >> beam.io.gcp.bigquery.WriteToBigQuery( # The table name is a required argument for the BigQuery sink. # In this case we use the value passed in from the command line. data_args.output_bq_table, schema=None if schema_inferred else data_gen.get_bq_schema(), # Creates the table in BigQuery if it does not yet exist. create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=data_gen.write_disp, # Use the max recommended batch size. batch_size=500)) p.run().wait_until_finish()