Exemplo n.º 1
0
    def setUp(self):

        # Note changed default for schema_file for ease of testing.
        dir_path = os.path.dirname(os.path.realpath(''))
        schema_file = os.path.join(dir_path, 'data-generator-pipeline','resources', 'lineorder-schema.json')

        self.data_gen = DataGenerator(bq_schema_filename=schema_file,
                                      p_null=0.0, n_keys=1000, min_date='2000-01-01',
                                      max_date=datetime.date.today().strftime('%Y-%m-%d'),
                                      only_pos=True, max_int=10**11, max_float=float(10**11),
                                      float_precision=2, write_disp='WRITE_APPEND')


        self.fakerowgen = FakeRowGen(self.data_gen)


        logging.basicConfig(level=logging.INFO)
Exemplo n.º 2
0
def run(argv=None):
    """
    This function parses the command line arguments and runs the Beam Pipeline.

    Args:
        argv: list containing the commandline arguments for this call of the
         script.
    """
    # Keeps track if schema was inferred by input or ouput table.
    schema_inferred = False

    data_args, pipeline_args = parse_data_generator_args(argv)
    data_args, schema_inferred = fetch_schema(data_args, schema_inferred)
    pipeline_options = PipelineOptions(pipeline_args)

    temp_location = pipeline_options.display_data()['temp_location']
    temp_blob = write_n_line_file_to_gcs(
        pipeline_options.display_data()['project'], temp_location,
        data_args.num_records)

    data_gen = DataGenerator(bq_schema_filename=data_args.schema_file,
                             input_bq_table=data_args.input_bq_table,
                             p_null=data_args.p_null,
                             n_keys=data_args.n_keys,
                             min_date=data_args.min_date,
                             max_date=data_args.max_date,
                             only_pos=data_args.only_pos,
                             max_int=data_args.max_int,
                             max_float=data_args.max_float,
                             float_precision=data_args.float_precision,
                             write_disp=data_args.write_disp,
                             key_skew=data_args.key_skew,
                             primary_key_cols=data_args.primary_key_cols)

    # Initiate the pipeline using the pipeline arguments passed in from the
    # command line.  This includes information including where Dataflow should
    # store temp files, and what the project id is and what runner to use.
    p = beam.Pipeline(options=pipeline_options)

    rows = (
        p
        # Read the file we created with num_records newlines.
        | 'Read file with num_records lines' >> beam.io.ReadFromText(
            os.path.join('gs://', temp_blob.bucket.name, temp_blob.name))

        # Use our instance of our custom DataGenerator Class to generate 1 fake
        # datum with the appropriate schema for each element in the PColleciton
        # created above.
        | 'Generate Data' >> beam.ParDo(FakeRowGen(data_gen))
        | 'Parse Json Strings' >> beam.FlatMap(lambda row: [json.loads(row)]))

    if data_args.primary_key_cols:
        for key in data_args.primary_key_cols.split(','):
            rows |= 'Enforcing primary key: {}'.format(
                key) >> EnforcePrimaryKeys(key)

    if data_args.csv_schema_order:
        (rows
         | 'Order fields for CSV writing.' >> beam.FlatMap(
             lambda d: [dict_to_csv(d, data_args.csv_schema_order.split(','))])
         | 'Write to GCS' >> beam.io.textio.WriteToText(
             file_path_prefix=data_args.output_prefix, file_name_suffix='.csv')
         )

    if data_args.avro_schema_file:
        fastavro_avsc = fastavro.schema.load_schema(data_args.avro_schema_file)

        (rows
         # Need to convert time stamps from strings to timestamp-micros
         | 'Fix date and time Types for Avro.' >>
         beam.FlatMap(lambda row: fix_record_for_avro(row, fastavro_avsc))
         | 'Write to Avro.' >> beam.io.avroio.WriteToAvro(
             file_path_prefix=data_args.output_prefix,
             codec='null',
             file_name_suffix='.avro',
             use_fastavro=True,
             schema=fastavro_avsc))

    if data_args.write_to_parquet:
        with open(data_args.schema_file, 'r') as infile:
            str_schema = json.load(infile)
        pa_schema = get_pyarrow_translated_schema(str_schema)
        (rows
         | 'Fix data and time Types for Parquet.' >>
         beam.FlatMap(lambda row: fix_record_for_parquet(row, str_schema))
         | 'Write to Parquet.' >> beam.io.WriteToParquet(
             file_path_prefix=data_args.output_prefix,
             codec='null',
             file_name_suffix='.parquet',
             schema=pa_schema))

    if data_args.output_bq_table:
        (rows
         | 'Write to BigQuery.' >> beam.io.gcp.bigquery.WriteToBigQuery(
             # The table name is a required argument for the BigQuery sink.
             # In this case we use the value passed in from the command
             # line.
             data_args.output_bq_table,
             schema=None if schema_inferred else data_gen.get_bq_schema(),
             # Creates the table in BigQuery if it does not yet exist.
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=data_gen.write_disp,
             # Use the max recommended batch size.
             batch_size=500))

    p.run().wait_until_finish()

    # Manually clean up of temp_num_records.txt because it will be outside this
    # job's directory and Dataflow will not remove it for us.
    temp_blob.delete()
Exemplo n.º 3
0
class TestPrettyDataGenerator(unittest.TestCase):
    """The test cases are focused on the business logic.  In this case this is how we parse the
    schemas, generate data and label images.

    Execution Note:
        This script is stored in professional-services/data-analytics/dataflow-python-examples/tests
        but should be copied to professional-services/data-analytics/dataflow-python-examples/ and
        run from there.
    """

    def setUp(self):

        # Note changed default for schema_file for ease of testing.
        dir_path = os.path.dirname(os.path.realpath(''))
        schema_file = os.path.join(dir_path, 'data-generator-pipeline','resources', 'lineorder-schema.json')

        self.data_gen = DataGenerator(bq_schema_filename=schema_file,
                                      p_null=0.0, n_keys=1000, min_date='2000-01-01',
                                      max_date=datetime.date.today().strftime('%Y-%m-%d'),
                                      only_pos=True, max_int=10**11, max_float=float(10**11),
                                      float_precision=2, write_disp='WRITE_APPEND')


        self.fakerowgen = FakeRowGen(self.data_gen)


        logging.basicConfig(level=logging.INFO)

    def test_get_faker_schema(self):
        """
        This tests the get_faker_schema method of the DataGenerator class.
        """
        expected_faker_schema = {
            u'lo_recieptfile': 'file_name',  # This tests a field from special_map.
            u'lo_cust_key': 'word',  # The rest of the fields test type_map.
            u'lo_order_key': 'word',
            u'lo_ordpriority': 'random_number',
            u'lo_supp_key': 'word',
            u'lo_quantity': 'random_number',
            u'lo_revenue': 'pyfloat',
            u'lo_orderdate': 'date_this_century',
            u'lo_extendedprice': 'pyfloat',
            u'lo_supplycost': 'pyfloat',
            u'lo_part_key': 'word',
            u'lo_discount': 'pyfloat',
            u'lo_shippriority': 'random_number',
            u'lo_shipmode': 'pyfloat',
            u'lo_ordtotalprice': 'pyfloat',
            u'lo_linenumber': 'random_number',
            u'lo_tax': 'pyfloat',
            u'lo_record_field':{
                u'name': 'name',
                u'email': 'email',
                u'time_sec': 'random_number',
                u'tz_offset': 'random_number',
                u'date': 'date_time_this_century'} 
            }
        actual_faker_schema = self.data_gen.get_faker_schema()
        self.assertDictEqual(actual_faker_schema, expected_faker_schema)

    def test_generate_fake(self):
        """
        This tests the generate_Fake function of the FakeRowGen class which is called py process
        an instance of the DataGenerator class and returns fake json record that abides to the
        rules specified in the attributes of the DataGenerator instance. Note that this is a
        non-deterministic function so the best we can do is spot-check values obey the rules
        for this call for thoroughness we could run the unit test many many times.
        """
        faker_schema = self.fakerowgen.data_gen.get_faker_schema()
        actual_row = json.loads(self.fakerowgen.generate_fake(faker_schema))

        # Check returns a dict representing a single record.
        self.assertIsInstance(actual_row, dict)

        # # Check the schema is correct.
        # self.assertEquals(actual_row.keys(), faker_schema.keys())

        # Check the date in range.
        self.assertGreaterEqual(
            datetime.datetime.strptime(actual_row[u'lo_orderdate'], '%Y-%m-%d').date(),
            self.data_gen.min_date)
        self.assertLessEqual(
            datetime.datetime.strptime(actual_row[u'lo_orderdate'], '%Y-%m-%d').date(),
            self.data_gen.max_date)

        # Check the integer is in range.
        self.assertLessEqual(actual_row[u'lo_linenumber'], self.data_gen.max_int)

        # Check the float is in range.
        self.assertLessEqual(actual_row[u'lo_tax'], self.data_gen.max_float)

        # Check int strictly positive
        self.assertGreaterEqual(actual_row[u'lo_linenumber'], 0)

        # Check float strictly positive
        self.assertGreaterEqual(actual_row[u'lo_tax'], 0.0)
        
        # Check string size was parsed and enforced from description fields of lo_recieptfile.
        self.assertLessEqual(len(actual_row[u'lo_recieptfile']), 10)
        # Check if record type nesting worked.
        self.assertIsInstance(actual_row[u'lo_record_field'], list)
    def test_get_field_dict(self):
        """
        This tests the ability of the FakeRowGen.get_field_dict method to extract a single field
        dictionary from a FakeRowGen.data_gen.schema
        """
        expected_field_dict = {u'type': u'DATE', u'name':u'lo_orderdate', u'mode': u'NULLABLE'}
        actual_field_dict = self.fakerowgen.get_field_dict(field_name=u'lo_orderdate')
        self.assertDictEqual(actual_field_dict, expected_field_dict)
        expected_record_dict = {
                "name": "lo_record_field",
                "type": "RECORD",
                "mode": "REPEATED",
                "fields": [
                    {"mode": "NULLABLE", "name": "name", "type": "STRING"},
                    {"mode": "NULLABLE", "name": "email", "type": "STRING"},
                    {"mode": "NULLABLE", "name": "time_sec", "type": "INTEGER"},
                    {"mode": "NULLABLE", "name": "tz_offset", "type": "INTEGER"},
                    {"mode": "NULLABLE", "name": "date", "type": "TIMESTAMP"} 
                    ]}
        actual_record_dict = self.fakerowgen.get_field_dict(field_name=u'lo_record_field')

    def test_sanity_check(self):
        fschema = self.data_gen.get_faker_schema()
        schema_faker = FakerSchema()
        data = schema_faker.generate_fake(fschema, 1)  # Generate one record.

        # Note at this point data[u'lo_orderdate'] is a datetime.date object while Biguery expects
        # a string
        self.assertIsInstance(data[u'lo_orderdate'], datetime.date)

        data = self.fakerowgen.sanity_check(record=data, fieldname=u'lo_orderdate')

        # Check that the date was converted to a string
        self.assertIsInstance(data[u'lo_orderdate'], unicode)

        # Check that the date is in the correct format
        _ = datetime.datetime.strptime(data[u'lo_orderdate'], '%Y-%m-%d')

        # Check if sanity check enforces integers < data_args.max_int
        data[u'lo_linenumber'] = 10**12  # Note that max_int is 10**11

        data = self.fakerowgen.sanity_check(record=data, fieldname=u'lo_linenumber')

        self.assertLessEqual(data[u'lo_linenumber'], self.data_gen.max_int)

        data=self.fakerowgen.sanity_check(record=data, fieldname=u'lo_record_field')
        self.assertIsInstance(data[u'lo_record_field'], list)
    
    def test_get_skewed_key(self):
        """
        This tests the get_skewed_key method of the FakeRowGen class.
        """         
        uniform_key = self.fakerowgen.get_skewed_key()        
        self.assertTrue(uniform_key)
        self.assertLessEqual(uniform_key, self.data_gen.n_keys)

        binomial_key = self.fakerowgen.get_skewed_key(distribution='binomial')        
        self.assertTrue(uniform_key)
        self.assertLessEqual(uniform_key, self.data_gen.n_keys)

        zipf_key = self.fakerowgen.get_skewed_key(distribution='zipf')        
        self.assertTrue(uniform_key)
        self.assertLessEqual(uniform_key, self.data_gen.n_keys)