def run(argv=None):
    """
    This funciton parses the command line arguments and runs the Beam Pipeline.

    Args:
        argv: list containing the commandline arguments for this call of the script.
    """
    schema_inferred = False

    data_args, pipeline_args = parse_data_generator_args(argv)
    data_args, schema_inferred = fetch_schema(data_args, schema_inferred)
    pipeline_options = PipelineOptions(pipeline_args)

    temp_location = pipeline_options.display_data()['temp_location']
    temp_blob = write_n_line_file_to_gcs(
        pipeline_options.display_data()['project'], temp_location,
        data_args.num_records)

    data_gen = data_generator_from_data_args(data_args)
    # Initiate the pipeline using the pipeline arguments passed in from the
    # command line.  This includes information including where Dataflow should
    # store temp files, and what the project id is and what runner to use.
    p = beam.Pipeline(options=pipeline_options)

    (p

     # Read the file we created with num_records newlines.
     | 'Read file with num_records lines' >>
     beam.io.ReadFromText(temp_location + '/temp_num_records.txt')
     # Use our instance of our custom DataGenerator Class to generate 1 fake datum
     # with the appropriate schema for each element in the PColleciton created above.
     | 'Generate Data' >> beam.ParDo(FakeRowGen(data_gen))
     | 'Write to BigQuery' >> beam.io.gcp.bigquery.WriteToBigQuery(
         # The table name is a required argument for the BigQuery sink.
         # In this case we use the value passed in from the command line.
         data_args.output_bq_table,
         schema=None if schema_inferred else data_gen.get_bq_schema_string(),
         # Creates the table in BigQuery if it does not yet exist.
         create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
         write_disposition=data_gen.write_disp,
         # Use the max recommended batch size.
         batch_size=500))

    p.run().wait_until_finish()

    # Manually clean up of temp_num_records.txt because it will be outside this job's
    # directory and Dataflow will not remove it for us.
    temp_blob.delete()
Пример #2
0
def run(argv=None):
    """
    This function parses the command line arguments and runs the Beam Pipeline.

    Args:
        argv: list containing the commandline arguments for this call of the
         script.
    """
    # Keeps track if schema was inferred by input or ouput table.
    schema_inferred = False

    data_args, pipeline_args = parse_data_generator_args(argv)
    data_args, schema_inferred = fetch_schema(data_args, schema_inferred)
    pipeline_options = PipelineOptions(pipeline_args)

    temp_location = pipeline_options.display_data()['temp_location']
    temp_blob = write_n_line_file_to_gcs(
        pipeline_options.display_data()['project'], temp_location,
        data_args.num_records)

    data_gen = DataGenerator(bq_schema_filename=data_args.schema_file,
                             input_bq_table=data_args.input_bq_table,
                             p_null=data_args.p_null,
                             n_keys=data_args.n_keys,
                             min_date=data_args.min_date,
                             max_date=data_args.max_date,
                             only_pos=data_args.only_pos,
                             max_int=data_args.max_int,
                             max_float=data_args.max_float,
                             float_precision=data_args.float_precision,
                             write_disp=data_args.write_disp,
                             key_skew=data_args.key_skew,
                             primary_key_cols=data_args.primary_key_cols)

    # Initiate the pipeline using the pipeline arguments passed in from the
    # command line.  This includes information including where Dataflow should
    # store temp files, and what the project id is and what runner to use.
    p = beam.Pipeline(options=pipeline_options)

    rows = (
        p
        # Read the file we created with num_records newlines.
        | 'Read file with num_records lines' >> beam.io.ReadFromText(
            os.path.join('gs://', temp_blob.bucket.name, temp_blob.name))

        # Use our instance of our custom DataGenerator Class to generate 1 fake
        # datum with the appropriate schema for each element in the PColleciton
        # created above.
        | 'Generate Data' >> beam.ParDo(FakeRowGen(data_gen))
        | 'Parse Json Strings' >> beam.FlatMap(lambda row: [json.loads(row)]))

    if data_args.primary_key_cols:
        for key in data_args.primary_key_cols.split(','):
            rows |= 'Enforcing primary key: {}'.format(
                key) >> EnforcePrimaryKeys(key)

    if data_args.csv_schema_order:
        (rows
         | 'Order fields for CSV writing.' >> beam.FlatMap(
             lambda d: [dict_to_csv(d, data_args.csv_schema_order.split(','))])
         | 'Write to GCS' >> beam.io.textio.WriteToText(
             file_path_prefix=data_args.output_prefix, file_name_suffix='.csv')
         )

    if data_args.avro_schema_file:
        fastavro_avsc = fastavro.schema.load_schema(data_args.avro_schema_file)

        (rows
         # Need to convert time stamps from strings to timestamp-micros
         | 'Fix date and time Types for Avro.' >>
         beam.FlatMap(lambda row: fix_record_for_avro(row, fastavro_avsc))
         | 'Write to Avro.' >> beam.io.avroio.WriteToAvro(
             file_path_prefix=data_args.output_prefix,
             codec='null',
             file_name_suffix='.avro',
             use_fastavro=True,
             schema=fastavro_avsc))

    if data_args.write_to_parquet:
        with open(data_args.schema_file, 'r') as infile:
            str_schema = json.load(infile)
        pa_schema = get_pyarrow_translated_schema(str_schema)
        (rows
         | 'Fix data and time Types for Parquet.' >>
         beam.FlatMap(lambda row: fix_record_for_parquet(row, str_schema))
         | 'Write to Parquet.' >> beam.io.WriteToParquet(
             file_path_prefix=data_args.output_prefix,
             codec='null',
             file_name_suffix='.parquet',
             schema=pa_schema))

    if data_args.output_bq_table:
        (rows
         | 'Write to BigQuery.' >> beam.io.gcp.bigquery.WriteToBigQuery(
             # The table name is a required argument for the BigQuery sink.
             # In this case we use the value passed in from the command
             # line.
             data_args.output_bq_table,
             schema=None if schema_inferred else data_gen.get_bq_schema(),
             # Creates the table in BigQuery if it does not yet exist.
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=data_gen.write_disp,
             # Use the max recommended batch size.
             batch_size=500))

    p.run().wait_until_finish()

    # Manually clean up of temp_num_records.txt because it will be outside this
    # job's directory and Dataflow will not remove it for us.
    temp_blob.delete()
Пример #3
0
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--core_host', dest='core_host', type=str)
    parser.add_argument('--core_port', dest='core_port', type=int)
    parser.add_argument('--core_username', dest='core_username', type=str)
    parser.add_argument('--core_password', dest='core_password', type=str)
    parser.add_argument('--core_database', dest='core_database', type=str)
    parser.add_argument('--remittances_host',
                        dest='remittances_host',
                        type=str)
    parser.add_argument('--remittances_port',
                        dest='remittances_port',
                        type=int)
    parser.add_argument('--remittances_username',
                        dest='remittances_username',
                        type=str)
    parser.add_argument('--remittances_password',
                        dest='remittances_password',
                        type=str)
    parser.add_argument('--remittances_database',
                        dest='remittances_database',
                        type=str)
    parser.add_argument('--auth_uri', dest='auth_uri', type=str)
    parser.add_argument('--auth_database', dest='auth_database', type=str)
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    project_id = pipeline_options.display_data()['project']

    with beam.Pipeline(options=pipeline_options) as p:
        # Reads from data sources

        read_core = (
            p
            | 'ReadCore' >> ReadFromDB(source_config=SourceConfiguration(
                drivername='postgresql+psycopg2',
                host=known_args.core_host,
                port=known_args.core_port,
                username=known_args.core_username,
                password=known_args.core_password,
                database=known_args.core_database,
                create_if_missing=False),
                                       table_name='transaction',
                                       query='SELECT * FROM transaction')
            | 'FilterTransactions' >> beam.Filter(filter_transaction))

        read_auth = (
            p
            | 'Read Users' >> ReadFromMongoDB(uri=known_args.auth_uri,
                                              db=known_args.auth_database,
                                              coll='users')
            | 'TransformUsers' >> beam.ParDo(TransformUser())
            | 'FilterUsers' >> beam.Filter(filter_user))

        read_remittances = (
            p
            | 'ReadRemittances' >> ReadFromMySQL(
                query='SELECT * FROM valiu_remittances.remittance',
                host=known_args.remittances_host,
                database=known_args.remittances_database,
                user=known_args.remittances_username,
                password=known_args.remittances_password,
                port=known_args.remittances_port,
                splitter=splitters.NoSplitter())
            | 'TransformRemittances' >> beam.ParDo(TransformRemittance())
            | 'FilterRemittances' >> beam.Filter(filter_remittance))

        # Merges

        merged_transactions = (
            (read_core, read_auth)
            | 'MergeTransactionsUsers' >> LeftJoin('id2', 'id', 'user_'))

        merged_remittances = (
            (read_remittances, read_auth)
            | 'MergeRemittanceUsers' >> MergeRemittancesUsers())

        # Writes to BigQuery

        write_users = (
            read_auth
            | 'WriteUsers' >> beam.io.WriteToBigQuery(
                table='auth_users',
                dataset='auth',
                project=project_id,
                create_disposition=beam.io.BigQueryDisposition.
                CREATE_IF_NEEDED,
                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                schema=users_table_schema,
                additional_bq_parameters=users_table_partitioning))

        write_remittances = (
            merged_remittances
            | 'WriteRemittances' >> beam.io.WriteToBigQuery(
                table='remittances_movements',
                dataset='remittances',
                project=project_id,
                create_disposition=beam.io.BigQueryDisposition.
                CREATE_IF_NEEDED,
                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                schema=remittances_table_schema,
                additional_bq_parameters=remittances_table_partitioning))

        write_transactions_cash_in = (
            merged_transactions
            | 'FilterCashIn' >> beam.Filter(filter_currency_operation, 'COP',
                                            'USDv')
            | 'CleanTransactionsCashIn' >> beam.ParDo(
                TransformTransaction('cash_in'))
            | 'WriteCashIn' >> beam.io.WriteToBigQuery(
                table='core_cash_in',
                dataset='core',
                project=project_id,
                create_disposition=beam.io.BigQueryDisposition.
                CREATE_IF_NEEDED,
                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                schema=transactions_table_schema,
                additional_bq_parameters=transactions_table_partitioning))

        write_transactions_cash_out = (
            merged_transactions
            | 'FilterCashOut' >> beam.Filter(filter_currency_operation, 'USDv',
                                             'VES')
            | 'CleanTransactionsCashOut' >> beam.ParDo(
                TransformTransaction('cash_out'))
            | 'WriteCashOut' >> beam.io.WriteToBigQuery(
                table='core_cash_out',
                dataset='core',
                project=project_id,
                create_disposition=beam.io.BigQueryDisposition.
                CREATE_IF_NEEDED,
                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                schema=transactions_table_schema,
                additional_bq_parameters=transactions_table_partitioning))

        write_transactions_p2p = (
            merged_transactions
            | 'FilterP2P' >> beam.Filter(filter_currency_operation, 'USDv',
                                         'USDv')
            | 'CleanTransactionsP2P' >> beam.ParDo(TransformTransaction('p2p'))
            | 'WriteP2P' >> beam.io.WriteToBigQuery(
                table='core_p2p',
                dataset='core',
                project=project_id,
                create_disposition=beam.io.BigQueryDisposition.
                CREATE_IF_NEEDED,
                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                schema=transactions_table_schema,
                additional_bq_parameters=transactions_table_partitioning))
Пример #4
0
        logging.info('running locally on DirectRunner')
        argv = [
            '--runner',
            'DirectRunner',
            '--staging_location',
            os.path.join(args.data_dir, "staging"),
            '--temp_location',
            os.path.join(args.data_dir, "temp"),
            # see https://beam.apache.org/documentation/sdks/python-pipeline-dependencies/ for more details
            '--setup_file',
            os.path.join(CURRENT_DIR, 'setup.py'),
        ]
    options = PipelineOptions(flags=argv)

    t1 = time.time()
    with tft_beam.Context(temp_dir=options.display_data()['temp_location']):
        pipeline = beam.Pipeline(options=options)
        # when training we want all the data
        if args.mode == "train":
            logging.info("TRAINING")
            if args.run_cloud:
                source = (
                    pipeline
                    | 'Read BQ table' >> beam.io.Read(
                        beam.io.gcp.bigquery.BigQuerySource(
                            query=BQQuery.train_query, use_standard_sql=True)))
            else:
                source = (pipeline
                          | 'Read local JSON' >> beam.io.ReadFromText(
                              os.path.join(args.data_dir, 'bq_sample.json'))
                          | 'Parse JSON' >> MapAndFilterErrors(json.loads))