def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--input', default='PROJECT_ID:demos.small_teams', help=('Input BigQuery table to process specified as: ' 'PROJECT:DATASET.TABLE or DATASET.TABLE.')) parser.add_argument( '--output', # required=True, required=False, help=('Output BigQuery table for results specified as: ' 'PROJECT:DATASET.TABLE or DATASET.TABLE.')) parser.add_argument('--gcs_location', required=False, help=('GCS Location to store files to load ' 'data into Bigquery')) known_args, pipeline_args = parser.parse_known_args(argv) source_config = relational_db.SourceConfiguration( drivername='postgresql+pg8000', host='localhost', port=5432, username='******', password='******', database='postgres') table_config_teams = relational_db.TableConfiguration( name='teams', create_if_missing=True, # automatically create the table if not there primary_key_columns=['id'] # and use 'id' column as primary key ) table_config_category = relational_db.TableConfiguration( name='category', create_if_missing=True, # automatically create the table if not there primary_key_columns=['category_ts' ] # and use 'category_ts' column as primary key ) with beam.Pipeline(argv=pipeline_args) as p: # Read the table rows into a PCollection. rows = p | 'read' >> beam.io.ReadFromBigQuery(query=""" SELECT id, category FROM `PROJECT_ID.demos.small_teams` limit 1500""", use_standard_sql=True) counted = count_categories(rows) # Write the output using a "Write" transform that has side effects. rows | 'Write Teams' >> relational_db.Write( source_config=source_config, table_config=table_config_teams) counted | 'Write Counts' >> relational_db.Write( source_config=source_config, table_config=table_config_category)
def evaluate_upsert(self, source_config): # reconfigure the db for this TC (as the db configured in setup # used the default source config, which might be different from the # one used in this TC). self.destroy_db(self.db) self.db = self.configure_db(source_config) table_config = relational_db.TableConfiguration( name=self.table_name, create_if_missing=True, primary_key_columns=['num'] ) # execute the write pipeline and retrieve the table rows table_rows = self.execute_pipeline( source_config=source_config, table_config=table_config, records=self.records ) # compare assert_equal(table_rows, self.records) # re-write the same records table_rows_after_rewrite = self.execute_pipeline( source_config=source_config, table_config=table_config, records=self.records ) # verify the table records are the same assert_equal(table_rows_after_rewrite, table_rows)
def test_write_to_user_defined_table(self): user_defined_table = 'my_table' assert_not_equal(user_defined_table, self.table_name) def define_table(metadata): from sqlalchemy import Table, Column, Integer, String return Table( user_defined_table, metadata, Column('name', String(50), primary_key=True), Column('num', Integer) ) table_config = relational_db.TableConfiguration( name=user_defined_table, define_table_f=define_table, create_if_missing=True, ) # execute the write pipeline and retrieve the table rows table_rows = self.execute_pipeline( source_config=self.source_config, table_config=table_config, records=self.records ) # assert nothing written to the test default table assert_equal(len(self.db.read_rows(self.table_name)), 0) # and all written to the user defined table assert_equal(sort(table_rows, 'name'), sort(self.records, 'name'))
def test_write_to_existing_table(self): table_name = self.create_table() table_config = relational_db.TableConfiguration( name=table_name, create_if_missing=False ) part1_size = 2 part1_records = self.records[:part1_size] part2_records = self.records[part1_size:] # write part1 to the DB part1_table_rows = self.execute_pipeline( source_config=self.source_config, table_config=table_config, records=part1_records ) assert_equal(sort(part1_table_rows, 'num'), sort(part1_records, 'num')) # write part2 to the DB part2_table_rows = self.execute_pipeline( source_config=self.source_config, table_config=table_config, records=part2_records ) assert_equal( sort(part2_table_rows, 'num'), sort(part1_records + part2_records, 'num') )
def main(): # get the cmd args db_args, pipeline_args = get_args() # Target database instance source_config = relational_db.SourceConfiguration( drivername=db_args.drivername, host=db_args.host, port=db_args.port, database=db_args.database, username=db_args.username, password=db_args.password, create_if_missing=db_args.create_if_missing) # The data to be written records = [ { 'name': 'Jan', 'num': 1 }, { 'name': 'Feb', 'num': 2 }, { 'name': 'Mar', 'num': 3 }, { 'name': 'Apr', 'num': 4 }, { 'name': 'May', 'num': 5 }, { 'name': 'Jun', 'num': 6 }, ] # Target database table table_config = relational_db.TableConfiguration( name='months', create_if_missing=True, # automatically create the table if not there primary_key_columns=['num'] # and use 'num' column as a primary key ) # Create the pipeline options = PipelineOptions(pipeline_args) options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=options) as p: months = p | "Reading records" >> beam.Create(records, reshuffle=False) months | 'Writing to DB' >> relational_db.Write( source_config=source_config, table_config=table_config)
def run(argv=None, save_main_session=True): """Main entry point; defines and runs the pipeline.""" logging.info("HERE") parser = argparse.ArgumentParser() parser.add_argument('--input', type=str, default='dataset/league_of_legends_dataset.csv', help='Path to the data file(s) containing game data.') parser.add_argument('--database', type=str, required=True, help='Database Name') parser.add_argument('--database_host', type=str, required=True, help='Database Host') parser.add_argument('--table_name', default='leader_board', help='table where to store the data') parser.add_argument('--database_user', default='postgres', help='table where to store the data') parser.add_argument('--database_password', default='postgres', help='table where to store the data') args, pipeline_args = parser.parse_known_args(argv) options = PipelineOptions(pipeline_args) logging.info(pipeline_args) source_config = relational_db.SourceConfiguration( drivername='postgresql', host=args.database_host, port=5432, create_if_missing=True, username=args.database_user, password=args.database_password, database=args.database) table_config = relational_db.TableConfiguration( name=args.table_name, create_if_missing=True, primary_key_columns=['gameId']) with beam.Pipeline(options=options) as p: ( # pylint: disable=expression-not-assigned p | 'Setting Up File' >> beam.Create([args.input]) | 'Reading Input Data' >> beam.FlatMap(get_csv_reader) | 'Writing to DB table' >> relational_db.Write( source_config=source_config, table_config=table_config))
def test_write(self): table_config = relational_db.TableConfiguration( name=self.table_name, create_if_missing=True, primary_key_columns=['num']) # execute the write pipeline and retrieve the table rows table_rows = self.execute_pipeline(source_config=self.source_config, table_config=table_config, records=self.records) # compare assert_equal(sort(table_rows, 'num'), sort(self.records, 'num'))
def test_write_no_primary_key(self): table_config = relational_db.TableConfiguration(name=self.table_name, create_if_missing=True) # execute the write pipeline and retrieve the table rows table_rows = self.execute_pipeline(source_config=self.source_config, table_config=table_config, records=self.records) # An auto-increment table called id should have been added expected_table_rows = [dict(record) for record in self.records] _ = [rec.update(id=i + 1) for i, rec in enumerate(expected_table_rows)] assert_equal(sort(table_rows, 'num'), sort(expected_table_rows, 'num'))
def test_auto_column_type_inference(self): from sqlalchemy import Integer, Float, String NAME = 'name' NUM = 'num' TIME_STAMP = 'time_stamp' records = [ {NAME: name, NUM: num, TIME_STAMP: ts} for name, num, ts in [ ['Jan', 1, datetime.datetime.now()], ['Feb', 2, datetime.datetime.now()] ] ] expected_column_types = [ (NAME, String), (NUM, Float), (TIME_STAMP, DateTime), ('id', Integer) # Auto created when no primary key is specified ] table_config = relational_db.TableConfiguration( name=self.table_name, create_if_missing=True ) # write the records using the pipeline _ = self.execute_pipeline( source_config=self.source_config, table_config=table_config, records=records ) # load created table metadata table = self.db.load_table_class(self.table_name).__table__ columns = [col for col in table.columns] def get_column(_name): [_column] = [c for c in columns if c.name == _name] return _column # verify inferred column types is as expected for col_name, expec_col_type in expected_column_types: inferred_col_type = get_column(col_name).type assert_equal(isinstance(inferred_col_type, expec_col_type), True)
def run(**db_args, pipeline_args): source_config = relational_db.SourceConfiguration( drivername=db_args['drivername'], host=db_args['host'], port=db_args['port'], database=db_args['database'], username=db_args['username'], password=db_args['password'], create_if_missing=db_args['create_if_missing'] ) table_config = relational_db.TableConfiguration( name='YOUR_TABLE_NAME', # table name create_if_missing=True, # automatically create the table if not there primary_key_columns=['id'] ) """Build and run the pipeline.""" options = PipelineOptions( pipeline_args, save_main_session=True, streaming=True, runner='DataflowRunner', project='YOUR_PROJECT', job_name='YOUR_JOB', temp_location='YOUR_BUCKET', region='YOUR_REGION' ) with beam.Pipeline(options=options) as pipeline: messages = ( pipeline | 'Read from Pub/Sub' >> beam.io.ReadFromPubSub( subscription=kwargs['input_subscription']).with_output_types(bytes) | 'UTF-8 bytes to string' >> beam.Map(lambda msg: msg.decode('utf-8')) | 'Parse JSON messages' >> beam.Map(parse_json_message)) # Output the results into Cloud SQL table. _ = messages | 'Write to Cloud SQL' >> relational_db.Write( source_config=source_config, table_config=table_config )
from __future__ import division, print_function import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions from beam_nuggets.io import relational_db with beam.Pipeline(options=PipelineOptions()) as p: months = p | "Reading month records" >> beam.Create([ { 'name': 'Jan', 'num': 1 }, { 'name': 'Feb', 'num': 2 }, ]) months | 'Writing to Sqlite table' >> relational_db.Write( source_config=relational_db.SourceConfiguration( drivername='sqlite', database='/tmp/months_db.sqlite', create_if_missing=True), table_config=relational_db.TableConfiguration(name='months', create_if_missing=True))
# Change the subscription details as per your GCP project ## INPUT_SUBSCRIPTION = "projects/hadooptest-223316/subscriptions/vitualStoreSubscriber" ## # Change the path to the dir where your service account private key file is kept ## SERVICE_ACCOUNT_PATH = "/home/aakash/credentials/pubsubtest.json" ## # Change the details as per your MYSQL config ## SOURCE_CONFIG_PROD = relational_db.SourceConfiguration( drivername="mysql+pymysql", host="35.200.253.253", port=3306, username="******", password="******", database="virtual_store", create_if_missing=False, # create the database if not there ) ## # Change the details as per your table name ## TABLE_CONFIG = relational_db.TableConfiguration( name="transaction_data", create_if_missing=True, # automatically create the table if not there primary_key_columns=["id"], # and use 'num' column as primary key )
import apache_beam as beam from beam_nuggets.io import relational_db #configuracion de target source_config = relational_db.SourceConfiguration( drivername='postgresql', host='localhost', port=5432, username='******', password='******', database='conekta', ) table_config = relational_db.TableConfiguration(name='charges', create_if_missing=True, primary_key_columns=['id']) #inicio apache beam p = beam.Pipeline() class PrepareData(beam.DoFn): def process(self, element): #proceso de id del element['_id'] #elimino campo compania del element['name'] #verifico los montos que sean flotantes try: