Пример #1
0
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        default='PROJECT_ID:demos.small_teams',
                        help=('Input BigQuery table to process specified as: '
                              'PROJECT:DATASET.TABLE or DATASET.TABLE.'))
    parser.add_argument(
        '--output',
        #      required=True,
        required=False,
        help=('Output BigQuery table for results specified as: '
              'PROJECT:DATASET.TABLE or DATASET.TABLE.'))

    parser.add_argument('--gcs_location',
                        required=False,
                        help=('GCS Location to store files to load '
                              'data into Bigquery'))

    known_args, pipeline_args = parser.parse_known_args(argv)

    source_config = relational_db.SourceConfiguration(
        drivername='postgresql+pg8000',
        host='localhost',
        port=5432,
        username='******',
        password='******',
        database='postgres')

    table_config_teams = relational_db.TableConfiguration(
        name='teams',
        create_if_missing=True,  # automatically create the table if not there
        primary_key_columns=['id']  # and use 'id' column as primary key
    )

    table_config_category = relational_db.TableConfiguration(
        name='category',
        create_if_missing=True,  # automatically create the table if not there
        primary_key_columns=['category_ts'
                             ]  # and use 'category_ts' column as primary key
    )

    with beam.Pipeline(argv=pipeline_args) as p:
        # Read the table rows into a PCollection.
        rows = p | 'read' >> beam.io.ReadFromBigQuery(query="""
            SELECT id, category FROM `PROJECT_ID.demos.small_teams` limit 1500""",
                                                      use_standard_sql=True)
        counted = count_categories(rows)

        # Write the output using a "Write" transform that has side effects.

        rows | 'Write Teams' >> relational_db.Write(
            source_config=source_config, table_config=table_config_teams)
        counted | 'Write Counts' >> relational_db.Write(
            source_config=source_config, table_config=table_config_category)
Пример #2
0
    def evaluate_upsert(self, source_config):
        # reconfigure the db for this TC (as the db configured in setup
        # used the default source config, which might be different from the
        # one used in this TC).
        self.destroy_db(self.db)
        self.db = self.configure_db(source_config)


        table_config = relational_db.TableConfiguration(
            name=self.table_name,
            create_if_missing=True,
            primary_key_columns=['num']
        )

        # execute the write pipeline and retrieve the table rows
        table_rows = self.execute_pipeline(
            source_config=source_config,
            table_config=table_config,
            records=self.records
        )

        # compare
        assert_equal(table_rows, self.records)

        # re-write the same records
        table_rows_after_rewrite = self.execute_pipeline(
            source_config=source_config,
            table_config=table_config,
            records=self.records
        )

        # verify the table records are the same
        assert_equal(table_rows_after_rewrite, table_rows)
Пример #3
0
    def test_write_to_user_defined_table(self):
        user_defined_table = 'my_table'
        assert_not_equal(user_defined_table, self.table_name)

        def define_table(metadata):
            from sqlalchemy import Table, Column, Integer, String
            return Table(
                user_defined_table, metadata,
                Column('name', String(50), primary_key=True),
                Column('num', Integer)
            )

        table_config = relational_db.TableConfiguration(
            name=user_defined_table,
            define_table_f=define_table,
            create_if_missing=True,
        )

        # execute the write pipeline and retrieve the table rows
        table_rows = self.execute_pipeline(
            source_config=self.source_config,
            table_config=table_config,
            records=self.records
        )

        # assert nothing written to the test default table
        assert_equal(len(self.db.read_rows(self.table_name)), 0)

        # and all written to the user defined table
        assert_equal(sort(table_rows, 'name'), sort(self.records, 'name'))
Пример #4
0
    def test_write_to_existing_table(self):
        table_name = self.create_table()

        table_config = relational_db.TableConfiguration(
            name=table_name,
            create_if_missing=False
        )

        part1_size = 2
        part1_records = self.records[:part1_size]
        part2_records = self.records[part1_size:]

        # write part1 to the DB
        part1_table_rows = self.execute_pipeline(
            source_config=self.source_config,
            table_config=table_config,
            records=part1_records
        )
        assert_equal(sort(part1_table_rows, 'num'), sort(part1_records, 'num'))

        # write part2 to the DB
        part2_table_rows = self.execute_pipeline(
            source_config=self.source_config,
            table_config=table_config,
            records=part2_records
        )
        assert_equal(
            sort(part2_table_rows, 'num'),
            sort(part1_records + part2_records, 'num')
        )
Пример #5
0
def main():
    # get the cmd args
    db_args, pipeline_args = get_args()

    # Target database instance
    source_config = relational_db.SourceConfiguration(
        drivername=db_args.drivername,
        host=db_args.host,
        port=db_args.port,
        database=db_args.database,
        username=db_args.username,
        password=db_args.password,
        create_if_missing=db_args.create_if_missing)

    # The data to be written
    records = [
        {
            'name': 'Jan',
            'num': 1
        },
        {
            'name': 'Feb',
            'num': 2
        },
        {
            'name': 'Mar',
            'num': 3
        },
        {
            'name': 'Apr',
            'num': 4
        },
        {
            'name': 'May',
            'num': 5
        },
        {
            'name': 'Jun',
            'num': 6
        },
    ]

    # Target database table
    table_config = relational_db.TableConfiguration(
        name='months',
        create_if_missing=True,  # automatically create the table if not there
        primary_key_columns=['num']  # and use 'num' column as a primary key
    )

    # Create the pipeline
    options = PipelineOptions(pipeline_args)
    options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=options) as p:
        months = p | "Reading records" >> beam.Create(records, reshuffle=False)
        months | 'Writing to DB' >> relational_db.Write(
            source_config=source_config, table_config=table_config)
Пример #6
0
def run(argv=None, save_main_session=True):
    """Main entry point; defines and runs the pipeline."""
    logging.info("HERE")
    parser = argparse.ArgumentParser()

    parser.add_argument('--input',
                        type=str,
                        default='dataset/league_of_legends_dataset.csv',
                        help='Path to the data file(s) containing game data.')
    parser.add_argument('--database',
                        type=str,
                        required=True,
                        help='Database Name')
    parser.add_argument('--database_host',
                        type=str,
                        required=True,
                        help='Database Host')
    parser.add_argument('--table_name',
                        default='leader_board',
                        help='table where to store the data')
    parser.add_argument('--database_user',
                        default='postgres',
                        help='table where to store the data')
    parser.add_argument('--database_password',
                        default='postgres',
                        help='table where to store the data')

    args, pipeline_args = parser.parse_known_args(argv)

    options = PipelineOptions(pipeline_args)
    logging.info(pipeline_args)
    source_config = relational_db.SourceConfiguration(
        drivername='postgresql',
        host=args.database_host,
        port=5432,
        create_if_missing=True,
        username=args.database_user,
        password=args.database_password,
        database=args.database)

    table_config = relational_db.TableConfiguration(
        name=args.table_name,
        create_if_missing=True,
        primary_key_columns=['gameId'])

    with beam.Pipeline(options=options) as p:
        (  # pylint: disable=expression-not-assigned
            p
            | 'Setting Up File' >> beam.Create([args.input])
            | 'Reading Input Data' >> beam.FlatMap(get_csv_reader)
            | 'Writing to DB table' >> relational_db.Write(
                source_config=source_config, table_config=table_config))
    def test_write(self):
        table_config = relational_db.TableConfiguration(
            name=self.table_name,
            create_if_missing=True,
            primary_key_columns=['num'])

        # execute the write pipeline and retrieve the table rows
        table_rows = self.execute_pipeline(source_config=self.source_config,
                                           table_config=table_config,
                                           records=self.records)

        # compare
        assert_equal(sort(table_rows, 'num'), sort(self.records, 'num'))
    def test_write_no_primary_key(self):
        table_config = relational_db.TableConfiguration(name=self.table_name,
                                                        create_if_missing=True)

        # execute the write pipeline and retrieve the table rows
        table_rows = self.execute_pipeline(source_config=self.source_config,
                                           table_config=table_config,
                                           records=self.records)

        # An auto-increment table called id should have been added
        expected_table_rows = [dict(record) for record in self.records]
        _ = [rec.update(id=i + 1) for i, rec in enumerate(expected_table_rows)]

        assert_equal(sort(table_rows, 'num'), sort(expected_table_rows, 'num'))
Пример #9
0
    def test_auto_column_type_inference(self):
        from sqlalchemy import Integer, Float, String
        NAME = 'name'
        NUM = 'num'
        TIME_STAMP = 'time_stamp'
        records = [
            {NAME: name, NUM: num, TIME_STAMP: ts}
            for name, num, ts in [
                ['Jan', 1, datetime.datetime.now()],
                ['Feb', 2, datetime.datetime.now()]
            ]
        ]
        expected_column_types = [
            (NAME, String),
            (NUM, Float),
            (TIME_STAMP, DateTime),
            ('id', Integer)  # Auto created  when no primary key is specified
        ]

        table_config = relational_db.TableConfiguration(
            name=self.table_name,
            create_if_missing=True
        )

        # write the records using the pipeline
        _ = self.execute_pipeline(
            source_config=self.source_config,
            table_config=table_config,
            records=records
        )

        # load created table metadata
        table = self.db.load_table_class(self.table_name).__table__
        columns = [col for col in table.columns]

        def get_column(_name):
            [_column] = [c for c in columns if c.name == _name]
            return _column

        # verify inferred column types is as expected
        for col_name, expec_col_type in expected_column_types:
            inferred_col_type = get_column(col_name).type
            assert_equal(isinstance(inferred_col_type, expec_col_type), True)
Пример #10
0
def run(**db_args, pipeline_args):

    source_config = relational_db.SourceConfiguration(
        drivername=db_args['drivername'], 
        host=db_args['host'], 
        port=db_args['port'], 
        database=db_args['database'], 
        username=db_args['username'], 
        password=db_args['password'], 
        create_if_missing=db_args['create_if_missing']
    )

    table_config = relational_db.TableConfiguration(
        name='YOUR_TABLE_NAME', # table name
        create_if_missing=True,  # automatically create the table if not there
        primary_key_columns=['id']
    )


    """Build and run the pipeline."""
    options = PipelineOptions(
        pipeline_args, save_main_session=True, streaming=True, runner='DataflowRunner',
        project='YOUR_PROJECT', job_name='YOUR_JOB', temp_location='YOUR_BUCKET', 
        region='YOUR_REGION'
    )


    with beam.Pipeline(options=options) as pipeline:
        messages = (
            pipeline
            | 'Read from Pub/Sub' >> beam.io.ReadFromPubSub(
                subscription=kwargs['input_subscription']).with_output_types(bytes)
            | 'UTF-8 bytes to string' >> beam.Map(lambda msg: msg.decode('utf-8'))
            | 'Parse JSON messages' >> beam.Map(parse_json_message))

        # Output the results into Cloud SQL table.
        _ = messages | 'Write to Cloud SQL' >> relational_db.Write(
            source_config=source_config,
            table_config=table_config
        )
Пример #11
0
from __future__ import division, print_function

import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions

from beam_nuggets.io import relational_db

with beam.Pipeline(options=PipelineOptions()) as p:
    months = p | "Reading month records" >> beam.Create([
        {
            'name': 'Jan',
            'num': 1
        },
        {
            'name': 'Feb',
            'num': 2
        },
    ])
    months | 'Writing to Sqlite table' >> relational_db.Write(
        source_config=relational_db.SourceConfiguration(
            drivername='sqlite',
            database='/tmp/months_db.sqlite',
            create_if_missing=True),
        table_config=relational_db.TableConfiguration(name='months',
                                                      create_if_missing=True))
Пример #12
0
# Change the subscription details as per your GCP project
##
INPUT_SUBSCRIPTION = "projects/hadooptest-223316/subscriptions/vitualStoreSubscriber"

##
# Change the path to the dir where your service account private key file is kept
##
SERVICE_ACCOUNT_PATH = "/home/aakash/credentials/pubsubtest.json"

##
# Change the details as per your MYSQL config
##
SOURCE_CONFIG_PROD = relational_db.SourceConfiguration(
    drivername="mysql+pymysql",
    host="35.200.253.253",
    port=3306,
    username="******",
    password="******",
    database="virtual_store",
    create_if_missing=False,  # create the database if not there
)

##
# Change the details as per your table name
##
TABLE_CONFIG = relational_db.TableConfiguration(
    name="transaction_data",
    create_if_missing=True,  # automatically create the table if not there
    primary_key_columns=["id"],  # and use 'num' column as primary key
)
Пример #13
0
import apache_beam as beam
from beam_nuggets.io import relational_db

#configuracion de target
source_config = relational_db.SourceConfiguration(
    drivername='postgresql',
    host='localhost',
    port=5432,
    username='******',
    password='******',
    database='conekta',
)

table_config = relational_db.TableConfiguration(name='charges',
                                                create_if_missing=True,
                                                primary_key_columns=['id'])

#inicio apache beam
p = beam.Pipeline()


class PrepareData(beam.DoFn):
    def process(self, element):
        #proceso de id
        del element['_id']

        #elimino campo compania
        del element['name']

        #verifico los montos que sean flotantes
        try: