示例#1
0
 def test_dataset_schema(self):
     columns = [Column('c1', DataType.VARCHAR, 50), Column('c2', DataType.BIGINT)]
     num_header_rows = None
     df = DataFormat(FileFormat.PARQUET, RowFormat.NONE, num_header_rows)
     ds = Dataset(data=DatasetData('test-dataset', 'test_dataset_table', 's3://bucket/prefix', df, columns))
     obj_before = ds.to_dict()
     obj_after = default_and_validate(ds, dataset_schema()).to_dict()
     # num_header_rows should have been defaulted to 0, making these unequal
     self.assertNotEqual(obj_before, obj_after)
示例#2
0
def put_dataset(dataset):
    return {
        'results':
        dataset_service().update_dataset(dataset.id,
                                         Dataset.from_dict(
                                             request.get_json())).to_dict()
    }
示例#3
0
def infer_dataset_data(s3_path, max_lines):
    """
    :type s3_path: str
    :param s3_path: The full s3 path to a sample delimited dataset
        file.

    :type max_lines: int
    :param max_lines: The maximum number of lines to peek.
    """
    guesses, has_header, compression, dialect = get_guesses(s3_path, max_lines)
    columns = columns_with_best_guess(guesses, has_header)
    location = guess_location(s3_path)
    return Dataset(data=DatasetData(name='guessed_dataset',
                                    table_name='public',
                                    data_format=DataFormat(
                                        file_format=FileFormat.TEXTFILE,
                                        row_format=RowFormat.DELIMITED,
                                        delimited_by=dialect.delimiter,
                                        escaped_by=dialect.escapechar,
                                        quoted_by=dialect.quotechar,
                                    ),
                                    location=location,
                                    columns=columns,
                                    compression=compression,
                                    load_type=LoadType.INSERT))
示例#4
0
    def test_crud(self):
        columns = [
            Column('c1', DataType.VARCHAR, 50),
            Column('c2', DataType.BIGINT)
        ]
        df = DataFormat(FileFormat.PARQUET, RowFormat.NONE)
        ds = Dataset(
            data=DatasetData(NoOpActionTypes.action_that_succeeds.name,
                             NoOpActionTypes.action_that_succeeds.name,
                             's3://bucket/prefix',
                             df,
                             columns,
                             tags=['foo']))
        posted_dataset = self.dart.save_dataset(ds)
        self.assertEqual(posted_dataset.data.to_dict(), ds.data.to_dict())

        dataset = self.dart.get_dataset(posted_dataset.id)
        self.assertEqual(posted_dataset.to_dict(), dataset.to_dict())

        dataset.data.compression = Compression.GZIP
        put_dataset = self.dart.save_dataset(dataset)
        self.assertEqual(put_dataset.data.compression, Compression.GZIP)
        self.assertNotEqual(posted_dataset.to_dict(), put_dataset.to_dict())

        self.dart.delete_dataset(dataset.id)
        try:
            self.dart.get_dataset(dataset.id)
        except DartRequestException as e:
            self.assertEqual(e.response.status_code, 404)
            return

        self.fail('dataset should have been missing after delete!')
示例#5
0
文件: resolve.py 项目: chrisborg/dart
 def _resolve_and_save_dataset(self, entity_id, entity_map, actual_entities_by_node_id, actual_entities_by_unsaved_id):
     actual_id, unsaved_id = self._resolve(EntityType.dataset, entity_id, entity_map, actual_entities_by_unsaved_id)
     if actual_id:
         return actual_id
     node_id = self._node_id(EntityType.dataset, unsaved_id)
     dataset = Dataset.from_dict(entity_map['unsaved_entities'][node_id])
     dataset = self._dataset_service.save_dataset(dataset, commit=False, flush=True)
     actual_entities_by_node_id[node_id] = dataset
     actual_entities_by_unsaved_id[unsaved_id] = dataset
     return dataset.id
示例#6
0
文件: resolve.py 项目: ophiradi/dart
 def _resolve_and_save_dataset(self, entity_id, entity_map, actual_entities_by_node_id, actual_entities_by_unsaved_id):
     actual_id, unsaved_id = self._resolve(EntityType.dataset, entity_id, entity_map, actual_entities_by_unsaved_id)
     if actual_id:
         return actual_id
     node_id = self._node_id(EntityType.dataset, unsaved_id)
     dataset = Dataset.from_dict(entity_map['unsaved_entities'][node_id])
     dataset = self._dataset_service.save_dataset(dataset, commit=False, flush=True)
     actual_entities_by_node_id[node_id] = dataset
     actual_entities_by_unsaved_id[unsaved_id] = dataset
     return dataset.id
示例#7
0
    def test_dataset_schema_invalid(self):
        with self.assertRaises(DartValidationException) as context:
            columns = [Column('c1', DataType.VARCHAR, 50), Column('c2', DataType.BIGINT)]
            df = DataFormat(FileFormat.PARQUET, RowFormat.NONE)
            location = None
            ds = Dataset(data=DatasetData('test-dataset', 'test_dataset_table', location, df, columns))
            # should fail because location is required
            default_and_validate(ds, dataset_schema())

        self.assertTrue(isinstance(context.exception, DartValidationException))
    def test_impala_table_definition_step(self):
        ds = Dataset(data=DatasetData(
            name='weblogs_v01',
            table_name='weblogs_parquet',
            location='s3://wsm-log-servers/weblogs/www.retailmenot.com/ec2/',
            data_format=DataFormat(
                file_format=FileFormat.PARQUET,
                row_format=RowFormat.NONE,
            ),
            columns=[
                Column('ip', 'STRING'),
                Column('user', 'STRING'),
                Column('requestDate', 'TIMESTAMP', date_pattern='dd/MMM/yyyy:HH:mm:ss Z'),
                Column('httpMethod', 'STRING'),
                Column('urlPath', 'STRING'),
                Column('queryString', 'STRING'),
                Column('httpVersion', 'STRING'),
                Column('statusCode', 'STRING'),
                Column('bytesSent', 'INT'),
                Column('referrer', 'STRING'),
                Column('userAgent', 'STRING'),
                Column('responseTime', 'BIGINT'),
                Column('hostname', 'STRING'),
                Column('userFingerprint', 'STRING'),
                Column('userId', 'STRING'),
                Column('sessionId', 'STRING'),
                Column('requestId', 'STRING'),
                Column('visitorId', 'STRING'),
                Column('vegSlice', 'STRING'),
                Column('fruitSlice', 'STRING'),
                Column('cacheHitMiss', 'STRING'),
            ],
            compression='GZIP',
            partitions=[
                Column('year', 'STRING'),
                Column('week', 'STRING'),
                Column('day', 'STRING'),
            ],
        ))

        call('mkdir -p /tmp/dart-emr-test/impala/')
        this_path = os.path.dirname(os.path.abspath(__file__))
        shutil.copyfile(this_path + '/../../../engine/emr/steps/impala/copy_to_table.sql', '/tmp/dart-emr-test/impala/copy_to_table.sql')
        impala_copy_to_table(ds, 'weblogs_stage', ds, 'weblogs_parquet', 's3://test', '/tmp/dart-emr-test/', 'actionid123', 1, 1)

        with open(os.path.join(this_path, 'copy_to_table_weblogs_parquet.sql')) as f:
            expected_contents = f.read()

        with open('/tmp/dart-emr-test/impala/copy_to_table_weblogs_parquet.sql') as f:
            actual_contents = f.read()

        self.assertEqual(expected_contents, actual_contents)
示例#9
0
    def setUp(self):
        dart = Dart(host='localhost', port=5000)
        """ :type dart: dart.client.python.dart_client.Dart """
        self.dart = dart

        cs = [
            Column('c1', DataType.VARCHAR, 50),
            Column('c2', DataType.BIGINT)
        ]
        df = DataFormat(FileFormat.TEXTFILE, RowFormat.DELIMITED)
        dataset_data = DatasetData(
            name='test-dataset',
            table_name='test_dataset_table',
            load_type=LoadType.INSERT,
            location=('s3://' + os.environ['DART_TEST_BUCKET'] + '/impala'),
            data_format=df,
            columns=cs,
            tags=[])
        self.dataset = self.dart.save_dataset(Dataset(data=dataset_data))

        start = 's3://' + os.environ['DART_TEST_BUCKET'] + '/impala/impala'
        end = 's3://' + os.environ['DART_TEST_BUCKET'] + '/impala/install'
        regex = '.*\\.rpm'
        ds = Subscription(data=SubscriptionData(
            'test-subscription', self.dataset.id, start, end, regex))
        self.subscription = self.dart.save_subscription(ds)

        dst_args = {'action_sleep_time_in_seconds': 0}
        dst = Datastore(data=DatastoreData('test-datastore',
                                           'no_op_engine',
                                           args=dst_args,
                                           state=DatastoreState.TEMPLATE))
        self.datastore = self.dart.save_datastore(dst)

        wf = Workflow(data=WorkflowData(
            'test-workflow', self.datastore.id, state=WorkflowState.ACTIVE))
        self.workflow = self.dart.save_workflow(wf, self.datastore.id)

        a_args = {'subscription_id': self.subscription.id}
        a0 = Action(data=ActionData(NoOpActionTypes.action_that_succeeds.name,
                                    NoOpActionTypes.action_that_succeeds.name,
                                    state=ActionState.TEMPLATE))
        a1 = Action(data=ActionData(NoOpActionTypes.consume_subscription.name,
                                    NoOpActionTypes.consume_subscription.name,
                                    a_args,
                                    state=ActionState.TEMPLATE))
        self.action0, self.action1 = self.dart.save_actions(
            [a0, a1], workflow_id=self.workflow.id)
示例#10
0
def _get_engineless_static_subgraphs_by_related_type(graph_entity_service):
    sub_graph_map = {}

    d_entity_models = graph_entity_service.to_entity_models_with_randomized_ids(
        [
            Dataset(id=Ref.dataset(1),
                    data=DatasetData(None,
                                     None,
                                     None,
                                     None,
                                     None,
                                     columns=[],
                                     partitions=[]))
        ])
    e_entity_models = graph_entity_service.to_entity_models_with_randomized_ids(
        [Event(id=Ref.event(1), data=EventData('event'))])
    sub_graph_map[None] = [
        SubGraph(
            name='dataset',
            description='create a new dataset entity',
            related_type=None,
            related_is_a=None,
            graph=graph_entity_service.to_graph(None, d_entity_models),
            entity_map=graph_entity_service.to_entity_map(d_entity_models),
            icon='⬟',
        ),
        SubGraph(
            name='event',
            description='create a new event entity',
            related_type=None,
            related_is_a=None,
            graph=graph_entity_service.to_graph(None, e_entity_models),
            entity_map=graph_entity_service.to_entity_map(e_entity_models),
            icon='★',
        ),
    ]

    entity_models = graph_entity_service.to_entity_models_with_randomized_ids([
        Subscription(id=Ref.subscription(1),
                     data=SubscriptionData('subscription', Ref.parent()))
    ])
    sub_graph_map[EntityType.dataset] = [
        SubGraph(
            name='subscription',
            description='create a new subscription entity',
            related_type=EntityType.dataset,
            related_is_a=Relationship.PARENT,
            graph=graph_entity_service.to_graph(None, entity_models),
            entity_map=graph_entity_service.to_entity_map(entity_models),
            icon='⬢',
        ),
    ]

    entity_models = graph_entity_service.to_entity_models_with_randomized_ids([
        Trigger(id=Ref.trigger(1),
                data=TriggerData(name='%s_trigger' % event_trigger.name,
                                 trigger_type_name=event_trigger.name,
                                 state=TriggerState.INACTIVE,
                                 workflow_ids=[],
                                 args={'event_id': Ref.parent()}))
    ])
    sub_graph_map[EntityType.event] = [
        SubGraph(
            name='event trigger',
            description='create a new event trigger entity',
            related_type=EntityType.event,
            related_is_a=Relationship.PARENT,
            graph=graph_entity_service.to_graph(None, entity_models),
            entity_map=graph_entity_service.to_entity_map(entity_models),
            icon='▼',
        ),
    ]

    entity_models = graph_entity_service.to_entity_models_with_randomized_ids([
        Trigger(id=Ref.trigger(1),
                data=TriggerData(
                    name='%s_trigger' % subscription_batch_trigger.name,
                    trigger_type_name=subscription_batch_trigger.name,
                    state=TriggerState.INACTIVE,
                    workflow_ids=[],
                    args={
                        'subscription_id': Ref.parent(),
                        'unconsumed_data_size_in_bytes': 1000000
                    }))
    ])
    sub_graph_map[EntityType.subscription] = [
        SubGraph(
            name='subscription batch trigger',
            description='create a new subscription batch trigger entity',
            related_type=EntityType.subscription,
            related_is_a=Relationship.PARENT,
            graph=graph_entity_service.to_graph(None, entity_models),
            entity_map=graph_entity_service.to_entity_map(entity_models),
            icon='▼',
        ),
    ]

    return sub_graph_map
    def test_hive_table_definition_step(self):
        ds = Dataset(data=DatasetData(
            name='owen_eu_v01',
            table_name='owen_eu',
            location='s3://s3-rpt-uss-dat-warehouse/prd/inbound/overlord/eu-all-events',
            data_format=DataFormat(
                file_format=FileFormat.TEXTFILE,
                row_format=RowFormat.JSON,
            ),
            columns=[
                Column('host', 'STRING', path='metadata.host'),
                Column('pageName', 'STRING', path='owen.context.pageName'),
                Column('viewInstanceUuid', 'STRING', path='owen.context.viewInstanceUuid'),
                Column('previousPageName', 'STRING', path='owen.context.previousPageName'),
                Column('previousViewInstanceUuid', 'STRING', path='owen.context.previousViewInstanceUuid'),
                Column('session', 'STRING', path='owen.context.session'),
                Column('pageType', 'STRING', path='owen.context.pageType'),
                Column('propertyName', 'STRING', path='owen.context.propertyName'),
                Column('enviroment', 'STRING', path='owen.context.environment'),
                Column('appForegroundFlag', 'BOOLEAN', path='owen.context.appForegroundFlag'),
                Column('bluetoothEnabledFlag', 'BOOLEAN', path='owen.context.bluetoothEnabledFlag'),
                Column('favoriteFlag', 'BOOLEAN', path='owen.context.favoriteFlag'),
                Column('locationEnabledFlag', 'BOOLEAN', path='owen.context.locationEnabledFlag'),
                Column('loggedInFlag', 'BOOLEAN', path='owen.context.loggedInFlag'),
                Column('notificationEnabledFlag', 'BOOLEAN', path='owen.context.notificationEnabledFlag'),
                Column('personalizationFlag', 'BOOLEAN', path='owen.context.personalizationFlag'),
                Column('advertiserUuid', 'STRING', path='owen.context.advertiserUuid'),
                Column('udid', 'STRING', path='owen.context.udid'),
                Column('userQualifier', 'STRING', path='owen.context.userQualifier'),
                Column('userId', 'STRING', path='owen.context.custom.legacy.userId'),
                Column('userUuid', 'STRING', path='owen.context.userUuid'),
                Column('macAddress', 'STRING', path='owen.context.macAddress'),
                Column('ipAddress', 'STRING', path='owen.context.ipAddress'),
                Column('osVersion', 'STRING', path='owen.context.osVersion'),
                Column('osFamily', 'STRING', path='owen.context.osFamily'),
                Column('osName', 'STRING', path='owen.context.osName'),
                Column('browserFamily', 'STRING', path='owen.context.browserFamily'),
                Column('deviceCategory', 'STRING', path='owen.context.deviceCategory'),
                Column('deviceMake', 'STRING', path='owen.context.mobileDeviceMake'),
                Column('deviceModel', 'STRING', path='owen.context.mobileDeviceModel'),
                Column('connectionType', 'STRING', path='owen.context.connectionType'),
                Column('userAgent', 'STRING', path='owen.context.userAgent'),
                Column('geofenceId', 'STRING', path='owen.context.custom.legacy.geofenceId'),
                Column('eventTimestamp', 'TIMESTAMP', path='owen.event.eventTimestamp', date_pattern="yyyy-MM-dd'T'HH:mm:ssZ"),
                Column('eventInstanceUuid', 'STRING', path='owen.event.eventInstanceUuid'),
                Column('eventPlatformVersion', 'STRING', path='owen.event.eventPlatformVersion'),
                Column('eventVersion', 'STRING', path='owen.event.eventVersion'),
                Column('eventCategory', 'STRING', path='owen.event.eventCategory'),
                Column('eventName', 'STRING', path='owen.event.eventName'),
                Column('eventAction', 'STRING', path='owen.event.eventAction'),
                Column('eventPlatform', 'STRING', path='owen.event.eventPlatform'),
                Column('testUnixTimestampSecondsPattern', 'TIMESTAMP', path='some.fake.path.testUnixTimestampSecondsPattern', date_pattern='UNIX_TIMESTAMP_SECONDS'),
                Column('testUnixTimestampMillisPattern', 'TIMESTAMP', path='some.fake.path.testUnixTimestampMillisPattern', date_pattern='UNIX_TIMESTAMP_MILLIS'),
            ],
            compression='GZIP',
            partitions=[
                Column('year', 'STRING'),
                Column('week', 'STRING'),
                Column('day', 'STRING'),
            ],
        ))

        call('mkdir -p /tmp/dart-emr-test/hive/')
        this_path = os.path.dirname(os.path.abspath(__file__))
        shutil.copyfile(this_path + '/../../../engine/emr/steps/hive/copy_to_table.hql', '/tmp/dart-emr-test/hive/copy_to_table.hql')
        action_id = 'actionid123'

        target_dataset = Dataset.from_dict(ds.to_dict())
        target_dataset.data.data_format.num_header_rows = 0
        target_dataset.data.data_format = DataFormat(FileFormat.RCFILE, RowFormat.NONE)
        stage_dataset = Dataset.from_dict(ds.to_dict())
        assert isinstance(stage_dataset, Dataset)
        for c in stage_dataset.data.columns:
            c.data_type = DataType.STRING

        hive_copy_to_table(stage_dataset, 'owen_eu_stage', target_dataset, 'owen_eu', 's3://test', '/tmp/dart-emr-test/', action_id, None, 1, 1)

        with open(os.path.join(this_path, 'copy_to_table_owen_eu.hql')) as f:
            expected_contents = f.read()

        with open('/tmp/dart-emr-test/hive/copy_to_table_owen_eu.hql') as f:
            actual_contents = f.read()

        self.assertEqual(expected_contents, actual_contents)
示例#12
0
 Dataset(
     id='PDUZ8EDNOR',
     data=(DatasetData(
         name='beacon_native_app_parsed_gzipped_v03',
         table_name='beacon_native_app',
         location=
         's3://example-bucket/prd/beacon/native_app/v3/dwh-delimited/gzipped',
         load_type=LoadType.INSERT,
         distribution_keys=['created'],
         sort_keys=['created', 'eventtype'],
         hive_compatible_partition_folders=True,
         data_format=DataFormat(
             FileFormat.TEXTFILE,
             RowFormat.DELIMITED,
             delimited_by='\t',
             quoted_by='"',
             escaped_by='\\',
             null_string='NULL',
         ),
         compression=Compression.GZIP,
         partitions=[Column('createdpartition', DataType.STRING)],
         columns=[
             Column('logfileid', DataType.INT),
             Column('linenumber', DataType.INT),
             Column('created',
                    DataType.TIMESTAMP,
                    date_pattern="yyyy-MM-dd HH:mm:ss"),
             Column('remoteip', DataType.VARCHAR, 500),
             Column('useragent', DataType.VARCHAR, 2500),
             Column('eventtype', DataType.VARCHAR, 255),
             Column('appversion', DataType.VARCHAR, 255),
             Column('advertiserid', DataType.VARCHAR, 2048),
             Column('couponsonpage', DataType.INT),
             Column('coupons', DataType.VARCHAR, 10000),
             Column('channel', DataType.VARCHAR, 128),
             Column('geocouponcount', DataType.BIGINT),
             Column('geofence', DataType.VARCHAR, 255),
             Column('geofencetimespent',
                    DataType.NUMERIC,
                    precision=14,
                    scale=5),
             Column('loginstatus', DataType.VARCHAR, 25),
             Column('products', DataType.VARCHAR, 2500),
             Column('session', DataType.VARCHAR, 5000),
             Column('systemname', DataType.VARCHAR, 2500),
             Column('systemversion', DataType.VARCHAR, 2500),
             Column('udid', DataType.VARCHAR, 128),
             Column('userqualifier', DataType.VARCHAR, 64),
             Column('url', DataType.VARCHAR, 5000),
             Column('user_uuid', DataType.VARCHAR, 64),
             Column('userid', DataType.INT),
             Column('searchtype', DataType.VARCHAR, 128),
             Column('searchlistterm', DataType.VARCHAR, 512),
             Column('searchterm', DataType.VARCHAR, 512),
             Column('emailuuid', DataType.VARCHAR, 128),
             Column('userfingerprint', DataType.VARCHAR, 64),
             Column('locationstatus', DataType.VARCHAR, 128),
             Column('pushnotificationstatus', DataType.VARCHAR, 128),
             Column('placement', DataType.VARCHAR, 1024),
             Column('loc', DataType.VARCHAR, 128),
             Column('ppoi0', DataType.VARCHAR, 128),
             Column('ppoi1', DataType.VARCHAR, 128),
             Column('ppoi2', DataType.VARCHAR, 128),
             Column('ppoi3', DataType.VARCHAR, 128),
             Column('ppoi4', DataType.VARCHAR, 128),
             Column('applaunchnotificationtype', DataType.VARCHAR, 128),
             Column('scenarioname', DataType.VARCHAR, 128),
             Column('behaviorname', DataType.VARCHAR, 128),
             Column('coupontype', DataType.VARCHAR, 128),
             Column('couponposition', DataType.VARCHAR, 128),
             Column('hasqsrcontent', DataType.VARCHAR, 128),
             Column('promptname', DataType.VARCHAR, 128),
             Column('locationpermissionchanage', DataType.VARCHAR, 128),
             Column('couponproblemtype', DataType.VARCHAR, 128),
             Column('storetitle', DataType.VARCHAR, 128),
             Column('mallname', DataType.VARCHAR, 128),
             Column('restaurantname', DataType.VARCHAR, 128),
             Column('milesaway', DataType.VARCHAR, 128),
             Column('menuitem', DataType.VARCHAR, 128),
             Column('toolname', DataType.VARCHAR, 128),
             Column('toolaction', DataType.VARCHAR, 128),
             Column('toolstep', DataType.VARCHAR, 128),
             Column('mallposition', DataType.VARCHAR, 128),
             Column('recommendstorename', DataType.VARCHAR, 128),
             Column('recommendstoreposition', DataType.VARCHAR, 128),
             Column('favoritestorename', DataType.VARCHAR, 128),
             Column('favoritestoreaction', DataType.VARCHAR, 128),
             Column('favoritestoreposition', DataType.VARCHAR, 128),
             Column('favoritesiteid', DataType.VARCHAR, 128),
             Column('receivername', DataType.VARCHAR, 128),
             Column('outclickbuttonprompt', DataType.VARCHAR, 128),
             Column('datasource', DataType.VARCHAR, 1024),
             Column('searchresultcount', DataType.VARCHAR, 128),
             Column('searchresultposition', DataType.VARCHAR, 128),
             Column('sharetype', DataType.VARCHAR, 128),
             Column('daysuntilexpiration', DataType.VARCHAR, 128),
             Column('firedate', DataType.VARCHAR, 128),
             Column('settingschangevalue', DataType.VARCHAR, 128),
             Column('settingschangetype', DataType.VARCHAR, 128),
             Column('settingschangelocation', DataType.VARCHAR, 128),
             Column('clickaction', DataType.VARCHAR, 128),
             Column('tnt', DataType.VARCHAR, 128),
             Column('previouspage', DataType.VARCHAR, 2500),
             Column('clickpage', DataType.VARCHAR, 2500),
             Column('launchreason', DataType.VARCHAR, 128),
             Column('taplyticsData', DataType.VARCHAR, 150),
             Column('appCampaign', DataType.VARCHAR, 50),
             Column('accountMethod', DataType.VARCHAR, 60),
             Column('appState', DataType.VARCHAR, 100),
             Column('btStatus', DataType.BOOLEAN),
             Column('btBeaconId', DataType.VARCHAR, 500),
             Column('btBeaconFactoryId', DataType.VARCHAR, 500),
             Column('btBeaconName', DataType.VARCHAR, 500),
             Column('btTimeSpent', DataType.VARCHAR, 50),
             Column('purchaseId', DataType.VARCHAR, 500),
             Column('transactionId', DataType.VARCHAR, 500),
             Column('outclickLink', DataType.VARCHAR, 1000),
             Column('outclickPage', DataType.VARCHAR, 300),
             Column('featuredCouponPosition', DataType.INT),
             Column('commentCount', DataType.INT),
             Column('mallCount', DataType.INT),
             Column('clickCount', DataType.INT),
             Column('merchantName', DataType.VARCHAR, 100),
             Column('merchantPosition', DataType.INT),
             Column('couponUuids', DataType.VARCHAR, 10000),
             Column('favoriteSiteUuid', DataType.VARCHAR, 50),
             Column('deepLinkType', DataType.VARCHAR, 40),
             Column('adUnitUuid', DataType.VARCHAR, 50),
         ],
     ))))
示例#13
0
def post_dataset():
    return {
        'results':
        dataset_service().save_dataset(Dataset.from_dict(
            request.get_json())).to_dict()
    }
示例#14
0
 Dataset(data=(DatasetData(
     name='beacon_native_app_parsed_v01',
     table_name='beacon_native_app',
     location='s3://example-bucket/nb.retailmenot.com/parsed_logs',
     load_type=LoadType.INSERT,
     data_format=DataFormat(FileFormat.TEXTFILE,
                            RowFormat.DELIMITED,
                            delimited_by='\t',
                            quoted_by='"',
                            escaped_by='\\',
                            null_string='NULL',
                            num_header_rows=1),
     compression=Compression.NONE,
     partitions=[
         Column('year', DataType.STRING),
         Column('week', DataType.STRING),
     ],
     columns=[
         Column('logFileId', DataType.BIGINT),
         Column('lineNumber', DataType.INT),
         Column('created',
                DataType.TIMESTAMP,
                date_pattern="yyyy-MM-dd HH:mm:ss"),
         Column('remoteip', DataType.STRING),
         Column('useragent', DataType.STRING),
         Column('eventType', DataType.STRING),
         Column('appVersion', DataType.STRING),
         Column('advertiserID', DataType.STRING),
         Column('couponsOnPage', DataType.INT),
         Column('coupons', DataType.STRING),
         Column('channel', DataType.STRING),
         Column('geoCouponCount', DataType.STRING),
         Column('geofence', DataType.STRING),
         Column('geofenceTimeSpent', DataType.STRING),
         Column('loginStatus', DataType.STRING),
         Column('products', DataType.STRING),
         Column('session', DataType.STRING),
         Column('systemName', DataType.STRING),
         Column('systemVersion', DataType.STRING),
         Column('udid', DataType.STRING),
         Column('userQualifier', DataType.STRING),
         Column('url', DataType.STRING),
         Column('user_uuid', DataType.STRING),
         Column('userId', DataType.STRING),
         Column('searchType', DataType.STRING),
         Column('searchListTerm', DataType.STRING),
         Column('searchTerm', DataType.STRING),
         Column('emailUUId', DataType.STRING),
         Column('userFingerprint', DataType.STRING),
         Column('locationStatus', DataType.STRING),
         Column('pushNotificationStatus', DataType.BOOLEAN),
         Column('placement', DataType.STRING),
         Column('loc', DataType.STRING),
         Column('ppoi0', DataType.STRING),
         Column('ppoi1', DataType.STRING),
         Column('ppoi2', DataType.STRING),
         Column('ppoi3', DataType.STRING),
         Column('ppoi4', DataType.STRING),
         Column('appLaunchNotificationType', DataType.STRING),
         Column('scenarioName', DataType.STRING),
         Column('behaviorName', DataType.STRING),
         Column('couponType', DataType.STRING),
         Column('couponPosition', DataType.STRING),
         Column('hasQSRContent', DataType.BOOLEAN),
         Column('promptName', DataType.STRING),
         Column('locationPermissionChanage', DataType.STRING),
         Column('couponProblemType', DataType.STRING),
         Column('storeTitle', DataType.STRING),
         Column('mallName', DataType.STRING),
         Column('restaurantName', DataType.STRING),
         Column('milesAway', 'float'),
         Column('menuItem', DataType.STRING),
         Column('toolName', DataType.STRING),
         Column('toolAction', DataType.STRING),
         Column('toolStep', DataType.STRING),
         Column('mallPosition', DataType.INT),
         Column('recommendStoreName', DataType.STRING),
         Column('recommendStorePosition', DataType.INT),
         Column('favoriteStoreName', DataType.STRING),
         Column('favoriteStoreAction', DataType.STRING),
         Column('favoriteStorePosition', DataType.INT),
         Column('favoriteSiteId', DataType.STRING),
         Column('receiverName', DataType.STRING),
         Column('outclickButtonPrompt', DataType.STRING),
         Column('dataSource', DataType.STRING),
         Column('searchResultCount', DataType.INT),
         Column('searchResultPosition', DataType.INT),
         Column('shareType', DataType.STRING),
         Column('daysUntilExpiration', DataType.INT),
         Column('fireDate', DataType.BIGINT),
         Column('settingsChangeValue', DataType.STRING),
         Column('settingsChangeType', DataType.STRING),
         Column('settingsChangeLocation', DataType.STRING),
         Column('clickAction', DataType.STRING),
         Column('tnt', DataType.STRING),
         Column('previousPage', DataType.STRING),
         Column('clickPage', DataType.STRING),
         Column('launchReason', DataType.STRING),
         Column('taplyticsData', DataType.STRING),
         Column('appCampaign', DataType.STRING),
         Column('accountMethod', DataType.STRING),
         Column('appState', DataType.STRING),
         Column('btStatus', DataType.BOOLEAN),
         Column('btBeaconId', DataType.STRING),
         Column('btBeaconFactoryId', DataType.STRING),
         Column('btBeaconName', DataType.STRING),
         Column('btTimeSpent', DataType.STRING),
         Column('purchaseId', DataType.STRING),
         Column('transactionId', DataType.STRING),
         Column('outclickLink', DataType.STRING),
         Column('outclickPage', DataType.STRING),
         Column('featuredCouponPosition', DataType.INT),
         Column('commentCount', DataType.INT),
         Column('mallCount', DataType.INT),
         Column('clickCount', DataType.INT),
         Column('merchantName', DataType.STRING),
         Column('merchantPosition', DataType.INT),
     ],
 ))))
示例#15
0
 Dataset(data=(DatasetData(
     name='owen_eu_DW-3213_v3',
     table_name='owen_eu',
     location='s3://example-bucket/prd/inbound/overlord/eu-all-events',
     load_type=LoadType.MERGE,
     data_format=DataFormat(
         file_format=FileFormat.TEXTFILE,
         row_format=RowFormat.JSON,
     ),
     compression=Compression.GZIP,
     partitions=[
         Column('year', DataType.STRING),
         Column('month', DataType.STRING),
         Column('day', DataType.STRING),
     ],
     columns=[
         Column('host', DataType.STRING, path='metadata.host'),
         Column('referer', DataType.STRING, path='metadata.referer'),
         Column('userAgent',
                DataType.STRING,
                path='owen.context.userAgent'),
         Column('ipAddress',
                DataType.STRING,
                path='owen.context.ipAddress'),
         Column('session', DataType.STRING,
                path='owen.context.session'),
         Column('propertyName',
                DataType.STRING,
                path='owen.context.propertyName'),
         Column(
             'pageName', DataType.STRING, path='owen.context.pageName'),
         Column('previousPageName',
                DataType.STRING,
                path='owen.context.previousPageName'),
         Column('viewInstanceUuid',
                DataType.STRING,
                path='owen.context.viewInstanceUuid'),
         Column('previousViewInstanceUuid',
                DataType.STRING,
                path='owen.context.previousViewInstanceUuid'),
         Column(
             'pageType', DataType.STRING, path='owen.context.pageType'),
         Column('udid', DataType.STRING, path='owen.context.udid'),
         Column('advertiserUuid',
                DataType.STRING,
                path='owen.context.advertiserUuid'),
         Column(
             'osFamily', DataType.STRING, path='owen.context.osFamily'),
         Column(
             'latitude', DataType.STRING, path='owen.context.latitude'),
         Column('longitude',
                DataType.STRING,
                path='owen.context.longitude'),
         Column('userId',
                DataType.STRING,
                path='owen.context.custom.legacy.userId'),
         Column('geofenceId',
                DataType.STRING,
                path='owen.context.custom.legacy.geofenceId'),
         Column(
             'userUuid', DataType.STRING, path='owen.context.userUuid'),
         Column('offerId',
                DataType.STRING,
                path='owen.context.inventory[0].inventoryUuid'),
         Column('inventorySource',
                DataType.STRING,
                path='owen.context.inventory[0].inventorySource'),
         Column('expirationDate',
                DataType.STRING,
                path='owen.context.inventory[0].expirationDate'),
         Column('position',
                DataType.STRING,
                path='owen.context.inventory[0].position'),
         Column('offerType',
                DataType.STRING,
                path='owen.context.inventory[0].inventoryType'),
         Column('eventInstanceUuid',
                DataType.STRING,
                path='owen.event.eventInstanceUuid'),
         Column('eventTimestamp',
                DataType.TIMESTAMP,
                path='owen.event.eventTimestamp',
                date_pattern="yyyy-MM-dd'T'HH:mm:ss'Z'"),
         Column('eventPlatform',
                DataType.STRING,
                path='owen.event.eventPlatform'),
         Column('eventCategory',
                DataType.STRING,
                path='owen.event.eventCategory'),
         Column('eventAction',
                DataType.STRING,
                path='owen.event.eventAction'),
         Column(
             'eventName', DataType.STRING, path='owen.event.eventName'),
         Column('eventTarget',
                DataType.STRING,
                path='owen.event.eventTarget'),
         Column('eventVersion',
                DataType.STRING,
                path='owen.event.eventVersion'),
         Column('userQualifier',
                DataType.STRING,
                path='owen.context.userQualifier'),
         Column('outclickUuid',
                DataType.STRING,
                path='owen.context.inventory[0].outclickUuid'),
         Column('inventoryName',
                DataType.STRING,
                path='owen.context.inventory[0].inventoryName'),
         Column('enviroment',
                DataType.STRING,
                path='owen.context.environment'),
         Column('loggedInFlag',
                DataType.STRING,
                path='owen.context.loggedInFlag'),
         Column('eventPlatformVersion',
                DataType.STRING,
                path='owen.event.eventPlatformVersion'),
         Column('appForegroundFlag',
                DataType.BOOLEAN,
                path='owen.context.appForegroundFlag'),
         Column('bluetoothEnabledFlag',
                DataType.BOOLEAN,
                path='owen.context.bluetoothEnabledFlag'),
         Column('favoriteFlag',
                DataType.BOOLEAN,
                path='owen.context.favoriteFlag'),
         Column('locationEnabledFlag',
                DataType.BOOLEAN,
                path='owen.context.locationEnabledFlag'),
         Column('notificationEnabledFlag',
                DataType.BOOLEAN,
                path='owen.context.notificationEnabledFlag'),
         Column('personalizationFlag',
                DataType.BOOLEAN,
                path='owen.context.personalizationFlag'),
         Column('macAddress',
                DataType.STRING,
                path='owen.context.macAddress'),
         Column('osVersion',
                DataType.STRING,
                path='owen.context.osVersion'),
         Column('osName', DataType.STRING, path='owen.context.osName'),
         Column('browserFamily',
                DataType.STRING,
                path='owen.context.browserFamily'),
         Column('deviceCategory',
                DataType.STRING,
                path='owen.context.deviceCategory'),
         Column('deviceMake',
                DataType.STRING,
                path='owen.context.mobileDeviceMake'),
         Column('deviceModel',
                DataType.STRING,
                path='owen.context.mobileDeviceModel'),
         Column('connectionType',
                DataType.STRING,
                path='owen.context.connectionType'),
         Column('browserVersion',
                DataType.STRING,
                path='owen.context.browserVersion'),
         Column('city', DataType.STRING, path='owen.context.city'),
         Column('country', DataType.STRING,
                path='owen.context.country'),
         Column('region', DataType.STRING, path='owen.context.region'),
         Column('partialSearchTerm',
                DataType.STRING,
                path='owen.context.partialSearchTerm'),
         Column('outclickURL',
                DataType.STRING,
                path='owen.context.inventory[0].outRedirectUrl'),
         Column('clickLocation',
                DataType.STRING,
                path='owen.context.inventory[0].clickLocation'),
         Column('inventoryChannel',
                DataType.STRING,
                path='owen.context.inventory[0].inventoryChannel'),
         Column('brand',
                DataType.STRING,
                path='owen.context.inventory[0].brand'),
         Column('commentsCount',
                DataType.INT,
                path='owen.context.inventory[0].commentsCount'),
         Column('legacyOfferId',
                DataType.STRING,
                path='owen.context.custom.legacy.offerIds.offerId'),
         Column('pageViewHash',
                DataType.STRING,
                path='owen.context.custom.legacy.pageViewHash'),
         Column('vIdInt',
                DataType.STRING,
                path='owen.context.custom.legacy.vIdInt'),
         Column('merchantId',
                DataType.STRING,
                path='owen.context.custom.legacy.merchantId'),
         Column('facebookConnect',
                DataType.STRING,
                path='owen.context.custom.facebookConnect'),
         Column('schemaKey', DataType.STRING, path='schema.key'),
     ],
 ))))
示例#16
0
 dataset = dart.save_dataset(Dataset(data=DatasetData(
     name='weblogs_v01',
     table_name='weblogs',
     location='s3://example-bucket/weblogs/www.retailmenot.com/ec2/',
     data_format=DataFormat(
         file_format=FileFormat.TEXTFILE,
         row_format=RowFormat.REGEX,
         regex_input="(?<ip>^(?:(?:unknown(?:,\\s)?|(?:\\d+\\.\\d+\\.\\d+\\.\\d+(?:,\\s)?))+)|\\S*)\\s+\\S+\\s+(?<userIdentifier>(?:[^\\[]+|\\$\\S+\\['\\S+'\\]|\\[username\\]))\\s*\\s+\\[(?<requestDate>[^\\]]+)\\]\\s+\"(?<httpMethod>(?:GET|HEAD|POST|PUT|DELETE|TRACE))\\s(?<urlPath>(?:[^ ?]+))(?:\\?(?<queryString>(?:[^ ]+)))?\\sHTTP/(?<httpVersion>(?:[\\d\\.]+))\"\\s+(?<statusCode>[0-9]+)\\s+(?<bytesSent>\\S+)\\s+\"(?<referrer>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<userAgent>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+(?<responseTime>[-0-9]*)\\s+\"(?<hostName>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<userFingerprint>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<userId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<sessionId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<requestId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<visitorId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<vegSlice>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<fruitSlice>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<cacheHitMiss>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s*\\Z",
         regex_output="%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s %10$s %11$s %12$s %13$s %14$s %15$s %16$s %17$s %18$s %19$s %20$s %21s",
     ),
     columns=[
         Column('ip', DataType.STRING),
         Column('user', DataType.STRING),
         Column('requestDate', DataType.TIMESTAMP, date_pattern='dd/MMM/yyyy:HH:mm:ss Z'),
         Column('httpMethod', DataType.STRING),
         Column('urlPath', DataType.STRING),
         Column('queryString', DataType.STRING),
         Column('httpVersion', DataType.STRING),
         Column('statusCode', DataType.STRING),
         Column('bytesSent', DataType.INT),
         Column('referrer', DataType.STRING),
         Column('userAgent', DataType.STRING),
         Column('responseTime', DataType.BIGINT),
         Column('hostname', DataType.STRING),
         Column('userFingerprint', DataType.STRING),
         Column('userId', DataType.STRING),
         Column('sessionId', DataType.STRING),
         Column('requestId', DataType.STRING),
         Column('visitorId', DataType.STRING),
         Column('vegSlice', DataType.STRING),
         Column('fruitSlice', DataType.STRING),
         Column('cacheHitMiss', DataType.STRING),
     ],
     compression=Compression.BZ2,
     partitions=[
         Column('year', DataType.STRING),
         Column('week', DataType.STRING),
     ],
 )))
示例#17
0
def prepare_load_dataset_steps(dry_run,
                               args_by_name,
                               datastore,
                               dataset,
                               action_id,
                               s3_path_and_file_size_gen,
                               target_is_dynamodb=False):
    """ :type dataset: dart.model.dataset.Dataset """
    def add_to(step_partials, step_num, func, *args):
        # add all params except the last one, which is the total steps (known at the end)
        step_partials.append(
            functools.partial(func, *(list(args) + [step_num])))
        return step_num + 1

    def stage_table_not_needed(ds, file_format, row_format, compression,
                               delimited_by, quoted_by, escaped_by,
                               null_string):
        """ :type ds: dart.model.dataset.Dataset """
        return file_format == ds.data.data_format.file_format\
            and row_format == ds.data.data_format.row_format\
            and compression == ds.data.compression\
            and delimited_by == ds.data.data_format.delimited_by\
            and quoted_by == ds.data.data_format.quoted_by\
            and escaped_by == ds.data.data_format.escaped_by\
            and null_string == ds.data.data_format.null_string

    # some steps require producing a dataset specific file based on a template, so we will copy all to a tempdir
    tempdir = tempfile.mkdtemp()
    try:
        local_step_path, s3_step_path, s3_temp_path = prepare_step_paths(
            datastore, tempdir)
        target_table_name = args_by_name.get(
            'target_table_name') or dataset.data.table_name
        target_file_format = args_by_name.get(
            'target_file_format') or dataset.data.data_format.file_format
        target_row_format = args_by_name.get(
            'target_row_format') or dataset.data.data_format.row_format
        target_compression = args_by_name.get(
            'target_compression') or dataset.data.compression
        target_delimited_by = args_by_name.get(
            'target_delimited_by') or dataset.data.data_format.delimited_by
        target_quoted_by = args_by_name.get(
            'target_quoted_by') or dataset.data.data_format.quoted_by
        target_escaped_by = args_by_name.get(
            'target_escaped_by') or dataset.data.data_format.escaped_by
        target_null_string = args_by_name.get(
            'target_null_string') or dataset.data.data_format.null_string

        stage_table_name = target_table_name + '_stage_for_action_' + action_id
        staging_not_needed = stage_table_not_needed(
            dataset, target_file_format, target_row_format, target_compression,
            target_delimited_by, target_quoted_by, target_escaped_by,
            target_null_string)
        first_table_name = target_table_name if staging_not_needed and not target_is_dynamodb else stage_table_name

        drop_table_names = []
        step_funcs = []
        i = 1

        # ------------------------------------------------------------------------------------------------------------
        # all code paths below require copying the data to HDFS, and lowercasing the table is required because of hive
        # ------------------------------------------------------------------------------------------------------------
        i = add_to(step_funcs, i,
                   s3distcp_files_step, s3_path_and_file_size_gen,
                   first_table_name.lower(), dataset, s3_step_path,
                   local_step_path, action_id)

        # ------------------------------------------------------------------------------------------------------------
        # not all folder structures on s3 are hive compatible... if not, rename directories after copying
        # ------------------------------------------------------------------------------------------------------------
        if dataset.data.partitions and not dataset.data.hive_compatible_partition_folders:
            i = add_to(step_funcs, i, python_fix_partition_folder_names,
                       first_table_name.lower(), dataset.data.partitions,
                       s3_step_path, action_id)

        # ------------------------------------------------------------------------------------------------------------
        # special case to share functionality with the dynamodb_engine
        # ------------------------------------------------------------------------------------------------------------
        if target_is_dynamodb:
            dyn_dataset = Dataset.from_dict(dataset.to_dict())
            assert isinstance(dyn_dataset, Dataset)
            dyn_dataset.data.data_format = DataFormat('DYNAMODB_TABLE',
                                                      RowFormat.NONE, 0)
            dyn_dataset.data.compression = Compression.NONE
            dyn_dataset.data.columns = [
                Column(c.name, dynamodb_column_type(c))
                for c in dataset.data.columns
            ]
            set_hive_vars = 'SET dynamodb.retry.duration = 0;\nSET dynamodb.throughput.write.percent = %s;'
            set_hive_vars = set_hive_vars % args_by_name[
                'write_capacity_utilization_percent']

            i = add_to(step_funcs, i, hive_table_definition_step,
                       stage_table_name, dataset, s3_step_path,
                       local_step_path, action_id, False)
            i = add_to(step_funcs, i, hive_table_definition_step,
                       target_table_name, dyn_dataset, s3_step_path,
                       local_step_path, action_id, True)
            i = add_to(step_funcs, i, hive_msck_repair_table_step,
                       stage_table_name, s3_step_path, action_id)
            i = add_to(step_funcs, i, hive_copy_to_table, dataset,
                       stage_table_name, dyn_dataset, target_table_name,
                       s3_step_path, local_step_path, action_id, set_hive_vars)

        # ------------------------------------------------------------------------------------------------------------
        # if no stage tables are needed, much complexity can be skipped
        # ------------------------------------------------------------------------------------------------------------
        elif staging_not_needed:
            i = add_to(step_funcs, i, hive_table_definition_step,
                       target_table_name, dataset, s3_step_path,
                       local_step_path, action_id, False)
            i = add_to(step_funcs, i, hive_msck_repair_table_step,
                       target_table_name, s3_step_path, action_id)

        # ------------------------------------------------------------------------------------------------------------
        # one or more staging tables are needed
        # ------------------------------------------------------------------------------------------------------------
        else:
            stage_dataset = dataset
            target_dataset = Dataset.from_dict(dataset.to_dict())
            target_dataset.data.data_format = DataFormat(
                target_file_format, target_row_format, 0, target_delimited_by,
                target_quoted_by, target_escaped_by, target_null_string)
            target_dataset.data.compression = target_compression
            drop_table_names.append(stage_table_name)

            # --------------------------------------------------------------------------------------------------------
            # define string types for JSON/REGEX based datasets (safe), and we will cast appropriately during insert
            # --------------------------------------------------------------------------------------------------------
            if stage_dataset.data.data_format.row_format in [
                    RowFormat.JSON, RowFormat.REGEX
            ]:
                # make a copy since we are modifying the columns
                stage_dataset = Dataset.from_dict(dataset.to_dict())
                assert isinstance(stage_dataset, Dataset)
                for c in stage_dataset.data.columns:
                    c.data_type = DataType.STRING

            i = add_to(step_funcs, i, hive_table_definition_step,
                       stage_table_name, stage_dataset, s3_step_path,
                       local_step_path, action_id, False)
            i = add_to(step_funcs, i, hive_table_definition_step,
                       target_table_name, target_dataset, s3_step_path,
                       local_step_path, action_id, False)
            i = add_to(step_funcs, i, hive_msck_repair_table_step,
                       stage_table_name, s3_step_path, action_id)

            # --------------------------------------------------------------------------------------------------------
            # hive has issues creating parquet files
            # --------------------------------------------------------------------------------------------------------
            if target_file_format != FileFormat.PARQUET:
                i = add_to(step_funcs, i, hive_copy_to_table, stage_dataset,
                           stage_table_name, target_dataset, target_table_name,
                           s3_step_path, local_step_path, action_id, None)

            # --------------------------------------------------------------------------------------------------------
            # impala is better for creating parquet files
            # --------------------------------------------------------------------------------------------------------
            else:
                # ----------------------------------------------------------------------------------------------------
                # no additional staging tables needed if the source dataset file format is RCFILE (impala friendly)
                # ----------------------------------------------------------------------------------------------------
                if dataset.data.data_format.file_format == FileFormat.RCFILE:
                    i = add_to(step_funcs, i, hive_copy_to_table,
                               stage_dataset, stage_table_name, target_dataset,
                               target_table_name, s3_step_path,
                               local_step_path, action_id, None)

                # ----------------------------------------------------------------------------------------------------
                # impala cannot read all hive formats, so we will introduce another staging table
                # ----------------------------------------------------------------------------------------------------
                else:
                    rc_table_name = target_table_name + '_rcfile_stage_for_action_' + action_id
                    rc_dataset = Dataset.from_dict(target_dataset.to_dict())
                    rc_dataset.data.data_format = DataFormat(
                        FileFormat.RCFILE, RowFormat.NONE, 0)
                    rc_dataset.data.compression = Compression.NONE
                    drop_table_names.append(rc_table_name)

                    i = add_to(step_funcs, i, hive_table_definition_step,
                               rc_table_name, rc_dataset, s3_step_path,
                               local_step_path, action_id, False)
                    i = add_to(step_funcs, i, hive_copy_to_table,
                               stage_dataset, stage_table_name, rc_dataset,
                               rc_table_name, s3_step_path, local_step_path,
                               action_id, None)
                    i = add_to(step_funcs, i, impala_copy_to_table, rc_dataset,
                               rc_table_name, target_dataset,
                               target_table_name, s3_step_path,
                               local_step_path, action_id)

        # ------------------------------------------------------------------------------------------------------------
        # at this point, the load should be considered complete even if something goes wrong in the steps below,
        # so we will indicate this in the step wrapper
        # ------------------------------------------------------------------------------------------------------------
        considered_successful_at_this_index = i

        # ------------------------------------------------------------------------------------------------------------
        # drop any staging tables created
        # ------------------------------------------------------------------------------------------------------------
        if drop_table_names:
            script = '\n'.join(
                ['DROP TABLE %s;' % name for name in drop_table_names])
            i = add_to(step_funcs, i, hive_run_script_contents_step, script,
                       s3_step_path, local_step_path, action_id)

        # ------------------------------------------------------------------------------------------------------------
        # inform impala about changes
        # ------------------------------------------------------------------------------------------------------------
        if not target_is_dynamodb:
            i = add_to(step_funcs, i, impala_invalidate_metadata_step,
                       s3_step_path, action_id)

        total_steps = i - 1
        steps = []
        for index, f in enumerate(step_funcs, 1):
            step_wrapper = f(total_steps)
            assert isinstance(step_wrapper, StepWrapper)
            if index >= considered_successful_at_this_index:
                step_wrapper.action_considered_successful = True
            steps.append(step_wrapper)

        if not dry_run:
            s3_copy_recursive(local_step_path, s3_step_path)

        return steps

    finally:
        shutil.rmtree(tempdir)
示例#18
0
 dataset = dart.save_dataset(Dataset(data=(DatasetData(
     name='beacon_native_app_v02',
     table_name='beacon_native_app',
     location='s3://example-bucket/prd/beacon/native_app/v2/parquet/snappy',
     hive_compatible_partition_folders=True,
     load_type=LoadType.INSERT,
     data_format=DataFormat('parquet'),
     columns=[
         Column('logFileId', DataType.BIGINT),
         Column('lineNumber', DataType.INT),
         Column('created', DataType.BIGINT),
         Column('remoteip', DataType.STRING),
         Column('useragent', DataType.STRING),
         Column('eventType', DataType.STRING),
         Column('appVersion', DataType.STRING),
         Column('advertiserID', DataType.STRING),
         Column('couponsOnPage', DataType.INT),
         Column('coupons', DataType.STRING),
         Column('channel', DataType.STRING),
         Column('geoCouponCount', DataType.STRING),
         Column('geofence', DataType.STRING),
         Column('geofenceTimeSpent', DataType.STRING),
         Column('loginStatus', DataType.STRING),
         Column('products', DataType.STRING),
         Column('session', DataType.STRING),
         Column('systemName', DataType.STRING),
         Column('systemVersion', DataType.STRING),
         Column('udid', DataType.STRING),
         Column('userQualifier', DataType.STRING),
         Column('url', DataType.STRING),
         Column('user_uuid', DataType.STRING),
         Column('userId', DataType.STRING),
         Column('searchType', DataType.STRING),
         Column('searchListTerm', DataType.STRING),
         Column('searchTerm', DataType.STRING),
         Column('emailUUId', DataType.STRING),
         Column('userFingerprint', DataType.STRING),
         Column('locationStatus', DataType.STRING),
         Column('pushNotificationStatus', DataType.BOOLEAN),
         Column('placement', DataType.STRING),
         Column('loc', DataType.STRING),
         Column('ppoi0', DataType.STRING),
         Column('ppoi1', DataType.STRING),
         Column('ppoi2', DataType.STRING),
         Column('ppoi3', DataType.STRING),
         Column('ppoi4', DataType.STRING),
         Column('appLaunchNotificationType', DataType.STRING),
         Column('scenarioName', DataType.STRING),
         Column('behaviorName', DataType.STRING),
         Column('couponType', DataType.STRING),
         Column('couponPosition', DataType.STRING),
         Column('hasQSRContent', DataType.BOOLEAN),
         Column('promptName', DataType.STRING),
         Column('locationPermissionChanage', DataType.STRING),
         Column('couponProblemType', DataType.STRING),
         Column('storeTitle', DataType.STRING),
         Column('mallName', DataType.STRING),
         Column('restaurantName', DataType.STRING),
         Column('milesAway', 'float'),
         Column('menuItem', DataType.STRING),
         Column('toolName', DataType.STRING),
         Column('toolAction', DataType.STRING),
         Column('toolStep', DataType.STRING),
         Column('mallPosition', DataType.INT),
         Column('recommendStoreName', DataType.STRING),
         Column('recommendStorePosition', DataType.INT),
         Column('favoriteStoreName', DataType.STRING),
         Column('favoriteStoreAction', DataType.STRING),
         Column('favoriteStorePosition', DataType.INT),
         Column('favoriteSiteId', DataType.STRING),
         Column('receiverName', DataType.STRING),
         Column('outclickButtonPrompt', DataType.STRING),
         Column('dataSource', DataType.STRING),
         Column('searchResultCount', DataType.INT),
         Column('searchResultPosition', DataType.INT),
         Column('shareType', DataType.STRING),
         Column('daysUntilExpiration', DataType.INT),
         Column('fireDate', DataType.BIGINT),
         Column('settingsChangeValue', DataType.STRING),
         Column('settingsChangeType', DataType.STRING),
         Column('settingsChangeLocation', DataType.STRING),
         Column('clickAction', DataType.STRING),
         Column('tnt', DataType.STRING),
         Column('previousPage', DataType.STRING),
         Column('clickPage', DataType.STRING),
         Column('launchReason', DataType.STRING),
         Column('taplyticsData', DataType.STRING),
         Column('appCampaign', DataType.STRING),
         Column('accountMethod', DataType.STRING),
         Column('appState', DataType.STRING),
         Column('btStatus', DataType.BOOLEAN),
         Column('btBeaconId', DataType.STRING),
         Column('btBeaconFactoryId', DataType.STRING),
         Column('btBeaconName', DataType.STRING),
         Column('btTimeSpent', DataType.STRING),
         Column('purchaseId', DataType.STRING),
         Column('transactionId', DataType.STRING),
         Column('outclickLink', DataType.STRING),
         Column('outclickPage', DataType.STRING),
         Column('featuredCouponPosition', DataType.INT),
         Column('commentCount', DataType.INT),
         Column('mallCount', DataType.INT),
         Column('clickCount', DataType.INT),
         Column('merchantName', DataType.STRING),
         Column('merchantPosition', DataType.INT),
     ],
     compression=Compression.SNAPPY,
     partitions=[Column('createdpartition', DataType.STRING)],
 ))))
示例#19
0
 Dataset(data=(DatasetData(
     name='owen_eu_DW-3411_v1',
     table_name='owen_eu',
     location='s3://example-bucket/prd/inbound/overlord/eu-all-events',
     load_type=LoadType.MERGE,
     data_format=DataFormat(
         file_format=FileFormat.TEXTFILE,
         row_format=RowFormat.JSON,
     ),
     compression=Compression.GZIP,
     partitions=[
         Column('year', DataType.STRING),
         Column('month', DataType.STRING),
         Column('day', DataType.STRING),
     ],
     columns=[
         Column('host', DataType.STRING, path='metadata.host'),
         Column('referer', DataType.STRING, path='metadata.referer'),
         Column(
             'eventName', DataType.STRING, path='owen.event.eventName'),
         Column('eventVersion',
                DataType.STRING,
                path='owen.event.eventVersion'),
         Column('eventPlatform',
                DataType.STRING,
                path='owen.event.eventPlatform'),
         Column('eventInstanceUuid',
                DataType.STRING,
                path='owen.event.eventInstanceUuid'),
         Column('eventCategory',
                DataType.STRING,
                path='owen.event.eventCategory'),
         Column('eventTimestamp',
                DataType.TIMESTAMP,
                path='owen.event.eventTimestamp',
                date_pattern="yyyy-MM-dd'T'HH:mm:ss'Z'"),
         Column('eventTarget',
                DataType.STRING,
                path='owen.event.eventTarget'),
         Column('eventAction',
                DataType.STRING,
                path='owen.event.eventAction'),
         Column('eventPlatformVersion',
                DataType.STRING,
                path='owen.event.eventPlatformVersion'),
         Column('osName', DataType.STRING, path='owen.context.osName'),
         Column('loggedInFlag',
                DataType.BOOLEAN,
                path='owen.context.loggedInFlag'),
         Column('custom', DataType.STRING, path='owen.context.custom'),
         Column('browserVersion',
                DataType.STRING,
                path='owen.context.browserVersion'),
         Column(
             'referrer', DataType.STRING, path='owen.context.referrer'),
         Column('previousPageName',
                DataType.STRING,
                path='owen.context.previousPageName'),
         Column('screenHeight',
                DataType.STRING,
                path='owen.context.screenHeight'),
         Column('breadCrumb',
                DataType.STRING,
                path='owen.context.breadCrumb'),
         Column(
             'pageName', DataType.STRING, path='owen.context.pageName'),
         Column('country', DataType.STRING,
                path='owen.context.country'),
         Column('propertyName',
                DataType.STRING,
                path='owen.context.propertyName'),
         Column('launchCount',
                DataType.STRING,
                path='owen.context.launchCount'),
         Column('viewInstanceUuid',
                DataType.STRING,
                path='owen.context.viewInstanceUuid'),
         Column('osVersion',
                DataType.STRING,
                path='owen.context.osVersion'),
         Column('connectionType',
                DataType.STRING,
                path='owen.context.connectionType'),
         Column('partialSearchTerm',
                DataType.STRING,
                path='owen.context.partialSearchTerm'),
         Column('carrier', DataType.STRING,
                path='owen.context.carrier'),
         Column('longitude',
                DataType.STRING,
                path='owen.context.longitude'),
         Column(
             'productSectionPosition_0',
             DataType.STRING,
             path='owen.context.inventory[0].productSectionPosition'),
         Column('savedFlag_0',
                DataType.BOOLEAN,
                path='owen.context.inventory[0].savedFlag'),
         Column('position_0',
                DataType.STRING,
                path='owen.context.inventory[0].position'),
         Column('brand_0',
                DataType.STRING,
                path='owen.context.inventory[0].brand'),
         Column('affiliateNetwork_0',
                DataType.STRING,
                path='owen.context.inventory[0].affiliateNetwork'),
         Column('deepLinkUrl_0',
                DataType.STRING,
                path='owen.context.inventory[0].deepLinkUrl'),
         Column('conquestingFlag_0',
                DataType.BOOLEAN,
                path='owen.context.inventory[0].conquestingFlag'),
         Column('originalPrice_0',
                DataType.STRING,
                path='owen.context.inventory[0].originalPrice'),
         Column('adUnitUuid_0',
                DataType.STRING,
                path='owen.context.inventory[0].adUnitUuid'),
         Column('startDate_0',
                DataType.TIMESTAMP,
                path='owen.context.inventory[0].startDate',
                date_pattern="yyyy-MM-dd'T'HH:mm:ss'Z'"),
         Column('proximityUnit_0',
                DataType.STRING,
                path='owen.context.inventory[0].proximityUnit'),
         Column('commentsCount_0',
                DataType.STRING,
                path='owen.context.inventory[0].commentsCount'),
         Column('outRedirectUrl_0',
                DataType.STRING,
                path='owen.context.inventory[0].outRedirectUrl'),
         Column('productCardPosition_0',
                DataType.STRING,
                path='owen.context.inventory[0].productCardPosition'),
         Column('productSectionUuid_0',
                DataType.STRING,
                path='owen.context.inventory[0].productSectionUuid'),
         Column('lastVerifiedDate_0',
                DataType.TIMESTAMP,
                path='owen.context.inventory[0].lastVerifiedDate',
                date_pattern="yyyy-MM-dd'T'HH:mm:ss'Z'"),
         Column('productCardUuid_0',
                DataType.STRING,
                path='owen.context.inventory[0].productCardUuid'),
         Column('redemptionChannel_0',
                DataType.STRING,
                path='owen.context.inventory[0].redemptionChannel'),
         Column('noVotes_0',
                DataType.STRING,
                path='owen.context.inventory[0].noVotes'),
         Column('retailCategory_0',
                DataType.STRING,
                path='owen.context.inventory[0].retailCategory'),
         Column('couponRank_0',
                DataType.STRING,
                path='owen.context.inventory[0].couponRank'),
         Column('inventoryChannel_0',
                DataType.STRING,
                path='owen.context.inventory[0].inventoryChannel'),
         Column('yesVotes_0',
                DataType.STRING,
                path='owen.context.inventory[0].yesVotes'),
         Column('inventorySource_0',
                DataType.STRING,
                path='owen.context.inventory[0].inventorySource'),
         Column('inventoryName_0',
                DataType.STRING,
                path='owen.context.inventory[0].inventoryName'),
         Column('monetizableFlag_0',
                DataType.BOOLEAN,
                path='owen.context.inventory[0].monetizableFlag'),
         Column('recommendedFlag_0',
                DataType.BOOLEAN,
                path='owen.context.inventory[0].recommendedFlag'),
         Column('expirationDate_0',
                DataType.TIMESTAMP,
                path='owen.context.inventory[0].expirationDate',
                date_pattern="yyyy-MM-dd'T'HH:mm:ss'Z'"),
         Column('clickLocation_0',
                DataType.STRING,
                path='owen.context.inventory[0].clickLocation'),
         Column('finalPrice_0',
                DataType.STRING,
                path='owen.context.inventory[0].finalPrice'),
         Column('usedByCount_0',
                DataType.STRING,
                path='owen.context.inventory[0].usedByCount'),
         Column('proximity_0',
                DataType.STRING,
                path='owen.context.inventory[0].proximity'),
         Column('inventoryUuid_0',
                DataType.STRING,
                path='owen.context.inventory[0].inventoryUuid'),
         Column('siteUuid_0',
                DataType.STRING,
                path='owen.context.inventory[0].siteUuid'),
         Column('outclickUuid_0',
                DataType.STRING,
                path='owen.context.inventory[0].outclickUuid'),
         Column('adUnitType_0',
                DataType.STRING,
                path='owen.context.inventory[0].adUnitType'),
         Column('exclusivityFlag_0',
                DataType.BOOLEAN,
                path='owen.context.inventory[0].exclusivityFlag'),
         Column('inventoryType_0',
                DataType.STRING,
                path='owen.context.inventory[0].inventoryType'),
         Column('successPercentage_0',
                DataType.STRING,
                path='owen.context.inventory[0].successPercentage'),
         Column('claimUuid_0',
                DataType.STRING,
                path='owen.context.inventory[0].claimUuid'),
         Column('region', DataType.STRING, path='owen.context.region'),
         Column('session', DataType.STRING,
                path='owen.context.session'),
         Column('content',
                DataType.STRING,
                path='owen.context.marketing.content'),
         Column('marketingVendor',
                DataType.STRING,
                path='owen.context.marketing.vendor'),
         Column('campaign',
                DataType.STRING,
                path='owen.context.marketing.campaign'),
         Column('adGroup',
                DataType.STRING,
                path='owen.context.marketing.adGroup'),
         Column('campaignUuid',
                DataType.STRING,
                path='owen.context.marketing.campaignUuid'),
         Column('campaignSendCount',
                DataType.STRING,
                path='owen.context.marketing.campaignSendCount'),
         Column('source',
                DataType.STRING,
                path='owen.context.marketing.source'),
         Column('term',
                DataType.STRING,
                path='owen.context.marketing.term'),
         Column('channel',
                DataType.STRING,
                path='owen.context.marketing.channel'),
         Column('medium',
                DataType.STRING,
                path='owen.context.marketing.medium'),
         Column('cdRank',
                DataType.STRING,
                path='owen.context.marketing.cdRank'),
         Column('notificationUuid',
                DataType.STRING,
                path='owen.context.marketing.notificationUuid'),
         Column('inventoryCount',
                DataType.STRING,
                path='owen.context.inventoryCount'),
         Column('favoriteFlag',
                DataType.BOOLEAN,
                path='owen.context.favoriteFlag'),
         Column(
             'pageType', DataType.STRING, path='owen.context.pageType'),
         Column('bluetoothBeaconType',
                DataType.STRING,
                path='owen.context.bluetoothBeaconType'),
         Column('variation_0',
                DataType.STRING,
                path='owen.context.experiment[0].variation'),
         Column('campaign_0',
                DataType.STRING,
                path='owen.context.experiment[0].campaign'),
         Column('locationEnabledFlag',
                DataType.BOOLEAN,
                path='owen.context.locationEnabledFlag'),
         Column('macAddress',
                DataType.STRING,
                path='owen.context.macAddress'),
         Column('browserFamily',
                DataType.STRING,
                path='owen.context.browserFamily'),
         Column('geofenceUuid',
                DataType.STRING,
                path='owen.context.geofenceUuid'),
         Column('mobileDeviceMake',
                DataType.STRING,
                path='owen.context.mobileDeviceMake'),
         Column('vendor_0',
                DataType.STRING,
                path='owen.context.vendor[0].vendor'),
         Column('vendorClickUuid_0',
                DataType.STRING,
                path='owen.context.vendor[0].vendorClickUuid'),
         Column('udid', DataType.STRING, path='owen.context.udid'),
         Column(
             'latitude', DataType.STRING, path='owen.context.latitude'),
         Column('bluetoothEnabledFlag',
                DataType.BOOLEAN,
                path='owen.context.bluetoothEnabledFlag'),
         Column('environment',
                DataType.STRING,
                path='owen.context.environment'),
         Column('city', DataType.STRING, path='owen.context.city'),
         Column(
             'userUuid', DataType.STRING, path='owen.context.userUuid'),
         Column('dma', DataType.STRING, path='owen.context.dma'),
         Column('testUuid',
                DataType.STRING,
                path='owen.context.test.testUuid'),
         Column('userAgent',
                DataType.STRING,
                path='owen.context.userAgent'),
         Column('previousViewInstanceUuid',
                DataType.STRING,
                path='owen.context.previousViewInstanceUuid'),
         Column(
             'language', DataType.STRING, path='owen.context.language'),
         Column('deviceCategory',
                DataType.STRING,
                path='owen.context.deviceCategory'),
         Column('bluetoothBeaconId',
                DataType.STRING,
                path='owen.context.bluetoothBeaconId'),
         Column('screenWidth',
                DataType.STRING,
                path='owen.context.screenWidth'),
         Column('personalizationFlag',
                DataType.BOOLEAN,
                path='owen.context.personalizationFlag'),
         Column('appForegroundFlag',
                DataType.BOOLEAN,
                path='owen.context.appForegroundFlag'),
         Column('mobileDeviceModel',
                DataType.STRING,
                path='owen.context.mobileDeviceModel'),
         Column('userQualifier',
                DataType.STRING,
                path='owen.context.userQualifier'),
         Column('deviceFingerprint',
                DataType.STRING,
                path='owen.context.deviceFingerprint'),
         Column('ipAddress',
                DataType.STRING,
                path='owen.context.ipAddress'),
         Column(
             'osFamily', DataType.STRING, path='owen.context.osFamily'),
         Column('advertiserUuid',
                DataType.STRING,
                path='owen.context.advertiserUuid'),
         Column('notificationEnabledFlag',
                DataType.BOOLEAN,
                path='owen.context.notificationEnabledFlag'),
         Column('inventory',
                DataType.STRING,
                path='owen.context.inventory'),
         Column('vendor', DataType.STRING, path='owen.context.vendor'),
         Column('experiment',
                DataType.STRING,
                path='owen.context.experiment'),
     ],
 ))))
示例#20
0
    def test_hive_table_definition_step(self):
        ds = Dataset(data=DatasetData(
            name='weblogs_v01',
            table_name='weblogs',
            location='s3://wsm-log-servers/weblogs/www.retailmenot.com/ec2/',
            data_format=DataFormat(
                file_format=FileFormat.TEXTFILE,
                row_format=RowFormat.REGEX,
                regex_input=
                "(?<ip>^(?:(?:unknown(?:,\\s)?|(?:\\d+\\.\\d+\\.\\d+\\.\\d+(?:,\\s)?))+)|\\S*)\\s+\\S+\\s+(?<userIdentifier>(?:[^\\[]+|\\$\\S+\\['\\S+'\\]|\\[username\\]))\\s*\\s+\\[(?<requestDate>[^\\]]+)\\]\\s+\"(?<httpMethod>(?:GET|HEAD|POST|PUT|DELETE|TRACE))\\s(?<urlPath>(?:[^ ?]+))(?:\\?(?<queryString>(?:[^ ]+)))?\\sHTTP/(?<httpVersion>(?:[\\d\\.]+))\"\\s+(?<statusCode>[0-9]+)\\s+(?<bytesSent>\\S+)\\s+\"(?<referrer>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<userAgent>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+(?<responseTime>[-0-9]*)\\s+\"(?<hostName>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<userFingerprint>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<userId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<sessionId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<requestId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<visitorId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<vegSlice>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<fruitSlice>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<cacheHitMiss>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s*\\Z",
                regex_output=
                "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s %10$s %11$s %12$s %13$s %14$s %15$s %16$s %17$s %18$s %19$s %20$s %21s",
            ),
            columns=[
                Column('ip', 'STRING'),
                Column('user', 'STRING'),
                Column('requestDate',
                       'TIMESTAMP',
                       date_pattern='dd/MMM/yyyy:HH:mm:ss Z'),
                Column('httpMethod', 'STRING'),
                Column('urlPath', 'STRING'),
                Column('queryString', 'STRING'),
                Column('httpVersion', 'STRING'),
                Column('statusCode', 'STRING'),
                Column('bytesSent', 'INT'),
                Column('referrer', 'STRING'),
                Column('userAgent', 'STRING'),
                Column('responseTime', 'BIGINT'),
                Column('hostname', 'STRING'),
                Column('userFingerprint', 'STRING'),
                Column('userId', 'STRING'),
                Column('sessionId', 'STRING'),
                Column('requestId', 'STRING'),
                Column('visitorId', 'STRING'),
                Column('vegSlice', 'STRING'),
                Column('fruitSlice', 'STRING'),
                Column('cacheHitMiss', 'STRING'),
            ],
            compression='GZIP',
            partitions=[
                Column('year', 'STRING'),
                Column('week', 'STRING'),
                Column('day', 'STRING'),
            ],
        ))

        call('mkdir -p /tmp/dart-emr-test/hive/')
        this_path = os.path.dirname(os.path.abspath(__file__))
        shutil.copyfile(
            this_path + '/../../../engine/emr/steps/hive/copy_to_table.hql',
            '/tmp/dart-emr-test/hive/copy_to_table.hql')
        hive_copy_to_table(ds, 'weblogs_stage', ds, 'weblogs', 's3://test',
                           '/tmp/dart-emr-test/', 'actionid123', None, 1, 1)

        with open(os.path.join(this_path, 'copy_to_table_weblogs.hql')) as f:
            expected_contents = f.read()

        with open('/tmp/dart-emr-test/hive/copy_to_table_weblogs.hql') as f:
            actual_contents = f.read()

        self.assertEqual(expected_contents, actual_contents)
示例#21
0
 Dataset(data=(DatasetData(
     name='owen_outclick_us_v02',
     description=
     'Owen outclick data, based on overlord schema version. Considered a replacement for outclick events.',
     table_name='outclick',
     location=
     's3://example-bucket/prd/inbound/overlord/raw-firehose-02/rmn-outclicks',
     load_type=LoadType.MERGE,
     data_format=DataFormat(
         file_format=FileFormat.TEXTFILE,
         row_format=RowFormat.JSON,
     ),
     compression=Compression.GZIP,
     partitions=[
         Column('year', DataType.STRING),
         Column('month', DataType.STRING),
         Column('day', DataType.STRING),
     ],
     primary_keys=['eventInstanceUuid'],
     merge_keys=['eventInstanceUuid'],
     sort_keys=[
         'eventTimestamp', 'eventInstanceUuid', 'derivedEventInstanceId'
     ],
     distribution_keys=['eventInstanceUuid'],
     batch_merge_sort_keys=['owenProcessed DESC'],
     columns=[
         Column('advertiserUuid',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.advertiserUuid'),
         Column('appBadgeCount',
                DataType.INT,
                path='owen.context.appBadgeCount'),
         Column('appForegroundFlag',
                DataType.BOOLEAN,
                path='owen.context.appForegroundFlag'),
         Column('bluetoothBeaconId',
                DataType.VARCHAR,
                length=50,
                path='owen.context.bluetoothBeaconId'),
         Column('bluetoothBeaconType',
                DataType.VARCHAR,
                length=25,
                path='owen.context.bluetoothBeaconType'),
         Column('bluetoothEnabledFlag',
                DataType.BOOLEAN,
                path='owen.context.bluetoothEnabledFlag'),
         Column('breadCrumb',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.breadCrumb'),
         Column('browserFamily',
                DataType.VARCHAR,
                length=50,
                path='owen.context.browserFamily'),
         Column('browserVersion',
                DataType.VARCHAR,
                length=50,
                path='owen.context.browserVersion'),
         Column('carrier',
                DataType.VARCHAR,
                length=25,
                path='owen.context.carrier'),
         Column('city',
                DataType.VARCHAR,
                length=75,
                path='owen.context.city'),
         Column('connectionType',
                DataType.VARCHAR,
                length=25,
                path='owen.context.connectionType'),
         Column('country',
                DataType.VARCHAR,
                length=2,
                path='owen.context.country'),
         Column('custom', DataType.VARCHAR, path='owen.context.custom'),
         Column('deviceCategory',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.deviceCategory'),
         Column('deviceFingerprint',
                DataType.VARCHAR,
                length=26,
                path='owen.context.deviceFingerprint'),
         Column('dma', DataType.INT, path='owen.context.dma'),
         Column('environment',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.environment'),
         Column('experimentObject',
                DataType.VARCHAR,
                length=1024,
                path='owen.context.experiment'),
         Column('failureFlag',
                DataType.BOOLEAN,
                path='owen.context.failureFlag'),
         Column('failureReason',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.failureReason'),
         Column('favoriteFlag',
                DataType.BOOLEAN,
                path='owen.context.favoriteFlag'),
         Column('featureFlags',
                DataType.VARCHAR,
                path='owen.context.featureFlags'),
         Column('geofenceUuid',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.geofenceUuid'),
         Column('inventoryCount',
                DataType.INT,
                path='owen.context.inventoryCount'),
         Column('inventory_affiliateNetwork',
                DataType.VARCHAR,
                length=50,
                path='owen.context.inventory[0].affiliateNetwork'),
         Column('inventory_brand',
                DataType.VARCHAR,
                length=100,
                path='owen.context.inventory[0].brand'),
         Column('inventory_claimUuid',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.inventory[0].claimUuid'),
         Column('inventory_clickLocation',
                DataType.VARCHAR,
                length=100,
                path='owen.context.inventory[0].clickLocation'),
         Column('inventory_commentsCount',
                DataType.INT,
                path='owen.context.inventory[0].commentsCount'),
         Column('inventory_conquestingFlag',
                DataType.BOOLEAN,
                path='owen.context.inventory[0].conquestingFlag'),
         Column('inventory_couponRank',
                DataType.NUMERIC,
                precision=18,
                scale=4,
                path='owen.context.inventory[0].couponRank'),
         Column('inventory_deepLinkUrl',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.inventory[0].deepLinkUrl'),
         Column('inventory_deepLinkUrlScheme',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.inventory[0].deepLinkUrlScheme'),
         Column('inventory_exclusivityFlag',
                DataType.BOOLEAN,
                path='owen.context.inventory[0].exclusivityFlag'),
         Column('inventory_expirationDate',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.inventory[0].expirationDate'),
         Column('inventory_finalPrice',
                DataType.NUMERIC,
                precision=18,
                scale=4,
                path='owen.context.inventory[0].finalPrice'),
         Column('inventory_instoreType',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.inventory[0].instoreType'),
         Column('inventory_inventoryChannel',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.inventory[0].inventoryChannel'),
         Column('inventory_inventoryName',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.inventory[0].inventoryName'),
         Column('inventory_inventorySource',
                DataType.VARCHAR,
                length=50,
                path='owen.context.inventory[0].inventorySource'),
         Column('inventory_inventoryType',
                DataType.VARCHAR,
                length=25,
                path='owen.context.inventory[0].inventoryType'),
         Column('inventory_inventoryUuid',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.inventory[0].inventoryUuid'),
         Column('inventory_lastVerifiedDate',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.inventory[0].lastVerifiedDate'),
         Column('inventory_monetizableFlag',
                DataType.BOOLEAN,
                path='owen.context.inventory[0].monetizableFlag'),
         Column('inventory_noVotes',
                DataType.INT,
                path='owen.context.inventory[0].noVotes'),
         Column('inventory_onlineType',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.inventory[0].onlineType'),
         Column('inventory_originalPrice',
                DataType.NUMERIC,
                precision=18,
                scale=4,
                path='owen.context.inventory[0].originalPrice'),
         Column('inventory_outRedirectUrl',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.inventory[0].outRedirectUrl'),
         Column('inventory_outclickUuid',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.inventory[0].outclickUuid'),
         Column('inventory_parentInventoryUuid',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.inventory[0].parentInventoryUuid'),
         Column('inventory_personalizationFlag',
                DataType.BOOLEAN,
                path='owen.context.inventory[0].personalizationFlag'),
         Column('inventory_position',
                DataType.INT,
                path='owen.context.inventory[0].position'),
         Column('inventory_proximity',
                DataType.NUMERIC,
                precision=18,
                scale=4,
                path='owen.context.inventory[0].proximity'),
         Column('inventory_proximityUnit',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.inventory[0].proximityUnit'),
         Column('inventory_recommendedFlag',
                DataType.BOOLEAN,
                path='owen.context.inventory[0].recommendedFlag'),
         Column('inventory_redemptionChannel',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.inventory[0].redemptionChannel'),
         Column('inventory_retailCategory',
                DataType.VARCHAR,
                length=75,
                path='owen.context.inventory[0].retailCategory'),
         Column('inventory_savedFlag',
                DataType.BOOLEAN,
                path='owen.context.inventory[0].savedFlag'),
         Column('inventory_siteUuid',
                DataType.VARCHAR,
                length=26,
                path='owen.context.inventory[0].siteUuid'),
         Column('inventory_startDate',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.inventory[0].startDate'),
         Column('inventory_successPercentage',
                DataType.NUMERIC,
                precision=18,
                scale=4,
                path='owen.context.inventory[0].successPercentage'),
         Column('inventory_usedByCount',
                DataType.INT,
                path='owen.context.inventory[0].usedByCount'),
         Column('inventory_yesVotes',
                DataType.INT,
                path='owen.context.inventory[0].yesVotes'),
         Column('ipAddress',
                DataType.VARCHAR,
                length=45,
                path='owen.context.ipAddress'),
         Column('language',
                DataType.VARCHAR,
                length=6,
                path='owen.context.language'),
         Column('latitude',
                DataType.NUMERIC,
                precision=18,
                scale=4,
                path='owen.context.latitude'),
         Column('locationEnabledFlag',
                DataType.BOOLEAN,
                path='owen.context.locationEnabledFlag'),
         Column('loggedInFlag',
                DataType.BOOLEAN,
                path='owen.context.loggedInFlag'),
         Column('longitude',
                DataType.NUMERIC,
                precision=18,
                scale=4,
                path='owen.context.longitude'),
         Column('macAddress',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.macAddress'),
         Column('marketing_adGroup',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.marketing.adGroup'),
         Column('marketing_campaign',
                DataType.VARCHAR,
                length=50,
                path='owen.context.marketing.campaign'),
         Column('marketing_campaignSendCount',
                DataType.INT,
                path='owen.context.marketing.campaignSendCount'),
         Column('marketing_campaignUuid',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.marketing.campaignUuid'),
         Column('marketing_cdRank',
                DataType.INT,
                path='owen.context.marketing.cdRank'),
         Column('marketing_channel',
                DataType.VARCHAR,
                length=50,
                path='owen.context.marketing.channel'),
         Column('marketing_content',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.marketing.content'),
         Column('marketing_medium',
                DataType.VARCHAR,
                length=50,
                path='owen.context.marketing.medium'),
         Column('marketing_notificationUuid',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.marketing.notificationUuid'),
         Column('marketing_source',
                DataType.VARCHAR,
                length=100,
                path='owen.context.marketing.source'),
         Column('marketing_term',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.marketing.term'),
         Column('marketing_vendor',
                DataType.VARCHAR,
                length=25,
                path='owen.context.marketing.vendor'),
         Column('mobileDeviceMake',
                DataType.VARCHAR,
                length=25,
                path='owen.context.mobileDeviceMake'),
         Column('mobileDeviceModel',
                DataType.VARCHAR,
                length=50,
                path='owen.context.mobileDeviceModel'),
         Column('notificationEnabledFlag',
                DataType.BOOLEAN,
                path='owen.context.notificationEnabledFlag'),
         Column('osFamily',
                DataType.VARCHAR,
                length=25,
                path='owen.context.osFamily'),
         Column('osName',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.osName'),
         Column('osVersion',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.osVersion'),
         Column('pageName',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.pageName'),
         Column('pageType',
                DataType.VARCHAR,
                length=100,
                path='owen.context.pageType'),
         Column('partialSearchTerm',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.partialSearchTerm'),
         Column('personalizationFlag',
                DataType.BOOLEAN,
                path='owen.context.personalizationFlag'),
         Column('previousPageName',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.previousPageName'),
         Column('previousViewInstanceUuid',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.previousViewInstanceUuid'),
         Column('promptName',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.promptName'),
         Column('propertyName',
                DataType.VARCHAR,
                length=20,
                path='owen.context.propertyName'),
         Column('referrer',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.referrer'),
         Column('region',
                DataType.VARCHAR,
                length=25,
                path='owen.context.region'),
         Column('screenHeight',
                DataType.INT,
                path='owen.context.screenHeight'),
         Column('screenWidth',
                DataType.INT,
                path='owen.context.screenWidth'),
         Column('session',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.session'),
         Column('test_testUuid',
                DataType.VARCHAR,
                length=26,
                path='owen.context.test.testUuid'),
         Column('udid',
                DataType.VARCHAR,
                length=40,
                path='owen.context.udid'),
         Column('userAgent',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.userAgent'),
         Column('userQualifier',
                DataType.VARCHAR,
                length=26,
                path='owen.context.userQualifier'),
         Column('userUuid',
                DataType.VARCHAR,
                length=2048,
                path='owen.context.userUuid'),
         Column('vendorObject',
                DataType.VARCHAR,
                length=512,
                path='owen.context.vendor'),
         Column('viewInstanceUuid',
                DataType.VARCHAR,
                length=128,
                path='owen.context.viewInstanceUuid'),
         Column('eventAction',
                DataType.VARCHAR,
                length=2048,
                path='owen.event.eventAction'),
         Column('eventCategory',
                DataType.VARCHAR,
                length=25,
                path='owen.event.eventCategory'),
         Column('eventInstanceUuid',
                DataType.VARCHAR,
                length=26,
                path='owen.event.eventInstanceUuid'),
         Column('eventName',
                DataType.VARCHAR,
                length=50,
                path='owen.event.eventName'),
         Column('eventPlatform',
                DataType.VARCHAR,
                length=25,
                path='owen.event.eventPlatform'),
         Column('eventPlatformVersion',
                DataType.VARCHAR,
                length=25,
                path='owen.event.eventPlatformVersion'),
         Column('eventTarget',
                DataType.VARCHAR,
                length=2048,
                path='owen.event.eventTarget'),
         Column('eventVersion',
                DataType.VARCHAR,
                length=25,
                path='owen.event.eventVersion'),
         Column('eventTimestamp',
                DataType.DATETIME,
                date_pattern="yyyy-MM-dd'T'HH:mm:ss'Z'",
                path='owen.event.eventTimestamp'),
         Column('derivedEventInstanceId',
                DataType.VARCHAR,
                length=64,
                path='metadata.derivedEventInstanceId'),
         Column('owenProcessed',
                DataType.DATETIME,
                date_pattern="yyyy-MM-dd'T'HH:mm:ss'Z'",
                path='metadata.analyticsTopologyFinishTime'),
     ],
 ))))
示例#22
0
def post_dataset():
    return {'results': dataset_service().save_dataset(Dataset.from_dict(request.get_json())).to_dict()}
示例#23
0
    def setUp(self):
        dart = Dart(host='localhost', port=5000)
        """ :type dart: dart.client.python.dart_client.Dart """
        self.dart = dart

        cs = [Column('c1', DataType.VARCHAR, 50), Column('c2', DataType.BIGINT)]
        df = DataFormat(FileFormat.PARQUET, RowFormat.NONE)
        dataset_data = DatasetData('test-dataset0', 'test_dataset_table0', 's3://test/dataset/0/%s' + random_id(), df, cs)
        self.dataset0 = self.dart.save_dataset(Dataset(data=dataset_data))

        cs = [Column('c1', DataType.VARCHAR, 50), Column('c2', DataType.BIGINT)]
        df = DataFormat(FileFormat.PARQUET, RowFormat.NONE)
        dataset1_location = 's3://test/dataset/1/%s' + random_id()
        dataset_data = DatasetData('test-dataset1', 'test_dataset_table1', dataset1_location, df, cs)
        self.dataset1 = self.dart.save_dataset(Dataset(data=dataset_data))

        cs = [Column('c1', DataType.VARCHAR, 50), Column('c2', DataType.BIGINT)]
        df = DataFormat(FileFormat.PARQUET, RowFormat.NONE)
        dataset_data = DatasetData('test-dataset2-no-show', 'test_dataset_table2', 's3://test/dataset/2/%s' + random_id(), df, cs)
        self.dataset2 = self.dart.save_dataset(Dataset(data=dataset_data))

        s = Subscription(data=SubscriptionData('test-subscription0', self.dataset0.id))
        self.subscription0 = self.dart.save_subscription(s)

        s = Subscription(data=SubscriptionData('test-subscription2-no-show', self.dataset2.id))
        self.subscription2 = self.dart.save_subscription(s)

        dst_args = {'action_sleep_time_in_seconds': 0}
        dst = Datastore(data=DatastoreData('test-datastore0', 'no_op_engine', args=dst_args, state=DatastoreState.TEMPLATE))
        self.datastore0 = self.dart.save_datastore(dst)
        dst = Datastore(data=DatastoreData('test-datastore1', 'no_op_engine', args=dst_args, state=DatastoreState.TEMPLATE))
        self.datastore1 = self.dart.save_datastore(dst)
        dst = Datastore(data=DatastoreData('test-datastore2-no-show', 'no_op_engine', args=dst_args, state=DatastoreState.ACTIVE))
        self.datastore2 = self.dart.save_datastore(dst)

        wf0 = Workflow(data=WorkflowData('test-workflow0', self.datastore0.id, state=WorkflowState.ACTIVE))
        self.workflow0 = self.dart.save_workflow(wf0, self.datastore0.id)
        wf1 = Workflow(data=WorkflowData('test-workflow1', self.datastore1.id, state=WorkflowState.ACTIVE))
        self.workflow1 = self.dart.save_workflow(wf1, self.datastore1.id)
        wf2 = Workflow(data=WorkflowData('test-workflow2-no-show', self.datastore2.id, state=WorkflowState.ACTIVE))
        self.workflow2 = self.dart.save_workflow(wf2, self.datastore2.id)

        a_args = {'source_hdfs_path': 'hdfs:///user/hive/warehouse/test', 'destination_s3_path': dataset1_location}
        a00 = Action(data=ActionData(NoOpActionTypes.action_that_succeeds.name, NoOpActionTypes.action_that_succeeds.name, state=ActionState.TEMPLATE))
        a01 = Action(data=ActionData(NoOpActionTypes.consume_subscription.name, NoOpActionTypes.consume_subscription.name, {'subscription_id': self.subscription0.id}, state=ActionState.TEMPLATE))
        a02 = Action(data=ActionData(NoOpActionTypes.action_that_succeeds.name, NoOpActionTypes.action_that_succeeds.name, state=ActionState.TEMPLATE))
        a03 = Action(data=ActionData(NoOpActionTypes.copy_hdfs_to_s3_action.name, NoOpActionTypes.copy_hdfs_to_s3_action.name, a_args, state=ActionState.TEMPLATE))
        a04 = Action(data=ActionData(NoOpActionTypes.action_that_succeeds.name, NoOpActionTypes.action_that_succeeds.name, state=ActionState.TEMPLATE))
        self.action00, self.action01, self.action02, self.action03, self.action04 = \
            self.dart.save_actions([a00, a01, a02, a03, a04], workflow_id=self.workflow0.id)

        a10 = Action(data=ActionData(NoOpActionTypes.load_dataset.name, NoOpActionTypes.load_dataset.name, {'dataset_id': self.dataset1.id}, state=ActionState.TEMPLATE))
        self.action10 = self.dart.save_actions([a10], workflow_id=self.workflow1.id)

        a20 = Action(data=ActionData(NoOpActionTypes.action_that_succeeds.name, NoOpActionTypes.action_that_succeeds.name, state=ActionState.HAS_NEVER_RUN))
        a21 = Action(data=ActionData(NoOpActionTypes.load_dataset.name, NoOpActionTypes.load_dataset.name, {'dataset_id': self.dataset2.id}, state=ActionState.TEMPLATE))
        self.action20 = self.dart.save_actions([a20], datastore_id=self.datastore2.id)
        self.action21 = self.dart.save_actions([a21], workflow_id=self.workflow2.id)

        self.event1 = self.dart.save_event(Event(data=EventData('test-event1', state=EventState.ACTIVE)))
        self.event2 = self.dart.save_event(Event(data=EventData('test-event2-no-show', state=EventState.ACTIVE)))

        tr_args = {'event_id': self.event1.id}
        tr = Trigger(data=TriggerData('test-event-trigger1', 'event', [self.workflow1.id], tr_args, TriggerState.ACTIVE))
        self.event_trigger1 = self.dart.save_trigger(tr)

        tr_args = {'event_id': self.event2.id}
        tr = Trigger(data=TriggerData('test-event-trigger2-no-show', 'event', [self.workflow2.id], tr_args, TriggerState.ACTIVE))
        self.event_trigger2 = self.dart.save_trigger(tr)

        st_args = {'fire_after': 'ALL', 'completed_trigger_ids': [self.event_trigger1.id]}
        st = Trigger(data=TriggerData('test-super-trigger1', 'super', None, st_args, TriggerState.ACTIVE))
        self.super_trigger1 = self.dart.save_trigger(st)

        st_args = {'fire_after': 'ANY', 'completed_trigger_ids': [self.super_trigger1.id]}
        st = Trigger(data=TriggerData('test-super-trigger2', 'super', [self.workflow1.id], st_args, TriggerState.ACTIVE))
        self.super_trigger2 = self.dart.save_trigger(st)
示例#24
0
def prepare_load_dataset_steps(dry_run, args_by_name, datastore, dataset, action_id, s3_path_and_file_size_gen,
                               target_is_dynamodb=False):
    """ :type dataset: dart.model.dataset.Dataset """

    def add_to(step_partials, step_num, func, *args):
        # add all params except the last one, which is the total steps (known at the end)
        step_partials.append(functools.partial(func, *(list(args) + [step_num])))
        return step_num + 1

    def stage_table_not_needed(ds, file_format, row_format, compression, delimited_by, quoted_by,
                               escaped_by, null_string):
        """ :type ds: dart.model.dataset.Dataset """
        return file_format == ds.data.data_format.file_format\
            and row_format == ds.data.data_format.row_format\
            and compression == ds.data.compression\
            and delimited_by == ds.data.data_format.delimited_by\
            and quoted_by == ds.data.data_format.quoted_by\
            and escaped_by == ds.data.data_format.escaped_by\
            and null_string == ds.data.data_format.null_string

    # some steps require producing a dataset specific file based on a template, so we will copy all to a tempdir
    tempdir = tempfile.mkdtemp()
    try:
        local_step_path, s3_step_path, s3_temp_path = prepare_step_paths(datastore, tempdir)
        target_table_name = args_by_name.get('target_table_name') or dataset.data.table_name
        target_file_format = args_by_name.get('target_file_format') or dataset.data.data_format.file_format
        target_row_format = args_by_name.get('target_row_format') or dataset.data.data_format.row_format
        target_compression = args_by_name.get('target_compression') or dataset.data.compression
        target_delimited_by = args_by_name.get('target_delimited_by') or dataset.data.data_format.delimited_by
        target_quoted_by = args_by_name.get('target_quoted_by') or dataset.data.data_format.quoted_by
        target_escaped_by = args_by_name.get('target_escaped_by') or dataset.data.data_format.escaped_by
        target_null_string = args_by_name.get('target_null_string') or dataset.data.data_format.null_string

        stage_table_name = target_table_name + '_stage_for_action_' + action_id
        staging_not_needed = stage_table_not_needed(dataset, target_file_format, target_row_format, target_compression,
                                                    target_delimited_by, target_quoted_by, target_escaped_by, target_null_string)
        first_table_name = target_table_name if staging_not_needed and not target_is_dynamodb else stage_table_name

        drop_table_names = []
        step_funcs = []
        i = 1

        # ------------------------------------------------------------------------------------------------------------
        # all code paths below require copying the data to HDFS, and lowercasing the table is required because of hive
        # ------------------------------------------------------------------------------------------------------------
        i = add_to(step_funcs, i, s3distcp_files_step, s3_path_and_file_size_gen, first_table_name.lower(), dataset, s3_step_path, local_step_path, action_id)

        # ------------------------------------------------------------------------------------------------------------
        # not all folder structures on s3 are hive compatible... if not, rename directories after copying
        # ------------------------------------------------------------------------------------------------------------
        if dataset.data.partitions and not dataset.data.hive_compatible_partition_folders:
            i = add_to(step_funcs, i, python_fix_partition_folder_names, first_table_name.lower(), dataset.data.partitions, s3_step_path, action_id)

        # ------------------------------------------------------------------------------------------------------------
        # special case to share functionality with the dynamodb_engine
        # ------------------------------------------------------------------------------------------------------------
        if target_is_dynamodb:
            dyn_dataset = Dataset.from_dict(dataset.to_dict())
            assert isinstance(dyn_dataset, Dataset)
            dyn_dataset.data.data_format = DataFormat('DYNAMODB_TABLE', RowFormat.NONE, 0)
            dyn_dataset.data.compression = Compression.NONE
            dyn_dataset.data.columns = [Column(c.name, dynamodb_column_type(c)) for c in dataset.data.columns]
            set_hive_vars = 'SET dynamodb.retry.duration = 0;\nSET dynamodb.throughput.write.percent = %s;'
            set_hive_vars = set_hive_vars % args_by_name['write_capacity_utilization_percent']

            i = add_to(step_funcs, i, hive_table_definition_step, stage_table_name, dataset, s3_step_path, local_step_path, action_id, False)
            i = add_to(step_funcs, i, hive_table_definition_step, target_table_name, dyn_dataset, s3_step_path, local_step_path, action_id, True)
            i = add_to(step_funcs, i, hive_msck_repair_table_step, stage_table_name, s3_step_path, action_id)
            i = add_to(step_funcs, i, hive_copy_to_table, dataset, stage_table_name, dyn_dataset, target_table_name, s3_step_path, local_step_path, action_id, set_hive_vars)

        # ------------------------------------------------------------------------------------------------------------
        # if no stage tables are needed, much complexity can be skipped
        # ------------------------------------------------------------------------------------------------------------
        elif staging_not_needed:
            i = add_to(step_funcs, i, hive_table_definition_step, target_table_name, dataset, s3_step_path, local_step_path, action_id, False)
            i = add_to(step_funcs, i, hive_msck_repair_table_step, target_table_name, s3_step_path, action_id)

        # ------------------------------------------------------------------------------------------------------------
        # one or more staging tables are needed
        # ------------------------------------------------------------------------------------------------------------
        else:
            stage_dataset = dataset
            target_dataset = Dataset.from_dict(dataset.to_dict())
            target_dataset.data.data_format = DataFormat(target_file_format, target_row_format, 0, target_delimited_by,
                                                         target_quoted_by, target_escaped_by, target_null_string)
            target_dataset.data.compression = target_compression
            drop_table_names.append(stage_table_name)

            # --------------------------------------------------------------------------------------------------------
            # define string types for JSON/REGEX based datasets (safe), and we will cast appropriately during insert
            # --------------------------------------------------------------------------------------------------------
            if stage_dataset.data.data_format.row_format in [RowFormat.JSON, RowFormat.REGEX]:
                # make a copy since we are modifying the columns
                stage_dataset = Dataset.from_dict(dataset.to_dict())
                assert isinstance(stage_dataset, Dataset)
                for c in stage_dataset.data.columns:
                    c.data_type = DataType.STRING

            i = add_to(step_funcs, i, hive_table_definition_step, stage_table_name, stage_dataset, s3_step_path, local_step_path, action_id, False)
            i = add_to(step_funcs, i, hive_table_definition_step, target_table_name, target_dataset, s3_step_path, local_step_path, action_id, False)
            i = add_to(step_funcs, i, hive_msck_repair_table_step, stage_table_name, s3_step_path, action_id)

            # --------------------------------------------------------------------------------------------------------
            # hive has issues creating parquet files
            # --------------------------------------------------------------------------------------------------------
            if target_file_format != FileFormat.PARQUET:
                i = add_to(step_funcs, i, hive_copy_to_table, stage_dataset, stage_table_name, target_dataset, target_table_name, s3_step_path, local_step_path, action_id, None)

            # --------------------------------------------------------------------------------------------------------
            # impala is better for creating parquet files
            # --------------------------------------------------------------------------------------------------------
            else:
                # ----------------------------------------------------------------------------------------------------
                # no additional staging tables needed if the source dataset file format is RCFILE (impala friendly)
                # ----------------------------------------------------------------------------------------------------
                if dataset.data.data_format.file_format == FileFormat.RCFILE:
                    i = add_to(step_funcs, i, hive_copy_to_table, stage_dataset, stage_table_name, target_dataset, target_table_name, s3_step_path, local_step_path, action_id, None)

                # ----------------------------------------------------------------------------------------------------
                # impala cannot read all hive formats, so we will introduce another staging table
                # ----------------------------------------------------------------------------------------------------
                else:
                    rc_table_name = target_table_name + '_rcfile_stage_for_action_' + action_id
                    rc_dataset = Dataset.from_dict(target_dataset.to_dict())
                    rc_dataset.data.data_format = DataFormat(FileFormat.RCFILE, RowFormat.NONE, 0)
                    rc_dataset.data.compression = Compression.NONE
                    drop_table_names.append(rc_table_name)

                    i = add_to(step_funcs, i, hive_table_definition_step, rc_table_name, rc_dataset, s3_step_path, local_step_path, action_id, False)
                    i = add_to(step_funcs, i, hive_copy_to_table, stage_dataset, stage_table_name, rc_dataset, rc_table_name, s3_step_path, local_step_path, action_id, None)
                    i = add_to(step_funcs, i, impala_copy_to_table, rc_dataset, rc_table_name, target_dataset, target_table_name, s3_step_path, local_step_path, action_id)

        # ------------------------------------------------------------------------------------------------------------
        # at this point, the load should be considered complete even if something goes wrong in the steps below,
        # so we will indicate this in the step wrapper
        # ------------------------------------------------------------------------------------------------------------
        considered_successful_at_this_index = i

        # ------------------------------------------------------------------------------------------------------------
        # drop any staging tables created
        # ------------------------------------------------------------------------------------------------------------
        if drop_table_names:
            script = '\n'.join(['DROP TABLE %s;' % name for name in drop_table_names])
            i = add_to(step_funcs, i, hive_run_script_contents_step, script, s3_step_path, local_step_path, action_id)

        # ------------------------------------------------------------------------------------------------------------
        # inform impala about changes
        # ------------------------------------------------------------------------------------------------------------
        if not target_is_dynamodb:
            i = add_to(step_funcs, i, impala_invalidate_metadata_step, s3_step_path, action_id)

        total_steps = i - 1
        steps = []
        for index, f in enumerate(step_funcs, 1):
            step_wrapper = f(total_steps)
            assert isinstance(step_wrapper, StepWrapper)
            if index >= considered_successful_at_this_index:
                step_wrapper.action_considered_successful = True
            steps.append(step_wrapper)

        if not dry_run:
            s3_copy_recursive(local_step_path, s3_step_path)

        return steps

    finally:
        shutil.rmtree(tempdir)
示例#25
0
def put_dataset(dataset):
    return {'results': dataset_service().update_dataset(dataset.id, Dataset.from_dict(request.get_json())).to_dict()}