Пример #1
0
    def generate_subscription_elements(self, subscription):
        """ :type subscription: dart.model.subscription.Subscription """
        _update_subscription_state(subscription, SubscriptionState.GENERATING)

        dataset = self._dataset_service.get_dataset(
            subscription.data.dataset_id)
        conn = boto.connect_s3()
        bucket = get_bucket(conn, dataset.data.location)
        s3_keys = yield_s3_keys(
            bucket,
            dataset.data.location,
            subscription.data.s3_path_start_prefix_inclusive,
            subscription.data.s3_path_end_prefix_exclusive,
            subscription.data.s3_path_regex_filter,
        )
        elements = []
        subscription_element_dict = {}
        for i, key_obj in enumerate(s3_keys):
            sid = subscription.id
            s3_path = get_s3_path(key_obj)
            state = SubscriptionElementState.UNCONSUMED
            now = datetime.now()
            subscription_element_dict = {
                'id': random_id(),
                'version_id': 0,
                'created': now,
                'updated': now,
                'subscription_id': sid,
                's3_path': s3_path,
                'file_size': key_obj.size,
                'state': state
            }
            elements.append(subscription_element_dict)

            batch_size_reached = (i + 1) % _batch_size == 0
            if batch_size_reached:
                self._insert_elements(elements)
                elements = []

        if len(elements) > 0:
            self._insert_elements(elements)

        _update_subscription_state(subscription, SubscriptionState.ACTIVE)

        # Now that the subscription is ACTIVE, s3 events for new files will cause conditional inserts to be
        # performed to keep the subscription up to date.  However, in the time it took for the subscription
        # elements to be generated, s3 events for new objects could have been missed.  So we will do one final
        # s3 list operation (starting with the last inserted key) to fill in the potential gap.
        s3_keys = yield_s3_keys(
            bucket,
            dataset.data.location,
            subscription_element_dict.get('s3_path'),
            subscription.data.s3_path_end_prefix_exclusive,
            subscription.data.s3_path_regex_filter,
        )
        for key_obj in s3_keys:
            self.conditional_insert_subscription_element(
                subscription, get_s3_path(key_obj), key_obj.size)
Пример #2
0
    def generate_subscription_elements(self, subscription):
        """ :type subscription: dart.model.subscription.Subscription """
        _update_subscription_state(subscription, SubscriptionState.GENERATING)

        dataset = self._dataset_service.get_dataset(subscription.data.dataset_id)
        conn = boto.connect_s3()
        bucket = get_bucket(conn, dataset.data.location)
        s3_keys = yield_s3_keys(
            bucket,
            dataset.data.location,
            subscription.data.s3_path_start_prefix_inclusive,
            subscription.data.s3_path_end_prefix_exclusive,
            subscription.data.s3_path_regex_filter,
        )
        elements = []
        subscription_element_dict = {}
        for i, key_obj in enumerate(s3_keys):
            sid = subscription.id
            s3_path = get_s3_path(key_obj)
            state = SubscriptionElementState.UNCONSUMED
            now = datetime.now()
            subscription_element_dict = {
                'id': random_id(),
                'version_id': 0,
                'created': now,
                'updated': now,
                'subscription_id': sid,
                's3_path': s3_path,
                'file_size': key_obj.size,
                'state': state
            }
            elements.append(subscription_element_dict)

            batch_size_reached = (i + 1) % _batch_size == 0
            if batch_size_reached:
                self._insert_elements(elements)
                elements = []

        if len(elements) > 0:
            self._insert_elements(elements)

        _update_subscription_state(subscription, SubscriptionState.ACTIVE)

        # Now that the subscription is ACTIVE, s3 events for new files will cause conditional inserts to be
        # performed to keep the subscription up to date.  However, in the time it took for the subscription
        # elements to be generated, s3 events for new objects could have been missed.  So we will do one final
        # s3 list operation (starting with the last inserted key) to fill in the potential gap.
        s3_keys = yield_s3_keys(
            bucket,
            dataset.data.location,
            subscription_element_dict.get('s3_path'),
            subscription.data.s3_path_end_prefix_exclusive,
            subscription.data.s3_path_regex_filter,
        )
        for key_obj in s3_keys:
            self.conditional_insert_subscription_element(subscription, get_s3_path(key_obj), key_obj.size)
Пример #3
0
def _s3_path_and_updated_generator(action, dataset):
    conn = boto.connect_s3()
    s3_keys = yield_s3_keys(
        get_bucket(conn, dataset.data.location),
        dataset.data.location,
        action.data.args.get('s3_path_start_prefix_inclusive'),
        action.data.args.get('s3_path_end_prefix_exclusive'),
        action.data.args.get('s3_path_regex_filter')
    )
    for key_obj in s3_keys:
        yield get_s3_path(key_obj), None
Пример #4
0
def load_dataset_s3_path_and_file_size_generator(emr_engine, action, dataset=None):
    if dataset is None:
        dataset = emr_engine.dart.get_dataset(action.data.args['dataset_id'])
    conn = boto.connect_s3()
    s3_keys = yield_s3_keys(
        get_bucket(conn, dataset.data.location),
        dataset.data.location,
        action.data.args.get('s3_path_start_prefix_inclusive'),
        action.data.args.get('s3_path_end_prefix_exclusive'),
        action.data.args.get('s3_path_regex_filter')
    )
    for key_obj in s3_keys:
        yield get_s3_path(key_obj), key_obj.size
Пример #5
0
def load_dataset_s3_path_and_file_size_generator(emr_engine,
                                                 action,
                                                 dataset=None):
    if dataset is None:
        dataset = emr_engine.dart.get_dataset(action.data.args['dataset_id'])
    conn = boto.connect_s3()
    s3_keys = yield_s3_keys(
        get_bucket(conn, dataset.data.location), dataset.data.location,
        action.data.args.get('s3_path_start_prefix_inclusive'),
        action.data.args.get('s3_path_end_prefix_exclusive'),
        action.data.args.get('s3_path_regex_filter'))
    for key_obj in s3_keys:
        yield get_s3_path(key_obj), key_obj.size