def generate_subscription_elements(self, subscription): """ :type subscription: dart.model.subscription.Subscription """ _update_subscription_state(subscription, SubscriptionState.GENERATING) dataset = self._dataset_service.get_dataset( subscription.data.dataset_id) conn = boto.connect_s3() bucket = get_bucket(conn, dataset.data.location) s3_keys = yield_s3_keys( bucket, dataset.data.location, subscription.data.s3_path_start_prefix_inclusive, subscription.data.s3_path_end_prefix_exclusive, subscription.data.s3_path_regex_filter, ) elements = [] subscription_element_dict = {} for i, key_obj in enumerate(s3_keys): sid = subscription.id s3_path = get_s3_path(key_obj) state = SubscriptionElementState.UNCONSUMED now = datetime.now() subscription_element_dict = { 'id': random_id(), 'version_id': 0, 'created': now, 'updated': now, 'subscription_id': sid, 's3_path': s3_path, 'file_size': key_obj.size, 'state': state } elements.append(subscription_element_dict) batch_size_reached = (i + 1) % _batch_size == 0 if batch_size_reached: self._insert_elements(elements) elements = [] if len(elements) > 0: self._insert_elements(elements) _update_subscription_state(subscription, SubscriptionState.ACTIVE) # Now that the subscription is ACTIVE, s3 events for new files will cause conditional inserts to be # performed to keep the subscription up to date. However, in the time it took for the subscription # elements to be generated, s3 events for new objects could have been missed. So we will do one final # s3 list operation (starting with the last inserted key) to fill in the potential gap. s3_keys = yield_s3_keys( bucket, dataset.data.location, subscription_element_dict.get('s3_path'), subscription.data.s3_path_end_prefix_exclusive, subscription.data.s3_path_regex_filter, ) for key_obj in s3_keys: self.conditional_insert_subscription_element( subscription, get_s3_path(key_obj), key_obj.size)
def generate_subscription_elements(self, subscription): """ :type subscription: dart.model.subscription.Subscription """ _update_subscription_state(subscription, SubscriptionState.GENERATING) dataset = self._dataset_service.get_dataset(subscription.data.dataset_id) conn = boto.connect_s3() bucket = get_bucket(conn, dataset.data.location) s3_keys = yield_s3_keys( bucket, dataset.data.location, subscription.data.s3_path_start_prefix_inclusive, subscription.data.s3_path_end_prefix_exclusive, subscription.data.s3_path_regex_filter, ) elements = [] subscription_element_dict = {} for i, key_obj in enumerate(s3_keys): sid = subscription.id s3_path = get_s3_path(key_obj) state = SubscriptionElementState.UNCONSUMED now = datetime.now() subscription_element_dict = { 'id': random_id(), 'version_id': 0, 'created': now, 'updated': now, 'subscription_id': sid, 's3_path': s3_path, 'file_size': key_obj.size, 'state': state } elements.append(subscription_element_dict) batch_size_reached = (i + 1) % _batch_size == 0 if batch_size_reached: self._insert_elements(elements) elements = [] if len(elements) > 0: self._insert_elements(elements) _update_subscription_state(subscription, SubscriptionState.ACTIVE) # Now that the subscription is ACTIVE, s3 events for new files will cause conditional inserts to be # performed to keep the subscription up to date. However, in the time it took for the subscription # elements to be generated, s3 events for new objects could have been missed. So we will do one final # s3 list operation (starting with the last inserted key) to fill in the potential gap. s3_keys = yield_s3_keys( bucket, dataset.data.location, subscription_element_dict.get('s3_path'), subscription.data.s3_path_end_prefix_exclusive, subscription.data.s3_path_regex_filter, ) for key_obj in s3_keys: self.conditional_insert_subscription_element(subscription, get_s3_path(key_obj), key_obj.size)
def _s3_path_and_updated_generator(action, dataset): conn = boto.connect_s3() s3_keys = yield_s3_keys( get_bucket(conn, dataset.data.location), dataset.data.location, action.data.args.get('s3_path_start_prefix_inclusive'), action.data.args.get('s3_path_end_prefix_exclusive'), action.data.args.get('s3_path_regex_filter') ) for key_obj in s3_keys: yield get_s3_path(key_obj), None
def load_dataset_s3_path_and_file_size_generator(emr_engine, action, dataset=None): if dataset is None: dataset = emr_engine.dart.get_dataset(action.data.args['dataset_id']) conn = boto.connect_s3() s3_keys = yield_s3_keys( get_bucket(conn, dataset.data.location), dataset.data.location, action.data.args.get('s3_path_start_prefix_inclusive'), action.data.args.get('s3_path_end_prefix_exclusive'), action.data.args.get('s3_path_regex_filter') ) for key_obj in s3_keys: yield get_s3_path(key_obj), key_obj.size
def load_dataset_s3_path_and_file_size_generator(emr_engine, action, dataset=None): if dataset is None: dataset = emr_engine.dart.get_dataset(action.data.args['dataset_id']) conn = boto.connect_s3() s3_keys = yield_s3_keys( get_bucket(conn, dataset.data.location), dataset.data.location, action.data.args.get('s3_path_start_prefix_inclusive'), action.data.args.get('s3_path_end_prefix_exclusive'), action.data.args.get('s3_path_regex_filter')) for key_obj in s3_keys: yield get_s3_path(key_obj), key_obj.size