def run(self): app.logger.info('Starting Canvas schema creation job...') canvas_path = get_s3_canvas_daily_path() if not s3.get_keys_with_prefix(canvas_path): canvas_path = get_s3_canvas_daily_path(datetime.now() - timedelta(days=1)) if not s3.get_keys_with_prefix(canvas_path): raise BackgroundJobError( 'No timely Canvas data found, aborting') else: app.logger.info('Falling back to yesterday\'s Canvas data') external_schema = app.config['REDSHIFT_SCHEMA_CANVAS'] s3_prefix = 's3://' + app.config['LOCH_S3_BUCKET'] + '/' s3_canvas_data_url = s3_prefix + canvas_path s3_canvas_data_path_current_term = s3_prefix + berkeley.s3_canvas_data_path_current_term( ) redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template( 'create_canvas_schema.template.sql', loch_s3_canvas_data_path_today=s3_canvas_data_url, loch_s3_canvas_data_path_current_term= s3_canvas_data_path_current_term, ) if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) return 'Canvas schema creation job completed.' else: raise BackgroundJobError('Canvas schema creation job failed.')
def generate_canvas_path(self): canvas_path = get_s3_canvas_daily_path() if not s3.get_keys_with_prefix(canvas_path): canvas_path = get_s3_canvas_daily_path(datetime.now() - timedelta(days=1)) if not s3.get_keys_with_prefix(canvas_path): raise BackgroundJobError( 'No timely Canvas data found, aborting') else: app.logger.info('Falling back to yesterday\'s Canvas data') return canvas_path
def mock_metadata(job_id, snapshot, status, destination_size): metadata.create_canvas_sync_status(job_id, snapshot['filename'], snapshot['table'], snapshot['url']) key = '/'.join([ get_s3_canvas_daily_path(), snapshot['table'], snapshot['filename'] ]) metadata.update_canvas_sync_status( job_id, key, status, source_size=1048576, destination_size=destination_size)
def run(self, cleanup=True): job_id = self.generate_job_id() app.logger.info(f'Starting Canvas snapshot resync job... (id={job_id})') md = metadata.get_failures_from_last_sync() if not md['failures']: return f"No failures found for job_id {md['job_id']}, skipping resync." app.logger.info(f"Found {len(md['failures'])} failures for job_id {md['job_id']}, attempting resync.") failures = 0 successes = 0 for failure in md['failures']: if cleanup and failure['destination_url']: destination_key = failure['destination_url'].split(app.config['LOCH_S3_BUCKET'] + '/')[1] if s3.delete_objects([destination_key]): metadata.delete_canvas_snapshots([destination_key]) else: app.logger.error(f'Could not delete failed snapshot from S3 (url={failure.destination_url})') metadata.create_canvas_sync_status( job_id=job_id, filename=failure['filename'], canvas_table=failure['canvas_table'], # The original signed source URL will remain valid if the resync job is run within an hour of the sync job. # TODO Add logic to fetch a new signed URL from the Canvas Data API for older jobs. source_url=failure['source_url'], ) # Regenerate the S3 key, since the failed job may not have progressed far enough to store a destination URL in its metadata. if failure['canvas_table'] == 'requests': key_components = [berkeley.s3_canvas_data_path_current_term(), failure['canvas_table'], failure['filename']] else: key_components = [get_s3_canvas_daily_path(), failure['canvas_table'], failure['filename']] key = '/'.join(key_components) response = dispatch('sync_file_to_s3', data={'canvas_sync_job_id': job_id, 'url': failure['source_url'], 'key': key}) if not response: app.logger.error('Failed to dispatch S3 resync of snapshot ' + failure['filename']) metadata.update_canvas_sync_status(job_id, key, 'error', details=f'Failed to dispatch: {response}') failures += 1 else: app.logger.info('Dispatched S3 resync of snapshot ' + failure['filename']) successes += 1 return f'Canvas snapshot resync job dispatched to workers ({successes} successful dispatches, {failures} failures).'
def run(self, cleanup=True): job_id = self.generate_job_id() app.logger.info(f'Starting Canvas snapshot sync job... (id={job_id})') snapshot_response = canvas_data.get_snapshots() if not snapshot_response: raise BackgroundJobError( 'Error retrieving Canvas data snapshots, aborting job.') snapshots = snapshot_response.get('files', []) def should_sync(snapshot): # For tables other than requests, sync all snapshots. # For the requests table, sync snapshots that are partial or later than the configured cutoff date. def after_cutoff_date(url): match = re.search('requests/(20\d{6})', url) return match is not None and ( match[1] >= app.config['LOCH_CANVAS_DATA_REQUESTS_CUTOFF_DATE']) return snapshot['table'] != 'requests' or snapshot[ 'partial'] is True or after_cutoff_date(snapshot['url']) snapshots_to_sync = [s for s in snapshots if should_sync(s)] app.logger.info( f'Will sync {len(snapshots_to_sync)} of {len(snapshots)} available files from Canvas Data.' ) success = 0 failure = 0 for snapshot in snapshots_to_sync: metadata.create_canvas_sync_status( job_id=job_id, filename=snapshot['filename'], canvas_table=snapshot['table'], source_url=snapshot['url'], ) if snapshot['table'] == 'requests': key_components = [ berkeley.s3_canvas_data_path_current_term(), snapshot['table'], snapshot['filename'] ] else: key_components = [ get_s3_canvas_daily_path(), snapshot['table'], snapshot['filename'] ] key = '/'.join(key_components) response = dispatch('sync_file_to_s3', data={ 'canvas_sync_job_id': job_id, 'url': snapshot['url'], 'key': key }) if not response: app.logger.error('Failed to dispatch S3 sync of snapshot ' + snapshot['filename']) metadata.update_canvas_sync_status( job_id, key, 'error', details=f'Failed to dispatch: {response}') failure += 1 else: app.logger.info('Dispatched S3 sync of snapshot ' + snapshot['filename']) success += 1 if cleanup: app.logger.info('Will remove obsolete snapshots from S3.') current_snapshot_filenames = [ s['filename'] for s in snapshots_to_sync ] requests_prefix = berkeley.s3_canvas_data_path_current_term( ) + '/requests' delete_result = s3.delete_objects_with_prefix( requests_prefix, whitelist=current_snapshot_filenames) if not delete_result: app.logger.error('Cleanup of obsolete snapshots failed.') return f'Canvas snapshot sync job dispatched to workers ({success} successful dispatches, {failure} failures).'