def test_canvas_sync_metadata(self, app, metadata_db): """When given a job id, updates metadata on file sync.""" url = 'http://shakespeare.mit.edu/Poetry/sonnet.XLV.html' key = 'canvas/sonnet_submission_dim/sonnet-xlv.txt' with mock_s3(app): with open(_get_fixtures_path() + '/sonnet_xlv.html', 'r') as file: responses.add(responses.GET, url, body=file.read(), headers={'Content-Length': '767'}) # Run two successive sync jobs on the same file. The first succeeds, the second is skipped as # a duplicate. metadata.create_canvas_sync_status('job_1', 'sonnet-xlv.txt', 'sonnet_submission_dim', url) result = SyncFileToS3().run(url=url, key=key, canvas_sync_job_id='job_1') assert result is True metadata.create_canvas_sync_status('job_2', 'sonnet-xlv.txt', 'sonnet_submission_dim', url) result = SyncFileToS3().run(url=url, key=key, canvas_sync_job_id='job_2') assert result is False schema = app.config['REDSHIFT_SCHEMA_METADATA'] sync_metadata = redshift.fetch( f'SELECT * FROM {schema}.canvas_sync_job_status') snapshot_metadata = redshift.fetch( f'SELECT * FROM {schema}.canvas_synced_snapshots') assert len(sync_metadata) == 2 assert sync_metadata[0]['job_id'] == 'job_1' assert sync_metadata[0][ 'destination_url'] == 's3://mock-bucket/canvas/sonnet_submission_dim/sonnet-xlv.txt' assert sync_metadata[0]['status'] == 'complete' assert sync_metadata[0]['source_size'] == 767 assert sync_metadata[0]['destination_size'] == 767 assert sync_metadata[0]['updated_at'] > sync_metadata[0][ 'created_at'] assert sync_metadata[1]['job_id'] == 'job_2' assert sync_metadata[1][ 'destination_url'] == 's3://mock-bucket/canvas/sonnet_submission_dim/sonnet-xlv.txt' assert sync_metadata[1]['status'] == 'duplicate' assert sync_metadata[1]['source_size'] is None assert sync_metadata[1]['destination_size'] is None assert sync_metadata[1]['updated_at'] > sync_metadata[1][ 'created_at'] assert len(snapshot_metadata) == 1 assert snapshot_metadata[0]['filename'] == 'sonnet-xlv.txt' assert snapshot_metadata[0][ 'canvas_table'] == 'sonnet_submission_dim' assert snapshot_metadata[0][ 'url'] == 's3://mock-bucket/canvas/sonnet_submission_dim/sonnet-xlv.txt' assert snapshot_metadata[0]['size'] == 767 assert snapshot_metadata[0]['created_at'] assert snapshot_metadata[0]['deleted_at'] is None
def mock_metadata(job_id, snapshot, status, destination_size): metadata.create_canvas_sync_status(job_id, snapshot['filename'], snapshot['table'], snapshot['url']) key = '/'.join([ get_s3_canvas_daily_path(), snapshot['table'], snapshot['filename'] ]) metadata.update_canvas_sync_status( job_id, key, status, source_size=1048576, destination_size=destination_size)
def run(self, cleanup=True): job_id = self.generate_job_id() app.logger.info(f'Starting Canvas snapshot resync job... (id={job_id})') md = metadata.get_failures_from_last_sync() if not md['failures']: return f"No failures found for job_id {md['job_id']}, skipping resync." app.logger.info(f"Found {len(md['failures'])} failures for job_id {md['job_id']}, attempting resync.") failures = 0 successes = 0 for failure in md['failures']: if cleanup and failure['destination_url']: destination_key = failure['destination_url'].split(app.config['LOCH_S3_BUCKET'] + '/')[1] if s3.delete_objects([destination_key]): metadata.delete_canvas_snapshots([destination_key]) else: app.logger.error(f'Could not delete failed snapshot from S3 (url={failure.destination_url})') metadata.create_canvas_sync_status( job_id=job_id, filename=failure['filename'], canvas_table=failure['canvas_table'], # The original signed source URL will remain valid if the resync job is run within an hour of the sync job. # TODO Add logic to fetch a new signed URL from the Canvas Data API for older jobs. source_url=failure['source_url'], ) # Regenerate the S3 key, since the failed job may not have progressed far enough to store a destination URL in its metadata. if failure['canvas_table'] == 'requests': key_components = [berkeley.s3_canvas_data_path_current_term(), failure['canvas_table'], failure['filename']] else: key_components = [get_s3_canvas_daily_path(), failure['canvas_table'], failure['filename']] key = '/'.join(key_components) response = dispatch('sync_file_to_s3', data={'canvas_sync_job_id': job_id, 'url': failure['source_url'], 'key': key}) if not response: app.logger.error('Failed to dispatch S3 resync of snapshot ' + failure['filename']) metadata.update_canvas_sync_status(job_id, key, 'error', details=f'Failed to dispatch: {response}') failures += 1 else: app.logger.info('Dispatched S3 resync of snapshot ' + failure['filename']) successes += 1 return f'Canvas snapshot resync job dispatched to workers ({successes} successful dispatches, {failures} failures).'
def run(self, cleanup=True): job_id = self.generate_job_id() app.logger.info(f'Starting Canvas snapshot sync job... (id={job_id})') snapshot_response = canvas_data.get_snapshots() if not snapshot_response: raise BackgroundJobError( 'Error retrieving Canvas data snapshots, aborting job.') snapshots = snapshot_response.get('files', []) def should_sync(snapshot): return snapshot[ 'table'] == 'requests' and snapshot['partial'] is False snapshots_to_sync = [s for s in snapshots if should_sync(s)] app.logger.info( f'Will sync {len(snapshots_to_sync)} of {len(snapshots)} available files from Canvas Data.' ) success = 0 failure = 0 for snapshot in snapshots_to_sync: metadata.create_canvas_sync_status( job_id=job_id, filename=snapshot['filename'], canvas_table=snapshot['table'], source_url=snapshot['url'], ) key_components = [ app.config['LOCH_S3_CANVAS_DATA_PATH_HISTORICAL'], snapshot['table'], snapshot['filename'] ] key = '/'.join(key_components) response = dispatch('sync_file_to_s3', data={ 'canvas_sync_job_id': job_id, 'url': snapshot['url'], 'key': key }) if not response: app.logger.error('Failed to dispatch S3 sync of snapshot ' + snapshot['filename']) metadata.update_canvas_sync_status( job_id, key, 'error', details=f'Failed to dispatch: {response}') failure += 1 else: app.logger.info('Dispatched S3 sync of snapshot ' + snapshot['filename']) success += 1 if cleanup: app.logger.info('Will remove obsolete snapshots from S3.') current_snapshot_filenames = [ s['filename'] for s in snapshots_to_sync ] requests_prefix = app.config[ 'LOCH_S3_CANVAS_DATA_PATH_HISTORICAL'] + '/requests' delete_result = s3.delete_objects_with_prefix( requests_prefix, whitelist=current_snapshot_filenames) if not delete_result: app.logger.error('Cleanup of obsolete snapshots failed.') return f'Canvas snapshot sync job dispatched to workers ({success} successful dispatches, {failure} failures).'
def run(self, cleanup=True): job_id = self.generate_job_id() app.logger.info(f'Starting Canvas snapshot sync job... (id={job_id})') snapshot_response = canvas_data.get_snapshots() if not snapshot_response: raise BackgroundJobError( 'Error retrieving Canvas data snapshots, aborting job.') snapshots = snapshot_response.get('files', []) def should_sync(snapshot): # For tables other than requests, sync all snapshots. # For the requests table, sync snapshots that are partial or later than the configured cutoff date. def after_cutoff_date(url): match = re.search('requests/(20\d{6})', url) return match is not None and ( match[1] >= app.config['LOCH_CANVAS_DATA_REQUESTS_CUTOFF_DATE']) return snapshot['table'] != 'requests' or snapshot[ 'partial'] is True or after_cutoff_date(snapshot['url']) snapshots_to_sync = [s for s in snapshots if should_sync(s)] app.logger.info( f'Will sync {len(snapshots_to_sync)} of {len(snapshots)} available files from Canvas Data.' ) success = 0 failure = 0 for snapshot in snapshots_to_sync: metadata.create_canvas_sync_status( job_id=job_id, filename=snapshot['filename'], canvas_table=snapshot['table'], source_url=snapshot['url'], ) if snapshot['table'] == 'requests': key_components = [ berkeley.s3_canvas_data_path_current_term(), snapshot['table'], snapshot['filename'] ] else: key_components = [ get_s3_canvas_daily_path(), snapshot['table'], snapshot['filename'] ] key = '/'.join(key_components) response = dispatch('sync_file_to_s3', data={ 'canvas_sync_job_id': job_id, 'url': snapshot['url'], 'key': key }) if not response: app.logger.error('Failed to dispatch S3 sync of snapshot ' + snapshot['filename']) metadata.update_canvas_sync_status( job_id, key, 'error', details=f'Failed to dispatch: {response}') failure += 1 else: app.logger.info('Dispatched S3 sync of snapshot ' + snapshot['filename']) success += 1 if cleanup: app.logger.info('Will remove obsolete snapshots from S3.') current_snapshot_filenames = [ s['filename'] for s in snapshots_to_sync ] requests_prefix = berkeley.s3_canvas_data_path_current_term( ) + '/requests' delete_result = s3.delete_objects_with_prefix( requests_prefix, whitelist=current_snapshot_filenames) if not delete_result: app.logger.error('Cleanup of obsolete snapshots failed.') return f'Canvas snapshot sync job dispatched to workers ({success} successful dispatches, {failure} failures).'