예제 #1
0
    def test_canvas_sync_metadata(self, app, metadata_db):
        """When given a job id, updates metadata on file sync."""
        url = 'http://shakespeare.mit.edu/Poetry/sonnet.XLV.html'
        key = 'canvas/sonnet_submission_dim/sonnet-xlv.txt'

        with mock_s3(app):
            with open(_get_fixtures_path() + '/sonnet_xlv.html', 'r') as file:
                responses.add(responses.GET,
                              url,
                              body=file.read(),
                              headers={'Content-Length': '767'})

            # Run two successive sync jobs on the same file. The first succeeds, the second is skipped as
            # a duplicate.
            metadata.create_canvas_sync_status('job_1', 'sonnet-xlv.txt',
                                               'sonnet_submission_dim', url)
            result = SyncFileToS3().run(url=url,
                                        key=key,
                                        canvas_sync_job_id='job_1')
            assert result is True
            metadata.create_canvas_sync_status('job_2', 'sonnet-xlv.txt',
                                               'sonnet_submission_dim', url)
            result = SyncFileToS3().run(url=url,
                                        key=key,
                                        canvas_sync_job_id='job_2')
            assert result is False

            schema = app.config['REDSHIFT_SCHEMA_METADATA']
            sync_metadata = redshift.fetch(
                f'SELECT * FROM {schema}.canvas_sync_job_status')
            snapshot_metadata = redshift.fetch(
                f'SELECT * FROM {schema}.canvas_synced_snapshots')

            assert len(sync_metadata) == 2
            assert sync_metadata[0]['job_id'] == 'job_1'
            assert sync_metadata[0][
                'destination_url'] == 's3://mock-bucket/canvas/sonnet_submission_dim/sonnet-xlv.txt'
            assert sync_metadata[0]['status'] == 'complete'
            assert sync_metadata[0]['source_size'] == 767
            assert sync_metadata[0]['destination_size'] == 767
            assert sync_metadata[0]['updated_at'] > sync_metadata[0][
                'created_at']
            assert sync_metadata[1]['job_id'] == 'job_2'
            assert sync_metadata[1][
                'destination_url'] == 's3://mock-bucket/canvas/sonnet_submission_dim/sonnet-xlv.txt'
            assert sync_metadata[1]['status'] == 'duplicate'
            assert sync_metadata[1]['source_size'] is None
            assert sync_metadata[1]['destination_size'] is None
            assert sync_metadata[1]['updated_at'] > sync_metadata[1][
                'created_at']

            assert len(snapshot_metadata) == 1
            assert snapshot_metadata[0]['filename'] == 'sonnet-xlv.txt'
            assert snapshot_metadata[0][
                'canvas_table'] == 'sonnet_submission_dim'
            assert snapshot_metadata[0][
                'url'] == 's3://mock-bucket/canvas/sonnet_submission_dim/sonnet-xlv.txt'
            assert snapshot_metadata[0]['size'] == 767
            assert snapshot_metadata[0]['created_at']
            assert snapshot_metadata[0]['deleted_at'] is None
예제 #2
0
 def mock_metadata(job_id, snapshot, status, destination_size):
     metadata.create_canvas_sync_status(job_id, snapshot['filename'],
                                        snapshot['table'],
                                        snapshot['url'])
     key = '/'.join([
         get_s3_canvas_daily_path(), snapshot['table'],
         snapshot['filename']
     ])
     metadata.update_canvas_sync_status(
         job_id,
         key,
         status,
         source_size=1048576,
         destination_size=destination_size)
예제 #3
0
    def run(self, cleanup=True):
        job_id = self.generate_job_id()
        app.logger.info(f'Starting Canvas snapshot resync job... (id={job_id})')
        md = metadata.get_failures_from_last_sync()
        if not md['failures']:
            return f"No failures found for job_id {md['job_id']}, skipping resync."
        app.logger.info(f"Found {len(md['failures'])} failures for job_id {md['job_id']}, attempting resync.")

        failures = 0
        successes = 0

        for failure in md['failures']:
            if cleanup and failure['destination_url']:
                destination_key = failure['destination_url'].split(app.config['LOCH_S3_BUCKET'] + '/')[1]
                if s3.delete_objects([destination_key]):
                    metadata.delete_canvas_snapshots([destination_key])
                else:
                    app.logger.error(f'Could not delete failed snapshot from S3 (url={failure.destination_url})')
            metadata.create_canvas_sync_status(
                job_id=job_id,
                filename=failure['filename'],
                canvas_table=failure['canvas_table'],
                # The original signed source URL will remain valid if the resync job is run within an hour of the sync job.
                # TODO Add logic to fetch a new signed URL from the Canvas Data API for older jobs.
                source_url=failure['source_url'],
            )

            # Regenerate the S3 key, since the failed job may not have progressed far enough to store a destination URL in its metadata.
            if failure['canvas_table'] == 'requests':
                key_components = [berkeley.s3_canvas_data_path_current_term(), failure['canvas_table'], failure['filename']]
            else:
                key_components = [get_s3_canvas_daily_path(), failure['canvas_table'], failure['filename']]
            key = '/'.join(key_components)
            response = dispatch('sync_file_to_s3', data={'canvas_sync_job_id': job_id, 'url': failure['source_url'], 'key': key})

            if not response:
                app.logger.error('Failed to dispatch S3 resync of snapshot ' + failure['filename'])
                metadata.update_canvas_sync_status(job_id, key, 'error', details=f'Failed to dispatch: {response}')
                failures += 1
            else:
                app.logger.info('Dispatched S3 resync of snapshot ' + failure['filename'])
                successes += 1

        return f'Canvas snapshot resync job dispatched to workers ({successes} successful dispatches, {failures} failures).'
    def run(self, cleanup=True):
        job_id = self.generate_job_id()
        app.logger.info(f'Starting Canvas snapshot sync job... (id={job_id})')

        snapshot_response = canvas_data.get_snapshots()
        if not snapshot_response:
            raise BackgroundJobError(
                'Error retrieving Canvas data snapshots, aborting job.')
        snapshots = snapshot_response.get('files', [])

        def should_sync(snapshot):
            return snapshot[
                'table'] == 'requests' and snapshot['partial'] is False

        snapshots_to_sync = [s for s in snapshots if should_sync(s)]
        app.logger.info(
            f'Will sync {len(snapshots_to_sync)} of {len(snapshots)} available files from Canvas Data.'
        )

        success = 0
        failure = 0

        for snapshot in snapshots_to_sync:
            metadata.create_canvas_sync_status(
                job_id=job_id,
                filename=snapshot['filename'],
                canvas_table=snapshot['table'],
                source_url=snapshot['url'],
            )

            key_components = [
                app.config['LOCH_S3_CANVAS_DATA_PATH_HISTORICAL'],
                snapshot['table'], snapshot['filename']
            ]

            key = '/'.join(key_components)
            response = dispatch('sync_file_to_s3',
                                data={
                                    'canvas_sync_job_id': job_id,
                                    'url': snapshot['url'],
                                    'key': key
                                })

            if not response:
                app.logger.error('Failed to dispatch S3 sync of snapshot ' +
                                 snapshot['filename'])
                metadata.update_canvas_sync_status(
                    job_id,
                    key,
                    'error',
                    details=f'Failed to dispatch: {response}')
                failure += 1
            else:
                app.logger.info('Dispatched S3 sync of snapshot ' +
                                snapshot['filename'])
                success += 1

        if cleanup:
            app.logger.info('Will remove obsolete snapshots from S3.')
            current_snapshot_filenames = [
                s['filename'] for s in snapshots_to_sync
            ]
            requests_prefix = app.config[
                'LOCH_S3_CANVAS_DATA_PATH_HISTORICAL'] + '/requests'
            delete_result = s3.delete_objects_with_prefix(
                requests_prefix, whitelist=current_snapshot_filenames)
            if not delete_result:
                app.logger.error('Cleanup of obsolete snapshots failed.')
        return f'Canvas snapshot sync job dispatched to workers ({success} successful dispatches, {failure} failures).'
예제 #5
0
    def run(self, cleanup=True):
        job_id = self.generate_job_id()
        app.logger.info(f'Starting Canvas snapshot sync job... (id={job_id})')

        snapshot_response = canvas_data.get_snapshots()
        if not snapshot_response:
            raise BackgroundJobError(
                'Error retrieving Canvas data snapshots, aborting job.')
        snapshots = snapshot_response.get('files', [])

        def should_sync(snapshot):
            # For tables other than requests, sync all snapshots.
            # For the requests table, sync snapshots that are partial or later than the configured cutoff date.
            def after_cutoff_date(url):
                match = re.search('requests/(20\d{6})', url)
                return match is not None and (
                    match[1] >=
                    app.config['LOCH_CANVAS_DATA_REQUESTS_CUTOFF_DATE'])

            return snapshot['table'] != 'requests' or snapshot[
                'partial'] is True or after_cutoff_date(snapshot['url'])

        snapshots_to_sync = [s for s in snapshots if should_sync(s)]
        app.logger.info(
            f'Will sync {len(snapshots_to_sync)} of {len(snapshots)} available files from Canvas Data.'
        )

        success = 0
        failure = 0

        for snapshot in snapshots_to_sync:
            metadata.create_canvas_sync_status(
                job_id=job_id,
                filename=snapshot['filename'],
                canvas_table=snapshot['table'],
                source_url=snapshot['url'],
            )
            if snapshot['table'] == 'requests':
                key_components = [
                    berkeley.s3_canvas_data_path_current_term(),
                    snapshot['table'], snapshot['filename']
                ]
            else:
                key_components = [
                    get_s3_canvas_daily_path(), snapshot['table'],
                    snapshot['filename']
                ]

            key = '/'.join(key_components)
            response = dispatch('sync_file_to_s3',
                                data={
                                    'canvas_sync_job_id': job_id,
                                    'url': snapshot['url'],
                                    'key': key
                                })

            if not response:
                app.logger.error('Failed to dispatch S3 sync of snapshot ' +
                                 snapshot['filename'])
                metadata.update_canvas_sync_status(
                    job_id,
                    key,
                    'error',
                    details=f'Failed to dispatch: {response}')
                failure += 1
            else:
                app.logger.info('Dispatched S3 sync of snapshot ' +
                                snapshot['filename'])
                success += 1

        if cleanup:
            app.logger.info('Will remove obsolete snapshots from S3.')
            current_snapshot_filenames = [
                s['filename'] for s in snapshots_to_sync
            ]
            requests_prefix = berkeley.s3_canvas_data_path_current_term(
            ) + '/requests'
            delete_result = s3.delete_objects_with_prefix(
                requests_prefix, whitelist=current_snapshot_filenames)
            if not delete_result:
                app.logger.error('Cleanup of obsolete snapshots failed.')
        return f'Canvas snapshot sync job dispatched to workers ({success} successful dispatches, {failure} failures).'