Пример #1
0
    def test_remove_obsolete_files(self, app, caplog, cleanup_s3):
        """Removes files from S3 following prefix and whitelist rules."""
        caplog.set_level(logging.INFO)
        with capture_app_logs(app):
            prefix1 = app.config['LOCH_S3_PREFIX_TESTEXT'] + '/001'
            prefix2 = app.config['LOCH_S3_PREFIX_TESTEXT'] + '/002'

            assert s3.upload_from_url(
                'http://shakespeare.mit.edu/Poetry/sonnet.XX.html',
                prefix1 + '/xx/sonnet-xx.html')
            assert s3.upload_from_url(
                'http://shakespeare.mit.edu/Poetry/sonnet.XXI.html',
                prefix1 + '/xxi/sonnet-xxi.html')
            assert s3.upload_from_url(
                'http://shakespeare.mit.edu/Poetry/sonnet.XXII.html',
                prefix1 + '/xxii/sonnet-xxii.html')
            assert s3.upload_from_url(
                'http://shakespeare.mit.edu/Poetry/sonnet.XLV.html',
                prefix2 + '/xlv/sonnet-xlv.html')

            whitelist = ['sonnet-xxi.html', 'sonnet-xxii.html']
            assert s3.delete_objects_with_prefix(prefix1, whitelist) is True

            assert f'3 key(s) matching prefix "{prefix1}"' in caplog.text
            assert '2 key(s) in whitelist' in caplog.text
            assert 'will delete 1 object(s)' in caplog.text

            assert s3.object_exists(prefix1 + '/xx/sonnet-xx.html') is False
            assert s3.object_exists(prefix1 + '/xxi/sonnet-xxi.html') is True
            assert s3.object_exists(prefix1 + '/xxii/sonnet-xxii.html') is True
            assert s3.object_exists(prefix2 + '/xlv/sonnet-xlv.html') is True
Пример #2
0
    def test_file_upload_and_delete(self, app, cleanup_s3):
        """Can upload and delete files in S3."""
        url1 = 'http://shakespeare.mit.edu/Poetry/sonnet.XLV.html'
        key1 = app.config['LOCH_S3_PREFIX_TESTEXT'] + '/00001/sonnet-xlv.html'

        url2 = 'http://shakespeare.mit.edu/Poetry/sonnet.LXII.html'
        key2 = app.config['LOCH_S3_PREFIX_TESTEXT'] + '/00002/sonnet-xlii.html'

        assert s3.object_exists(key1) is False
        assert s3.upload_from_url(url1, key1)['ContentLength'] == 767
        assert s3.object_exists(key1) is True
        assert s3.get_keys_with_prefix(app.config['LOCH_S3_PREFIX_TESTEXT'] +
                                       '/00001') == [key1]

        assert s3.object_exists(key2) is False
        assert s3.upload_from_url(url2, key2)['ContentLength'] == 743
        assert s3.object_exists(key2) is True
        assert s3.get_keys_with_prefix(app.config['LOCH_S3_PREFIX_TESTEXT'] +
                                       '/00002') == [key2]

        client = s3.get_client()
        contents1 = client.get_object(Bucket=app.config['LOCH_S3_BUCKET'],
                                      Key=key1)['Body'].read().decode('utf-8')
        assert 'These present-absent with swift motion slide' in contents1
        contents2 = client.get_object(Bucket=app.config['LOCH_S3_BUCKET'],
                                      Key=key2)['Body'].read().decode('utf-8')
        assert 'Beated and chopp\'d with tann\'d antiquity' in contents2
Пример #3
0
 def test_s3_nonexistent_object(self, app, caplog, bad_bucket):
     """Returns false on S3 checks for nonexistent objects."""
     with capture_app_logs(app):
         key = app.config[
             'LOCH_S3_PREFIX_TESTEXT'] + '/00001/sonnet-xlv.html'
         response = s3.object_exists(key)
         assert response is False
 def transform(self, s3_source, s3_dest, job_id):
     objects = s3.get_keys_with_prefix(s3_source)
     if len(objects) == 0:
         message = f'Zero objects found in {s3_source}. Quitting.'
         app.logger.info(message)
         return message
     app.logger.info(f'Will transform {len(objects)} objects from {s3_source} and put results to {s3_dest}.')
     objects_updated = 0
     new_objects = 0
     objects_in_error = 0
     total_objects = 0
     for o in objects:
         file_name = o.split('/')[-1]
         app.logger.debug(f'processing {file_name}')
         # file_name is like 'daily_2020-08-14.zip'
         piazza_zip_file = s3.get_object_compressed_text_reader(o)
         for subfile in piazza_zip_file.namelist():
             if '.json' in subfile:
                 try:
                     json_file = subfile.split('/')[-1]
                     course_id = subfile.split('/')[-2]
                     file_type = json_file.split('_')[0]
                     record = piazza_zip_file.read(subfile)
                     with tempfile.TemporaryFile() as result:
                         s3_object = f'{s3_dest}/{file_type}/{course_id}/{json_file}'
                         if s3.object_exists(s3_object):
                             objects_updated += 1
                         else:
                             new_objects += 1
                         result.write(record)
                         s3.upload_file(result, s3_object)
                         total_objects += 1
                     # update job queue every 1000 files...
                     if total_objects % 1000 == 0:
                         message = f'{subfile}, {total_objects} so far; ' \
                                   + f'{new_objects} new files; ' \
                                   + f'{objects_updated} existing files. {objects_in_error} files in error' \
                                   + f'({len(objects)} objects in all)'
                         update_background_job_status(job_id, 'transforming', details=message)
                 except Exception as e:
                     app.logger.error(f'could not extract {subfile}')
                     app.logger.error(e)
                     objects_in_error += 1
             else:
                 # not a json file, so we skip it
                 continue
     message = f'Transformed {len(objects)} input files; created {new_objects} new objects; '\
               + f'updated {objects_updated} existing objects. {objects_in_error} objects in error'
     app.logger.info(message)
     return message
Пример #5
0
 def test_canvas_sync_metadata(self, app, metadata_db):
     """Makes an API call and puts the result in S3."""
     with mock_s3(app):
         bucket = app.config['LOCH_S3_BUCKET']
         path = '/api/v1/audit/grade_change/courses/7654321'
         s3_key = f'{bucket}/grade_change_log/grade_change_log_7654321'
         result = ImportCanvasApiData().run_wrapped(
             course_id='7654321',
             path=path,
             s3_key=s3_key,
             job_id='ImportCanvasGradeChangeLog_123',
         )
         assert result is True
         assert s3.object_exists(f'{s3_key}_0.json') is True
Пример #6
0
 def transform(self, s3_source, s3_dest, key=None):
     objects = s3.get_keys_with_prefix(s3_source)
     app.logger.info(
         f'Will transform {len(objects)} objects from {s3_source} and put results to {s3_dest}.'
     )
     skip_count = 0
     for o in objects:
         file_name = o.split('/')[-1]
         if s3.object_exists(f'{s3_dest}/{file_name}'):
             skip_count += 1
             continue
         canvas_api_data = s3.get_object_json(o).get(
             key) if key else s3.get_object_json(o)
         with tempfile.TemporaryFile() as result:
             course_id = int(file_name.split('_')[-2])
             for record in canvas_api_data:
                 record['course_id'] = course_id
                 result.write(json.dumps(record).encode() + b'\n')
             s3.upload_file(result, f'{s3_dest}/{file_name}')
     app.logger.info(
         f'Transformed {len(objects) - skip_count} new objects; skipped {skip_count} existing objects.'
     )
Пример #7
0
    def run(self, url, key, canvas_sync_job_id=None):
        if canvas_sync_job_id:
            update_canvas_sync_status(canvas_sync_job_id, key, 'started')
        if s3.object_exists(key):
            app.logger.info(f'Key {key} exists, skipping upload')
            if canvas_sync_job_id:
                update_canvas_sync_status(canvas_sync_job_id, key, 'duplicate')
            return False
        else:
            app.logger.info(f'Key {key} does not exist, starting upload')
            try:

                def update_streaming_status(headers):
                    update_canvas_sync_status(
                        canvas_sync_job_id,
                        key,
                        'streaming',
                        source_size=headers.get('Content-Length'))

                response = s3.upload_from_url(
                    url, key, on_stream_opened=update_streaming_status)
                if response and canvas_sync_job_id:
                    destination_size = response.get('ContentLength')
                    update_canvas_sync_status(
                        canvas_sync_job_id,
                        key,
                        'complete',
                        destination_size=destination_size)
                    create_canvas_snapshot(key, size=destination_size)
                return True
            except (ClientError, ConnectionError, ValueError) as e:
                if canvas_sync_job_id:
                    update_canvas_sync_status(canvas_sync_job_id,
                                              key,
                                              'error',
                                              details=str(e))
                return False