def test_old_submission_yaml(app, admin_idx): """ Test we can validate against the old submission schema (for use when importing) :return: """ base_dir = os.path.dirname(os.path.realpath(__file__)) hepsubmission = HEPSubmission(publication_recid=12345, overall_status='todo', version=1) db.session.add(hepsubmission) db.session.commit() directory = os.path.join(base_dir, 'test_data/test_v0_submission') # This should fail against current schema errors = process_submission_directory( directory, os.path.join(directory, 'submission.yaml'), 12345) assert ('submission.yaml' in errors) assert (len(errors['submission.yaml']) == 1) assert (errors['submission.yaml'][0]['level'] == 'error') assert (errors['submission.yaml'][0]['message'].decode().startswith( "Invalid value (in GeV) for cmenergies: 1.383-1.481")) # Use old schema - should now work errors = process_submission_directory(directory, os.path.join(directory, 'submission.yaml'), 12345, old_submission_schema=True) assert (errors == {})
def test_update_record_info(app): """Test update of publication information from INSPIRE.""" assert update_record_info( None) == 'Inspire ID is None' # case where Inspire ID is None for inspire_id in ( '1311487', '19999999'): # check both a valid and invalid Inspire ID assert update_record_info( inspire_id ) == 'No HEPData submission' # before creation of HEPSubmission object submission = process_submission_payload(inspire_id=inspire_id, submitter_id=1, reviewer={ 'name': 'Reviewer', 'email': '*****@*****.**' }, uploader={ 'name': 'Uploader', 'email': '*****@*****.**' }, send_upload_email=False) # Process the files to create DataSubmission tables in the DB. base_dir = os.path.dirname(os.path.realpath(__file__)) directory = os.path.join(base_dir, 'test_data/test_submission') tmp_path = os.path.join(tempfile.mkdtemp(dir=CFG_TMPDIR), 'test_submission') shutil.copytree(directory, tmp_path) process_submission_directory(tmp_path, os.path.join(tmp_path, 'submission.yaml'), submission.publication_recid) do_finalise(submission.publication_recid, force_finalise=True, convert=False) if inspire_id == '19999999': assert update_record_info(inspire_id) == 'Invalid Inspire ID' else: # First change the publication information to that of a different record. different_inspire_record_information, status = get_inspire_record_information( '1650066') assert status == 'success' hep_submission = get_latest_hepsubmission(inspire_id=inspire_id) assert hep_submission is not None update_record(hep_submission.publication_recid, different_inspire_record_information) # Then can check that the update works and that a further update is not required. assert update_record_info(inspire_id, send_email=True) == 'Success' assert update_record_info( inspire_id ) == 'No update needed' # check case where information already current unload_submission(submission.publication_recid)
def test_create_submission(app): """ Test the whole submission pipeline in loading a file, ensuring the HEPSubmission object is created, all the files have been added, and the record has been indexed. :return: """ with app.app_context(): # test submission part works record = {'inspire_id': '19999999', 'title': 'HEPData Testing 1', 'reviewer': {'name': 'Testy McTester', 'email': '*****@*****.**'}, 'uploader': {'name': 'Testy McTester', 'email': '*****@*****.**'}, 'message': 'This is ready', 'user_id': 1} hepdata_submission = process_submission_payload(**record) assert (hepdata_submission.version == 1) assert (hepdata_submission.overall_status == 'todo') # test upload works base_dir = os.path.dirname(os.path.realpath(__file__)) directory = os.path.join(base_dir, 'test_data/test_submission') process_submission_directory(directory, os.path.join(directory, 'submission.yaml'), hepdata_submission.publication_recid) data_submissions = DataSubmission.query.filter_by( publication_recid=hepdata_submission.publication_recid).count() assert (data_submissions == 8) assert (len(hepdata_submission.resources) == 4) assert (len(hepdata_submission.participants) == 4) do_finalise(hepdata_submission.publication_recid, force_finalise=True) assert (record_exists(inspire_id=record['inspire_id'])) # Test record is in index... index_records = get_records_matching_field('inspire_id', record['inspire_id'], doc_type='publication') print(index_records) assert (len(index_records['hits']['hits']) == 1) publication_record = get_record_contents(hepdata_submission.publication_recid) print(publication_record) assert (publication_record is not None) ctx = format_submission(hepdata_submission.publication_recid, publication_record, hepdata_submission.version, 1, hepdata_submission) assert(ctx is not None) assert(ctx['version'] == 1) assert (ctx['recid'] == hepdata_submission.publication_recid)
def test_submission_too_big(app, mocker): """ Test the right thing happens when the submission data is too big :return: """ base_dir = os.path.dirname(os.path.realpath(__file__)) hepsubmission = HEPSubmission(publication_recid=12345, overall_status='todo', version=1) db.session.add(hepsubmission) db.session.commit() # Patch the app config to reduce the max upload size mocker.patch.dict('flask.current_app.config', {'CONVERT_MAX_SIZE': 1000}) test_directory = os.path.join(base_dir, 'test_data/test_submission') errors = process_submission_directory( test_directory, os.path.join(test_directory, 'submission.yaml'), 12345) assert ('Archive' in errors) assert (len(errors['Archive']) == 1) assert (errors['Archive'][0]['level'] == 'error') assert (errors['Archive'][0]['message'].startswith( "Archive is too big for conversion to other formats."))
def load_submission(self, record_information, file_base_path, submission_yaml_file_location, update=False): """ :param record_information: :param file_base_path: :param files: :return: """ # create publication record. # load data tables # create data table records (call finalise(recid)) admin_user_id = 1 # consume data payload and store in db. get_or_create_hepsubmission(record_information["recid"], admin_user_id) errors = process_submission_directory(file_base_path, submission_yaml_file_location, record_information["recid"], update=update) if len(errors) > 0: print("ERRORS ARE: ") print(errors) if errors: raise FailedSubmission( "Submission failed for {0}.".format( record_information["recid"]), errors, record_information["recid"]) else: return record_information["recid"]
def load_submission(self, record_information, file_base_path, submission_yaml_file_location, update=False): """ :param record_information: :param file_base_path: :param files: :return: """ # create publication record. # load data tables # create data table records (call finalise(recid)) admin_user_id = 1 # consume data payload and store in db. get_or_create_hepsubmission(record_information["recid"], admin_user_id) errors = process_submission_directory(file_base_path, submission_yaml_file_location, record_information["recid"], update=update) if len(errors) > 0: print('ERRORS ARE: ') print(errors) if errors: raise FailedSubmission("Submission failed for {0}.".format( record_information['recid']), errors, record_information['recid']) else: return record_information["recid"]
def process_zip_archive(file, id): filename = secure_filename(file.filename) time_stamp = str(int(round(time.time()))) file_save_directory = os.path.join(current_app.config['CFG_DATADIR'], str(id), time_stamp) if not os.path.exists(file_save_directory): os.makedirs(file_save_directory) if not filename.endswith('.oldhepdata'): file_path = os.path.join(file_save_directory, filename) file.save(file_path) submission_path = os.path.join(file_save_directory, remove_file_extension(filename)) if filename.endswith('.yaml'): # we split the singular yaml file and create a submission directory error, last_updated = split_files(file_path, submission_path) if error: return { "Single YAML file splitter": [{ "level": "error", "message": str(error) }] } else: # we are dealing with a zip, tar, etc. so we extract the contents extract(filename, file_path, submission_path) submission_found = find_file_in_directory( submission_path, lambda x: x == "submission.yaml") else: file_path = os.path.join(file_save_directory, 'oldhepdata') if not os.path.exists(file_path): os.makedirs(file_path) print('Saving file to {}'.format(os.path.join(file_path, filename))) file.save(os.path.join(file_path, filename)) submission_path = os.path.join(file_save_directory, 'oldhepdata') submission_found = False if submission_found: basepath, submission_file_path = submission_found else: result = check_and_convert_from_oldhepdata(submission_path, id, time_stamp) # Check for errors if type(result) == dict: return result else: basepath, submission_file_path = result return process_submission_directory(basepath, submission_file_path, id)
def process_zip_archive(file, id): filename = secure_filename(file.filename) time_stamp = str(int(round(time.time()))) file_save_directory = os.path.join(current_app.config['CFG_DATADIR'], str(id), time_stamp) if not os.path.exists(file_save_directory): os.makedirs(file_save_directory) if '.oldhepdata' not in filename: file_path = os.path.join(file_save_directory, filename) file.save(file_path) submission_path = os.path.join(file_save_directory, remove_file_extension(filename)) if 'yaml' in filename: # we split the singular yaml file and create a submission directory split_files(file_path, submission_path) else: # we are dealing with a zip, tar, etc. so we extract the contents extract(filename, file_path, submission_path) submission_found = find_file_in_directory(submission_path, lambda x: x == "submission.yaml") else: file_path = os.path.join(file_save_directory, 'oldhepdata') if not os.path.exists(file_path): os.makedirs(file_path) if filename.endswith('.txt'): filename = filename.replace(".txt", "") print('Saving file to {}'.format(os.path.join(file_path, filename))) file.save(os.path.join(file_path, filename)) submission_path = os.path.join(file_save_directory, 'oldhepdata') submission_found = False if submission_found: basepath, submission_file_path = submission_found else: result = check_and_convert_from_oldhepdata(submission_path, id, time_stamp) # Check for errors if type(result) == dict: return result else: basepath, submission_file_path = result return process_submission_directory(basepath, submission_file_path, id)
def test_invalid_submission_yaml(app, admin_idx): """ Test the right thing happens when the submission.yaml is invalid :return: """ base_dir = os.path.dirname(os.path.realpath(__file__)) directory = os.path.join(base_dir, 'test_data/test_invalid_submission_file') errors = process_submission_directory( directory, os.path.join(directory, 'submission.yaml'), 12345) assert ('submission.yaml' in errors) assert (len(errors['submission.yaml']) == 1) assert (errors['submission.yaml'][0]['level'] == 'error') assert (errors['submission.yaml'][0]['message'].startswith( "There was a problem parsing the file"))
def test_duplicate_table_names(app): """ Test that an error is returned for a submission.yaml file with duplicate table names. """ base_dir = os.path.dirname(os.path.realpath(__file__)) hepsubmission = HEPSubmission(publication_recid=12345, overall_status='todo', version=1) db.session.add(hepsubmission) db.session.commit() directory = os.path.join(base_dir, 'test_data/test_duplicate_table_names') errors = process_submission_directory( directory, os.path.join(directory, 'submission.yaml'), 12345) assert ('submission.yaml' in errors) assert (len(errors['submission.yaml']) == 2) for error in errors['submission.yaml']: assert (error['level'] == 'error') assert (error['message'].startswith("Duplicate table with name"))
def test_invalid_data_yaml(app, admin_idx): """ Test the right thing happens when a data yaml file is invalid :return: """ base_dir = os.path.dirname(os.path.realpath(__file__)) hepsubmission = HEPSubmission(publication_recid=12345, overall_status='todo', version=1) db.session.add(hepsubmission) db.session.commit() directory = os.path.join(base_dir, 'test_data/test_invalid_data_file') errors = process_submission_directory( directory, os.path.join(directory, 'submission.yaml'), 12345) assert ('data1.yaml' in errors) assert (len(errors['data1.yaml']) == 1) assert (errors['data1.yaml'][0]['level'] == 'error') assert (errors['data1.yaml'][0]['message'].startswith( "There was a problem parsing the file"))
def process_zip_archive(file, id): filename = secure_filename(file.filename) time_stamp = str(int(round(time.time()))) file_save_directory = os.path.join(current_app.config['CFG_DATADIR'], str(id), time_stamp) if not os.path.exists(file_save_directory): os.makedirs(file_save_directory) if not filename.endswith('.oldhepdata'): file_path = os.path.join(file_save_directory, filename) print('Saving file to {}'.format(file_path)) file.save(file_path) submission_path = os.path.join(file_save_directory, remove_file_extension(filename)) submission_temp_path = tempfile.mkdtemp( dir=current_app.config["CFG_TMPDIR"]) if filename.endswith('.yaml'): # we split the singular yaml file and create a submission directory error, last_updated = split_files(file_path, submission_temp_path) if error: return { "Single YAML file splitter": [{ "level": "error", "message": str(error) }] } else: # we are dealing with a zip, tar, etc. so we extract the contents if not extract(file_path, submission_temp_path): return { "Archive file extractor": [{ "level": "error", "message": "{} is not a valid zip or tar archive file.".format( file_path) }] } if not os.path.exists(submission_path): os.makedirs(submission_path) # Move files from submission_temp_path to submission_path (try to avoid problems with EOS disk). if current_app.config.get('PRODUCTION_MODE', False): # production instance at CERN copy_command = ['xrdcp', '-N', '-f'] copy_submission_path = submission_path.replace( current_app.config['CFG_DATADIR'], current_app.config['EOS_DATADIR']) else: # local instance copy_command = ['cp'] copy_submission_path = submission_path print('Copying with: {} -r {} {}'.format(' '.join(copy_command), submission_temp_path + '/.', copy_submission_path)) subprocess.check_output( copy_command + ['-r', submission_temp_path + '/.', copy_submission_path]) rmtree(submission_temp_path, ignore_errors=True ) # can uncomment when this is definitely working submission_found = find_file_in_directory( submission_path, lambda x: x == "submission.yaml") else: file_path = os.path.join(file_save_directory, 'oldhepdata') if not os.path.exists(file_path): os.makedirs(file_path) print('Saving file to {}'.format(os.path.join(file_path, filename))) file.save(os.path.join(file_path, filename)) submission_found = False if submission_found: basepath, submission_file_path = submission_found else: result = check_and_convert_from_oldhepdata(file_path, id, time_stamp) # Check for errors if type(result) == dict: return result else: basepath, submission_file_path = result return process_submission_directory(basepath, submission_file_path, id)
def test_create_submission(app, admin_idx): """ Test the whole submission pipeline in loading a file, ensuring the HEPSubmission object is created, all the files have been added, and the record has been indexed. :return: """ with app.app_context(): admin_idx.recreate_index() # test submission part works record = { 'inspire_id': '19999999', 'title': 'HEPData Testing 1', 'reviewer': { 'name': 'Testy McTester', 'email': '*****@*****.**' }, 'uploader': { 'name': 'Testy McTester', 'email': '*****@*****.**' }, 'message': 'This is ready', 'user_id': 1 } hepdata_submission = process_submission_payload(**record) assert (hepdata_submission.version == 1) assert (hepdata_submission.overall_status == 'todo') # test upload works base_dir = os.path.dirname(os.path.realpath(__file__)) test_directory = os.path.join(base_dir, 'test_data/test_submission') time_stamp = str(int(round(time.time()))) directory = get_data_path_for_record( hepdata_submission.publication_recid, time_stamp) shutil.copytree(test_directory, directory) assert (os.path.exists(directory)) process_submission_directory( directory, os.path.join(directory, 'submission.yaml'), hepdata_submission.publication_recid) admin_idx_results = admin_idx.search( term=hepdata_submission.publication_recid, fields=['recid']) assert (admin_idx_results is not None) data_submissions = DataSubmission.query.filter_by( publication_recid=hepdata_submission.publication_recid).count() assert (data_submissions == 8) assert (len(hepdata_submission.resources) == 4) assert (len(hepdata_submission.participants) == 4) do_finalise(hepdata_submission.publication_recid, force_finalise=True, convert=False) assert (record_exists(inspire_id=record['inspire_id'])) # Test record is in index... index_records = get_records_matching_field('inspire_id', record['inspire_id'], doc_type='publication') assert (len(index_records['hits']['hits']) == 1) publication_record = get_record_contents( hepdata_submission.publication_recid) assert (publication_record is not None) ctx = format_submission(hepdata_submission.publication_recid, publication_record, hepdata_submission.version, 1, hepdata_submission) assert (ctx is not None) assert (ctx['version'] == 1) assert (ctx['recid'] == hepdata_submission.publication_recid) # remove the submission and test that all is remove unload_submission(hepdata_submission.publication_recid) assert (not record_exists(inspire_id=record['inspire_id'])) data_submissions = DataSubmission.query.filter_by( publication_recid=hepdata_submission.publication_recid).count() assert (data_submissions == 0) sleep(2) admin_idx_results = admin_idx.search( term=hepdata_submission.publication_recid, fields=['recid']) assert (len(admin_idx_results) == 0) # Check file dir has been deleted assert (not os.path.exists(directory))
def process_zip_archive(file_path, id, old_submission_schema=False, old_data_schema=False): (file_save_directory, filename) = os.path.split(file_path) if not filename.endswith('.oldhepdata'): file_save_directory = os.path.dirname(file_path) submission_path = os.path.join(file_save_directory, remove_file_extension(filename)) submission_temp_path = tempfile.mkdtemp(dir=current_app.config["CFG_TMPDIR"]) if filename.endswith('.yaml.gz'): print('Extracting: {} to {}'.format(file_path, file_path[:-3])) if not extract(file_path, file_path[:-3]): return { "Archive file extractor": [{ "level": "error", "message": "{} is not a valid .gz file.".format(file_path) }] } return process_zip_archive(file_path[:-3], id, old_submission_schema=old_submission_schema, old_data_schema=False) elif filename.endswith('.yaml'): # we split the singular yaml file and create a submission directory error, last_updated = split_files(file_path, submission_temp_path) if error: return { "Single YAML file splitter": [{ "level": "error", "message": str(error) }] } else: # we are dealing with a zip, tar, etc. so we extract the contents try: unzipped_path = extract(file_path, submission_temp_path) except Exception as e: unzipped_path = None if not unzipped_path: return { "Archive file extractor": [{ "level": "error", "message": "{} is not a valid zip or tar archive file.".format(file_path) }] } copy_errors = move_files(submission_temp_path, submission_path) if copy_errors: return copy_errors submission_found = find_file_in_directory(submission_path, lambda x: x == "submission.yaml") if not submission_found: return { "Archive file extractor": [{ "level": "error", "message": "No submission.yaml file has been found in the archive." }] } basepath, submission_file_path = submission_found else: file_dir = os.path.dirname(file_save_directory) time_stamp = os.path.split(file_dir)[1] result = check_and_convert_from_oldhepdata(os.path.dirname(file_save_directory), id, time_stamp) # Check for errors if type(result) == dict: return result else: basepath, submission_file_path = result old_data_schema = True return process_submission_directory(basepath, submission_file_path, id, old_data_schema=old_data_schema, old_submission_schema=old_submission_schema)
def mock_import_old_record(inspire_id=mock_inspire_ids[1], send_email=False): """Creates a submission but mimics the old migrated paths. (See hepdata master branch at ccd691b for old migrator module.) """ if inspire_id not in mock_inspire_ids: raise ValueError('Invalid inspire id %s. Accepted values are: %s' % (inspire_id, ', '.join(mock_inspire_ids))) # Use zipped test data for specific record(s) publication_information, status = get_inspire_record_information( inspire_id) publication_information["inspire_id"] = inspire_id # Create record if status == "success": record_information = create_record(publication_information) else: log.error("Failed to retrieve publication information for " + inspire_id) return False # Unzip into correct data dir data_path = get_data_path_for_record(record_information['recid']) base_dir = os.path.dirname(os.path.realpath(__file__)) zip_path = os.path.join(base_dir, 'old_hepdata_zips', 'ins%s.zip' % inspire_id) if os.path.isfile(zip_path): log.info('Unzipping %s to %s' % (zip_path, data_path)) shutil.unpack_archive(zip_path, data_path) time_stamp = str(int(round(time.time()))) yaml_path = os.path.join(data_path, time_stamp) sub_zip_path = os.path.join(data_path, 'ins%s.zip' % inspire_id) shutil.unpack_archive(sub_zip_path, yaml_path) else: log.error('Invalid path %s' % zip_path) return False # Create submission admin_user_id = 1 # Consume data payload and store in db. get_or_create_hepsubmission(record_information["recid"], admin_user_id) errors = process_submission_directory(yaml_path, os.path.join(yaml_path, "submission.yaml"), record_information["recid"], old_submission_schema=True, old_data_schema=True) if errors: log.error( "Submission failed for {0}.".format(record_information["recid"]), errors, record_information["recid"]) return False do_finalise(record_information['recid'], publication_record=record_information, force_finalise=True, convert=False, send_email=send_email)