def rewrite_file_paths(path, test_files_dir): """Load the import CSV, prepend pytest tmpdir to basename_orig """ headers, rowds, csv_errs = csvfile.make_rowds(fileio.read_csv(path)) for rowd in rowds: src = os.path.join(test_files_dir, rowd['basename_orig']) rowd['basename_orig'] = src headers, rows = csvfile.make_rows(rowds) fileio.write_csv(path, headers, rows)
def test_read_csv(): # prep if os.path.exists(CSV_PATH): os.remove(CSV_PATH) with open(CSV_PATH, 'w') as f: f.write(CSV_FILE) # test expected = CSV_ROWS expected.insert(0, CSV_HEADERS) assert fileio.read_csv(CSV_PATH) == expected # cleanup if os.path.exists(CSV_PATH): os.remove(CSV_PATH)
def test_read_csv(tmpdir): CSV_PATH = str(tmpdir / 'write_csv.csv') # prep if os.path.exists(CSV_PATH): os.remove(CSV_PATH) with open(CSV_PATH, 'w') as f: f.write(CSV_FILE) # test expected = CSV_ROWS expected.insert(0, CSV_HEADERS) assert fileio.read_csv(CSV_PATH) == expected # cleanup if os.path.exists(CSV_PATH): os.remove(CSV_PATH)
def check_csv(csv_path, cidentifier, vocabs_path): """Load CSV, validate headers and rows Results dict includes: - 'passed' - 'headers' - 'rowds' - 'header_errs' - 'rowds_errs' @param csv_path: Absolute path to CSV data file. @param cidentifier: Identifier @param vocabs_path: Absolute path to vocab dir @param session: requests.session object @returns: nothing """ logging.info('Checking CSV file') passed = False headers,rowds = csvfile.make_rowds(fileio.read_csv(csv_path)) for rowd in rowds: rowd['identifier'] = identifier.Identifier(rowd['id']) logging.info('%s rows' % len(rowds)) model = Checker._guess_model(rowds) module = Checker._get_module(model) vocabs = Checker._get_vocabs(module) header_errs,rowds_errs = Checker._validate_csv_file( module, vocabs, headers, rowds ) if (not header_errs) and (not rowds_errs): passed = True logging.info('ok') else: logging.error('FAIL') return { 'passed': passed, 'headers': headers, 'rowds': rowds, 'header_errs': header_errs, 'rowds_errs': rowds_errs, }
def register_entity_ids(csv_path, cidentifier, idservice_client, dryrun=True): """ @param csv_path: Absolute path to CSV data file. @param cidentifier: Identifier @param idservice_client: idservice.IDServiceCrequests.session object @param register: boolean Whether or not to register IDs @returns: nothing """ logging.info('-----------------------------------------------') logging.info('Reading %s' % csv_path) headers,rowds = csvfile.make_rowds(fileio.read_csv(csv_path)) logging.info('%s rows' % len(rowds)) logging.info('Looking up already registered IDs') csv_eids = [rowd['id'] for rowd in rowds] status1,reason1,registered,unregistered = idservice_client.check_eids(cidentifier, csv_eids) logging.info('%s %s' % (status1,reason1)) if status1 != 200: raise Exception('%s %s' % (status1,reason1)) num_unregistered = len(unregistered) logging.info('%s IDs to register.' % num_unregistered) if unregistered and dryrun: logging.info('These IDs would be registered if not --dryrun') for n,eid in enumerate(unregistered): logging.info('| %s/%s %s' % (n, num_unregistered, eid)) elif unregistered: logging.info('Registering IDs') for n,eid in enumerate(unregistered): logging.info('| %s/%s %s' % (n, num_unregistered, eid)) status2,reason2,created = idservice_client.register_eids(cidentifier, unregistered) logging.info('%s %s' % (status2,reason2)) if status2 != 201: raise Exception('%s %s' % (status2,reason2)) logging.info('%s registered' % len(created)) logging.info('- - - - - - - - - - - - - - - - - - - - - - - -')
def import_files(csv_path, cidentifier, vocabs_path, git_name, git_mail, agent, log_path=None, dryrun=False): """Adds or updates files from a CSV file TODO how to handle excluded fields like XMP??? @param csv_path: Absolute path to CSV data file. @param cidentifier: Identifier @param vocabs_path: Absolute path to vocab dir @param git_name: str @param git_mail: str @param agent: str @param log_path: str Absolute path to addfile log for all files @param dryrun: boolean """ logging.info('batch import files ----------------------------') # TODO hard-coded model name... model = 'file' csv_dir = os.path.dirname(csv_path) logging.debug('csv_dir %s' % csv_dir) # TODO this still knows too much about entities and files... entity_class = identifier.class_for_name( identifier.MODEL_CLASSES['entity']['module'], identifier.MODEL_CLASSES['entity']['class'] ) logging.debug('entity_class %s' % entity_class) logging.info('Reading %s' % csv_path) headers,rowds = csvfile.make_rowds(fileio.read_csv(csv_path)) logging.info('%s rows' % len(rowds)) # check for modified or uncommitted files in repo repository = dvcs.repository(cidentifier.path_abs()) logging.debug(repository) fidentifiers = { rowd['id']: identifier.Identifier( id=rowd['id'], base_path=cidentifier.basepath ) for rowd in rowds } fidentifier_parents = { fi.id: Importer._fidentifier_parent(fi) for fi in fidentifiers.itervalues() } # eidentifiers, removing duplicates eidentifiers = list(set([e for e in fidentifier_parents.itervalues()])) entities = {} bad_entities = [] for eidentifier in eidentifiers: if os.path.exists(eidentifier.path_abs()): entity = eidentifier.object() entities[eidentifier.id] = entity else: if eidentifier.id not in bad_entities: bad_entities.append(eidentifier.id) if bad_entities: for f in bad_entities: logging.error(' %s missing' % f) raise Exception('%s entities could not be loaded! - IMPORT CANCELLED!' % len(bad_entities)) # separate into new and existing lists rowds_new = [] rowds_existing = [] for n,rowd in enumerate(rowds): if Importer._file_is_new(fidentifiers[rowd['id']]): rowds_new.append(rowd) else: rowds_existing.append(rowd) logging.info('- - - - - - - - - - - - - - - - - - - - - - - -') logging.info('Updating existing files') start_updates = datetime.now() git_files = [] updated = [] elapsed_rounds_updates = [] staged = [] obj_metadata = None for n,rowd in enumerate(rowds_existing): logging.info('+ %s/%s - %s (%s)' % (n+1, len(rowds), rowd['id'], rowd['basename_orig'])) start_round = datetime.now() fidentifier = fidentifiers[rowd['id']] eidentifier = fidentifier_parents[fidentifier.id] entity = entities[eidentifier.id] file_ = fidentifier.object() modified = file_.load_csv(rowd) # Getting obj_metadata takes about 1sec each time # TODO caching works as long as all objects have same metadata... if not obj_metadata: obj_metadata = models.object_metadata( fidentifier.fields_module(), repository.working_dir ) if dryrun: pass elif modified: logging.debug(' writing %s' % file_.json_path) file_.write_json(obj_metadata=obj_metadata) # TODO better to write to collection changelog? Importer._write_entity_changelog(entity, git_name, git_mail, agent) # stage git_files.append(file_.json_path_rel) git_files.append(entity.changelog_path_rel) updated.append(file_) elapsed_round = datetime.now() - start_round elapsed_rounds_updates.append(elapsed_round) logging.debug('| %s (%s)' % (fidentifier, elapsed_round)) elapsed_updates = datetime.now() - start_updates logging.debug('%s updated in %s' % (len(elapsed_rounds_updates), elapsed_updates)) if dryrun: pass elif git_files: logging.info('Staging %s modified files' % len(git_files)) start_stage = datetime.now() dvcs.stage(repository, git_files) staged = util.natural_sort(dvcs.list_staged(repository)) for path in staged: if path in git_files: logging.debug('+ %s' % path) else: logging.debug('| %s' % path) elapsed_stage = datetime.now() - start_stage logging.debug('ok (%s)' % elapsed_stage) logging.debug('%s staged in %s' % (len(staged), elapsed_stage)) logging.info('- - - - - - - - - - - - - - - - - - - - - - - -') logging.info('Adding new files') start_adds = datetime.now() elapsed_rounds_adds = [] logging.info('Checking source files') for rowd in rowds_new: rowd['src_path'] = os.path.join(csv_dir, rowd['basename_orig']) logging.debug('| %s' % rowd['src_path']) if not os.path.exists(rowd['src_path']): raise Exception('Missing file: %s' % rowd['src_path']) if log_path: logging.info('addfile logging to %s' % log_path) for n,rowd in enumerate(rowds_new): logging.info('+ %s/%s - %s (%s)' % (n+1, len(rowds), rowd['id'], rowd['basename_orig'])) start_round = datetime.now() fidentifier = fidentifiers[rowd['id']] eidentifier = fidentifier_parents[fidentifier.id] entity = entities[eidentifier.id] logging.debug('| %s' % (entity)) if dryrun: pass elif Importer._file_is_new(fidentifier): # ingest # TODO make sure this updates entity.files file_,repo2,log2 = ingest.add_file( entity, rowd['src_path'], fidentifier.parts['role'], rowd, git_name, git_mail, agent, log_path=log_path, show_staged=False ) elapsed_round = datetime.now() - start_round elapsed_rounds_adds.append(elapsed_round) logging.debug('| %s (%s)' % (file_, elapsed_round)) elapsed_adds = datetime.now() - start_adds logging.debug('%s added in %s' % (len(elapsed_rounds_adds), elapsed_adds)) logging.info('- - - - - - - - - - - - - - - - - - - - - - - -') return git_files
def import_entities(csv_path, cidentifier, vocabs_path, git_name, git_mail, agent, dryrun=False): """Adds or updates entities from a CSV file Running function multiple times with the same CSV file is idempotent. After the initial pass, files will only be modified if the CSV data has been updated. This function writes and stages files but does not commit them! That is left to the user or to another function. @param csv_path: Absolute path to CSV data file. @param cidentifier: Identifier @param vocabs_path: Absolute path to vocab dir @param git_name: str @param git_mail: str @param agent: str @param dryrun: boolean @returns: list of updated entities """ logging.info('------------------------------------------------------------------------') logging.info('batch import entity') model = 'entity' repository = dvcs.repository(cidentifier.path_abs()) logging.info(repository) logging.info('Reading %s' % csv_path) headers,rowds = csvfile.make_rowds(fileio.read_csv(csv_path)) logging.info('%s rows' % len(rowds)) logging.info('- - - - - - - - - - - - - - - - - - - - - - - -') logging.info('Importing') start_updates = datetime.now() git_files = [] updated = [] elapsed_rounds = [] obj_metadata = None if dryrun: logging.info('Dry run - no modifications') for n,rowd in enumerate(rowds): logging.info('%s/%s - %s' % (n+1, len(rowds), rowd['id'])) start_round = datetime.now() eidentifier = identifier.Identifier(id=rowd['id'], base_path=cidentifier.basepath) # if there is an existing object it will be loaded entity = eidentifier.object() if not entity: entity = models.Entity.create(eidentifier.path_abs(), eidentifier) modified = entity.load_csv(rowd) # Getting obj_metadata takes about 1sec each time # TODO caching works as long as all objects have same metadata... if not obj_metadata: obj_metadata = models.object_metadata( eidentifier.fields_module(), repository.working_dir ) if dryrun: pass elif modified: # write files if not os.path.exists(entity.path_abs): os.makedirs(entity.path_abs) logging.debug(' writing %s' % entity.json_path) entity.write_json(obj_metadata=obj_metadata) # TODO better to write to collection changelog? # TODO write all additions to changelog at one time Importer._write_entity_changelog(entity, git_name, git_mail, agent) # stage git_files.append(entity.json_path_rel) git_files.append(entity.changelog_path_rel) updated.append(entity) elapsed_round = datetime.now() - start_round elapsed_rounds.append(elapsed_round) logging.debug('| %s (%s)' % (eidentifier, elapsed_round)) if dryrun: logging.info('Dry run - no modifications') elif updated: logging.info('Staging %s modified files' % len(git_files)) start_stage = datetime.now() dvcs.stage(repository, git_files) for path in util.natural_sort(dvcs.list_staged(repository)): if path in git_files: logging.debug('+ %s' % path) else: logging.debug('| %s' % path) elapsed_stage = datetime.now() - start_stage logging.debug('ok (%s)' % elapsed_stage) elapsed_updates = datetime.now() - start_updates logging.debug('%s updated in %s' % (len(elapsed_rounds), elapsed_updates)) logging.info('- - - - - - - - - - - - - - - - - - - - - - - -') return updated
def test_files_import_external_nohashes_rename(tmpdir, collection, test_csv_dir, test_files_dir): """Test importing *external* files with *no* hash cols but binaries present If file is external, binary is present, and no hash cols, rename binary in place ddr-testing-123-1-master-684e15e967 ddr-testing-123-2-master-b9773b9aef """ print('collection_path %s' % collection.path_abs) file_csv_path = os.path.join( test_csv_dir, 'ddrimport-files-import-external-nohashes-rename.csv') print('file_csv_path %s' % file_csv_path) rewrite_file_paths(file_csv_path, test_files_dir) log_path = os.path.join( test_files_dir, 'ddrimport-files-import-external-nohashes-rename.log') print('log_path %s' % log_path) print('test_files_dir %s' % test_files_dir) for path in os.listdir(test_files_dir): print(path) # copy test files so later tests don't crash # replace basename_orig in CSV with copied file # and rewrite CSV headers, rowds, csv_errs = csvfile.make_rowds( fileio.read_csv(file_csv_path)) renamed_files = [] copied_files = [] ingested_files = [] access_files = [] for rowd in rowds: print(rowd) src_file = os.path.join(test_files_dir, rowd['basename_orig']) path, ext = os.path.splitext(src_file) dest_file = path + '-rename' + ext print('shutil.copy(%s, %s)' % (src_file, dest_file)) shutil.copy(src_file, dest_file) if os.path.exists(dest_file): renamed_files.append(os.path.basename(dest_file)) else: print('could not copy') assert False rowd['basename_orig'] = dest_file # figure out new file ID sha1 = util.file_hash(dest_file, 'sha1')[:10] idparts = rowd['id'].split('-') + [rowd['role']] + [sha1] final_file = '-'.join(idparts) + ext final_access = '-'.join(idparts + ['a.jpg']) copied_files.append(final_file) ingested_files.append(final_file) access_files.append(final_access) headers, rows = csvfile.make_rows(rowds) fileio.write_csv(file_csv_path, headers, rows) out = batch.Importer.import_files( file_csv_path, collection.identifier, VOCABS_URL, GIT_USER, GIT_MAIL, AGENT, log_path=log_path, tmp_dir=test_files_dir, ) # save and commit repo = dvcs.repository(collection.path_abs) print('STAGED FILES') staged_files = sorted([path for path in dvcs.list_staged(repo)]) for path in staged_files: print(' %s' % path) # after import_files, we expect to see offenses = 0 # assert final_file in os.listdir(test_files_dir) print('test_files_dir') test_files = [path for path in os.listdir(test_files_dir)] for path in copied_files: print(path) if path not in test_files: print('RENAMED SRC FILE NOT PRESENT %s' % path) offenses += 1 # assert files not ingested # assert no access files created for path in staged_files: if os.path.basename(path) in ingested_files: print('ERROR %s HAS BEEN IMPORTED!!' % path) offenses += 1 if os.path.basename(path) in access_files: print('ERROR %s ACCESS FILE GENERATED!!' % path) offenses += 1 commit = repo.index.commit('test_files_import_external_nohashes_rename') print('commit %s' % commit) if offenses: assert False # test hashes present check_file_hashes(collection.path_abs) # ensure no binaries in .git/objects print('log_path %s' % log_path) assert not find_binaries_in_git_objects(repo) assert not find_missing_annex_binaries(repo)