def size(self, iterator, recurse=False, verbose=False): """ Yields (path, size-in-bytes) tuples for the selected data objects and collections. Examples: >>> session.bulk.size('~/data/out*.txt') >>> session.bulk.size('./data', recurse=True) Arguments: iterator: iterator or str Defines which items are subject to the bulk operation. Can be an iterator (e.g. using search_manager.find()) or a string (which will be used to construct a search_manager.iglob() iterator). Data sizes will be returned for matching data objects and, if used recursively, collections. recurse: bool (default: False) Whether to use recursion, meaning that the data size of matching collections will be calculated as the sum of their data objects and subcollection sizes. verbose: bool (default: False) Whether to print more output. """ if isinstance(iterator, str): iterator = self.session.search.iglob(iterator) for item in iterator: path = self.session.path.get_absolute_irods_path(item) if self.session.collections.exists(path): if recurse: new_iterator = self.size(item + '/*', recurse=True, verbose=verbose) size = sum([result[1] for result in new_iterator]) else: self.log('Skipping collection %s (no recursion)' % item, verbose) continue else: dirname = os.path.dirname(path) basename = os.path.basename(path) criteria = [ Criterion('=', Collection.name, dirname), Criterion('=', DataObject.name, basename) ] fields = [DataObject.size] q = self.session.query(*fields).filter(*criteria) results = [result for result in q.get_results()] if len(results) > 1: raise MultipleResultsFound('Different replicas of data ' + \ 'object %s have different sizes' % path) size = results[0][DataObject.size] yield (item, size)
def get_logical_location_get(filename, match_exact=None, include_trash=None) -> str: max_rows = 1000 offset = 0 sess = create_session() conditions = [] if match_exact: conditions += [DataObject.name == str(filename)] else: conditions += [ Criterion('like', DataObject.name, '%' + str(filename) + '%') ] if not include_trash: conditions += [Criterion('not like', Collection.name, '%/trash/%')] results = sess.query(DataObject.name, Collection.name).\ filter(*conditions).\ offset(offset).\ limit(max_rows).all() sess.cleanup() data = {} data['irods_filenames'] = [] for r in results.rows: l_path = r.popitem() f_name = r.popitem() data['irods_filenames'] += {str(l_path[1]) + '/' + str(f_name[1])} return jsonify(data)
def get_metadata_value_for_collection(session, coll_name, key): results = session.query(Collection, CollectionMeta).filter( \ Criterion('=', Collection.name, coll_name)).filter( \ Criterion('=', CollectionMeta.name, key)) for r in results: return r[CollectionMeta.value] return ''
def get_metadata_value(session, coll_name, data_name, key): results = session.query(DataObject, DataObjectMeta).filter( \ Criterion('=', Collection.name, coll_name)).filter( \ Criterion('=', DataObject.name, data_name)).filter( \ Criterion('=', DataObjectMeta.name, key)) for r in results: return r[DataObjectMeta.value] return ''
def get_flag_dataobj_names_by_user(self, user_id): """ Returns the names of all flag data objects associated with the given user. :param user_id: wdk id of user to whom the flags pertain :return: list of all flag data object names satisfying the criteria. """ criteria = [ Criterion("=", Collection.name, paths.FLAGS_PATH), Criterion('like', DataObject.name, '%_u' + user_id + '%') ] return self.get_dataobj_names_by_query(criteria)
def get_event_dataobj_names_created_since(self, start_time): """ Returns the names of all event data objects created since the given start time. :param start_time: datetime in sec :return: list of all event data object names satisfying the criteria """ criteria = [ Criterion("=", Collection.name, paths.EVENTS_PATH), Criterion(">=", DataObject.create_time, start_time) ] return self.get_dataobj_names_by_query(criteria)
def get_syncable_irods_groups(sess): irods_group_names_set = set() # filter only rodsgroups, query = sess.query(User.name, User.id, User.type).filter(Criterion('=', User.type, 'rodsgroup')) n = 0 for result in query: n = n + 1 # if not result[User.name] in unsynced_users: irodsGroup = sess.users.get(result[User.name]) syncAVUs = irodsGroup.metadata.get_all(LDAP_SYNC_AVU) if not syncAVUs: irods_group_names_set.add(irodsGroup.name) elif (len(syncAVUs) == 1) and (syncAVUs[0].value == "true"): irods_group_names_set.add(irodsGroup.name) elif (len(syncAVUs) == 1) and (syncAVUs[0].value == "false"): logger.debug("AVU ldapSync=false found for group: {}".format( irodsGroup.name)) continue else: logger.error( "found unexpected number of AVUs for key ldapSync and group: {} {}" .format(irodsGroup.name, len(syncAVUs))) logger.debug( "iRods groups found: {} (allowed for synchronization: {})".format( n, len(irods_group_names_set))) return irods_group_names_set
def syncable_irods_users(sess): irods_user_names_set = set() # filter only rodsusers, filter the special users, check wich one are not in the LDAP list query = sess.query(User.name, User.id, User.type).filter(Criterion('=', User.type, 'rodsuser')) n = 0 for result in query: n = n + 1 irodsUser = sess.users.get(result[User.name]) syncAVUs = irodsUser.metadata.get_all(LDAP_SYNC_AVU) if not syncAVUs: irods_user_names_set.add(result[User.name]) elif (len(syncAVUs) == 1) and (syncAVUs[0].value == "true"): irods_user_names_set.add(result[User.name]) elif (len(syncAVUs) == 1) and (syncAVUs[0].value == "false"): logger.debug("AVU ldapSync=false found for user: {}".format( irodsUser.name)) continue else: logger.error( "found unexpected number of AVUs for key ldapSync and user: {} {}" .format(irodsUser.name, len(syncAVUs))) logger.debug( "iRods users found: {} (allowed for synchronization: {})".format( n, len(irods_user_names_set))) return irods_user_names_set
def test_query_like(self): '''Equivalent to: iquest "select RESC_NAME where RESC_NAME like 'dem%'" ''' rows = self.sess.query(Resource).filter(Criterion('like', Resource.name, 'dem%')).get_results() self.assertIn('demoResc', [row[Resource.name] for row in rows])
def get_data_object_physical_path(coll_match, data_match, resc_match): session = session_object() NotInTrash = Criterion('not like', Collection.name, '%/trash/%') q = session.query(DataObject).filter(NotInTrash).filter( Collection.name == coll_match).filter( DataObject.name == data_match).filter(Resource.name == resc_match) return q.one()[DataObject.path]
def search_collection_metadata(q, config={}): print("Search::CollectionMeta", q) with new_session(config) as session: query = session.query(Collection, CollectionMeta) \ .add_keyword('zone', 'seq') for k in q: query = query.filter(Criterion('=', CollectionMeta.name, k)) \ .filter(Criterion('=', CollectionMeta.value, q[k])) return [{ "type": iRODSCollection, "id": result[Collection.id], "name": result[Collection.name] } for result in query] # << search methods
def search_data_object_metadata(q, config={}): print("Search::DataObjectMeta", q) with new_session(config) as session: query = session.query(DataObject, Collection.name) \ .filter(Like(DataObject.path, "/irods-seq-sr%")) \ .add_keyword('zone', 'seq') print("Searching for", q) for k in q: query = query.filter(Criterion('=', DataObjectMeta.name, k)) \ .filter(Criterion('=', DataObjectMeta.value, q[k])) return [{ "type": iRODSDataObject, "id": result[DataObject.id], "name": result[DataObject.name], "size": sizeof_fmt(result[DataObject.size]), "modified": result[DataObject.modify_time].isoformat(), "path": "{}/{}".format(result[Collection.name], result[DataObject.name]) } for result in query]
def do_audit(run_handle): run_data_dir_filesystem = "%s/hpc_storage/run_data/%s" % ( phengs_path_prefix, run_handle) machine_fastqs_dir_filesystem = "%s/hpc_storage/machine_fastqs/%s" % ( phengs_path_prefix, run_handle) env_file = os.path.expanduser('~/.irods/irods_environment.json') with iRODSSession(irods_env_file=env_file) as session: # select COLL_NAME, DATA_NAME where META_DATA_ATTR_NAME = 'filesystem::run_handle' and META_DATA_ATTR_VALUE = <run_handle> results = session.query(Collection.name, DataObject).filter( \ Criterion('=', DataObjectMeta.name, 'filesystem::run_handle')).filter( \ Criterion('=', DataObjectMeta.value, run_handle)) for r in results: filesystem_path = get_metadata_value(session, r[Collection.name], r[DataObject.name], 'filesystem::path') if not os.path.exists(filesystem_path): print("%s does not exist" % filesystem_path) else: print("%s exists" % filesystem_path)
def test_force_unlink(self): collection = self.coll_path filename = 'test_force_unlink.txt' file_path = '{collection}/{filename}'.format(**locals()) # make object obj = helpers.make_object(self.sess, file_path) # force remove object obj.unlink(force=True) # should be gone with self.assertRaises(DataObjectDoesNotExist): obj = self.sess.data_objects.get(file_path) # make sure it's not in the trash either conditions = [DataObject.name == filename, Criterion('like', Collection.name, "/dev/trash/%%")] query = self.sess.query(DataObject.id, DataObject.name, Collection.name).filter(*conditions) results = query.all() self.assertEqual(len(results), 0)
def search_data_object(q, config={}): print("Search::DataObject", q) with new_session(config) as session: query = session.query(DataObject, Collection.name) \ .filter(Like(DataObject.path, "/irods-seq-sr%")) \ .filter(Criterion('=', DataObject.name, '%{}%'.format(q['value']))) \ .add_keyword('zone', 'seq') results = [{ "type": iRODSDataObject, "id": result[DataObject.id], "name": result[DataObject.name], "size": sizeof_fmt(result[DataObject.size]), "modified": result[DataObject.modify_time].isoformat(), "path": "{}/{}".format(result[Collection.name], result[DataObject.name]) } for result in query] return { "id": "search-result", "name": q['value'], "count": len(results), "children": results }
def do_restore(run_handle): run_data_dir_filesystem = "%s/hpc_storage/run_data/%s" % ( phengs_path_prefix, run_handle) machine_fastqs_dir_filesystem = "%s/hpc_storage/machine_fastqs/%s" % ( phengs_path_prefix, run_handle) env_file = os.path.expanduser('~/.irods/irods_environment.json') with iRODSSession(irods_env_file=env_file) as session: # Restore files # select COLL_NAME, DATA_NAME where META_DATA_ATTR_NAME = 'filesystem::run_handle' and META_DATA_ATTR_VALUE = <run_handle> results = session.query(Collection.name, DataObject).filter( \ Criterion('=', DataObjectMeta.name, 'filesystem::run_handle')).filter( \ Criterion('=', DataObjectMeta.value, run_handle)) for r in results: # get filesystem attributes filesystem_path = get_metadata_value_for_data_object( session, r[Collection.name], r[DataObject.name], 'filesystem::path') atime = get_metadata_value_for_data_object(session, r[Collection.name], r[DataObject.name], 'filesystem::atime') mtime = get_metadata_value_for_data_object(session, r[Collection.name], r[DataObject.name], 'filesystem::mtime') owner = get_metadata_value_for_data_object(session, r[Collection.name], r[DataObject.name], 'filesystem::owner') perms = get_metadata_value_for_data_object(session, r[Collection.name], r[DataObject.name], 'filesystem::perms') group = get_metadata_value_for_data_object(session, r[Collection.name], r[DataObject.name], 'filesystem::group') print(r[Collection.name], r[DataObject.name], filesystem_path, atime, mtime, owner, perms, group) restore_to_lustre(session, r[Collection.name], r[DataObject.name], filesystem_path, atime, mtime, owner, perms, group) # Restore directory metadata # select COLL_NAME where META_COLL_ATTR_NAME = 'filesystem::run_handle' and META_COLL_ATTR_VALUE = <run_handle> results = session.query(Collection, CollectionMeta).filter( \ Criterion('=', CollectionMeta.name, 'filesystem::run_handle')).filter( \ Criterion('=', CollectionMeta.value, run_handle)) for r in results: # get filesystem attributes filesystem_path = get_metadata_value_for_collection( session, r[Collection.name], 'filesystem::path') atime = get_metadata_value_for_collection(session, r[Collection.name], 'filesystem::atime') mtime = get_metadata_value_for_collection(session, r[Collection.name], 'filesystem::mtime') owner = get_metadata_value_for_collection(session, r[Collection.name], 'filesystem::owner') perms = get_metadata_value_for_collection(session, r[Collection.name], 'filesystem::perms') group = get_metadata_value_for_collection(session, r[Collection.name], 'filesystem::group') print(r[Collection.name], filesystem_path, atime, mtime, owner, perms, group) restore_to_lustre(session, r[Collection.name], None, filesystem_path, atime, mtime, owner, perms, group) # create the restore_from_archive file os.system("touch %s/restore_from_archive" % run_data_dir_filesystem) # remove the writte_to_archive file, ignore error if it does not exist os.system("rm %s/written_to_archive 2>/dev/null" % run_data_dir_filesystem)
def iglob(self, pattern, debug=False): """ Returns an iterator of iRODS collection and data object paths which match the given pattern, similar to the glob.iglob builtin. .. note:: Currently only '*' is expanded. The other special characters '?' and '[]' are not (yet) taken into account. Examples: >>> session.glob('m*/ch4.xyz') ['molecules_database/ch4.xyz'] >>> session.glob('./*/*') ['./molecule_database/a.out', './foo/bar.so'] >>> session.glob('~/foo/c*.xyz') ['~/foo/ch4.xyz', '~/foo/co2.xyz'] Arguments: pattern: str The search pattern debug: bool (default: False) Set to True for debugging info """ self.log('DBG| search.iglob pattern: %s' % pattern, debug) if '*' in pattern: index = pattern.index('*') path_root = os.path.dirname(pattern[:index]) else: path_root = pattern path_root = path_root.rstrip('/') if path_root else '.' path_root_abs = self.session.path.get_absolute_irods_path(path_root) # First, the collections pattern_collection = self.session.path.get_absolute_irods_path(pattern) pattern_collection = pattern_collection.replace('*', '%') self.log('DBG| search.iglob pattern_collection: %s' % \ pattern_collection, debug) fields = [Collection.name] criteria = [ Criterion('like', Collection.name, pattern_collection), Criterion('not like', Collection.name, pattern_collection + '/%') ] q = self.session.query(*fields).filter(*criteria) for result in q.get_results(): path = result[Collection.name].replace(path_root_abs, path_root, 1) yield path # Next, the data objects pattern_collection = os.path.dirname(pattern_collection) pattern_object = os.path.basename(pattern) pattern_object = pattern_object.replace('*', '%') self.log('DBG| search.iglob pattern_object: %s' % pattern_object, debug) fields = [Collection.name, DataObject.name] criteria = [ Criterion('like', Collection.name, pattern_collection), Criterion('not like', Collection.name, pattern_collection + '/%'), Criterion('like', DataObject.name, pattern_object) ] q = self.session.query(*fields).filter(*criteria) for result in q.get_results(): path = os.path.join(result[Collection.name], result[DataObject.name]) path = path.replace(path_root_abs, path_root, 1) yield path
def do_register(run_handle): error_file = '%s_errors.log' % run_handle error_file = error_file.replace('/', '_') with open(error_file, 'w') as error_file: checksum_map = {} run_data_dir_filesystem = "%s/hpc_storage/run_data/%s" % ( phengs_path_prefix, run_handle) machine_fastqs_dir_filesystem = "%s/hpc_storage/machine_fastqs/%s" % ( phengs_path_prefix, run_handle) # remove the restore_from_archive and written_to_archive files, ignore error if they do not exist os.system("rm %s/restore_from_archive 2>/dev/null" % run_data_dir_filesystem) os.system("rm %s/written_to_archive 2>/dev/null" % run_data_dir_filesystem) recursively_register_and_checksum(run_data_dir_filesystem, checksum_map, run_handle, error_file) recursively_register_and_checksum(machine_fastqs_dir_filesystem, checksum_map, run_handle, error_file) # open results_ngssample_dirs and register directories in it results_file = "%s/results_ngssample_dirs" % run_data_dir_filesystem try: with open(results_file) as f: for line in f: os_path = line.strip() # register recursively_register_and_checksum(os_path, checksum_map, run_handle, error_file) # replicate and trim recursively_replicate_and_trim(os_path, run_handle) except IOError as e: print('WARNING: No results_ngssample_dirs file found.') error_file.write( 'WARNING: No results_ngssample_dirs file found.\n') # replicate and trim run_data recursively_replicate_and_trim(run_data_dir_filesystem, run_handle) # replicate and trim fastqs recursively_replicate_and_trim(machine_fastqs_dir_filesystem, run_handle) # do post verification # wait for all rules to complete active_rules = int( subprocess.check_output([ 'iquest', '%s', "select COUNT(RULE_EXEC_ID) where RULE_EXEC_USER_NAME = 'ngsservicearchive' and RULE_EXEC_NAME like '%{run_handle}%'" .format(**locals()) ]).strip()) while active_rules > 0: print("Waiting for replication jobs to complete. Job count = %d" % active_rules) time.sleep(5) active_rules = int( subprocess.check_output([ 'iquest', '%s', "select COUNT(RULE_EXEC_ID) where RULE_EXEC_USER_NAME = 'ngsservicearchive' and RULE_EXEC_NAME like '%{run_handle}%'" .format(**locals()) ]).strip()) print("Replication jobs completed...") # now compare checksum with those in checksum_map print("-------------------") print("Validating files...") print("-------------------") env_file = os.path.expanduser('~/.irods/irods_environment.json') validation_status = True with iRODSSession(irods_env_file=env_file) as session: for file_path in checksum_map: # find object in iRODS and get checksum # select COLL_NAME, DATA_NAME where META_DATA_ATTR_NAME = 'filesystem::path' and META_DATA_ATTR_VALUE = <file_path> found_file = False results = session.query(Collection.name, DataObject, Resource.name).filter( \ Criterion('=', DataObjectMeta.name, 'filesystem::path')).filter( \ Criterion('=', DataObjectMeta.value, file_path)).filter( \ Criterion('=', Resource.name, 's3_resc')) for result in results: found_file = True stored_checksum = "".join("%02x" % b for b in bytearray( base64.b64decode(result[DataObject.checksum][5:]))) if checksum_map[file_path] == stored_checksum: pass #print("Checksum validated for %s" % file_path) else: validation_status = False print( "ERROR: Checksum validation failed for %s: %s vs %s" % (file_path, checksum_map[file_path], stored_checksum)) error_file.write( "ERROR: Checksum validation failed for %s: %s vs %s\n" % (file_path, checksum_map[file_path], stored_checksum)) if found_file is False: validation_status = False print("ERROR: File %s was not found in archive..." % file_path) error_file.write( "ERROR: File %s was not found in archive...\n" % file_path) # create the written_to_archive file os.system("touch %s/written_to_archive" % run_data_dir_filesystem) if validation_status is False: print( "ERROR: Post replication validation failed for at least one file." ) error_file.write( "ERROR: Post replication validation failed for at least one file.\n" ) sys.exit(1) else: print("Post replication validation succeeded.")
def find(self, irods_path='.', pattern='*', use_wholename=False, types='d,f', mindepth=0, maxdepth=-1, collection_avu=[], object_avu=[], debug=False): """ Returns a list of iRODS collection and data object paths which match the given pattern, similar to the UNIX `find` command. Examples: >>> session.find('.', pattern='*mol*/*.xyz', types='f', >>> object_avu=('=,kind', 'like,%organic')) ['data/molecules/c6h6.xyz', './data/molecules/ch3cooh.xyz'] >>> session.find('~/data*', pattern='molecules', types='d') ['~/data/molecules'] Arguments: irods_path: str (default: '.') Glob pattern of the roots of the iRODS collection trees in which to search pattern: str (default: '*') The search pattern use_wholename: bool (default: False) Whether it is the whole (absolute) path name that has to match the pattern, or only the basename of the collection or data object. types: str (default: 'd,f') Comma-separated list of one or more of the following characters to select the type of results to include: * 'd' for directories (i.e. collections) * 'f' for files (i.e. data objects) mindepth: int (default: 0) Minimal depth with respect to the root collections maxdepth: int (default: -1) Maximal depth with respect to the root collections collection_avu: tuple or list of tuples (default: []) One or several attribute[-value[-unit]] patterns to be used in filtering collections. object_avu: tuple or list of tuples (default: []) One or several attribute[-value[-unit]] patterns to be used in filtering data objects. debug: bool (default: False) Set to True for debugging info """ # Process arguments: assert mindepth >= 0, 'mindepth argument must be >= 0' if isinstance(object_avu, tuple): object_avu = [object_avu] if isinstance(collection_avu, tuple): collection_avu = [collection_avu] if not use_wholename and '/' in pattern: msg = "Pattern %s contains a slash. UNIX file names usually don't, " msg += "so this search will probably yield no results. Setting " msg += "'wholename=True' may help you find what you're looking for." warnings.warn(msg % pattern) # Set up the metadata fields and criteria for the queries: def parse_avu_component(component): if component.count(',') == 0: operation, meta_pattern = '=', component elif component.count(',') == 1: operation, meta_pattern = component.split(',') else: raise ValueError('Cannot parse AVU component: %s' % component) return operation, meta_pattern meta_fields = {Collection: [], DataObject: []} meta_criteria = {Collection: [], DataObject: []} for model, avu_list in zip([Collection, DataObject], [collection_avu, object_avu]): for avu in avu_list: if model == Collection: fields = [ CollectionMeta.name, CollectionMeta.value, CollectionMeta.units ] elif model == DataObject: fields = [ DataObjectMeta.name, DataObjectMeta.value, DataObjectMeta.units ] for item, field in zip(avu, fields): operation, meta_pattern = parse_avu_component(item) self.log('DBG| AVU criterion: %s %s %s' % \ (operation, field, meta_pattern), debug) criterion = Criterion(operation, field, meta_pattern) meta_criteria[model].append(criterion) meta_fields[model].append(field) # Loop over the glob-pattern-matching collections and data objects for path_root in self.iglob(irods_path, debug=debug): self.log('DBG| search.find path_root: %s' % path_root, debug) path_root_abs = self.session.path.get_absolute_irods_path( path_root) if not self.session.collections.exists(path_root_abs): if 'f' in types.split(','): yield path_root continue # Walk the collection trees iterators = [ self.walk(path_root, mindepth=mindepth, maxdepth=maxdepth, return_objects=True, debug=debug) ] if mindepth == 0: # Also include the root collection, # which is not covered by self.walk collection = self.session.collections.get(path_root_abs) iterators.insert(0, [(collection, [collection], [])]) iterator = itertools.chain(*iterators) for (collection, subcollections, data_objects) in iterator: self.log('DBG| search.find collection: %s' % collection.path, debug) # Now we are left with collections and data objects # which match the depths and the given 'irods_path' # glob pattern, and we just need to further filter # on the (whole)name pattern and the AVUs. # Things to keep in mind: # * Collection: 'name' attribute refers to full path # 'path' attribute non-existent # * DataObject: 'name' attribute refers to basename # 'path' attribute non-existent # * iRODSCollection and iRODSDataObject: # 'name' refers to basename, # 'path' referse to full path for t, items in zip(['d', 'f'], [subcollections, data_objects]): if t not in types.split(','): continue for item in items: name = item.path if use_wholename else item.name if not fnmatch.fnmatch(name, pattern): continue if t == 'd': q = self.session.query(Collection.name, *meta_fields[Collection]) criterion = Criterion('=', Collection.name, item.path) q = q.filter(criterion, *meta_criteria[Collection]) elif t == 'f': q = self.session.query(Collection.name, DataObject.name, *meta_fields[DataObject]) criteria = [ Criterion('=', Collection.name, collection.path), Criterion('=', DataObject.name, item.name) ] q = q.filter(*criteria, *meta_criteria[DataObject]) results = [result for result in q.get_results()] assert len(results) in [0, 1], results if len(results) == 1: path = item.path.replace(path_root_abs, path_root.rstrip('/'), 1) yield path