def custom_result_summary_renderer(results): # pragma: no cover # fish out sizes of annexed files. those will only be present # with --annex ... annexed = [ (int(r['bytesize']), r.get('has_content', None)) for r in results if r.get('action', None) == 'status' \ and 'key' in r and 'bytesize' in r] if annexed: have_availability = any(a[1] is not None for a in annexed) total_size = bytes2human(sum(a[0] for a in annexed)) # we have availability info encoded in the results from datalad.ui import ui if have_availability: ui.message( "{} annex'd {} ({}/{} present/total size)".format( len(annexed), single_or_plural('file', 'files', len(annexed)), bytes2human(sum(a[0] for a in annexed if a[1])), total_size)) else: ui.message( "{} annex'd {} ({} recorded total size)".format( len(annexed), single_or_plural('file', 'files', len(annexed)), total_size))
def custom_result_summary_renderer(results): # pragma: more cover # fish out sizes of annexed files. those will only be present # with --annex ... annexed = [ (int(r['bytesize']), r.get('has_content', None)) for r in results if r.get('action', None) == 'status' \ and 'key' in r and 'bytesize' in r] if annexed: have_availability = any(a[1] is not None for a in annexed) total_size = bytes2human(sum(a[0] for a in annexed)) # we have availability info encoded in the results from datalad.ui import ui if have_availability: ui.message("{} annex'd {} ({}/{} present/total size)".format( len(annexed), single_or_plural('file', 'files', len(annexed)), bytes2human(sum(a[0] for a in annexed if a[1])), total_size)) else: ui.message("{} annex'd {} ({} recorded total size)".format( len(annexed), single_or_plural('file', 'files', len(annexed)), total_size)) if all( r.get('action', None) == 'status' and r.get('state', None) == 'clean' for r in results): from datalad.ui import ui ui.message("nothing to save, working tree clean")
def __call__(self, query, max_nresults=None, force_reindex=False): with self.idx_obj.searcher() as searcher: wquery = self.get_query(query) # perform the actual search hits = searcher.search( wquery, terms=True, limit=max_nresults if max_nresults > 0 else None) # report query stats topstr = '{} top {}'.format( max_nresults, single_or_plural('match', 'matches', max_nresults) ) lgr.info('Query completed in {} sec.{}'.format( hits.runtime, ' Reporting {}.'.format( ('up to ' + topstr) if max_nresults > 0 else 'all matches' ) if not hits.is_empty() else ' No matches.' )) if not hits: return nhits = 0 # annotate hits for full metadata report hits = [dict( path=normpath(opj(self.ds.path, hit['path'])), query_matched={assure_unicode(k): assure_unicode(v) if isinstance(v, unicode_srctypes) else v for k, v in hit.matched_terms()}, parentds=normpath( opj(self.ds.path, hit['parentds'])) if 'parentds' in hit else None, type=hit.get('type', None)) for hit in hits] for res in query_aggregated_metadata( # type is taken from hit record reporton=None, ds=self.ds, aps=hits, # never recursive, we have direct hits already recursive=False): res.update( refds=self.ds.path, action='search', status='ok', logger=lgr, ) yield res nhits += 1 if max_nresults and nhits == max_nresults: lgr.info( "Reached the limit of {}, there could be more which " "were not reported.".format(topstr) )
def result_renderer_cmdline(res, args): from datalad.ui import ui from os import linesep if res is None: res = [] if not isinstance(res, list): res = [res] if not len(res): ui.message("Got nothing new") return # provide summary nsuccess = sum(item.get('success', False) if isinstance(item, dict) else True for item in res) nfailure = len(res) - nsuccess msg = "Tried to get %d %s." % ( len(res), single_or_plural("file", "files", len(res))) if nsuccess: msg += " Got %d. " % nsuccess if nfailure: msg += " Failed to get %d." % (nfailure,) ui.message(msg) # if just a few or less than initially explicitly requested if len(res) < 10 or args.verbose: msg = linesep.join([ "{path} ... {suc}".format( suc="ok." if isinstance(item, Dataset) or item.get('success', False) else "failed. (%s)" % item.get('note', 'unknown reason'), path=item.get('file') if isinstance(item, dict) else item.path) for item in res]) ui.message(msg)
def custom_result_summary_renderer(res): from datalad.ui import ui from os import linesep if not len(res): ui.message("Got nothing new") return nfiles = count_results(res, type='file') nsuccess_file = count_results(res, type='file', status='ok') nfailure = nfiles - nsuccess_file msg = "Tried to get %d %s that had no content yet." % ( nfiles, single_or_plural("file", "files", nfiles)) if nsuccess_file: msg += " Successfully obtained %d. " % nsuccess_file if nfailure: msg += " %d (failed)." % (nfailure, ) ui.message(msg) # if just a few or less than initially explicitly requested if len(res) < 10: msg = linesep.join([ "{path}{type} ... {suc}".format( suc=item.get('status'), path=item.get('path'), type=' [{}]'.format(item['type']) if 'type' in item else '') for item in res ]) ui.message(msg)
def custom_result_summary_renderer(res): from datalad.ui import ui from os import linesep if not len(res): ui.message("Got nothing new") return nfiles = count_results(res, type='file') nsuccess_file = count_results(res, type='file', status='ok') nfailure = nfiles - nsuccess_file msg = "Tried to get %d %s that had no content yet." % ( nfiles, single_or_plural("file", "files", nfiles)) if nsuccess_file: msg += " Successfully obtained %d. " % nsuccess_file if nfailure: msg += " %d (failed)." % (nfailure,) ui.message(msg) # if just a few or less than initially explicitly requested if len(res) < 10: msg = linesep.join([ "{path}{type} ... {suc}".format( suc=item.get('status'), path=item.get('path'), type=' [{}]'.format(item['type']) if 'type' in item else '') for item in res]) ui.message(msg)
def _handle_and_return_installed_items(ds, installed_items, failed_items, save): if save and ds is not None: _save_installed_datasets(ds, installed_items) if failed_items: msg = '' for act, l in (("succeeded", installed_items), ("failed", failed_items)): if not l: continue if msg: msg += ', and ' msg += "%s %s" % ( single_or_plural("dataset", "datasets", len(l), include_count=True), act) if ds: paths = [relpath(i.path, ds.path) if hasattr(i, 'path') else i if not i.startswith(ds.path) else relpath(i, ds.path) for i in l] else: paths = l msg += " (%s)" % (", ".join(map(str, paths))) msg += ' to install' # we were asked for multiple installations if installed_items or len(failed_items) > 1: raise IncompleteResultsError( results=installed_items, failed=failed_items, msg=msg) else: raise InstallFailedError(msg=msg) return installed_items[0] \ if len(installed_items) == 1 else installed_items
def result_renderer_cmdline(res, args): from datalad.ui import ui from os import linesep if res is None: res = [] if not isinstance(res, list): res = [res] if not len(res): ui.message("Got nothing new") return # provide summary nsuccess = sum( item.get('success', False) if isinstance(item, dict) else True for item in res) nfailure = len(res) - nsuccess msg = "Tried to get %d %s." % ( len(res), single_or_plural("file", "files", len(res))) if nsuccess: msg += " Got %d. " % nsuccess if nfailure: msg += " Failed to get %d." % (nfailure, ) ui.message(msg) # if just a few or less than initially explicitly requested if len(res) < 10 or args.verbose: msg = linesep.join([ "{path} ... {suc}".format( suc="ok." if isinstance(item, Dataset) or item.get('success', False) else "failed. (%s)" % item.get('note', 'unknown reason'), path=item.get('file') if isinstance(item, dict) else item.path) for item in res ]) ui.message(msg)
def _handle_and_return_installed_items(ds, installed_items, failed_items, save): if save and ds is not None: _save_installed_datasets(ds, installed_items) if failed_items: msg = '' for act, l in (("succeeded", installed_items), ("failed", failed_items)): if not l: continue if msg: msg += ', and ' msg += "%s %s" % (single_or_plural( "dataset", "datasets", len(l), include_count=True), act) if ds: paths = [ relpath(i.path, ds.path) if hasattr(i, 'path') else i if not i.startswith(ds.path) else relpath(i, ds.path) for i in l ] else: paths = l msg += " (%s)" % (", ".join(map(str, paths))) msg += ' to install' # we were asked for multiple installations if installed_items or len(failed_items) > 1: raise IncompleteResultsError(results=installed_items, failed=failed_items, msg=msg) else: raise InstallFailedError(msg=msg) return installed_items[0] \ if len(installed_items) == 1 else installed_items
def __call__(self, query, max_nresults=None, consider_ucn=False, full_record=True): query_re = re.compile(self.get_query(query)) nhits = 0 for res in query_aggregated_metadata( reporton=self.documenttype, ds=self.ds, aps=[dict(path=self.ds.path, type='dataset')], # MIH: I cannot see a case when we would not want recursion (within # the metadata) recursive=True): # this assumes that files are reported after each dataset report, # and after a subsequent dataset report no files for the previous # dataset will be reported again meta = res.get('metadata', {}) # produce a flattened metadata dict to search through doc = _meta2autofield_dict(meta, val2str=True, consider_ucn=consider_ucn) # use search instead of match to not just get hits at the start of the string # this will be slower, but avoids having to use actual regex syntax at the user # side even for simple queries # DOTALL is needed to handle multiline description fields and such, and still # be able to match content coming for a later field lgr.log(7, "Querying %s among %d items", query_re, len(doc)) t0 = time() matches = { k: query_re.search(v.lower()) for k, v in iteritems(doc) } dt = time() - t0 lgr.log(7, "Finished querying in %f sec", dt) # retain what actually matched matches = { k: match.group() for k, match in matches.items() if match } if matches: hit = dict( res, action='search', query_matched=matches, ) yield hit nhits += 1 if max_nresults and nhits == max_nresults: # report query stats topstr = '{} top {}'.format( max_nresults, single_or_plural('match', 'matches', max_nresults)) lgr.info( "Reached the limit of {}, there could be more which " "were not reported.".format(topstr)) break
def _display_suppressed_message(nsimilar, ndisplayed, final=False): # +1 because there was the original result + nsimilar displayed. n_suppressed = nsimilar - ndisplayed + 1 if n_suppressed > 0: ui.message(' [{} similar {} been suppressed]'.format( n_suppressed, single_or_plural("message has", "messages have", n_suppressed, False)), cr="\n" if final else "\r")
def custom_result_summary_renderer(results): # pragma: no cover # fish out sizes of annexed files. those will only be present # with --annex ... annexed = [ (int(r['bytesize']), r.get('has_content', False)) for r in results if r.get('action', None) == 'status' \ and 'key' in r and 'bytesize' in r] if annexed: from datalad.ui import ui ui.message( "{} annex'd {} ({}/{} present/total size)".format( len(annexed), single_or_plural('file', 'files', len(annexed)), bytes2human(sum(a[0] for a in annexed if a[1])), bytes2human(sum(a[0] for a in annexed))))
def _display_suppressed_message(nsimilar, ndisplayed, last_ts, final=False): # +1 because there was the original result + nsimilar displayed. n_suppressed = nsimilar - ndisplayed + 1 if n_suppressed > 0: ts = time() # rate-limit update of suppression message, with a large number # of fast-paced results updating for each one can result in more # CPU load than the actual processing # arbitrarily go for a 2Hz update frequency -- it "feels" good if last_ts is None or final or (ts - last_ts > 0.5): ui.message(' [{} similar {} been suppressed]'.format( n_suppressed, single_or_plural("message has", "messages have", n_suppressed, False)), cr="\n" if final else "\r") return ts return last_ts
def _mk_search_index(self, force_reindex): """Generic entrypoint to index generation The actual work that determines the structure and content of the index is done by functions that are passed in as arguments `meta2doc` - must return dict for index document from result input """ from whoosh import index as widx from .metadata import agginfo_relpath # what is the lastest state of aggregated metadata metadata_state = self.ds.repo.get_last_commit_hash(agginfo_relpath) # use location common to all index types, they would all invalidate # simultaneously stamp_fname = opj(self.index_dir, 'datalad_metadata_state') index_dir = opj(self.index_dir, self._mode_label) if (not force_reindex) and \ exists(index_dir) and \ exists(stamp_fname) and \ open(stamp_fname).read() == metadata_state: try: # TODO check that the index schema is the same # as the one we would have used for reindexing # TODO support incremental re-indexing, whoosh can do it idx = widx.open_dir(index_dir) lgr.debug('Search index contains %i documents', idx.doc_count()) self.idx_obj = idx return except widx.LockError as e: raise e except widx.IndexError as e: # Generic index error. # we try to regenerate lgr.warning( "Cannot open existing index %s (%s), will regenerate", index_dir, exc_str(e)) except widx.IndexVersionError as e: # (msg, version, release=None) # Raised when you try to open an index using a format that the # current version of Whoosh cannot read. That is, when the index # you're trying to open is either not backward or forward # compatible with this version of Whoosh. # we try to regenerate lgr.warning(exc_str(e)) pass except widx.OutOfDateError as e: # Raised when you try to commit changes to an index which is not # the latest generation. # this should not happen here, but if it does ... KABOOM raise except widx.EmptyIndexError as e: # Raised when you try to work with an index that has no indexed # terms. # we can just continue with generating an index pass except ValueError as e: if 'unsupported pickle protocol' in str(e): lgr.warning( "Cannot open existing index %s (%s), will regenerate", index_dir, exc_str(e)) else: raise lgr.info('{} search index'.format( 'Rebuilding' if exists(index_dir) else 'Building')) if not exists(index_dir): os.makedirs(index_dir) # this is a pretty cheap call that just pull this info from a file dsinfo = self.ds.metadata(get_aggregates=True, return_type='list', result_renderer='disabled') self._mk_schema(dsinfo) idx_obj = widx.create_in(index_dir, self.schema) idx = idx_obj.writer( # cache size per process limitmb=cfg.obtain('datalad.search.indexercachesize'), # disable parallel indexing for now till #1927 is resolved ## number of processes for indexing #procs=multiprocessing.cpu_count(), ## write separate index segments in each process for speed ## asks for writer.commit(optimize=True) #multisegment=True, ) # load metadata of the base dataset and what it knows about all its subdatasets # (recursively) old_idx_size = 0 old_ds_rpath = '' idx_size = 0 log_progress( lgr.info, 'autofieldidxbuild', 'Start building search index', total=len(dsinfo), label='Building search index', unit=' Datasets', ) for res in query_aggregated_metadata( reporton=self.documenttype, ds=self.ds, aps=[dict(path=self.ds.path, type='dataset')], # MIH: I cannot see a case when we would not want recursion (within # the metadata) recursive=True): # this assumes that files are reported after each dataset report, # and after a subsequent dataset report no files for the previous # dataset will be reported again meta = res.get('metadata', {}) doc = self._meta2doc(meta) admin = { 'type': res['type'], 'path': relpath(res['path'], start=self.ds.path), } if 'parentds' in res: admin['parentds'] = relpath(res['parentds'], start=self.ds.path) if admin['type'] == 'dataset': if old_ds_rpath: lgr.debug( 'Added %s on dataset %s', single_or_plural('document', 'documents', idx_size - old_idx_size, include_count=True), old_ds_rpath) log_progress(lgr.info, 'autofieldidxbuild', 'Indexed dataset at %s', old_ds_rpath, update=1, increment=True) old_idx_size = idx_size old_ds_rpath = admin['path'] admin['id'] = res.get('dsid', None) doc.update({k: assure_unicode(v) for k, v in admin.items()}) lgr.debug("Adding document to search index: {}".format(doc)) # inject into index idx.add_document(**doc) idx_size += 1 if old_ds_rpath: lgr.debug( 'Added %s on dataset %s', single_or_plural('document', 'documents', idx_size - old_idx_size, include_count=True), old_ds_rpath) lgr.debug("Committing index") idx.commit(optimize=True) log_progress(lgr.info, 'autofieldidxbuild', 'Done building search index') # "timestamp" the search index to allow for automatic invalidation with open(stamp_fname, 'w') as f: f.write(metadata_state) lgr.info('Search index contains %i documents', idx_size) self.idx_obj = idx_obj
def __call__( path=None, dataset=None, get_aggregates=False, reporton='all', recursive=False): # prep results refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(action='metadata', logger=lgr) if refds_path: res_kwargs['refds'] = refds_path if get_aggregates: # yield all datasets for which we have aggregated metadata as results # the get actual dataset results, so we can turn them into dataset # instances using generic top-level code if desired ds = require_dataset( refds_path, check_installed=True, purpose='aggregate metadata query') info_fpath = opj(ds.path, agginfo_relpath) if not exists(info_fpath): # if there has ever been an aggregation run, this file would # exist, hence there has not been and we need to tell this # to people yield get_status_dict( ds=ds, status='impossible', action='metadata', logger=lgr, message='metadata aggregation has never been performed in this dataset') return agginfos = _load_json_object(info_fpath) parentds = [] for sd in sorted(agginfos): info = agginfos[sd] dspath = normpath(opj(ds.path, sd)) if parentds and not path_is_subpath(dspath, parentds[-1]): parentds.pop() info.update( path=dspath, type='dataset', status='ok', ) if sd == curdir: info['layout_version'] = aggregate_layout_version if parentds: info['parentds'] = parentds[-1] yield dict( info, **res_kwargs ) parentds.append(dspath) return if not dataset and not path: # makes no sense to have no dataset, go with "here" # error generation happens during annotation path = curdir content_by_ds = OrderedDict() for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, # MIH: we are querying the aggregated metadata anyways, and that # mechanism has its own, faster way to go down the hierarchy #recursive=recursive, #recursion_limit=recursion_limit, action='metadata', # uninstalled subdatasets could be queried via aggregated metadata # -> no 'error' unavailable_path_status='', nondataset_path_status='error', # we need to know when to look into aggregated data force_subds_discovery=True, force_parentds_discovery=True, return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('type', None) == 'dataset' and GitRepo.is_valid_repo(ap['path']): ap['process_content'] = True to_query = None if ap.get('state', None) == 'absent' or \ ap.get('type', 'dataset') != 'dataset': # this is a lonely absent dataset/file or content in a present dataset # -> query through parent # there must be a parent, otherwise this would be a non-dataset path # and would have errored during annotation to_query = ap['parentds'] else: to_query = ap['path'] if to_query: pcontent = content_by_ds.get(to_query, []) pcontent.append(ap) content_by_ds[to_query] = pcontent # test for datasets that will be queried, but have never been aggregated # TODO add option, even even by default, re-aggregate metadata prior query # if it was found to be outdated. # This is superior to re-aggregation upon manipulation, as manipulation # can happen in a gazzilon ways and may even be incremental over multiple # steps where intermediate re-aggregation is pointless and wasteful to_aggregate = [d for d in content_by_ds if not exists(opj(d, agginfo_relpath))] if to_aggregate: lgr.warning( 'Metadata query results might be incomplete, initial ' 'metadata aggregation was not yet performed in %s at: %s', single_or_plural( 'dataset', 'datasets', len(to_aggregate), include_count=True), to_aggregate) for ds_path in content_by_ds: ds = Dataset(ds_path) query_agg = [ap for ap in content_by_ds[ds_path] # this is an available subdataset, will be processed in another # iteration if ap.get('state', None) == 'absent' or not(ap.get('type', None) == 'dataset' and ap['path'] != ds_path)] if not query_agg: continue # report from aggregated metadata for r in query_aggregated_metadata( reporton, # by default query the reference dataset, only if there is none # try our luck in the dataset that contains the queried path # this is consistent with e.g. `get_aggregates` reporting the # situation in the reference dataset only Dataset(refds_path) if refds_path else ds, query_agg, # recursion above could only recurse into datasets # on the filesystem, but there might be any number of # uninstalled datasets underneath the last installed one # for which we might have metadata recursive=recursive, **res_kwargs): yield r return
def __call__(path, dataset=None, spec_file=None, properties=None, replace=False): # TODO: message dataset = require_dataset(dataset, check_installed=True, purpose="hirni spec4anything") path = assure_list(path) path = [resolve_path(p, dataset) for p in path] res_kwargs = dict(action='hirni spec4anything', logger=lgr) res_kwargs['refds'] = Interface.get_refds_path(dataset) # ### This might become superfluous. See datalad-gh-2653 ds_path = PathRI(dataset.path) # ### updated_files = [] paths = [] for ap in AnnotatePaths.__call__( dataset=dataset, path=path, action='hirni spec4anything', unavailable_path_status='impossible', nondataset_path_status='error', return_type='generator', # TODO: Check this one out: on_failure='ignore', # Note/TODO: Not sure yet whether and when we need those. # Generally we want to be able to create a spec for subdatasets, # too: # recursive=recursive, # recursion_limit=recursion_limit, # force_subds_discovery=True, # force_parentds_discovery=True, ): if ap.get('status', None) in ['error', 'impossible']: yield ap continue # ### This might become superfluous. See datalad-gh-2653 ap_path = PathRI(ap['path']) # ### # find acquisition and respective specification file: rel_path = posixpath.relpath(ap_path.posixpath, ds_path.posixpath) path_parts = rel_path.split('/') # TODO: Note: Outcommented this warning for now. We used to not have # a spec file at the toplevel of the study dataset, but now we do. # The logic afterwards works, but should be revisited. At least, # `acq` should be called differently now. # if len(path_parts) < 2: # lgr.warning("Not within an acquisition") acq = path_parts[0] # TODO: spec file specifiable or fixed path? # if we want the former, what we actually need is an # association of acquisition and its spec path # => prob. not an option but a config spec_path = spec_file if spec_file \ else posixpath.join(ds_path.posixpath, acq, dataset.config.get("datalad.hirni.studyspec.filename", "studyspec.json")) spec = [r for r in json_py.load_stream(spec_path)] \ if posixpath.exists(spec_path) else list() lgr.debug("Add specification snippet for %s", ap['path']) # XXX 'add' does not seem to be the thing we want to do # rather 'set', so we have to check whether a spec for a location # is already known and fail or replace it (maybe with --force) # go through all existing specs and extract unique value # and also assign them to the new record (subjects, ...), but only # editable fields!! uniques = dict() for s in spec: for k in s: if isinstance(s[k], dict) and 'value' in s[k]: if k not in uniques: uniques[k] = set() uniques[k].add(s[k]['value']) overrides = dict() for k in uniques: if len(uniques[k]) == 1: overrides[k] = _get_edit_dict(value=uniques[k].pop(), approved=False) if properties: # TODO: This entire reading of properties needs to be RF'd # into proper generalized functions. # spec got more complex. update() prob. can't simply override # (think: 'procedures' and 'tags' prob. need to be appended # instead) # load from file or json string if isinstance(properties, dict): props = properties elif op.exists(properties): props = json_py.load(properties) else: props = json_py.loads(properties) # turn into editable, pre-approved records spec_props = { k: dict(value=v, approved=True) for k, v in props.items() if k not in non_editables + ['tags', 'procedures'] } spec_props.update({ k: v for k, v in props.items() if k in non_editables + ['tags'] }) # TODO: still wrong. It's a list. Append or override? How to decide? spec_props.update({ o_k: [{ i_k: dict(value=i_v, approved=True) for i_k, i_v in o_v.items() }] for o_k, o_v in props.items() if o_k in ['procedures'] }) overrides.update(spec_props) # TODO: It's probably wrong to use uniques for overwriting! At least # they cannot be used to overwrite values explicitly set in # _add_to_spec like "location", "type", etc. # # But then: This should concern non-editable fields only, right? spec = _add_to_spec(spec, posixpath.split(spec_path)[0], ap, dataset, overrides=overrides, replace=replace) # Note: Not sure whether we really want one commit per snippet. # If not - consider: # - What if we fail amidst? => Don't write to file yet. # - What about input paths from different acquisitions? # => store specs per acquisition in memory # MIH: One commit per line seems silly. why not update all files # collect paths of updated files, and give them to a single `add` # at the very end? # MIH: if we fail, we fail and nothing is committed from datalad_hirni.support.spec_helpers import sort_spec json_py.dump2stream(sorted(spec, key=lambda x: sort_spec(x)), spec_path) updated_files.append(spec_path) yield get_status_dict(status='ok', type=ap['type'], path=ap['path'], **res_kwargs) paths.append(ap) from datalad.dochelpers import single_or_plural from os import linesep message = "[HIRNI] Add specification {n_snippets} for: {paths}".format( n_snippets=single_or_plural("snippet", "snippets", len(paths)), paths=linesep.join(" - " + op.relpath(p['path'], dataset.path) for p in paths) if len(paths) > 1 else op.relpath(paths[0]['path'], dataset.path)) for r in dataset.save(updated_files, to_git=True, message=message, return_type='generator', result_renderer='disabled'): yield r
def __call__(query=None, dataset=None, force_reindex=False, max_nresults=20, show_keys=False, show_query=False): from whoosh import qparser as qparse try: ds = require_dataset(dataset, check_installed=True, purpose='dataset search') if ds.id is None: raise NoDatasetArgumentFound( "This does not seem to be a dataset (no DataLad dataset ID " "found). 'datalad create --force %s' can initialize " "this repository as a DataLad dataset" % ds.path) except NoDatasetArgumentFound: for r in _search_from_virgin_install(dataset, query): yield r return # where does the bunny have the eggs? index_dir = opj(ds.path, get_git_dir(ds.path), 'datalad', 'search_index') idx_obj = _get_search_index(index_dir, ds, force_reindex) if show_keys: definitions_fname = opj(index_dir, 'datalad_term_definitions.json.gz') try: defs = jsonload(gzopen(definitions_fname)) except Exception as e: lgr.warning( 'No term definitions found alongside search index: %s', exc_str(e)) defs = {} for k in idx_obj.schema.names(): print('{}{}'.format( k, ' {}'.format(defs[k] if isinstance(defs[k], dict) else '({})'.format(defs[k])) if k in defs else '')) return if not query: return with idx_obj.searcher() as searcher: # parse the query string, default whoosh parser ATM, could be # tailored with plugins parser = qparse.MultifieldParser(idx_obj.schema.names(), idx_obj.schema) # XXX: plugin is broken in Debian's whoosh 2.7.0-2, but already fixed # upstream parser.add_plugin(qparse.FuzzyTermPlugin()) parser.add_plugin(qparse.GtLtPlugin()) # replace field defintion to allow for colons to be part of a field's name: parser.replace_plugin( qparse.FieldsPlugin(expr=r"(?P<text>[()<>:\w]+|[*]):")) # for convenience we accept any number of args-words from the # shell and put them together to a single string here querystr = ' '.join(assure_list(query)) # this gives a formal whoosh query wquery = parser.parse(querystr) if show_query: print(wquery) return # perform the actual search hits = searcher.search( wquery, terms=True, limit=max_nresults if max_nresults > 0 else None) # cheap way to get an approximate number of hits, without an expensive # scoring of all items # disabled: unreliable estimate, often confusing #nhits = hits.estimated_min_length() # report query stats topstr = '{} top {}'.format( max_nresults, single_or_plural('match', 'matches', max_nresults)) lgr.info('Query completed in {} sec.{}'.format( hits.runtime, ' Reporting {}.'.format(( 'up to ' + topstr) if max_nresults > 0 else 'all matches') if not hits.is_empty() else ' No matches.')) if not hits: return nhits = 0 for hit in hits: res = dict( action='search', status='ok', logger=lgr, refds=ds.path, # normpath to avoid trailing dot path=normpath(opj(ds.path, hit['path'])), query_matched={ assure_unicode(k): assure_unicode(v) if isinstance( v, unicode_srctypes) else v for k, v in hit.matched_terms() }, metadata={ k: v for k, v in hit.fields().items() if k not in ('path', 'parentds') }) if 'parentds' in hit: res['parentds'] = normpath(opj(ds.path, hit['parentds'])) yield res nhits += 1 if max_nresults and nhits == max_nresults: lgr.info("Reached the limit of {}, there could be more which " "were not reported.".format(topstr))
def _get_search_index(index_dir, ds, force_reindex): from whoosh import index as widx from .metadata import agginfo_relpath # what is the lastest state of aggregated metadata metadata_state = ds.repo.get_last_commit_hash(agginfo_relpath) stamp_fname = opj(index_dir, 'datalad_metadata_state') definitions_fname = opj(index_dir, 'datalad_term_definitions.json.gz') if not force_reindex and \ exists(stamp_fname) and \ open(stamp_fname).read() == metadata_state: try: # TODO check that the index schema is the same # as the one we would have used for reindexing # TODO support incremental re-indexing, whoosh can do it idx = widx.open_dir(index_dir) lgr.debug('Search index contains %i documents', idx.doc_count()) return idx except widx.LockError as e: raise e except widx.IndexError as e: # Generic index error. # we try to regenerate # TODO log this pass except widx.IndexVersionError as e: # (msg, version, release=None) # Raised when you try to open an index using a format that the # current version of Whoosh cannot read. That is, when the index # you're trying to open is either not backward or forward # compatible with this version of Whoosh. # we try to regenerate # TODO log this pass except widx.OutOfDateError as e: # Raised when you try to commit changes to an index which is not # the latest generation. # this should not happen here, but if it does ... KABOOM raise e except widx.EmptyIndexError as e: # Raised when you try to work with an index that has no indexed # terms. # we can just continue with generating an index pass lgr.info('{} search index'.format( 'Rebuilding' if exists(index_dir) else 'Building')) if not exists(index_dir): os.makedirs(index_dir) schema, definitions, per_ds_defs = _get_search_schema(ds) idx_obj = widx.create_in(index_dir, schema) idx = idx_obj.writer( # cache size per process limitmb=cfg.obtain('datalad.search.indexercachesize'), # disable parallel indexing for now till #1927 is resolved ## number of processes for indexing #procs=multiprocessing.cpu_count(), ## write separate index segments in each process for speed ## asks for writer.commit(optimize=True) #multisegment=True, ) # load metadata of the base dataset and what it knows about all its subdatasets # (recursively) old_idx_size = 0 old_ds_rpath = '' idx_size = 0 for res in _query_aggregated_metadata( reporton=ds.config.obtain( 'datalad.metadata.searchindex-documenttype'), ds=ds, aps=[dict(path=ds.path, type='dataset')], # TODO expose? but this would likely only affect metadata in the # base dataset merge_mode='init', # MIH: I cannot see a case when we would not want recursion (within # the metadata) recursive=True): rpath = relpath(res['path'], start=ds.path) # this assumes that files are reported after each dataset report, # and after a subsequent dataset report no files for the previous # dataset will be reported again rtype = res['type'] meta = res.get('metadata', {}) meta = MetadataDict(meta) if rtype == 'dataset': if old_ds_rpath: lgr.info( 'Added %s on dataset %s', single_or_plural('document', 'documents', idx_size - old_idx_size, include_count=True), old_ds_rpath) old_idx_size = idx_size old_ds_rpath = rpath # get any custom dataset mappings ds_defs = per_ds_defs.get(res['path'], {}) # now we merge all reported unique content properties (flattened representation # of content metadata) with the main metadata set, using the 'add' strategy # this way any existing metadata value of a dataset itself will be amended by # those coming from the content. E.g. a single dataset 'license' might be turned # into a sequence of unique license identifiers across all dataset components meta.merge_add(meta.get('unique_content_properties', {})) meta.pop('unique_content_properties', None) doc_props = dict(path=rpath, type=rtype, **_meta2index_dict(meta, definitions, ds_defs)) if 'parentds' in res: doc_props['parentds'] = relpath(res['parentds'], start=ds.path) _add_document(idx, **doc_props) idx_size += 1 if old_ds_rpath: lgr.info( 'Added %s on dataset %s', single_or_plural('document', 'documents', idx_size - old_idx_size, include_count=True), old_ds_rpath) idx.commit(optimize=True) # "timestamp" the search index to allow for automatic invalidation with open(stamp_fname, 'w') as f: f.write(metadata_state) # dump the term/field definitions records for later introspection # use compressed storage, the is not point in inflating the # diskspace requirements with gzopen(definitions_fname, 'wb') as f: # TODO actually go through all, incl. compound, defintions ('@id' plus 'unit' # or similar) and resolve terms to URLs, if anyhow possible jsondump2file(definitions, f) lgr.info('Search index contains %i documents', idx_size) return idx_obj
def __call__(self, query, max_nresults=None, consider_ucn=False, full_record=True): if max_nresults is None: # no limit by default max_nresults = 0 query = self.get_query(query) nhits = 0 for res in query_aggregated_metadata( reporton=self.documenttype, ds=self.ds, aps=[dict(path=self.ds.path, type='dataset')], # MIH: I cannot see a case when we would not want recursion (within # the metadata) recursive=True): # this assumes that files are reported after each dataset report, # and after a subsequent dataset report no files for the previous # dataset will be reported again meta = res.get('metadata', {}) # produce a flattened metadata dict to search through doc = _meta2autofield_dict(meta, val2str=True, consider_ucn=consider_ucn) # inject a few basic properties into the dict # analog to what the other modes do in their index doc.update({ k: res[k] for k in ('@id', 'type', 'path', 'parentds') if k in res }) # use search instead of match to not just get hits at the start of the string # this will be slower, but avoids having to use actual regex syntax at the user # side even for simple queries # DOTALL is needed to handle multiline description fields and such, and still # be able to match content coming for a later field lgr.log(7, "Querying %s among %d items", query, len(doc)) t0 = time() matches = { (q['query'] if isinstance(q, dict) else q, k): q['query'].search(v) if isinstance(q, dict) else q.search(v) for k, v in iteritems(doc) for q in query if not isinstance(q, dict) or q['field'].match(k) } dt = time() - t0 lgr.log(7, "Finished querying in %f sec", dt) # retain what actually matched matched = { k[1]: match.group() for k, match in matches.items() if match } # implement AND behavior across query expressions, but OR behavior # across queries matching multiple fields for a single query expression # for multiple queries, this makes it consistent with a query that # has no field specification if matched and len(query) == len( set(k[0] for k in matches if matches[k])): hit = dict( res, action='search', query_matched=matched, ) yield hit nhits += 1 if max_nresults and nhits == max_nresults: # report query stats topstr = '{} top {}'.format( max_nresults, single_or_plural('match', 'matches', max_nresults)) lgr.info( "Reached the limit of {}, there could be more which " "were not reported.".format(topstr)) break
def postclonecfg_annexdataset(ds, reckless, description=None): """If ds "knows annex" -- annex init it, set into reckless etc Provides additional tune up to a possibly an annex repo, e.g. "enables" reckless mode, sets up description """ # in any case check whether we need to annex-init the installed thing: if not knows_annex(ds.path): # not for us return # init annex when traces of a remote annex can be detected if reckless == 'auto': lgr.debug( "Instruct annex to hardlink content in %s from local " "sources, if possible (reckless)", ds.path) ds.config.set( 'annex.hardlink', 'true', where='local', reload=True) lgr.debug("Initializing annex repo at %s", ds.path) # Note, that we cannot enforce annex-init via AnnexRepo(). # If such an instance already exists, its __init__ will not be executed. # Therefore do quick test once we have an object and decide whether to call # its _init(). # # Additionally, call init if we need to add a description (see #1403), # since AnnexRepo.__init__ can only do it with create=True repo = AnnexRepo(ds.path, init=True) if not repo.is_initialized() or description: repo._init(description=description) if reckless == 'auto' or (reckless and reckless.startswith('shared-')): repo.call_annex(['untrust', 'here']) elif reckless == 'ephemeral': # with ephemeral we declare 'here' as 'dead' right away, whenever # we symlink origin's annex, since availability from 'here' should # not be propagated for an ephemeral clone when we publish back to # origin. # This will cause stuff like this for a locally present annexed file: # % git annex whereis d1 # whereis d1 (0 copies) failed # BUT this works: # % git annex find . --not --in here # % git annex find . --in here # d1 # we don't want annex copy-to origin ds.config.set( 'remote.origin.annex-ignore', 'true', where='local') ds.repo.set_remote_dead('here') if check_symlink_capability(ds.repo.dot_git / 'dl_link_test', ds.repo.dot_git / 'dl_target_test'): # symlink the annex to avoid needless copies in an ephemeral clone annex_dir = ds.repo.dot_git / 'annex' origin_annex_url = ds.config.get("remote.origin.url", None) origin_git_path = None if origin_annex_url: try: # Deal with file:// scheme URLs as well as plain paths. # If origin isn't local, we have nothing to do. origin_git_path = Path(RI(origin_annex_url).localpath) # we are local; check for a bare repo first to not mess w/ # the path if GitRepo(origin_git_path, create=False).bare: # origin is a bare repo -> use path as is pass elif origin_git_path.name != '.git': origin_git_path /= '.git' except ValueError: # Note, that accessing localpath on a non-local RI throws # ValueError rather than resulting in an AttributeError. # TODO: Warning level okay or is info level sufficient? # Note, that setting annex-dead is independent of # symlinking .git/annex. It might still make sense to # have an ephemeral clone that doesn't propagate its avail. # info. Therefore don't fail altogether. lgr.warning("reckless=ephemeral mode: origin doesn't seem " "local: %s\nno symlinks being used", origin_annex_url) if origin_git_path: # TODO make sure that we do not delete any unique data rmtree(str(annex_dir)) \ if not annex_dir.is_symlink() else annex_dir.unlink() annex_dir.symlink_to(origin_git_path / 'annex', target_is_directory=True) else: # TODO: What level? + note, that annex-dead is independ lgr.warning("reckless=ephemeral mode: Unable to create symlinks on " "this file system.") srs = {True: [], False: []} # special remotes by "autoenable" key remote_uuids = None # might be necessary to discover known UUIDs repo_config = repo.config # Note: The purpose of this function is to inform the user. So if something # looks misconfigured, we'll warn and move on to the next item. for uuid, config in repo.get_special_remotes().items(): sr_name = config.get('name', None) if sr_name is None: lgr.warning( 'Ignoring special remote %s because it does not have a name. ' 'Known information: %s', uuid, config) continue sr_autoenable = config.get('autoenable', False) try: sr_autoenable = ensure_bool(sr_autoenable) except ValueError: lgr.warning( 'Failed to process "autoenable" value %r for sibling %s in ' 'dataset %s as bool.' 'You might need to enable it later manually and/or fix it up to' ' avoid this message in the future.', sr_autoenable, sr_name, ds.path) continue # If it looks like a type=git special remote, make sure we have up to # date information. See gh-2897. if sr_autoenable and repo_config.get("remote.{}.fetch".format(sr_name)): try: repo.fetch(remote=sr_name) except CommandError as exc: lgr.warning("Failed to fetch type=git special remote %s: %s", sr_name, exc_str(exc)) # determine whether there is a registered remote with matching UUID if uuid: if remote_uuids is None: remote_uuids = { # Check annex-config-uuid first. For sameas annex remotes, # this will point to the UUID for the configuration (i.e. # the key returned by get_special_remotes) rather than the # shared UUID. (repo_config.get('remote.%s.annex-config-uuid' % r) or repo_config.get('remote.%s.annex-uuid' % r)) for r in repo.get_remotes() } if uuid not in remote_uuids: srs[sr_autoenable].append(sr_name) if srs[True]: lgr.debug( "configuration for %s %s added because of autoenable," " but no UUIDs for them yet known for dataset %s", # since we are only at debug level, we could call things their # proper names single_or_plural("special remote", "special remotes", len(srs[True]), True), ", ".join(srs[True]), ds.path ) if srs[False]: # if has no auto-enable special remotes lgr.info( 'access to %s %s not auto-enabled, enable with:\n' '\t\tdatalad siblings -d "%s" enable -s %s', # but since humans might read it, we better confuse them with our # own terms! single_or_plural("dataset sibling", "dataset siblings", len(srs[False]), True), ", ".join(srs[False]), ds.path, srs[False][0] if len(srs[False]) == 1 else "SIBLING", ) # we have just cloned the repo, so it has 'origin', configure any # reachable origin of origins yield from configure_origins(ds, ds)
def _get_metadata(ds, types, global_meta=None, content_meta=None, paths=None): """Make a direct query of a dataset to extract its metadata. Parameters ---------- ds : Dataset types : list """ errored = False dsmeta = dict() contentmeta = {} if global_meta is not None and content_meta is not None and \ not global_meta and not content_meta: # both are false and not just none return dsmeta, contentmeta, errored context = { '@vocab': 'http://docs.datalad.org/schema_v{}.json'.format(vocabulary_version) } fullpathlist = paths if paths and isinstance(ds.repo, AnnexRepo): # Ugly? Jep: #2055 content_info = zip(paths, ds.repo.file_has_content(paths), ds.repo.is_under_annex(paths)) paths = [p for p, c, a in content_info if not a or c] nocontent = len(fullpathlist) - len(paths) if nocontent: # TODO better fail, or support incremental and label this file as no present lgr.warning('{} files have no content present, ' 'some extractors will not operate on {}'.format( nocontent, 'them' if nocontent > 10 else [p for p, c, a in content_info if not c and a])) # pull out potential metadata field blacklist config settings blacklist = [ re.compile(bl) for bl in ensure_list( ds.config.obtain('datalad.metadata.aggregate-ignore-fields', default=[])) ] # enforce size limits max_fieldsize = ds.config.obtain('datalad.metadata.maxfieldsize') # keep local, who knows what some extractors might pull in from pkg_resources import iter_entry_points # delayed heavy import extractors = { ep.name: ep for ep in iter_entry_points('datalad.metadata.extractors') } # we said that we want to fail, rather then just moan about less metadata # Do an early check if all extractors are available so not to wait hours # and then crash for some obvious reason absent_extractors = [t for t in types if t not in extractors] if absent_extractors: raise ValueError( '%d enabled metadata extractor%s not available in this installation' ': %s' % (len(absent_extractors), single_or_plural(" is", "s are", len(absent_extractors)), ', '.join(absent_extractors))) log_progress( lgr.info, 'metadataextractors', 'Start metadata extraction from %s', ds, total=len(types), label='Metadata extraction', unit=' extractors', ) for mtype in types: mtype_key = mtype log_progress(lgr.info, 'metadataextractors', 'Engage %s metadata extractor', mtype_key, update=1, increment=True) try: extractor_cls = extractors[mtype_key].load() extractor = extractor_cls( ds, paths=paths if extractor_cls.NEEDS_CONTENT else fullpathlist) except Exception as e: log_progress( lgr.error, 'metadataextractors', 'Failed %s metadata extraction from %s', mtype_key, ds, ) raise ValueError("Failed to load metadata extractor for '%s', " "broken dataset configuration (%s)?: %s" % (mtype, ds, exc_str(e))) try: dsmeta_t, contentmeta_t = extractor.get_metadata( dataset=global_meta if global_meta is not None else ds.config.obtain( 'datalad.metadata.aggregate-dataset-{}'.format( mtype.replace('_', '-')), default=True, valtype=EnsureBool()), content=content_meta if content_meta is not None else ds.config.obtain( 'datalad.metadata.aggregate-content-{}'.format( mtype.replace('_', '-')), default=True, valtype=EnsureBool())) except Exception as e: lgr.error('Failed to get dataset metadata ({}): {}'.format( mtype, exc_str(e))) if cfg.get('datalad.runtime.raiseonerror'): log_progress( lgr.error, 'metadataextractors', 'Failed %s metadata extraction from %s', mtype_key, ds, ) raise errored = True # if we dont get global metadata we do not want content metadata continue if dsmeta_t: if _ok_metadata(dsmeta_t, mtype, ds, None): dsmeta_t = _filter_metadata_fields(dsmeta_t, maxsize=max_fieldsize, blacklist=blacklist) dsmeta[mtype_key] = dsmeta_t else: errored = True unique_cm = {} extractor_unique_exclude = getattr(extractor_cls, "_unique_exclude", set()) # TODO: ATM neuroimaging extractors all provide their own internal # log_progress but if they are all generators, we could provide generic # handling of the progress here. Note also that log message is actually # seems to be ignored and not used, only the label ;-) # log_progress( # lgr.debug, # 'metadataextractors_loc', # 'Metadata extraction per location for %s', mtype, # # contentmeta_t is a generator... so no cound is known # # total=len(contentmeta_t or []), # label='Metadata extraction per location', # unit=' locations', # ) for loc, meta in contentmeta_t or {}: lgr.log(5, "Analyzing metadata for %s", loc) # log_progress( # lgr.debug, # 'metadataextractors_loc', # 'ignoredatm', # label=loc, # update=1, # increment=True) if not _ok_metadata(meta, mtype, ds, loc): errored = True # log_progress( # lgr.debug, # 'metadataextractors_loc', # 'ignoredatm', # label='Failed for %s' % loc, # ) continue # we also want to store info that there was no metadata(e.g. to get a list of # files that have no metadata) # if there is an issue that a extractor needlessly produces empty records, the # extractor should be fixed and not a general switch. For example the datalad_core # issues empty records to document the presence of a file #elif not meta: # continue # apply filters meta = _filter_metadata_fields(meta, maxsize=max_fieldsize, blacklist=blacklist) if not meta: continue # assign # only ask each metadata extractor once, hence no conflict possible loc_dict = contentmeta.get(loc, {}) loc_dict[mtype_key] = meta contentmeta[loc] = loc_dict if ds.config.obtain('datalad.metadata.generate-unique-{}'.format( mtype_key.replace('_', '-')), default=True, valtype=EnsureBool()): # go through content metadata and inject report of unique keys # and values into `dsmeta` for k, v in meta.items(): if k in dsmeta.get(mtype_key, {}): # if the dataset already has a dedicated idea # about a key, we skip it from the unique list # the point of the list is to make missing info about # content known in the dataset, not to blindly # duplicate metadata. Example: list of samples data # were recorded from. If the dataset has such under # a 'sample' key, we should prefer that, over an # aggregated list of a hopefully-kinda-ok structure continue elif k in extractor_unique_exclude: # the extractor thinks this key is worthless for the purpose # of discovering whole datasets # we keep the key (so we know that some file is providing this key), # but ignore any value it came with unique_cm[k] = None continue vset = unique_cm.get(k, set()) vset.add(_val2hashable(v)) unique_cm[k] = vset # log_progress( # lgr.debug, # 'metadataextractors_loc', # 'Finished metadata extraction across locations for %s', mtype) if unique_cm: # per source storage here too ucp = dsmeta.get('datalad_unique_content_properties', {}) # important: we want to have a stable order regarding # the unique values (a list). we cannot guarantee the # same order of discovery, hence even when not using a # set above we would still need sorting. the callenge # is that any value can be an arbitrarily complex nested # beast # we also want to have each unique value set always come # in a top-level list, so we known if some unique value # was a list, os opposed to a list of unique values def _ensure_serializable(val): if isinstance(val, ReadOnlyDict): return {k: _ensure_serializable(v) for k, v in val.items()} if isinstance(val, (tuple, list)): return [_ensure_serializable(v) for v in val] else: return val ucp[mtype_key] = { k: [ _ensure_serializable(i) for i in sorted(v, key=_unique_value_key) ] if v is not None else None for k, v in unique_cm.items() # v == None (disable unique, but there was a value at some point) # otherwise we only want actual values, and also no single-item-lists # of a non-value # those contribute no information, but bloat the operation # (inflated number of keys, inflated storage, inflated search index, ...) if v is None or (v and not v == {''}) } dsmeta['datalad_unique_content_properties'] = ucp log_progress( lgr.info, 'metadataextractors', 'Finished metadata extraction from %s', ds, ) # always identify the effective vocabulary - JSON-LD style if context: dsmeta['@context'] = context return dsmeta, contentmeta, errored