def __init__(self, ds, force_reindex=False, **kwargs): super(_WhooshSearch, self).__init__(ds, **kwargs) self.idx_obj = None # where does the bunny have the eggs? self.index_dir = opj(self.ds.path, get_git_dir(self.ds.path), SEARCH_INDEX_DOTGITDIR) self._mk_search_index(force_reindex)
def __call__(dataset=None, what=None, recursive=False, recursion_limit=None): ds = require_dataset(dataset, purpose='clean-up') res_kwargs = dict(action='clean', logger=lgr, refds=ds.path) for ap in AnnotatePaths.__call__(dataset=ds.path, recursive=recursive, recursion_limit=recursion_limit, action='clean', unavailable_path_status='impossible', nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): if ap.get('status', None): yield ap continue if ap.get('type', None) != 'dataset': ap.update(status='impossible', message='only datasets can be cleaned') yield ap continue d = ap['path'] gitdir = get_git_dir(d) for dirpath, flag, msg, sing_pl in [ (ARCHIVES_TEMP_DIR, "cached-archives", "temporary archive", ("directory", "directories")), (ANNEX_TEMP_DIR, "annex-tmp", "temporary annex", ("file", "files")), (opj(gitdir, SEARCH_INDEX_DOTGITDIR), 'search-index', "metadata search index", ("file", "files")), ]: topdir = opj(d, dirpath) lgr.debug("Considering to clean %s:%s", d, dirpath) if not ((what is None) or (flag in what)): yield get_status_dict(path=topdir, status='notneeded', type='directory', **res_kwargs) continue paths = glob(opj(topdir, '*')) if not paths: yield get_status_dict(path=topdir, status='notneeded', type='directory', **res_kwargs) continue pl = len(paths) > 1 message = ("Removed %d %s %s: %s", len(paths), msg, sing_pl[int(pl)], ", ".join( sorted([x[len(topdir) + 1:] for x in paths]))) rmtree(topdir) yield get_status_dict(path=topdir, status='ok', type='dir', message=message, **res_kwargs)
def test_get_git_dir(path): # minimal, only missing coverage assert_raises(RuntimeError, get_git_dir, path) srcpath = opj(path, 'src') targetpath = opj(path, 'target') targetgitpath = opj(targetpath, '.git') os.makedirs(srcpath) os.makedirs(targetpath) if not on_windows: # with PY3 would also work with Windows 6+ os.symlink(srcpath, targetgitpath) eq_(srcpath, get_git_dir(targetpath)) # cleanup for following test unlink(targetgitpath) with open(targetgitpath, 'w') as f: f.write('gitdir: {}'.format(srcpath)) eq_(srcpath, get_git_dir(targetpath))
def get_dataset_directories(top, ignore_datalad=True): """Return a list of directories in the same dataset under a given path Parameters ---------- top : path Top-level path ignore_datalad : bool Whether to exlcude the '.datalad' directory of a dataset and its content from the results. Returns ------- list List of directories matching the top-level path, regardless of whether these directories are known to Git (i.e. contain tracked files). The list does not include the top-level path itself, but it does include any subdataset mount points (regardless of whether the particular subdatasets are installed or not). """ def func(arg, top, names): refpath, ignore, dirs = arg legit_names = [] for n in names: path = opj(top, n) if not isdir(path) or path in ignore: pass elif path != refpath and GitRepo.is_valid_repo(path): # mount point, keep but don't dive into dirs.append(path) else: legit_names.append(n) dirs.append(path) names[:] = legit_names # collects the directories refpath = get_dataset_root(top) if not refpath: raise ValueError("`top` path {} is not in a dataset".format(top)) ignore = [opj(refpath, get_git_dir(refpath))] if ignore_datalad: ignore.append(opj(refpath, '.datalad')) d = [] walk(top, func, (refpath, ignore, d)) return d
def __call__(match, dataset=None, search=None, report=None, report_matched=False, format='custom', regex=False): """ Yields ------ location : str (relative) path to the dataset report : dict fields which were requested by `report` option """ lgr.debug("Initiating search for match=%r and dataset %r", match, dataset) try: ds = require_dataset(dataset, check_installed=True, purpose='dataset search') if ds.id is None: raise NoDatasetArgumentFound( "This does not seem to be a dataset (no DataLad dataset ID " "found). 'datalad create --force %s' can initialize " "this repository as a DataLad dataset" % ds.path) except NoDatasetArgumentFound: exc_info = sys.exc_info() if dataset is None: if not ui.is_interactive: raise NoDatasetArgumentFound( "No DataLad dataset found. Specify a dataset to be " "searched, or run interactively to get assistance " "installing a queriable superdataset." ) # none was provided so we could ask user either he possibly wants # to install our beautiful mega-duper-super-dataset? # TODO: following logic could possibly benefit other actions. if os.path.exists(LOCAL_CENTRAL_PATH): central_ds = Dataset(LOCAL_CENTRAL_PATH) if central_ds.is_installed(): if ui.yesno( title="No DataLad dataset found at current location", text="Would you like to search the DataLad " "superdataset at %r?" % LOCAL_CENTRAL_PATH): pass else: reraise(*exc_info) else: raise NoDatasetArgumentFound( "No DataLad dataset found at current location. " "The DataLad superdataset location %r exists, " "but does not contain an dataset." % LOCAL_CENTRAL_PATH) elif ui.yesno( title="No DataLad dataset found at current location", text="Would you like to install the DataLad " "superdataset at %r?" % LOCAL_CENTRAL_PATH): from datalad.api import install central_ds = install(LOCAL_CENTRAL_PATH, source='///') ui.message( "From now on you can refer to this dataset using the " "label '///'" ) else: reraise(*exc_info) lgr.info( "Performing search using DataLad superdataset %r", central_ds.path ) for res in central_ds.search( match, search=search, report=report, report_matched=report_matched, format=format, regex=regex): yield res return else: raise cache_dir = opj(opj(ds.path, get_git_dir(ds.path)), 'datalad', 'cache') mcache_fname = opj(cache_dir, 'metadata.p%d' % pickle.HIGHEST_PROTOCOL) meta = None if os.path.exists(mcache_fname): lgr.debug("use cached metadata of '{}' from {}".format(ds, mcache_fname)) meta, checksum = pickle.load(open(mcache_fname, 'rb')) # TODO add more sophisticated tests to decide when the cache is no longer valid if checksum != ds.repo.get_hexsha(): # errrr, try again below meta = None # don't put in 'else', as yet to be written tests above might fail and require # regenerating meta data if meta is None: lgr.info("Loading and caching local meta-data... might take a few seconds") if not exists(cache_dir): os.makedirs(cache_dir) meta = get_metadata(ds, guess_type=False, ignore_subdatasets=False, ignore_cache=False) # merge all info on datasets into a single dict per dataset meta = flatten_metadata_graph(meta) # extract graph, if any meta = meta.get('@graph', meta) # build simple queriable representation if not isinstance(meta, list): meta = [meta] # sort entries by location (if present) sort_keys = ('location', 'description', 'id') # note with str() instead of '%' getting encoding issues... meta = sorted(meta, key=lambda m: tuple("%s" % (m.get(x, ""),) for x in sort_keys)) # use pickle to store the optimized graph in the cache pickle.dump( # graph plus checksum from what it was built (meta, ds.repo.get_hexsha()), open(mcache_fname, 'wb')) lgr.debug("cached meta data graph of '{}' in {}".format(ds, mcache_fname)) if report in ('', ['']): report = [] elif report and not isinstance(report, list): report = [report] match = assure_list(match) search = assure_list(search) # convert all to lower case for case insensitive matching search = {x.lower() for x in search} def get_in_matcher(m): """Function generator to provide closure for a specific value of m""" mlower = m.lower() def matcher(s): return mlower in s.lower() return matcher matchers = [ re.compile(match_).search if regex else get_in_matcher(match_) for match_ in match ] # location should be reported relative to current location # We will assume that noone chpwd while we are yielding ds_path_prefix = get_path_prefix(ds.path) # So we could provide a useful message whenever there were not a single # dataset with specified `--search` properties observed_properties = set() # for every meta data set for mds in meta: hit = False hits = [False] * len(matchers) matched_fields = set() if not mds.get('type', mds.get('schema:type', None)) == 'Dataset': # we are presently only dealing with datasets continue # TODO consider the possibility of nested and context/graph dicts # but so far we were trying to build simple lists of dicts, as much # as possible if not isinstance(mds, dict): raise NotImplementedError("nested meta data is not yet supported") # manual loop for now for k, v in iteritems(mds): if search: k_lower = k.lower() if k_lower not in search: if observed_properties is not None: # record for providing a hint later observed_properties.add(k_lower) continue # so we have a hit, no need to track observed_properties = None if isinstance(v, dict) or isinstance(v, list): v = text_type(v) for imatcher, matcher in enumerate(matchers): if matcher(v): hits[imatcher] = True matched_fields.add(k) if all(hits): hit = True # no need to do it longer than necessary if not report_matched: break if hit: location = mds.get('location', '.') report_ = matched_fields.union(report if report else {}) \ if report_matched else report if report_ == ['*']: report_dict = mds elif report_: report_dict = {k: mds[k] for k in report_ if k in mds} if report_ and not report_dict: lgr.debug( 'meta data match for %s, but no to-be-reported ' 'properties (%s) found. Present properties: %s', location, ", ".join(report_), ", ".join(sorted(mds)) ) else: report_dict = {} # it was empty but not None -- asked to # not report any specific field if isinstance(location, (list, tuple)): # could be that the same dataset installed into multiple # locations. For now report them separately for l in location: yield opj(ds_path_prefix, l), report_dict else: yield opj(ds_path_prefix, location), report_dict if search and observed_properties is not None: import difflib suggestions = { s: difflib.get_close_matches(s, observed_properties) for s in search } suggestions_str = "\n ".join( "%s for %s" % (", ".join(choices), s) for s, choices in iteritems(suggestions) if choices ) lgr.warning( "Found no properties which matched one of the one you " "specified (%s). May be you meant one among: %s.\n" "Suggestions:\n" " %s", ", ".join(search), ", ".join(observed_properties), suggestions_str if suggestions_str.strip() else "none" )
def __call__(query=None, dataset=None, force_reindex=False, max_nresults=20, show_keys=False, show_query=False): from whoosh import qparser as qparse try: ds = require_dataset(dataset, check_installed=True, purpose='dataset search') if ds.id is None: raise NoDatasetArgumentFound( "This does not seem to be a dataset (no DataLad dataset ID " "found). 'datalad create --force %s' can initialize " "this repository as a DataLad dataset" % ds.path) except NoDatasetArgumentFound: for r in _search_from_virgin_install(dataset, query): yield r return # where does the bunny have the eggs? index_dir = opj(ds.path, get_git_dir(ds.path), 'datalad', 'search_index') idx_obj = _get_search_index(index_dir, ds, force_reindex) if show_keys: definitions_fname = opj(index_dir, 'datalad_term_definitions.json.gz') try: defs = jsonload(gzopen(definitions_fname)) except Exception as e: lgr.warning( 'No term definitions found alongside search index: %s', exc_str(e)) defs = {} for k in idx_obj.schema.names(): print('{}{}'.format( k, ' {}'.format(defs[k] if isinstance(defs[k], dict) else '({})'.format(defs[k])) if k in defs else '')) return if not query: return with idx_obj.searcher() as searcher: # parse the query string, default whoosh parser ATM, could be # tailored with plugins parser = qparse.MultifieldParser(idx_obj.schema.names(), idx_obj.schema) # XXX: plugin is broken in Debian's whoosh 2.7.0-2, but already fixed # upstream parser.add_plugin(qparse.FuzzyTermPlugin()) parser.add_plugin(qparse.GtLtPlugin()) # replace field defintion to allow for colons to be part of a field's name: parser.replace_plugin( qparse.FieldsPlugin(expr=r"(?P<text>[()<>:\w]+|[*]):")) # for convenience we accept any number of args-words from the # shell and put them together to a single string here querystr = ' '.join(assure_list(query)) # this gives a formal whoosh query wquery = parser.parse(querystr) if show_query: print(wquery) return # perform the actual search hits = searcher.search( wquery, terms=True, limit=max_nresults if max_nresults > 0 else None) # cheap way to get an approximate number of hits, without an expensive # scoring of all items # disabled: unreliable estimate, often confusing #nhits = hits.estimated_min_length() # report query stats topstr = '{} top {}'.format( max_nresults, single_or_plural('match', 'matches', max_nresults)) lgr.info('Query completed in {} sec.{}'.format( hits.runtime, ' Reporting {}.'.format(( 'up to ' + topstr) if max_nresults > 0 else 'all matches') if not hits.is_empty() else ' No matches.')) if not hits: return nhits = 0 for hit in hits: res = dict( action='search', status='ok', logger=lgr, refds=ds.path, # normpath to avoid trailing dot path=normpath(opj(ds.path, hit['path'])), query_matched={ assure_unicode(k): assure_unicode(v) if isinstance( v, unicode_srctypes) else v for k, v in hit.matched_terms() }, metadata={ k: v for k, v in hit.fields().items() if k not in ('path', 'parentds') }) if 'parentds' in hit: res['parentds'] = normpath(opj(ds.path, hit['parentds'])) yield res nhits += 1 if max_nresults and nhits == max_nresults: lgr.info("Reached the limit of {}, there could be more which " "were not reported.".format(topstr))
def __call__(match, dataset=None, search=None, report=None, report_matched=False, format='custom', regex=False): lgr.debug("Initiating search for match=%r and dataset %r", match, dataset) try: ds = require_dataset(dataset, check_installed=True, purpose='dataset search') if ds.id is None: raise NoDatasetArgumentFound( "This does not seem to be a dataset (no DataLad dataset ID " "found). 'datalad create --force %s' can initialize " "this repository as a DataLad dataset" % ds.path) except NoDatasetArgumentFound: exc_info = sys.exc_info() if dataset is None: if not ui.is_interactive: raise NoDatasetArgumentFound( "No DataLad dataset found. Specify a dataset to be " "searched, or run interactively to get assistance " "installing a queriable superdataset." ) # none was provided so we could ask user either he possibly wants # to install our beautiful mega-duper-super-dataset? # TODO: following logic could possibly benefit other actions. if os.path.exists(LOCAL_CENTRAL_PATH): central_ds = Dataset(LOCAL_CENTRAL_PATH) if central_ds.is_installed(): if ui.yesno( title="No DataLad dataset found at current location", text="Would you like to search the DataLad " "superdataset at %r?" % LOCAL_CENTRAL_PATH): pass else: reraise(*exc_info) else: raise NoDatasetArgumentFound( "No DataLad dataset found at current location. " "The DataLad superdataset location %r exists, " "but does not contain an dataset." % LOCAL_CENTRAL_PATH) elif ui.yesno( title="No DataLad dataset found at current location", text="Would you like to install the DataLad " "superdataset at %r?" % LOCAL_CENTRAL_PATH): from datalad.api import install central_ds = install(LOCAL_CENTRAL_PATH, source='///') ui.message( "From now on you can refer to this dataset using the " "label '///'" ) else: reraise(*exc_info) lgr.info( "Performing search using DataLad superdataset %r", central_ds.path ) for res in central_ds.search( match, search=search, report=report, report_matched=report_matched, format=format, regex=regex): yield res return else: raise cache_dir = opj(opj(ds.path, get_git_dir(ds.path)), 'datalad', 'cache') mcache_fname = opj(cache_dir, 'metadata.p%d' % pickle.HIGHEST_PROTOCOL) meta = None if os.path.exists(mcache_fname): lgr.debug("use cached metadata of '{}' from {}".format(ds, mcache_fname)) meta, checksum = pickle.load(open(mcache_fname, 'rb')) # TODO add more sophisticated tests to decide when the cache is no longer valid if checksum != ds.repo.get_hexsha(): # errrr, try again below meta = None # don't put in 'else', as yet to be written tests above might fail and require # regenerating meta data if meta is None: lgr.info("Loading and caching local meta-data... might take a few seconds") if not exists(cache_dir): os.makedirs(cache_dir) meta = get_metadata(ds, guess_type=False, ignore_subdatasets=False, ignore_cache=False) # merge all info on datasets into a single dict per dataset meta = flatten_metadata_graph(meta) # extract graph, if any meta = meta.get('@graph', meta) # build simple queriable representation if not isinstance(meta, list): meta = [meta] # sort entries by location (if present) sort_keys = ('location', 'description', 'id') meta = sorted(meta, key=lambda m: tuple(m.get(x, "") for x in sort_keys)) # use pickle to store the optimized graph in the cache pickle.dump( # graph plus checksum from what it was built (meta, ds.repo.get_hexsha()), open(mcache_fname, 'wb')) lgr.debug("cached meta data graph of '{}' in {}".format(ds, mcache_fname)) if report in ('', ['']): report = [] elif report and not isinstance(report, list): report = [report] match = assure_list(match) search = assure_list(search) # convert all to lower case for case insensitive matching search = {x.lower() for x in search} def get_in_matcher(m): """Function generator to provide closure for a specific value of m""" mlower = m.lower() def matcher(s): return mlower in s.lower() return matcher matchers = [ re.compile(match_).search if regex else get_in_matcher(match_) for match_ in match ] # location should be reported relative to current location # We will assume that noone chpwd while we are yielding ds_path_prefix = get_path_prefix(ds.path) # So we could provide a useful message whenever there were not a single # dataset with specified `--search` properties observed_properties = set() # for every meta data set for mds in meta: hit = False hits = [False] * len(matchers) matched_fields = set() if not mds.get('type', mds.get('schema:type', None)) == 'Dataset': # we are presently only dealing with datasets continue # TODO consider the possibility of nested and context/graph dicts # but so far we were trying to build simple lists of dicts, as much # as possible if not isinstance(mds, dict): raise NotImplementedError("nested meta data is not yet supported") # manual loop for now for k, v in iteritems(mds): if search: k_lower = k.lower() if k_lower not in search: if observed_properties is not None: # record for providing a hint later observed_properties.add(k_lower) continue # so we have a hit, no need to track observed_properties = None if isinstance(v, dict) or isinstance(v, list): v = text_type(v) for imatcher, matcher in enumerate(matchers): if matcher(v): hits[imatcher] = True matched_fields.add(k) if all(hits): hit = True # no need to do it longer than necessary if not report_matched: break if hit: location = mds.get('location', '.') report_ = matched_fields.union(report if report else {}) \ if report_matched else report if report_ == ['*']: report_dict = mds elif report_: report_dict = {k: mds[k] for k in report_ if k in mds} if report_ and not report_dict: lgr.debug( 'meta data match for %s, but no to-be-reported ' 'properties (%s) found. Present properties: %s', location, ", ".join(report_), ", ".join(sorted(mds)) ) else: report_dict = {} # it was empty but not None -- asked to # not report any specific field if isinstance(location, (list, tuple)): # could be that the same dataset installed into multiple # locations. For now report them separately for l in location: yield opj(ds_path_prefix, l), report_dict else: yield opj(ds_path_prefix, location), report_dict if search and observed_properties is not None: import difflib suggestions = { s: difflib.get_close_matches(s, observed_properties) for s in search } suggestions_str = "\n ".join( "%s for %s" % (", ".join(choices), s) for s, choices in iteritems(suggestions) if choices ) lgr.warning( "Found no properties which matched one of the one you " "specified (%s). May be you meant one among: %s.\n" "Suggestions:\n" " %s", ", ".join(search), ", ".join(observed_properties), suggestions_str if suggestions_str.strip() else "none" )