def __call__(self, query, max_nresults=None, force_reindex=False): with self.idx_obj.searcher() as searcher: wquery = self.get_query(query) # perform the actual search hits = searcher.search( wquery, terms=True, limit=max_nresults if max_nresults > 0 else None) # report query stats topstr = '{} top {}'.format( max_nresults, single_or_plural('match', 'matches', max_nresults) ) lgr.info('Query completed in {} sec.{}'.format( hits.runtime, ' Reporting {}.'.format( ('up to ' + topstr) if max_nresults > 0 else 'all matches' ) if not hits.is_empty() else ' No matches.' )) if not hits: return nhits = 0 # annotate hits for full metadata report hits = [dict( path=normpath(opj(self.ds.path, hit['path'])), query_matched={assure_unicode(k): assure_unicode(v) if isinstance(v, unicode_srctypes) else v for k, v in hit.matched_terms()}, parentds=normpath( opj(self.ds.path, hit['parentds'])) if 'parentds' in hit else None, type=hit.get('type', None)) for hit in hits] for res in query_aggregated_metadata( # type is taken from hit record reporton=None, ds=self.ds, aps=hits, # never recursive, we have direct hits already recursive=False): res.update( refds=self.ds.path, action='search', status='ok', logger=lgr, ) yield res nhits += 1 if max_nresults and nhits == max_nresults: lgr.info( "Reached the limit of {}, there could be more which " "were not reported.".format(topstr) )
def __call__(self, query, max_nresults=None, consider_ucn=False, full_record=True): query_re = re.compile(self.get_query(query)) nhits = 0 for res in query_aggregated_metadata( reporton=self.documenttype, ds=self.ds, aps=[dict(path=self.ds.path, type='dataset')], # MIH: I cannot see a case when we would not want recursion (within # the metadata) recursive=True): # this assumes that files are reported after each dataset report, # and after a subsequent dataset report no files for the previous # dataset will be reported again meta = res.get('metadata', {}) # produce a flattened metadata dict to search through doc = _meta2autofield_dict(meta, val2str=True, consider_ucn=consider_ucn) # use search instead of match to not just get hits at the start of the string # this will be slower, but avoids having to use actual regex syntax at the user # side even for simple queries # DOTALL is needed to handle multiline description fields and such, and still # be able to match content coming for a later field lgr.log(7, "Querying %s among %d items", query_re, len(doc)) t0 = time() matches = { k: query_re.search(v.lower()) for k, v in iteritems(doc) } dt = time() - t0 lgr.log(7, "Finished querying in %f sec", dt) # retain what actually matched matches = { k: match.group() for k, match in matches.items() if match } if matches: hit = dict( res, action='search', query_matched=matches, ) yield hit nhits += 1 if max_nresults and nhits == max_nresults: # report query stats topstr = '{} top {}'.format( max_nresults, single_or_plural('match', 'matches', max_nresults)) lgr.info( "Reached the limit of {}, there could be more which " "were not reported.".format(topstr)) break
def _mk_schema(self, dsinfo): from whoosh import fields as wf from whoosh.analysis import SimpleAnalyzer # haven for terms that have been found to be undefined # (for faster decision-making upon next encounter) # this will harvest all discovered term definitions definitions = { '@id': 'unique identifier of an entity', # TODO make proper JSON-LD definition 'path': 'path name of an entity relative to the searched base dataset', # TODO make proper JSON-LD definition 'parentds': 'path of the datasets that contains an entity', # 'type' will not come from a metadata field, hence will not be detected 'type': 'type of a record', } schema_fields = { n.lstrip('@'): wf.ID(stored=True, unique=n == '@id') for n in definitions } lgr.debug('Scanning for metadata keys') # quick 1st pass over all dataset to gather the needed schema fields log_progress( lgr.info, 'idxschemabuild', 'Start building search schema', total=len(dsinfo), label='Building search schema', unit=' Datasets', ) for res in query_aggregated_metadata( # XXX TODO After #2156 datasets may not necessarily carry all # keys in the "unique" summary reporton='datasets', ds=self.ds, aps=[dict(path=self.ds.path, type='dataset')], recursive=True): meta = res.get('metadata', {}) # no stringification of values for speed, we do not need/use the # actual values at this point, only the keys idxd = _meta2autofield_dict(meta, val2str=False) for k in idxd: schema_fields[k] = wf.TEXT(stored=False, analyzer=SimpleAnalyzer()) log_progress(lgr.info, 'idxschemabuild', 'Scanned dataset at %s', res['path'], update=1, increment=True) log_progress(lgr.info, 'idxschemabuild', 'Done building search schema') self.schema = wf.Schema(**schema_fields)
def _get_keys(self, mode=None): """Return keys and their statistics if mode != 'name'.""" class key_stat: def __init__(self): self.ndatasets = 0 # how many datasets have this field self.uvals = set() from collections import defaultdict keys = defaultdict(key_stat) for res in query_aggregated_metadata( # XXX TODO After #2156 datasets may not necessarily carry all # keys in the "unique" summary reporton='datasets', ds=self.ds, aps=[dict(path=self.ds.path, type='dataset')], recursive=True): meta = res.get('metadata', {}) # inject a few basic properties into the dict # analog to what the other modes do in their index meta.update({ k: res.get(k, None) for k in ('@id', 'type', 'path', 'parentds') # parentds is tricky all files will have it, but the dataset # queried above might not (single dataset), let's force it in if k == 'parentds' or k in res}) # no stringification of values for speed idxd = _meta2autofield_dict(meta, val2str=False) for k, kvals in idxd.items(): # TODO deal with conflicting definitions when available keys[k].ndatasets += 1 if mode == 'name': continue try: kvals_set = assure_iter(kvals, set) except TypeError: # TODO: may be do show hashable ones??? nunhashable = sum( isinstance(x, collections.Hashable) for x in kvals ) kvals_set = { 'unhashable %d out of %d entries' % (nunhashable, len(kvals)) } keys[k].uvals |= kvals_set return keys
def show_keys(self): # use a dict already, later we need to map to a definition keys = {} for res in query_aggregated_metadata( # XXX TODO After #2156 datasets may not necessarily carry all # keys in the "unique" summary reporton='datasets', ds=self.ds, aps=[dict(path=self.ds.path, type='dataset')], recursive=True): meta = res.get('metadata', {}) # no stringification of values for speed idxd = _meta2autofield_dict(meta, val2str=False) for k in idxd: # TODO deal with conflicting definitions when available keys[k] = None for k in sorted(keys): print(k)
def test_aggregation(path=None): with chpwd(path): assert_raises(InsufficientArgumentsError, aggregate_metadata, None) # a hierarchy of three (super/sub)datasets, each with some native metadata ds = Dataset(opj(path, 'origin')).create(force=True) # before anything aggregated we would get nothing and only a log warning with swallow_logs(new_level=logging.WARNING) as cml: assert_equal(list(query_aggregated_metadata('all', ds, [])), []) assert_re_in('.*Found no aggregated metadata.*update', cml.out) ds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', scope='branch') subds = ds.create('sub', force=True) subds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', scope='branch') subsubds = subds.create('subsub', force=True) subsubds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', scope='branch') ds.save(recursive=True) assert_repo_status(ds.path) # aggregate metadata from all subdatasets into any superdataset, including # intermediate ones res = ds.aggregate_metadata(recursive=True, update_mode='all') # we get success report for both subdatasets and the superdataset, # and they get saved assert_result_count(res, 3, status='ok', action='aggregate_metadata') assert_in_results(res, action='save', status="ok") # nice and tidy assert_repo_status(ds.path) # quick test of aggregate report aggs = ds.metadata(get_aggregates=True) # one for each dataset assert_result_count(aggs, 3) # mother also report layout version assert_result_count(aggs, 1, path=ds.path, layout_version=1) # store clean direct result origres = ds.metadata(recursive=True) # basic sanity check assert_result_count(origres, 6) assert_result_count(origres, 3, type='dataset') assert_result_count(origres, 3, type='file') # Now that we have annex.key # three different IDs assert_equal(3, len(set([s['dsid'] for s in origres if s['type'] == 'dataset']))) # and we know about all three datasets for name in ('MOTHER_äöü東', 'child_äöü東', 'grandchild_äöü東'): assert_true( sum([s['metadata']['frictionless_datapackage']['name'] \ == ensure_unicode(name) for s in origres if s['type'] == 'dataset'])) # now clone the beast to simulate a new user installing an empty dataset clone = install( opj(path, 'clone'), source=ds.path, result_xfm='datasets', return_type='item-or-list') # ID mechanism works assert_equal(ds.id, clone.id) # get fresh metadata cloneres = clone.metadata() # basic sanity check assert_result_count(cloneres, 2) assert_result_count(cloneres, 1, type='dataset') assert_result_count(cloneres, 1, type='file') # now loop over the previous results from the direct metadata query of # origin and make sure we get the extract same stuff from the clone _compare_metadata_helper(origres, clone) # now obtain a subdataset in the clone, should make no difference assert_status('ok', clone.install('sub', result_xfm=None, return_type='list')) _compare_metadata_helper(origres, clone) # test search in search tests, not all over the place ## query smoke test assert_result_count(clone.search('mother', mode='egrep'), 1) assert_result_count(clone.search('(?i)MoTHER', mode='egrep'), 1) child_res = clone.search('child', mode='egrep') assert_result_count(child_res, 2) for r in child_res: if r['type'] == 'dataset': assert_in( r['query_matched']['frictionless_datapackage.name'], r['metadata']['frictionless_datapackage']['name'])
def show_keys(self, mode=None): maxl = 100 # maximal line length for unique values in mode=short # use a dict already, later we need to map to a definition # meanwhile map to the values class key_stat: def __init__(self): self.ndatasets = 0 # how many datasets have this field self.uvals = set() from collections import defaultdict keys = defaultdict(key_stat) for res in query_aggregated_metadata( # XXX TODO After #2156 datasets may not necessarily carry all # keys in the "unique" summary reporton='datasets', ds=self.ds, aps=[dict(path=self.ds.path, type='dataset')], recursive=True): meta = res.get('metadata', {}) # inject a few basic properties into the dict # analog to what the other modes do in their index meta.update({ k: res.get(k, None) for k in ('@id', 'type', 'path', 'parentds') # parentds is tricky all files will have it, but the dataset # queried above might not (single dataset), let's force it in if k == 'parentds' or k in res }) # no stringification of values for speed idxd = _meta2autofield_dict(meta, val2str=False) for k, kvals in iteritems(idxd): # TODO deal with conflicting definitions when available keys[k].ndatasets += 1 if mode == 'name': continue try: kvals_set = assure_iter(kvals, set) except TypeError: # TODO: may be do show hashable ones??? nunhashable = sum( isinstance(x, collections.Hashable) for x in kvals) kvals_set = { 'unhashable %d out of %d entries' % (nunhashable, len(kvals)) } keys[k].uvals |= kvals_set for k in sorted(keys): if mode == 'name': print(k) continue # do a bit more stat = keys[k] uvals = stat.uvals if mode == 'short': # show only up to X uvals if len(stat.uvals) > 10: uvals = {v for i, v in enumerate(uvals) if i < 10} # all unicode still scares yoh -- he will just use repr # def conv(s): # try: # return '{}'.format(s) # except UnicodeEncodeError: # return assure_unicode(s).encode('utf-8') stat.uvals_str = assure_unicode("{} unique values: {}".format( len(stat.uvals), ', '.join(map(repr, uvals)))) if mode == 'short': if len(stat.uvals) > 10: stat.uvals_str += ', ...' if len(stat.uvals_str) > maxl: stat.uvals_str = stat.uvals_str[:maxl - 4] + ' ....' elif mode == 'full': pass else: raise ValueError( "Unknown value for stats. Know full and short") print('{k}\n in {stat.ndatasets} datasets\n has {stat.uvals_str}'. format(k=k, stat=stat))
def __call__(self, query, max_nresults=None, consider_ucn=False, full_record=True): if max_nresults is None: # no limit by default max_nresults = 0 query = self.get_query(query) nhits = 0 for res in query_aggregated_metadata( reporton=self.documenttype, ds=self.ds, aps=[dict(path=self.ds.path, type='dataset')], # MIH: I cannot see a case when we would not want recursion (within # the metadata) recursive=True): # this assumes that files are reported after each dataset report, # and after a subsequent dataset report no files for the previous # dataset will be reported again meta = res.get('metadata', {}) # produce a flattened metadata dict to search through doc = _meta2autofield_dict(meta, val2str=True, consider_ucn=consider_ucn) # inject a few basic properties into the dict # analog to what the other modes do in their index doc.update({ k: res[k] for k in ('@id', 'type', 'path', 'parentds') if k in res }) # use search instead of match to not just get hits at the start of the string # this will be slower, but avoids having to use actual regex syntax at the user # side even for simple queries # DOTALL is needed to handle multiline description fields and such, and still # be able to match content coming for a later field lgr.log(7, "Querying %s among %d items", query, len(doc)) t0 = time() matches = { (q['query'] if isinstance(q, dict) else q, k): q['query'].search(v) if isinstance(q, dict) else q.search(v) for k, v in iteritems(doc) for q in query if not isinstance(q, dict) or q['field'].match(k) } dt = time() - t0 lgr.log(7, "Finished querying in %f sec", dt) # retain what actually matched matched = { k[1]: match.group() for k, match in matches.items() if match } # implement AND behavior across query expressions, but OR behavior # across queries matching multiple fields for a single query expression # for multiple queries, this makes it consistent with a query that # has no field specification if matched and len(query) == len( set(k[0] for k in matches if matches[k])): hit = dict( res, action='search', query_matched=matched, ) yield hit nhits += 1 if max_nresults and nhits == max_nresults: # report query stats topstr = '{} top {}'.format( max_nresults, single_or_plural('match', 'matches', max_nresults)) lgr.info( "Reached the limit of {}, there could be more which " "were not reported.".format(topstr)) break
def _mk_search_index(self, force_reindex): """Generic entrypoint to index generation The actual work that determines the structure and content of the index is done by functions that are passed in as arguments `meta2doc` - must return dict for index document from result input """ from whoosh import index as widx from .metadata import agginfo_relpath # what is the lastest state of aggregated metadata metadata_state = self.ds.repo.get_last_commit_hash(agginfo_relpath) # use location common to all index types, they would all invalidate # simultaneously stamp_fname = opj(self.index_dir, 'datalad_metadata_state') index_dir = opj(self.index_dir, self._mode_label) if (not force_reindex) and \ exists(index_dir) and \ exists(stamp_fname) and \ open(stamp_fname).read() == metadata_state: try: # TODO check that the index schema is the same # as the one we would have used for reindexing # TODO support incremental re-indexing, whoosh can do it idx = widx.open_dir(index_dir) lgr.debug('Search index contains %i documents', idx.doc_count()) self.idx_obj = idx return except widx.LockError as e: raise e except widx.IndexError as e: # Generic index error. # we try to regenerate lgr.warning( "Cannot open existing index %s (%s), will regenerate", index_dir, exc_str(e)) except widx.IndexVersionError as e: # (msg, version, release=None) # Raised when you try to open an index using a format that the # current version of Whoosh cannot read. That is, when the index # you're trying to open is either not backward or forward # compatible with this version of Whoosh. # we try to regenerate lgr.warning(exc_str(e)) pass except widx.OutOfDateError as e: # Raised when you try to commit changes to an index which is not # the latest generation. # this should not happen here, but if it does ... KABOOM raise except widx.EmptyIndexError as e: # Raised when you try to work with an index that has no indexed # terms. # we can just continue with generating an index pass except ValueError as e: if 'unsupported pickle protocol' in str(e): lgr.warning( "Cannot open existing index %s (%s), will regenerate", index_dir, exc_str(e)) else: raise lgr.info('{} search index'.format( 'Rebuilding' if exists(index_dir) else 'Building')) if not exists(index_dir): os.makedirs(index_dir) # this is a pretty cheap call that just pull this info from a file dsinfo = self.ds.metadata(get_aggregates=True, return_type='list', result_renderer='disabled') self._mk_schema(dsinfo) idx_obj = widx.create_in(index_dir, self.schema) idx = idx_obj.writer( # cache size per process limitmb=cfg.obtain('datalad.search.indexercachesize'), # disable parallel indexing for now till #1927 is resolved ## number of processes for indexing #procs=multiprocessing.cpu_count(), ## write separate index segments in each process for speed ## asks for writer.commit(optimize=True) #multisegment=True, ) # load metadata of the base dataset and what it knows about all its subdatasets # (recursively) old_idx_size = 0 old_ds_rpath = '' idx_size = 0 log_progress( lgr.info, 'autofieldidxbuild', 'Start building search index', total=len(dsinfo), label='Building search index', unit=' Datasets', ) for res in query_aggregated_metadata( reporton=self.documenttype, ds=self.ds, aps=[dict(path=self.ds.path, type='dataset')], # MIH: I cannot see a case when we would not want recursion (within # the metadata) recursive=True): # this assumes that files are reported after each dataset report, # and after a subsequent dataset report no files for the previous # dataset will be reported again meta = res.get('metadata', {}) doc = self._meta2doc(meta) admin = { 'type': res['type'], 'path': relpath(res['path'], start=self.ds.path), } if 'parentds' in res: admin['parentds'] = relpath(res['parentds'], start=self.ds.path) if admin['type'] == 'dataset': if old_ds_rpath: lgr.debug( 'Added %s on dataset %s', single_or_plural('document', 'documents', idx_size - old_idx_size, include_count=True), old_ds_rpath) log_progress(lgr.info, 'autofieldidxbuild', 'Indexed dataset at %s', old_ds_rpath, update=1, increment=True) old_idx_size = idx_size old_ds_rpath = admin['path'] admin['id'] = res.get('dsid', None) doc.update({k: assure_unicode(v) for k, v in admin.items()}) lgr.debug("Adding document to search index: {}".format(doc)) # inject into index idx.add_document(**doc) idx_size += 1 if old_ds_rpath: lgr.debug( 'Added %s on dataset %s', single_or_plural('document', 'documents', idx_size - old_idx_size, include_count=True), old_ds_rpath) lgr.debug("Committing index") idx.commit(optimize=True) log_progress(lgr.info, 'autofieldidxbuild', 'Done building search index') # "timestamp" the search index to allow for automatic invalidation with open(stamp_fname, 'w') as f: f.write(metadata_state) lgr.info('Search index contains %i documents', idx_size) self.idx_obj = idx_obj
def test_aggregation(path): with chpwd(path): assert_raises(InsufficientArgumentsError, aggregate_metadata, None) # a hierarchy of three (super/sub)datasets, each with some native metadata ds = Dataset(opj(path, 'origin')).create(force=True) # before anything aggregated we would get nothing and only a log warning with swallow_logs(new_level=logging.WARNING) as cml: assert_equal(list(query_aggregated_metadata('all', ds, [])), []) assert_re_in('.*Found no aggregated metadata.*update', cml.out) ds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') subds = ds.create('sub', force=True) subds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') subsubds = subds.create('subsub', force=True) subsubds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') ds.add('.', recursive=True) ok_clean_git(ds.path) # aggregate metadata from all subdatasets into any superdataset, including # intermediate ones res = ds.aggregate_metadata(recursive=True, update_mode='all') # we get success report for both subdatasets and the superdataset, # and they get saved assert_result_count(res, 6) assert_result_count(res, 3, status='ok', action='aggregate_metadata') assert_result_count(res, 3, status='ok', action='save') # nice and tidy ok_clean_git(ds.path) # quick test of aggregate report aggs = ds.metadata(get_aggregates=True) # one for each dataset assert_result_count(aggs, 3) # mother also report layout version assert_result_count(aggs, 1, path=ds.path, layout_version=1) # store clean direct result origres = ds.metadata(recursive=True) # basic sanity check assert_result_count(origres, 6) assert_result_count(origres, 3, type='dataset') assert_result_count(origres, 3, type='file') # Now that we have annex.key # three different IDs assert_equal(3, len(set([s['dsid'] for s in origres if s['type'] == 'dataset']))) # and we know about all three datasets for name in ('MOTHER_äöü東', 'child_äöü東', 'grandchild_äöü東'): assert_true( sum([s['metadata']['frictionless_datapackage']['name'] \ == assure_unicode(name) for s in origres if s['type'] == 'dataset'])) # now clone the beast to simulate a new user installing an empty dataset clone = install( opj(path, 'clone'), source=ds.path, result_xfm='datasets', return_type='item-or-list') # ID mechanism works assert_equal(ds.id, clone.id) # get fresh metadata cloneres = clone.metadata() # basic sanity check assert_result_count(cloneres, 2) assert_result_count(cloneres, 1, type='dataset') assert_result_count(cloneres, 1, type='file') # now loop over the previous results from the direct metadata query of # origin and make sure we get the extact same stuff from the clone _compare_metadata_helper(origres, clone) # now obtain a subdataset in the clone, should make no difference assert_status('ok', clone.install('sub', result_xfm=None, return_type='list')) _compare_metadata_helper(origres, clone) # test search in search tests, not all over the place ## query smoke test assert_result_count(clone.search('mother', mode='egrep'), 1) assert_result_count(clone.search('(?i)MoTHER', mode='egrep'), 1) child_res = clone.search('child', mode='egrep') assert_result_count(child_res, 2) for r in child_res: if r['type'] == 'dataset': assert_in( r['query_matched']['frictionless_datapackage.name'], r['metadata']['frictionless_datapackage']['name'])
def _get_search_index(index_dir, ds, force_reindex): from whoosh import index as widx from .metadata import agginfo_relpath # what is the lastest state of aggregated metadata metadata_state = ds.repo.get_last_commit_hash(agginfo_relpath) stamp_fname = opj(index_dir, 'datalad_metadata_state') definitions_fname = opj(index_dir, 'datalad_term_definitions.json.gz') if not force_reindex and \ exists(stamp_fname) and \ open(stamp_fname).read() == metadata_state: try: # TODO check that the index schema is the same # as the one we would have used for reindexing # TODO support incremental re-indexing, whoosh can do it idx = widx.open_dir(index_dir) lgr.debug('Search index contains %i documents', idx.doc_count()) return idx except widx.LockError as e: raise e except widx.IndexError as e: # Generic index error. # we try to regenerate # TODO log this pass except widx.IndexVersionError as e: # (msg, version, release=None) # Raised when you try to open an index using a format that the # current version of Whoosh cannot read. That is, when the index # you're trying to open is either not backward or forward # compatible with this version of Whoosh. # we try to regenerate lgr.warning(exc_str(e)) pass except widx.OutOfDateError as e: # Raised when you try to commit changes to an index which is not # the latest generation. # this should not happen here, but if it does ... KABOOM raise e except widx.EmptyIndexError as e: # Raised when you try to work with an index that has no indexed # terms. # we can just continue with generating an index pass lgr.info('{} search index'.format( 'Rebuilding' if exists(index_dir) else 'Building')) if not exists(index_dir): os.makedirs(index_dir) schema, definitions, per_ds_defs = _get_search_schema(ds) idx_obj = widx.create_in(index_dir, schema) idx = idx_obj.writer( # cache size per process limitmb=cfg.obtain('datalad.search.indexercachesize'), # disable parallel indexing for now till #1927 is resolved ## number of processes for indexing #procs=multiprocessing.cpu_count(), ## write separate index segments in each process for speed ## asks for writer.commit(optimize=True) #multisegment=True, ) # load metadata of the base dataset and what it knows about all its subdatasets # (recursively) old_idx_size = 0 old_ds_rpath = '' idx_size = 0 for res in query_aggregated_metadata( reporton=ds.config.obtain( 'datalad.metadata.searchindex-documenttype'), ds=ds, aps=[dict(path=ds.path, type='dataset')], # TODO expose? but this would likely only affect metadata in the # base dataset merge_mode='init', # MIH: I cannot see a case when we would not want recursion (within # the metadata) recursive=True): rpath = relpath(res['path'], start=ds.path) # this assumes that files are reported after each dataset report, # and after a subsequent dataset report no files for the previous # dataset will be reported again rtype = res['type'] meta = res.get('metadata', {}) meta = MetadataDict(meta) if rtype == 'dataset': if old_ds_rpath: lgr.info( 'Added %s on dataset %s', single_or_plural('document', 'documents', idx_size - old_idx_size, include_count=True), old_ds_rpath) old_idx_size = idx_size old_ds_rpath = rpath # get any custom dataset mappings ds_defs = per_ds_defs.get(res['path'], {}) # now we merge all reported unique content properties (flattened representation # of content metadata) with the main metadata set, using the 'add' strategy # this way any existing metadata value of a dataset itself will be amended by # those coming from the content. E.g. a single dataset 'license' might be turned # into a sequence of unique license identifiers across all dataset components meta.merge_add(meta.get('unique_content_properties', {})) meta.pop('unique_content_properties', None) doc_props = dict(path=rpath, type=rtype, **_meta2index_dict(meta, definitions, ds_defs)) if 'parentds' in res: doc_props['parentds'] = relpath(res['parentds'], start=ds.path) _add_document(idx, **doc_props) idx_size += 1 if old_ds_rpath: lgr.info( 'Added %s on dataset %s', single_or_plural('document', 'documents', idx_size - old_idx_size, include_count=True), old_ds_rpath) lgr.debug("Committing index") idx.commit(optimize=True) # "timestamp" the search index to allow for automatic invalidation with open(stamp_fname, 'w') as f: f.write(metadata_state) # dump the term/field definitions records for later introspection # use compressed storage, the is not point in inflating the # diskspace requirements lgr.debug("Storing definitions to %s", definitions_fname) with gzopen(definitions_fname, 'wb') as f: # TODO actually go through all, incl. compound, defintions ('@id' plus 'unit' # or similar) and resolve terms to URLs, if anyhow possible jsondump2file(definitions, f) lgr.info('Search index contains %i documents', idx_size) return idx_obj
def _get_search_schema(ds): from whoosh import fields as wf # haven for terms that have been found to be undefined # (for faster decision-making upon next encounter) undef = set() # this will harvest all discovered term definitions definitions = { '@id': 'unique identifier of an entity', # TODO make proper JSON-LD definition 'path': 'path name of an entity relative to the searched base dataset', # TODO make proper JSON-LD definition 'parentds': 'path of the datasets that contains an entity', # 'type' will not come from a metadata field, hence will not be detected 'type': { '@id': _resolve_term(common_defs['type']['def'], {}, common_defs, undef), 'description': common_defs['type']['descr'] }, } schema_fields = { n: wf.ID(stored=True, unique=n == '@id') for n in definitions } # this will contain any dataset-specific term mappings, in case we find # non-unique keys that are differently defined per_ds_defs = {} ds_defs = {} lgr.info('Scanning for metadata keys') # quick 1st pass over all dataset to gather the needed schema fields # sanitization of / should ideally be done while saving, but that would require # fixes in whoosh I guess sanitize_key = lambda k: k.replace(' ', '_').replace('/', '_') for res in query_aggregated_metadata( reporton='datasets', ds=ds, aps=[dict(path=ds.path, type='dataset')], merge_mode='init', recursive=True): ds_defs = {} meta = res.get('metadata', {}) for k, v in meta.get('@context', {}).items(): k = sanitize_key(k) if k not in definitions or definitions[k] == v: # this is new, but unique, or uniformly defined definitions[k] = v else: # non-unique key (across all seen datasets) # make unique # TODO we have to deal with @vocab fields in here, those # might be different when some aggregated metadata was # generated with an old version of datalad # in this case we should actually load the old vocabulary #set.add(', '.join(i for i in v) if isinstance(v, (tuple, list)) else v) # and perform the mapping to the current one in here count = 0 uk = k while uk in definitions: if definitions[uk] == v: break # already exists and matches count += 1 uk = '{}_{}'.format(k, count) ds_defs[k] = k = uk definitions[k] = v # we register a field for any definition in the context. # while this has the potential to needlessly blow up the # index size, the only alternative would be to iterate over # all content metadata in this first pass too, in order to # do a full scan. if k == '@vocab' or isinstance(v, dict) and v.get( 'type', None) == vocabulary_id: continue schema_fields[k] = wf.TEXT(stored=True) if ds_defs: # store ds-specific mapping for the second pass that actually # generates the search index per_ds_defs[res['path']] = ds_defs # anything that is a direct metadata key or is reported as being a content metadata # key is a valid candidate for inclusion into the schema cand_keys = list(meta) cand_keys.extend(meta.get('unique_content_properties', [])) # need a copy, we are going to reformat keys of ad-hoc defs final_defs = dict(definitions) for k in cand_keys: k = sanitize_key(k) if k in ('unique_content_properties', '@context'): # those are just means for something else and irrelevant # for searches continue # check if we have any kind of definitions for this key if k not in definitions: termdef = _resolve_term(k, definitions, common_defs, undef) if termdef is None: # we know nothing about this key, ignore lgr.debug("Ignoring term '%s', no definition found", k) continue final_defs[k] = termdef # TODO treat keywords/tags separately schema_fields[k] = wf.TEXT(stored=True) else: if isinstance(definitions[k], dict): final_defs[k] = { k_ if k_ == '@id' else '{} ({})'.format( k_, _resolve_term(k_, definitions, common_defs, undef)): _resolve_term(v, definitions, common_defs, undef) if k_ in ('@id', 'unit') else v for k_, v in definitions[k].items() if v # skip if value is empty } schema = wf.Schema(**schema_fields) return schema, final_defs, per_ds_defs
def extract(ds, output_directory, repository_info=None): if pd is None: lgr.error( "This plugin requires Pandas to be available (error follows)") import pandas return # collect infos about dataset and ISATAB structure for use in investigator # template info = {} if not exists(output_directory): lgr.info("creating output directory at '{}'".format(output_directory)) os.makedirs(output_directory) # pull out everything we know about any file in the dataset, and the dataset # itself metadb = { relpath(r['path'], ds.path): r.get('metadata', {}) for r in query_aggregated_metadata( 'all', ds, [dict(path=ds.path, type='dataset')], 'init') } # prep for assay table info protocols = OrderedDict() for prop in assay_props: info[prop] = [] # pull out essential metadata bits about the dataset itself # for study description) dsmeta = metadb.get('.', {}) info['name'] = dsmeta.get('shortdescription', dsmeta.get('name', 'TODO')) info['author'] = '\t'.join(assure_list(dsmeta.get('author', []))) info['keywords'] = '\t'.join(assure_list(dsmeta.get('tag', []))) # generate: s_study.txt study_df = _get_study_df(ds) if study_df.empty: # no samples, no assays, no metadataset return None _gather_protocol_parameters_from_df(study_df, protocols) _store_beautiful_table(study_df, output_directory, "s_study.txt") info['studytab_filename'] = 's_study.txt' deface_df = None # all imaging modalities recognized in BIDS #TODO maybe fold 'defacemask' into each modality as a derivative for modality in ('defacemask', 'T1w', 'T2w', 'T1map', 'T2map', 'FLAIR', 'FLASH', 'PD', 'PDmap', 'PDT2', 'inplaneT1', 'inplaneT2', 'angio', 'sbref', 'bold', 'SWImagandphase'): # what files do we have for this modality modfiles = _get_file_matches(metadb, '^sub-.*_{}\.nii\.gz$'.format(modality)) if not len(modfiles): # not files found, try next lgr.info( "no files match MRI modality '{}', skipping".format(modality)) continue df = _get_assay_df(metadb, modality, "Magnetic Resonance Imaging", modfiles, _describe_file, repository_info) if df is None: continue if modality == 'defacemask': # rename columns to strip index df.columns = [c[6:] for c in df.columns] df.rename(columns={'Raw Data File': 'Derived Data File'}, inplace=True) df.drop(['Assay Name', 'Sample Name'] + [c for c in df.columns if c.startswith('Factor')], axis=1, inplace=True) deface_df = df # re-prefix for merge logic compatibility below deface_df.columns = [ _get_colkey(i, c) for i, c in enumerate(df.columns) ] # do not save separate, but include into the others as a derivative continue elif deface_df is not None: # get any factor columns, put last in final table factors = [] # find where they stat for i, c in enumerate(df.columns): if '_Factor Value[' in c: factors = df.columns[i:] break factor_df = df[factors] df.drop(factors, axis=1, inplace=True) # merge relevant rows from deface df (hstack), by matching assay name df = df.join(deface_df, rsuffix='_deface') df.columns = [ c[:-7] if c.endswith('_deface') else c for c in df.columns ] # cannot have overlapping columns, we removed the factor before df = df.join(factor_df) # rename columns to strip index df.columns = [c[6:] for c in df.columns] # parse df to gather protocol info _gather_protocol_parameters_from_df(df, protocols) # store assay_fname = "a_mri_{}.txt".format(modality.lower()) _store_beautiful_table(df, output_directory, assay_fname) info['assay_fname'].append(assay_fname) info['assay_techtype'].append('nuclear magnetic resonance') info['assay_techtype_term'].append('OBI:0000182') info['assay_techtype_termsrc'].append('OBI') info['assay_measurementtype'].append('MRI Scanner') info['assay_measurementtype_term'].append('ERO:MRI_Scanner') info['assay_measurementtype_termsrc'].append('ERO') # non-MRI modalities for modlabel, assaylabel, protoref in (('physio', 'physio', "Physiological Measurement"), ('stim', 'stimulation', "Stimulation")): df = _get_assay_df( metadb, modlabel, protoref, _get_file_matches(metadb, '^sub-.*_{}.tsv.gz$'.format(modlabel)), _describe_file, repository_info) if df is None: continue # rename columns to strip index df.columns = [c[6:] for c in df.columns] assay_fname = "a_{}.txt".format(assaylabel) _store_beautiful_table(df, output_directory, assay_fname) info['assay_fname'].append(assay_fname) # ATM we cannot say anything definitive about these info['assay_techtype'].append('TODO') info['assay_techtype_term'].append('TODO') info['assay_techtype_termsrc'].append('TODO') info['assay_measurementtype'].append(assaylabel) info['assay_measurementtype_term'].append('TODO') info['assay_measurementtype_termsrc'].append('TODO') # post-proc assay-props for output for prop in assay_props: info[prop] = '\t'.join(assure_list(info[prop])) info['protocol_name'] = '\t'.join(protocols.keys()) for k in ('type', 'term', 'termsrc'): info['protocol_{}'.format(k)] = '\t'.join( protocol_defs.get(p, {}).get(k, 'TODO') for p in protocols) info['protocol_parameters'] = '\t'.join('; '.join(sorted(protocols[p])) for p in protocols) return info