def test_tag_dtype(sample_bidsfile, subject_entity): f, e = sample_bidsfile, subject_entity # Various ways of initializing--should all give same result tags = [ Tag(f, e, 4, int), Tag(f, e, '4', 'int'), Tag(f, e, '4', int), Tag(f, e, 4), Tag(file=f, entity=e, dtype=int, value='4') ] assert all([t.dtype == int for t in tags])
def test_entity_add_file(sample_bidsfile): session = create_session() bf = sample_bidsfile e = Entity('prop', r'-(\d+)') t = Tag(file=bf, entity=e, value=4) session.add_all([t, e, bf]) session.commit() assert e.files[bf.path] == 4
def writable_file(tmpdir): engine = create_engine('sqlite://') Base.metadata.create_all(engine) Session = sessionmaker(bind=engine) session = Session() testfile = 'sub-03_ses-2_task-rest_acq-fullbrain_run-2_bold.nii.gz' fn = tmpdir.mkdir("tmp").join(testfile) fn.write('###') bf = BIDSFile(os.path.join(str(fn))) tag_dict = {'task': 'rest', 'run': 2, 'subject': '3'} ents = {name: Entity(name) for name in tag_dict.keys()} tags = [Tag(bf, ents[k], value=v) for k, v in tag_dict.items()] session.add_all(list(ents.values()) + tags + [bf]) session.commit() return bf
def test_tag_init(sample_bidsfile, subject_entity): f, e = sample_bidsfile, subject_entity tag = Tag(f, e, 'zzz') rep = str(tag) assert rep.startswith("<Tag file:") and f.path in rep and 'zzz' in rep
def index_metadata(self, **filters): """Index metadata for all files in the BIDS dataset. """ # Process JSON files first if we're indexing metadata all_files = self.layout.get(absolute_paths=True, **filters) # Track ALL entities we've seen in file names or metadatas all_entities = {} for c in self.config: all_entities.update(c.entities) # If key/value pairs in JSON files duplicate ones extracted from files, # we can end up with Tag collisions in the DB. To prevent this, we # store all filename/entity pairs and the value, and then check against # that before adding each new Tag. all_tags = {} for t in self.session.query(Tag).all(): key = '{}_{}'.format(t.file_path, t.entity_name) all_tags[key] = str(t.value) # We build up a store of all file data as we iterate files. It looks # like: { extension/suffix: dirname: [(entities, payload)]}}. # The payload is left empty for non-JSON files. file_data = {} for bf in all_files: file_ents = bf.entities.copy() suffix = file_ents.pop('suffix', None) ext = file_ents.pop('extension', None) if suffix is not None and ext is not None: key = "{}/{}".format(ext, suffix) if key not in file_data: file_data[key] = defaultdict(list) if ext == 'json': with open(bf.path, 'r') as handle: try: payload = json.load(handle) except json.JSONDecodeError as e: msg = ("Error occurred while trying to decode JSON" " from file '{}'.".format(bf.path)) raise IOError(msg) from e else: payload = None to_store = (file_ents, payload, bf.path) file_data[key][bf.dirname].append(to_store) # To avoid integrity errors, track primary keys we've seen seen_assocs = set() def create_association_pair(src, dst, kind, kind2=None): kind2 = kind2 or kind pk1 = '#'.join([src, dst, kind]) if pk1 not in seen_assocs: self.session.add(FileAssociation(src=src, dst=dst, kind=kind)) seen_assocs.add(pk1) pk2 = '#'.join([dst, src, kind2]) if pk2 not in seen_assocs: self.session.add(FileAssociation(src=dst, dst=src, kind=kind2)) seen_assocs.add(pk2) # TODO: Efficiency of everything in this loop could be improved filenames = [bf for bf in all_files if not bf.path.endswith('.json')] for bf in filenames: file_ents = bf.entities.copy() suffix = file_ents.pop('suffix', None) ext = file_ents.pop('extension', None) file_ent_keys = set(file_ents.keys()) if suffix is None or ext is None: continue # Extract metadata associated with the file. The idea is # that we loop over parent directories, and if we find # payloads in the file_data store (indexing by directory # and current file suffix), we check to see if the # candidate JS file's entities are entirely consumed by # the current file. If so, it's a valid candidate, and we # add the payload to the stack. Finally, we invert the # stack and merge the payloads in order. ext_key = "{}/{}".format(ext, suffix) json_key = "json/{}".format(suffix) dirname = bf.dirname payloads = [] ancestors = [] while True: # Get JSON payloads json_data = file_data.get(json_key, {}).get(dirname, []) for js_ents, js_md, js_path in json_data: js_keys = set(js_ents.keys()) if js_keys - file_ent_keys: continue matches = [ js_ents[name] == file_ents[name] for name in js_keys ] if all(matches): payloads.append((js_md, js_path)) # Get all files this file inherits from candidates = file_data.get(ext_key, {}).get(dirname, []) for ents, _, path in candidates: keys = set(ents.keys()) if keys - file_ent_keys: continue matches = [ents[name] == file_ents[name] for name in keys] if all(matches): ancestors.append(path) parent = os.path.dirname(dirname) if parent == dirname: break dirname = parent if not payloads: continue # Create DB records for metadata associations js_file = payloads[-1][1] create_association_pair(js_file, bf.path, 'Metadata') # Consolidate metadata by looping over inherited JSON files file_md = {} for pl, js_file in payloads[::-1]: file_md.update(pl) # Create FileAssociation records for JSON inheritance n_pl = len(payloads) for i, (pl, js_file) in enumerate(payloads): if (i + 1) < n_pl: other = payloads[i + 1][1] create_association_pair(js_file, other, 'Child', 'Parent') # Inheritance for current file n_pl = len(ancestors) for i, src in enumerate(ancestors): if (i + 1) < n_pl: dst = ancestors[i + 1] create_association_pair(src, dst, 'Child', 'Parent') # Files with IntendedFor field always get mapped to targets intended = listify(file_md.get('IntendedFor', [])) for target in intended: # Per spec, IntendedFor paths are relative to sub dir. target = os.path.join(self.root, 'sub-{}'.format(bf.entities['subject']), target) create_association_pair(bf.path, target, 'IntendedFor', 'InformedBy') # Link files to BOLD runs if suffix in ['physio', 'stim', 'events', 'sbref']: images = self.layout.get(extension=['nii', 'nii.gz'], suffix='bold', return_type='filename', **file_ents) for img in images: create_association_pair(bf.path, img, 'IntendedFor', 'InformedBy') # Link files to DWI runs if suffix == 'sbref' or ext in ['bvec', 'bval']: images = self.layout.get(extension=['nii', 'nii.gz'], suffix='dwi', return_type='filename', **file_ents) for img in images: create_association_pair(bf.path, img, 'IntendedFor', 'InformedBy') # Create Tag <-> Entity mappings, and any newly discovered Entities for md_key, md_val in file_md.items(): tag_string = '{}_{}'.format(bf.path, md_key) # Skip pairs that were already found in the filenames if tag_string in all_tags: file_val = all_tags[tag_string] if str(md_val) != file_val: msg = ( "Conflicting values found for entity '{}' in " "filename {} (value='{}') versus its JSON sidecar " "(value='{}'). Please reconcile this discrepancy.") raise ValueError( msg.format(md_key, bf.path, file_val, md_val)) continue if md_key not in all_entities: all_entities[md_key] = Entity(md_key, is_metadata=True) self.session.add(all_entities[md_key]) tag = Tag(bf, all_entities[md_key], md_val) self.session.add(tag) if len(self.session.new) >= 1000: self.session.commit() self.session.commit()