def test_pattern_1(self): self._test_pattern(["*"], [ MetadataPath(".datalad_metadata"), MetadataPath("s1"), MetadataPath("s2"), MetadataPath("d3") ])
def get_path_info( dataset: Dataset, element_path: Optional[Path], into_dataset_path: Optional[Path] = None ) -> Tuple[MetadataPath, MetadataPath]: """ Determine the dataset tree path and the file tree path. If the path is absolute, we can determine the containing dataset and the metadatasets around it. If the path is not an element of a locally known dataset, we signal an error. If the path is relative, we convert it to an absolute path by appending it to the dataset or current directory and perform the above check. """ full_dataset_path = Path(dataset.path).resolve() if into_dataset_path is None: dataset_tree_path = MetadataPath("") else: full_into_dataset_path = into_dataset_path.resolve() dataset_tree_path = MetadataPath( full_dataset_path.relative_to(full_into_dataset_path)) if element_path is None: return dataset_tree_path, MetadataPath("") if element_path.is_absolute(): full_file_path = element_path else: full_file_path = full_dataset_path / element_path file_tree_path = full_file_path.relative_to(full_dataset_path) return dataset_tree_path, MetadataPath(file_tree_path)
def test_tree_version(self): parser = MetadataURLParser("tree:/a/b/c@00112233:/x/y") result = parser.parse() self.assertIsInstance(result, TreeMetadataURL) self.assertEqual(result.version, "00112233") self.assertEqual(result.dataset_path, MetadataPath("/a/b/c")) self.assertEqual(result.local_path, MetadataPath("/x/y"))
def test_relative_path(self): parser = MetadataURLParser(":a/b/c") result = parser.parse() self.assertIsInstance(result, TreeMetadataURL) self.assertIsNone(result.version) self.assertEqual(result.dataset_path, MetadataPath("")) self.assertEqual(result.local_path, MetadataPath("a/b/c"))
def test_empty_paths(self): parser = MetadataURLParser("tree:@00112233") result = parser.parse() self.assertIsInstance(result, TreeMetadataURL) self.assertEqual(result.version, "00112233") self.assertEqual(result.dataset_path, MetadataPath("")) self.assertEqual(result.local_path, MetadataPath(""))
def test_auto_list_root_on(self): found, failed = self.tree_search.get_matching_paths( [""], False, auto_list_root=True) self.assertPathsInResult(found, [ MetadataPath(".datalad_metadata"), MetadataPath("s1"), MetadataPath("s2"), MetadataPath("d3") ]) self.assertListEqual(failed, [])
def dump_from_dataset_tree(mapper: str, metadata_store: str, tree_version_list: TreeVersionList, metadata_url: TreeMetadataURL, recursive: bool) -> Generator[dict, None, None]: """ Dump dataset tree elements that are referenced in path """ # Normalize path representation if not metadata_url or metadata_url.dataset_path is None: metadata_url = TreeMetadataURL(MetadataPath(""), MetadataPath("")) # Get specified version, if none is specified, take the first from the # tree version list. requested_root_dataset_version = metadata_url.version if requested_root_dataset_version is None: requested_root_dataset_version = ( # TODO: add an item() method to VersionList tuple(tree_version_list.versions())[0] if metadata_url.version is None else metadata_url.version) # Fetch dataset tree for the specified version time_stamp, dataset_tree = tree_version_list.get_dataset_tree( requested_root_dataset_version) root_mrr = dataset_tree.get_metadata_root_record(MetadataPath("")) root_dataset_version = root_mrr.dataset_version root_dataset_identifier = root_mrr.dataset_identifier # Create a tree search object to search for the specified datasets tree_search = TreeSearch(dataset_tree) matches, not_found_paths = tree_search.get_matching_paths( [str(metadata_url.dataset_path)], recursive, auto_list_root=False) for missing_path in not_found_paths: lgr.error(f"could not locate metadata for dataset path {missing_path} " f"in tree version {metadata_url.version} in " f"metadata_store {mapper}:{metadata_store}") for match_record in matches: yield from show_dataset_metadata(mapper, metadata_store, root_dataset_identifier, root_dataset_version, match_record.path, match_record.node.value) yield from show_file_tree_metadata(mapper, metadata_store, root_dataset_identifier, root_dataset_version, MetadataPath(match_record.path), match_record.node.value, str(metadata_url.local_path), recursive) return
def get_matching_paths( self, pattern_list: List[str], recursive: bool, auto_list_root: bool = True ) -> Tuple[List[MatchRecord], List[MetadataPath]]: """ Get all metadata paths that are matching the patterns in pattern_list. - Leading "/" are removed from patterns, since metadata paths are not absolute. - Empty pattern-specifications, i.e. '', are interpreted as root-dataset or root-file-tree nodes. """ pattern_elements_list = [ MetadataPath(pattern) for pattern in set(pattern_list) ] matching, failed = self._get_matching_nodes(pattern_elements_list, auto_list_root) if recursive: matching = self._list_recursive(matching[:]) return matching, failed
def test_add_file_end_to_end(file_name): test_path = "d_0/d_0.0/f_0.0.0" json.dump({ **metadata_template, "type": "file", "path": test_path }, open(file_name, "tw")) with tempfile.TemporaryDirectory() as temp_dir: git_repo = GitRepo(temp_dir) res = meta_add(metadata=file_name, metadata_store=git_repo.path) assert_result_count(res, 1) assert_result_count(res, 1, type='file') assert_result_count(res, 0, type='dataset') # Verify file level metadata was added tree_version_list, uuid_set, mrr = _get_top_nodes( git_repo, UUID(metadata_template["dataset_id"]), metadata_template["dataset_version"]) file_tree = mrr.get_file_tree() assert_is_not_none(file_tree) assert_true(test_path in file_tree) metadata = file_tree.get_metadata(MetadataPath(test_path)) metadata_content = _get_metadata_content(metadata) eq_(metadata_content, metadata_template["extracted_metadata"])
def get_file_info(dataset: Dataset, file_path: MetadataPath) -> FileInfo: """ Get information about the file in the dataset or None, if the file is not part of the dataset. """ # Convert the metadata file-path into a system file path path = Path(file_path) try: relative_path = path.relative_to(dataset.pathobj) except ValueError: relative_path = path path = dataset.pathobj / relative_path path_status = (list(dataset.status(path, result_renderer="disabled")) or [None])[0] if path_status is None: raise FileNotFoundError("file not found: {}".format(path)) if path_status["state"] == "untracked": raise ValueError("file not tracked: {}".format(path)) # noinspection PyUnresolvedReferences return FileInfo( type=path_status["type"], git_sha_sum=path_status["gitshasum"], byte_size=path_status.get("bytesize", 0), state=path_status["state"], path=path_status["path"], # TODO: use the dataset-tree path here? intra_dataset_path=str( MetadataPath(*PurePath(path_status["path"]).relative_to( dataset.pathobj).parts)))
def test_auto_list_root_off(self): """ Expect a single root record for non-autolist root search """ found, failed = self.tree_search.get_matching_paths( [""], False, auto_list_root=False) self.assertListEqual( found, [MatchRecord(MetadataPath(""), self.tree_search.tree)]) self.assertListEqual(failed, [])
def test_uuid_empty(self): parser = MetadataURLParser("uuid:00112233-0011-2233-4455-66778899aabb") result = parser.parse() self.assertIsInstance(result, UUIDMetadataURL) self.assertEqual(result.version, None) self.assertEqual(result.uuid, UUID("00112233-0011-2233-4455-66778899aabb")) self.assertEqual(result.local_path, MetadataPath(""))
def __call__(metadata: Union[str, JSONObject], metadata_store: Optional[str] = None, additionalvalues: Optional[Union[str, JSONObject]] = None, allow_override: bool = False, allow_unknown: bool = False): additionalvalues = additionalvalues or dict() metadata_store = Path(metadata_store or curdir) metadata = process_parameters( metadata=read_json_object(metadata), additional_values=get_json_object(additionalvalues), allow_override=allow_override, allow_unknown=allow_unknown) lgr.debug(f"attempting to add metadata: {json.dumps(metadata)}") add_parameter = AddParameter( dataset_id=UUID(metadata["dataset_id"]), dataset_version=metadata["dataset_version"], file_path=(MetadataPath(metadata["path"]) if "path" in metadata else None), root_dataset_id=(UUID(metadata["root_dataset_id"]) if "root_dataset_id" in metadata else None), root_dataset_version=metadata.get("root_dataset_version", None), dataset_path=MetadataPath(metadata.get("dataset_path", "")), extractor_name=metadata["extractor_name"], extractor_version=metadata["extractor_version"], extraction_time=metadata["extraction_time"], extraction_parameter=metadata["extraction_parameter"], agent_name=metadata["agent_name"], agent_email=metadata["agent_email"], extracted_metadata=metadata["extracted_metadata"]) # If the key "path" is present in the metadata # dictionary, we assume that the metadata-dictionary describes # file-level metadata. Otherwise, we assume that the # metadata-dictionary contains dataset-level metadata. if add_parameter.file_path: yield from add_file_metadata(metadata_store, add_parameter) else: yield from add_dataset_metadata(metadata_store, add_parameter) return
def test_subdataset_add_file_end_to_end(file_name): test_path = "d_1/d_1.0/f_1.0.0" json.dump({ **metadata_template, **additional_keys_template, "type": "file", "path": test_path }, open(file_name, "tw")) with tempfile.TemporaryDirectory() as temp_dir: git_repo = GitRepo(temp_dir) res = meta_add(metadata=file_name, metadata_store=git_repo.path) assert_result_count(res, 1) assert_result_count(res, 1, type='file') assert_result_count(res, 0, type='dataset') # Verify dataset level metadata was added root_dataset_id = UUID(additional_keys_template["root_dataset_id"]) root_dataset_version = additional_keys_template["root_dataset_version"] dataset_tree_path = MetadataPath( additional_keys_template["dataset_path"]) tree_version_list, uuid_set, mrr = _get_top_nodes( git_repo, root_dataset_id, root_dataset_version) _, dataset_tree = tree_version_list.get_dataset_tree( root_dataset_version) mrr = dataset_tree.get_metadata_root_record(dataset_tree_path) eq_(mrr.dataset_identifier, UUID(metadata_template["dataset_id"])) file_tree = mrr.get_file_tree() assert_is_not_none(file_tree) assert_true(test_path in file_tree) metadata = file_tree.get_metadata(MetadataPath(test_path)) metadata_content = _get_metadata_content(metadata) eq_(metadata_content, metadata_template["extracted_metadata"])
def _search_matches(self, pattern_parts: Tuple[str], tree: FileTree, accumulated_path: MetadataPath) -> List[MatchRecord]: if not pattern_parts: return [MatchRecord(MetadataPath(accumulated_path), tree)] match_records = [] for name, sub_tree in tree.child_nodes.items(): if fnmatchcase(name, pattern_parts[0]): match_records.extend( self._search_matches(pattern_parts[1:], sub_tree, accumulated_path / name)) return match_records
def _get_top_nodes(realm: str, ap: AddParameter): if ap.root_dataset_id is None: return get_top_nodes_and_metadata_root_record(default_mapper_family, realm, ap.dataset_id, ap.dataset_version, MetadataPath(""), auto_create=True) tree_version_list, uuid_set, mrr = get_top_nodes_and_metadata_root_record( default_mapper_family, realm, ap.root_dataset_id, ap.root_dataset_version, MetadataPath(""), auto_create=True) _, dataset_tree = tree_version_list.get_dataset_tree( ap.root_dataset_version) if ap.dataset_path != MetadataPath("") and ap.dataset_path in dataset_tree: mrr = dataset_tree.get_metadata_root_record(ap.dataset_path) if mrr.dataset_identifier != ap.dataset_id: raise ValueError( f"add-metadata claims that the metadata store contains dataset " f"id {ap.dataset_id} at path {ap.dataset_path}, but the " f"id of the stored dataset is {mrr.dataset_identifier}") else: dataset_level_metadata = Metadata(default_mapper_family, realm) file_tree = FileTree(default_mapper_family, realm) mrr = MetadataRootRecord(default_mapper_family, realm, ap.dataset_id, ap.dataset_version, Connector.from_object(dataset_level_metadata), Connector.from_object(file_tree)) dataset_tree.add_dataset(ap.dataset_path, mrr) return tree_version_list, uuid_set, mrr
def parse(self): """ Parse a metadata path spec. It can either be a uuid spec or a tree spec. If no scheme is provided, a tree-spec is assumed. Note, if a dataset_path is empty, the root dataset is assumed and the primary data version of the youngest metadata record will be chosen. UUID: "uuid:" UUID-DIGITS ["@" VERSION-DIGITS] [":" [LOCAL_PATH]] TREE: ["tree:"] [DATASET_PATH] ["@" VERSION-DIGITS] [":" [LOCAL_PATH]] """ # Try to parse a uuid-spec if self.match(MetadataURLParser.uuid_header): uuid = UUID(self.fetch(MetadataURLParser.uuid_string_length)) _, version = self.parse_version() _, local_path = self.get_path() return UUIDMetadataURL(uuid, local_path, version) # Expect a tree spec self.match(self.tree_header) success, dataset_path = self.fetch_upto("@") if success: dataset_path = MetadataPath(dataset_path) _, version = self.parse_version() self.match(":") local_path = MetadataPath(self.get_remaining()) else: version = None success, dataset_path = self.fetch_upto(":") if success: dataset_path = MetadataPath(dataset_path) _, local_path = self.get_path() else: dataset_path = MetadataPath(self.get_remaining()) local_path = MetadataPath("") return TreeMetadataURL(dataset_path, local_path, version)
def _get_top_nodes(git_repo, dataset_id, dataset_version): # Ensure that metadata was created tree_version_list, uuid_set, mrr = \ get_top_nodes_and_metadata_root_record( "git", git_repo.path, dataset_id, dataset_version, MetadataPath("")) assert_is_not_none(tree_version_list) assert_is_not_none(uuid_set) assert_is_not_none(mrr) return tree_version_list, uuid_set, mrr
def setUp(self) -> None: self.path_list = [ MetadataPath(".datalad_metadata"), MetadataPath("s1/s1.1/d1.1.1/.datalad_metadata"), MetadataPath("s1/s1.2/d1.2.1/.datalad_metadata"), MetadataPath("s2/d2.1/.datalad_metadata"), MetadataPath("d3/.datalad_metadata"), MetadataPath("d3/some_file") ] self.tree_search = self.create_tree_search_from_paths(self.path_list)
def _check_metadata_record(metadata_record: dict, dataset: Dataset, extractor_name: str, extractor_version: str, extraction_parameter: dict, path: Optional[str] = None): assert_in("extraction_time", metadata_record) eq_(metadata_record["dataset_id"], UUID(dataset.id)) eq_(metadata_record["dataset_version"], dataset.repo.get_hexsha()) eq_(metadata_record["extractor_version"], extractor_version) eq_(metadata_record["extractor_name"], extractor_name) eq_(metadata_record["extraction_parameter"], extraction_parameter) eq_(metadata_record["agent_name"], "DataLad Tester") eq_(metadata_record["agent_email"], "*****@*****.**") if path is not None: eq_(metadata_record["path"], MetadataPath(path))
def _get_matching_nodes( self, pattern_list: List[MetadataPath], auto_list_root: bool ) -> Tuple[List[MatchRecord], List[MetadataPath]]: match_records: List[MatchRecord] = [] failed_patterns: List[MetadataPath] = [] for pattern in pattern_list: if pattern.parts == (): match_records.extend(self._get_root_nodes(auto_list_root)) else: matching_path_records = self._search_matches( pattern.parts, self.tree, MetadataPath("")) if matching_path_records: match_records.extend(matching_path_records) else: failed_patterns.append(pattern) return match_records, failed_patterns
def test_pattern_3(self): self._test_pattern(["s*/*"], [ MetadataPath("s1/s1.1"), MetadataPath("s1/s1.2"), MetadataPath("s2/d2.1") ])
def _get_root_nodes(self, auto_list_root: bool) -> List[MatchRecord]: return ([ MatchRecord(MetadataPath(name), child_node) for name, child_node in self.tree.child_nodes.items() ] if auto_list_root is True else [MatchRecord(MetadataPath(""), self.tree)])
def test_pattern_4(self): self._test_pattern(["d3/*"], [ MetadataPath("d3/.datalad_metadata"), MetadataPath("d3/some_file") ])
def test_blank_path(self): parser = MetadataURLParser("a/b/c") result = parser.parse() self.assertIsInstance(result, TreeMetadataURL) self.assertEqual(result.dataset_path, MetadataPath("a/b/c")) self.assertEqual(result.local_path, MetadataPath(""))
def test_pattern_5(self): self._test_pattern(["*/s*"], [ MetadataPath("s1/s1.1"), MetadataPath("s1/s1.2"), MetadataPath("d3/some_file") ])
def test_pattern_7(self): found, failed = self.tree_search.get_matching_paths(["see"], False) self.assertListEqual(found, []) self.assertListEqual(failed, [MetadataPath("see")])
def test_recursive_list_2(self): self._test_pattern_rec(["d3"], [ MetadataPath("d3/.datalad_metadata"), MetadataPath("d3/some_file") ])
def legacy_extract_file(ep: ExtractionParameter) -> Iterable[dict]: if issubclass(ep.extractor_class, MetadataExtractor): # Metalad legacy extractor status = [{ "type": "file", "path": str(ep.source_dataset.pathobj / ep.file_tree_path), "state": "clean", "gitshasum": ep.source_dataset_version }] extractor = ep.extractor_class() ensure_legacy_content_availability(ep, extractor, "content", status) for result in extractor(ep.source_dataset, ep.source_dataset_version, "content", status): result["action"] = "meta_extract" if result["status"] == "ok": result["metadata_record"] = dict( type="file", dataset_id=ep.source_dataset_id, dataset_version=ep.source_dataset_version, path=ep.file_tree_path, extractor_name=ep.extractor_name, extractor_version=str( extractor.get_state(ep.source_dataset)["version"]), extraction_parameter=ep.extractor_arguments, extraction_time=time.time(), agent_name=ep.agent_name, agent_email=ep.agent_email, extracted_metadata=result["metadata"]) yield result elif issubclass(ep.extractor_class, BaseMetadataExtractor): # Datalad legacy extractor path = str(ep.source_dataset.pathobj / ep.file_tree_path) if ep.extractor_class.NEEDS_CONTENT: ensure_legacy_path_availability(ep, path) extractor = ep.extractor_class(ep.source_dataset, [path]) _, file_result = extractor.get_metadata(False, True) for path, metadata in file_result: result = dict(action="meta_extract", status="ok", type="file", metadata_record=dict( type="file", dataset_id=ep.source_dataset_id, dataset_version=ep.source_dataset_version, path=MetadataPath(path), extractor_name=ep.extractor_name, extractor_version="un-versioned", extraction_parameter=ep.extractor_arguments, extraction_time=time.time(), agent_name=ep.agent_name, agent_email=ep.agent_email, extracted_metadata=metadata)) yield result else: raise ValueError( f"unknown extractor class: {type(ep.extractor_class).__name__}")
def get_path(self): if self.match(":"): path = MetadataPath(self.get_remaining()) return True, path return False, MetadataPath("")