def _get_upstream_projects(project: Project) -> List[Project]: """ get projects immediately upstream of a given project Args: project: the project to check Returns: A list of project names upstream of the project """ client = project.client # find upstream datasets - if GR project just get input datasets if ProjectType[project.type] == ProjectType.GOLDEN_RECORDS: upstream_datasets = [x for x in project.input_datasets().stream()] # else find the upstream datasets of the UD (not input datasets to capture datasets used in Tx) else: unified_dataset_id = project.unified_dataset().relative_id unified_dataset = client.datasets.by_relative_id(unified_dataset_id) upstream_datasets = unified_dataset.upstream_datasets() upstream_project_names = [] # walk through upstream datasets for upstream_result in upstream_datasets: # get the upstream object as a dataset upstream_dataset = client.datasets.by_resource_id( upstream_result.resource_id) # see if it is the output of a project and if so add to the list upstream_dataset_projects = set( x.project_name for x in upstream_dataset.usage().usage.output_from_project_steps) upstream_project_names.extend([x for x in upstream_dataset_projects]) return [client.projects.by_name(x) for x in upstream_project_names]
def unmap_dataset( project: Project, *, source_dataset: Dataset, remove_dataset_from_project: bool = False, skip_if_missing: bool = False, ) -> None: """ Wholly unmaps a dataset and optionally removes it from a project. Args: source_dataset: the source dataset (Dataset object not a string) to unmap project: the project in which to unmap the dataset remove_dataset_from_project: boolean to also remove the dataset from the project skip_if_missing: boolean to skip if dataset is not in project. If set to false and dataset is not in project will raise a RuntimeError Returns: None Raises: RuntimeError: if `source_dataset` is not in `project` and `skip_if_missing` not set to True """ # check to make sure dataset is in project and log a warning if it is not if source_dataset.name not in [x.name for x in project.input_datasets()]: if skip_if_missing: LOGGER.warning( f"Dataset to unmap {source_dataset.name} not in project {project.name}! " f"However skip_if_missing flag is set so will do nothing" ) return None else: error_message = ( f"Dataset to unmap {source_dataset.name} not in project " f"{project.name} and skip_if_missing not set to True so failing! " ) LOGGER.error(error_message) raise RuntimeError(error_message) # the resource ids of attribute mappings unfortunately change when you delete one # so need to just do this until there are no mappings left for the source dataset of interest while True: mappings = [ x for x in project.attribute_mappings().stream() if x.input_dataset_name == source_dataset.name ] # if no mappings found for this dataset then break if not mappings: break for mapping in mappings: # can only delete one then have to break out of inner loop project.attribute_mappings().delete_by_resource_id(mapping.resource_id) break # optionally remove dataset from the project if remove_dataset_from_project: project.remove_input_dataset(source_dataset)
def test_project_remove_input_dataset(self): dataset_id = self.dataset_json[0]["relativeId"] responses.add(responses.GET, self.input_datasets_url, json=self.dataset_json) responses.add( responses.DELETE, f"{self.input_datasets_url}?id={dataset_id}", status=204 ) responses.add(responses.GET, self.input_datasets_url, json=[]) project = Project(self.tamr, self.project_json[0]) dataset = next(project.input_datasets().stream()) response = project.remove_input_dataset(dataset) self.assertEqual(response.status_code, 204) input_datasets = project.input_datasets() self.assertEqual(list(input_datasets), [])
def bootstrap_dataset( project: Project, *, source_dataset: Dataset, force_add_dataset_to_project: bool = False ) -> List[AttributeMapping]: """ Bootstraps a dataset (i.e. maps all source columns to themselves) Args: source_dataset: the source dataset (a Dataset object not a string) project: the project to do the mapping ing force_add_dataset_to_project: boolean whether to add the dataset to the project if it is not already a part of it Returns: List of the AttributeMappings generated Raises: RuntimeError: if `source_dataset` is not part of the given `project`, set 'force_add_dataset_to_project' flag to True to automatically add it """ # check if dataset is in the project - python doesn't handle comparison of Dataset objects # well so check on name if source_dataset.name not in [x.name for x in project.input_datasets()]: if force_add_dataset_to_project: LOGGER.info(f"adding dataset {source_dataset.name} to project {project.name}") project.add_input_dataset(source_dataset) else: raise RuntimeError( f"dataset {source_dataset.name} not in project {project.name}!" + "Set 'force_add_dataset_to_project' flag to True to automatically add it" ) # for each attribute map it source_dataset_name = source_dataset.name completed_mappings = [] for attribute in source_dataset.attributes: attribute_name = attribute.name mapping = map_attribute( source_attribute_name=attribute_name, source_dataset_name=source_dataset_name, unified_attribute_name=attribute_name, project=project, ) completed_mappings.append(mapping) return completed_mappings
def test_project_get_input_datasets(self): p = Project(self.tamr, self.project_json[0]) datasets = p.input_datasets() self.assertEqual(datasets.api_path, "projects/1/inputDatasets")