def _check_taxonomy_depth(project: Project, *, tier: int) -> None: """ Checks the maximum depth of the taxonomy associated wit Args: project: Tamr project object tier: integer specifying the tier from which to extract categories Returns: whether tier exceed the maximum taxonomy depth or not Raises: ValueError: if tier is greater than maximum taxonomy depth """ # depth check is not required for leaf nodes if tier == -1: return max_depth = 0 classification_project = project.as_categorization() taxonomy = classification_project.taxonomy() categories = taxonomy.categories() for category in categories: if len(category.path) > max_depth: max_depth = len(category.path) if max_depth < tier: invalid_tier_value_error = ( f"Invalid value for tier {tier}. Maximum depth detected is {max_depth}." ) LOGGER.error(invalid_tier_value_error) raise ValueError(invalid_tier_value_error)
def _get_categories_at_tier(project: Project, *, tier: int) -> set: """ Extracts categories at tier from a taxonomy associated with Project Args: project: Tamr project object tier: integer specifying the tier to extract the categories; -1 will return all leaf categories Returns: set of category paths at tier, joined by '|' if multi-level taxonomy """ classification_project = project.as_categorization() taxonomy = classification_project.taxonomy() categories = taxonomy.categories() category_set = set() if tier > 0: for category in categories: if len(category.path) == tier: category_set.add("|".join(category.path)) else: # leaf nodes category_set = _create_leaf_node_set(taxonomy) return category_set
def get_tier_confidence( project: Project, *, tier: int = -1, allow_dataset_refresh: bool = False) -> data_type.JsonDict: """ Extracts tier-specific average confidence from a Tamr internal dataset `<unified dataset name>_classifications_average_confidences` in a dictionary Args: project: Tamr project object tier: integer specifying the tier to extract the average confidence; default value will return the average confidence at all leaf categories allow_dataset_refresh: if True, allows running a job to refresh dataset to make it streamable Returns: dictionary - keys are category paths, joined by '|' if multi-level taxonomy and values are average confidence of the corresponding keys Raises: RuntimeError: if `dataset` is not streamable and `allow_dataset_refresh` is False; TypeError: if tier is not of type int; or if the project type is not classification ValueError: if tier is less than -1 or equal to 0 """ LOGGER.info( f"Retrieving average confidence for taxonomy nodes in project {project.name} " f"(id={project.resource_id}).") # check project type is categorization try: project = project.as_categorization() except TypeError: not_categorization_error = f"Project {project.name} is not a classification project." LOGGER.error(not_categorization_error) raise TypeError(not_categorization_error) # check necessary dataset can be obtained dataset = _get_dataset_with_confidence(project) # check tier is valid if type(tier) is not int: wrong_tier_type_error = f"Tier {tier} is not an integer." LOGGER.error(wrong_tier_type_error) raise TypeError(wrong_tier_type_error) if tier < -1 or tier == 0: invalid_tier_value_error = ( f"Invalid value for tier {tier}. Tier cannot be 0 or less than -1." ) LOGGER.error(invalid_tier_value_error) raise ValueError(invalid_tier_value_error) # check dataset can be streamed if not dataset.status().is_streamable: if allow_dataset_refresh: LOGGER.info( f"Refreshing dataset {dataset.name} to make streamable.") op = dataset.refresh() operation.enforce_success(op) else: cannot_stream_error = ( f"Dataset {dataset.name} is not streamable. " f"Refresh it first, or run with allow_dataset_refresh=True") LOGGER.error(cannot_stream_error) raise RuntimeError(cannot_stream_error) # check dataset contains necessary attributes _check_dataset_with_confidence(dataset) # check tier does not exceed maximum taxonomy depth _check_taxonomy_depth(project, tier=tier) # obtain categories at tier selected_category_set = _get_categories_at_tier(project, tier=tier) # extract average confidence tier_confidence_dict = _extract_confidence( dataset=dataset, category_set=selected_category_set) return tier_confidence_dict
def from_taxonomy( project: Project, export_file_path: Union[Path, str], *, csv_delimiter: str = ",", flatten_delimiter: str = "|", quote_character: str = '"', quoting: int = csv.QUOTE_MINIMAL, overwrite: bool = False, ) -> int: """ Export a Tamr taxonomy to a csv file. Records are streamed to disk and written according to a given buffer size. Args: project: Tamr Project object export_file_path: Path to the csv file where the dataset will be saved csv_delimiter: Delimiter of the csv file flatten_delimiter: Flatten list types to strings by concatenating with this delimiter quote_character: Character used to escape value for csv delimiter when it appears in the value. quoting: The escape strategy to use according to the Python csv writer. See https://docs.python.org/2/library/csv.html#csv.QUOTE_MINIMAL overwrite: if True and export_file_name already exists, overwrite the file. Otherwise throw an error Returns: The total number of records written Raises: FileExistsError: if `export_file_path` exists and `overwrite` is set to False IOError: if the specified filepath does not exist or cannot be accessed RuntimeError: if the classification project is not yet associated with a taxonomy or taxonomy cannot be written to a csv file TypeError: if the project type is not classification ValueError: if `columns` and `flatten_columns` are identical values """ LOGGER.info( f"Streaming taxonomy to csv file {export_file_path} from project {project.name} " f"(project id={project.resource_id}).") try: project = project.as_categorization() except TypeError: not_categorization_error = f"Project {project.name} is not a classification project." LOGGER.error(not_categorization_error) raise TypeError(not_categorization_error) if os.path.exists(export_file_path): if not overwrite: message = ( f"CSV file {export_file_path} already exists. " f"(Set 'overwrite' flag to True if you wish to overwrite)") LOGGER.error(message) raise FileExistsError(message) else: LOGGER.warning( f"CSV file {export_file_path} already exists and will be overwritten" ) if csv_delimiter == flatten_delimiter: message = ( f"The CSV delimiter '{csv_delimiter}' cannot be identical to the list " f"flattening delimiter '{flatten_delimiter}'") LOGGER.error(message) raise ValueError(message) try: taxonomy = project.as_categorization().taxonomy() except requests.exceptions.RequestException: no_taxonomy_error = f"Project {project.name} is not associated with any taxonomy yet." LOGGER.error(no_taxonomy_error) raise RuntimeError(no_taxonomy_error) # obtain categories and store in a list categories = taxonomy.categories() taxonomy_list = [] for category in categories: taxonomy_list.append(category.path) # sort the categories taxonomy_list.sort() # Open CSV file and use newline='' as recommended by # https://docs.python.org/3/library/csv.html#csv.writer try: f = open(export_file_path, "w", newline="", encoding="utf-8") except (FileNotFoundError, IOError, PermissionError): cannot_open_error = f"File path {export_file_path} could not be opened for writing." LOGGER.error(cannot_open_error) raise IOError(cannot_open_error) else: try: csv_writer = csv.writer( f, delimiter=csv_delimiter, quotechar=quote_character, quoting=quoting, ) csv_writer.writerows(taxonomy_list) except csv.Error as e: general_error = ( "Encountered an error while writing taxonomy categories to " f"{export_file_path}: {e}") f.close() LOGGER.error(general_error) raise RuntimeError(general_error) finally: f.close() records_written = len(taxonomy_list) LOGGER.info( f"Wrote {records_written} categories from {project.name} taxonomy (project id" f"={project.resource_id}) to {export_file_path}") return records_written