def _extract_confidence(*, dataset: Dataset, category_set: set) -> data_type.JsonDict: """ Extracts tier-specific average confidence from a Tamr internal dataset `<unified dataset name>_classifications_average_confidences` to a dictionary Args: dataset: Tamr internal Dataset with a name ending in `_unified_dataset_classifications_average_confidences` category_set: set of category paths at the desired tier Returns: dictionary - keys are category paths, joined by '|' if multi-level taxonomy. Values are average confidence of the corresponding keys, where it is None if no confidence exists for the category. """ confidence_dict = {} for record in dataset.records(): path = "|".join(record["classificationPath"]) if path in category_set: confidence_dict[path] = record["averageConfidence"] empty_confidence_categories = category_set - set(confidence_dict.keys()) for category in empty_confidence_categories: confidence_dict[category] = None return confidence_dict
def from_dataset(dataset: Dataset) -> Dict[str, TranslationDictionary]: """ Stream a dictionary from Tamr Args: dataset: Tamr Dataset object Returns: A toolbox translation dictionary Raises: ValueError: if the provided `dataset` is not a toolbox translation dictionary dataset NameError: if the provided `dataset` does not contain all the attributes of a toolbox translation dictionary RuntimeError: if there is any other problem while reading the `dataset` as a toolbox translation dictionary """ if dataset.key_attribute_names[0] != "standardized_phrase": error_message = f"Provided Tamr Dataset is not a toolbox translation dictionary" LOGGER.error(error_message) raise ValueError(error_message) dictionary = {} for record in dataset.records(): try: entry = TranslationDictionary(**record) # values are returned as a list of a single string, we change this to string entry.translated_phrase = entry.translated_phrase[0] entry.detected_language = entry.detected_language[0] # original phrases are stored on Tamr as lists, we save it as a set entry.original_phrases = set(entry.original_phrases) except NameError as e: error_message = ( f"Supplied Tamr dataset is not in a toolbox translation dictionary format: {e}" ) LOGGER.error(error_message) raise NameError(error_message) except Exception as e: error_message = f"Error while reading the Tamr dataset translation dictionary: {e}" LOGGER.error(error_message) raise RuntimeError(error_message) formatted_dictionary = {entry.standardized_phrase: entry} dictionary.update(formatted_dictionary) return dictionary
def _yield_records( dataset: Dataset, *, func: Optional[Callable] = None, columns: Optional[List] = None, flatten_columns: Optional[List] = None, ) -> Iterable: """ Generator function for records from a Tamr Dataset. Can optionally apply a flattening function to some or all columns, and can optionally keep only a subset of columns from the original records. Args: dataset: Tamr Dataset func: optional, callable function to transform records columns: optional, list of columns to keep flatten_columns: optional, list of columns to flatten Returns: iterable over Dataset records Raises: ValueError: if `flatten_columns` is not None but `func` is None since it won't know how to flatten them. """ if flatten_columns is not None and func is None: message = ( f"Columns specified for flatting ('flatten_columns'={flatten_columns}), " f"but no flattening function provided for parameter 'func'" ) LOGGER.error(message) raise ValueError(message) checked_columns = False for record in dataset.records(): if not checked_columns: if columns is not None: _check_columns_subset( input_list=columns, reference_list=record.keys(), raise_error=True ) if flatten_columns is not None: _check_columns_subset( input_list=flatten_columns, reference_list=record.keys(), raise_error=True ) checked_columns = True # Set flatten_columns to all if unspecified if func is not None and flatten_columns is None: flatten_columns = record.keys() output = {} for k, v in record.items(): if columns is not None and k not in columns: # remove the column by skipping continue do_flatten_column = flatten_columns is not None and k in flatten_columns if do_flatten_column and func is not None: output[k] = func(v) else: output[k] = v yield output