예제 #1
0
def _extract_confidence(*, dataset: Dataset,
                        category_set: set) -> data_type.JsonDict:
    """
    Extracts tier-specific average confidence from a Tamr internal dataset
    `<unified dataset name>_classifications_average_confidences` to a dictionary

    Args:
        dataset: Tamr internal Dataset with a name ending in
        `_unified_dataset_classifications_average_confidences`
        category_set: set of category paths at the desired tier

    Returns:
        dictionary - keys are category paths, joined by '|' if multi-level taxonomy. Values are
        average confidence of the corresponding keys, where it is None if no confidence exists for
        the category.
    """
    confidence_dict = {}
    for record in dataset.records():
        path = "|".join(record["classificationPath"])
        if path in category_set:
            confidence_dict[path] = record["averageConfidence"]

    empty_confidence_categories = category_set - set(confidence_dict.keys())
    for category in empty_confidence_categories:
        confidence_dict[category] = None

    return confidence_dict
예제 #2
0
def from_dataset(dataset: Dataset) -> Dict[str, TranslationDictionary]:
    """
    Stream a dictionary from Tamr

    Args:
        dataset: Tamr Dataset object

    Returns:
        A toolbox translation dictionary

    Raises:
        ValueError: if the provided `dataset` is not a toolbox translation dictionary dataset
        NameError: if the provided `dataset` does not contain all the attributes of a
            toolbox translation dictionary
        RuntimeError: if there is any other problem while reading the `dataset` as a
            toolbox translation dictionary
    """
    if dataset.key_attribute_names[0] != "standardized_phrase":
        error_message = f"Provided Tamr Dataset is not a toolbox translation dictionary"
        LOGGER.error(error_message)
        raise ValueError(error_message)

    dictionary = {}
    for record in dataset.records():
        try:
            entry = TranslationDictionary(**record)
            # values are returned as a list of a single string, we change this to string
            entry.translated_phrase = entry.translated_phrase[0]
            entry.detected_language = entry.detected_language[0]

            # original phrases are stored on Tamr as lists, we save it as a set
            entry.original_phrases = set(entry.original_phrases)

        except NameError as e:
            error_message = (
                f"Supplied Tamr dataset is not in a toolbox translation dictionary format: {e}"
            )
            LOGGER.error(error_message)
            raise NameError(error_message)
        except Exception as e:
            error_message = f"Error while reading the Tamr dataset translation dictionary: {e}"
            LOGGER.error(error_message)
            raise RuntimeError(error_message)

        formatted_dictionary = {entry.standardized_phrase: entry}
        dictionary.update(formatted_dictionary)
    return dictionary
예제 #3
0
def _yield_records(
    dataset: Dataset,
    *,
    func: Optional[Callable] = None,
    columns: Optional[List] = None,
    flatten_columns: Optional[List] = None,
) -> Iterable:
    """
    Generator function for records from a Tamr Dataset.
    Can optionally apply a flattening function to some or all columns,
    and can optionally keep only a subset of columns from the original records.

    Args:
        dataset: Tamr Dataset
        func: optional, callable function to transform records
        columns: optional, list of columns to keep
        flatten_columns: optional, list of columns to flatten

    Returns: iterable over Dataset records

    Raises:
        ValueError: if `flatten_columns` is not None but `func` is None since it
            won't know how to flatten them.
    """

    if flatten_columns is not None and func is None:
        message = (
            f"Columns specified for flatting ('flatten_columns'={flatten_columns}), "
            f"but no flattening function provided for parameter 'func'"
        )
        LOGGER.error(message)
        raise ValueError(message)

    checked_columns = False
    for record in dataset.records():
        if not checked_columns:
            if columns is not None:
                _check_columns_subset(
                    input_list=columns, reference_list=record.keys(), raise_error=True
                )
            if flatten_columns is not None:
                _check_columns_subset(
                    input_list=flatten_columns, reference_list=record.keys(), raise_error=True
                )
            checked_columns = True

        # Set flatten_columns to all if unspecified
        if func is not None and flatten_columns is None:
            flatten_columns = record.keys()

        output = {}
        for k, v in record.items():
            if columns is not None and k not in columns:
                # remove the column by skipping
                continue

            do_flatten_column = flatten_columns is not None and k in flatten_columns
            if do_flatten_column and func is not None:
                output[k] = func(v)
            else:
                output[k] = v
        yield output