Пример #1
0
def delete_attributes(
    *,
    dataset: Dataset,
    attributes: Iterable[str] = None,
) -> Dataset:
    """Remove attributes from dataset by attribute name

    Args:
        dataset: An existing TUC dataset
        attributes: list of attribute names to delete from dataset

    Returns:
        Updated Dataset

    Raises:
        ValueError: If the dataset is not a source dataset
        ValueError: If a passed attribute does not exist in the dataset
        ValueError: If a passed attribute is a primary key and can't be removed
        TypeError: If the attributes argument is not an Iterable
    """
    dataset_name = dataset.name
    if dataset.upstream_datasets():
        raise ValueError(f"{dataset_name} is not a source dataset")

    # Check input type is correct
    if not isinstance(attributes, Iterable):
        raise TypeError("attributes arg must be an Iterable")

    # Get current dataset attributes
    target_attribute_dict = {attr.name: attr for attr in dataset.attributes}
    existing_attributes = target_attribute_dict.keys()
    primary_keys = dataset.spec().to_dict()["keyAttributeNames"]

    # Check all attributes exist before starting to remove any
    for attribute_name in attributes:
        if attribute_name not in existing_attributes:
            raise ValueError(
                f"The attribute '{attribute_name}' does not exist in {dataset_name}"
            )
        elif attribute_name in primary_keys:
            # Can not edit a primary key
            raise ValueError(
                f"The attribute '{attribute_name}' is a primary key and can't be removed"
            )

    # Remove attributes from dataset
    for attribute_name in attributes:
        dataset.attributes.delete_by_resource_id(
            target_attribute_dict[attribute_name].resource_id)
        LOGGER.info(f"Deleted attribute '{attribute_name}' in {dataset_name}")

    return dataset
Пример #2
0
def _extract_confidence(*, dataset: Dataset,
                        category_set: set) -> data_type.JsonDict:
    """
    Extracts tier-specific average confidence from a Tamr internal dataset
    `<unified dataset name>_classifications_average_confidences` to a dictionary

    Args:
        dataset: Tamr internal Dataset with a name ending in
        `_unified_dataset_classifications_average_confidences`
        category_set: set of category paths at the desired tier

    Returns:
        dictionary - keys are category paths, joined by '|' if multi-level taxonomy. Values are
        average confidence of the corresponding keys, where it is None if no confidence exists for
        the category.
    """
    confidence_dict = {}
    for record in dataset.records():
        path = "|".join(record["classificationPath"])
        if path in category_set:
            confidence_dict[path] = record["averageConfidence"]

    empty_confidence_categories = category_set - set(confidence_dict.keys())
    for category in empty_confidence_categories:
        confidence_dict[category] = None

    return confidence_dict
Пример #3
0
    def unified_dataset(self):
        """Unified dataset for this project.

        :return: Unified dataset for this project.
        :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset`
        """
        alias = self.api_path + "/unifiedDataset"
        resource_json = self.client.get(alias).successful().json()
        return Dataset.from_json(self.client, resource_json, alias)
Пример #4
0
    def create(self, creation_spec):
        """
        Create a Dataset in Tamr

        :param creation_spec: Dataset creation specification should be formatted as specified in the `Public Docs for Creating a Dataset <https://docs.tamr.com/reference#create-a-dataset>`_.
        :type creation_spec: dict[str, str]
        :returns: The created Dataset
        :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset`
        """
        data = self.client.post(self.api_path,
                                json=creation_spec).successful().json()
        return Dataset.from_json(self.client, data)
Пример #5
0
    def pairs(self):
        """Record pairs generated by Tamr's binning model.
        Pairs are displayed on the "Pairs" page in the Tamr UI.

        Call :func:`~tamr_unify_client.dataset.resource.Dataset.refresh` from
        this dataset to regenerate pairs according to the latest binning model.

        :returns: The record pairs represented as a dataset.
        :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset`
        """
        alias = self.api_path + "/recordPairs"
        return Dataset(self.client, None, alias)
Пример #6
0
    def record_clusters(self):
        """Record Clusters as a dataset. Tamr clusters labeled pairs using pairs
        model. These clusters populate the cluster review page and get transient
        cluster ids, rather than published cluster ids (i.e., "Permanent Ids")

        Call :func:`~tamr_unify_client.dataset.resource.Dataset.refresh` from
        this dataset to generate clusters based on to the latest pair-matching model.

        :returns: The record clusters represented as a dataset.
        :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset`
        """
        alias = self.api_path + "/recordClusters"
        return Dataset(self.client, None, alias)
Пример #7
0
    def test_delete_attribute(self):
        url = "http://localhost:9100/api/versioned/v1/datasets/1/attributes/RowNum"
        responses.add(responses.GET, url, json=self._attributes_json[0])
        responses.add(responses.DELETE, url, status=204)
        responses.add(responses.GET, url, status=404)

        dataset = Dataset(self.tamr, self._dataset_json)
        attribute = dataset.attributes.by_resource_id("RowNum")
        self.assertEqual(attribute._data, self._attributes_json[0])

        response = attribute.delete()
        self.assertEqual(response.status_code, 204)
        self.assertRaises(HTTPError,
                          lambda: dataset.attributes.by_resource_id("RowNum"))
Пример #8
0
def _find_associated_projects(dataset: Dataset) -> List[str]:
    """Return list of project_list that the dataset is part of.

    Args:
        dataset: The target dataset.

    Returns:
        List of Project IDs that the target `dataset` is part of.

    """
    project_list = set(
        [step.project().resource_id for step in dataset.usage().usage.input_to_project_steps]
    )
    return list(project_list)
Пример #9
0
def _request_upstream_datasets(dataset: Dataset) -> Dataset:
    """ Returns a dataset's upstream dataset

        Args:
            dataset: a Tamr Dataset Object
        Returns:
            The upstream datasets
    """
    # Find upstream datasets, output is a DatasetURI
    upstream = dataset.upstream_datasets()
    dataset_upstream = []
    # Make Dataset our of DatasetURI
    for data in upstream:
        dataset_upstream.append(
            dataset.client.datasets.by_resource_id(data.resource_id))
    return dataset_upstream
Пример #10
0
    def published_cluster_stats(self):
        """Retrieves published cluster stats for this project.

        :returns: The published cluster stats dataset.
        :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset`
        """
        # Replace this workaround with a direct API call once API
        # is fixed. APIs that need to work are: fetching the dataset and
        # being able to call refresh on resulting dataset. Until then, we grab
        # the dataset by constructing its name from the corresponding Unified Dataset's name
        unified_dataset = self.unified_dataset()
        name = unified_dataset.name + "_dedup_published_cluster_stats"
        dataset = self.client.datasets.by_name(name)

        path = self.api_path + "/publishedClusterStats"
        return Dataset.from_json(self.client, dataset._data, path)
Пример #11
0
    def high_impact_pairs(self):
        """High-impact pairs as a dataset. Tamr labels pairs as "high-impact" if
        labeling these pairs would help it learn most quickly (i.e. "Active learning").

        High-impact pairs are displayed with a ⚡ lightning bolt icon on the
        "Pairs" page in the Tamr UI.

        Call :func:`~tamr_unify_client.dataset.resource.Dataset.refresh` from
        this dataset to produce new high-impact pairs according to the latest
        pair-matching model.

        :returns: The high-impact pairs represented as a dataset.
        :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset`
        """
        alias = self.api_path + "/highImpactPairs"
        return Dataset(self.client, None, alias)
Пример #12
0
def from_dataset(dataset: Dataset) -> Dict[str, TranslationDictionary]:
    """
    Stream a dictionary from Tamr

    Args:
        dataset: Tamr Dataset object

    Returns:
        A toolbox translation dictionary

    Raises:
        ValueError: if the provided `dataset` is not a toolbox translation dictionary dataset
        NameError: if the provided `dataset` does not contain all the attributes of a
            toolbox translation dictionary
        RuntimeError: if there is any other problem while reading the `dataset` as a
            toolbox translation dictionary
    """
    if dataset.key_attribute_names[0] != "standardized_phrase":
        error_message = f"Provided Tamr Dataset is not a toolbox translation dictionary"
        LOGGER.error(error_message)
        raise ValueError(error_message)

    dictionary = {}
    for record in dataset.records():
        try:
            entry = TranslationDictionary(**record)
            # values are returned as a list of a single string, we change this to string
            entry.translated_phrase = entry.translated_phrase[0]
            entry.detected_language = entry.detected_language[0]

            # original phrases are stored on Tamr as lists, we save it as a set
            entry.original_phrases = set(entry.original_phrases)

        except NameError as e:
            error_message = (
                f"Supplied Tamr dataset is not in a toolbox translation dictionary format: {e}"
            )
            LOGGER.error(error_message)
            raise NameError(error_message)
        except Exception as e:
            error_message = f"Error while reading the Tamr dataset translation dictionary: {e}"
            LOGGER.error(error_message)
            raise RuntimeError(error_message)

        formatted_dictionary = {entry.standardized_phrase: entry}
        dictionary.update(formatted_dictionary)
    return dictionary
Пример #13
0
    def published_clusters(self):
        """Published record clusters generated by Tamr's pair-matching model.

        :returns: The published clusters represented as a dataset.
        :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset`
        """

        unified_dataset = self.unified_dataset()

        # Replace this workaround with a direct API call once API
        # is fixed. APIs that need to work are: fetching the dataset and
        # being able to call refresh on resulting dataset. Until then, we grab
        # the dataset by constructing its name from the corresponding Unified Dataset's name
        name = unified_dataset.name + "_dedup_published_clusters"
        canonical = self.client.datasets.by_name(name)
        resource_json = canonical._data
        alias = self.api_path + "/publishedClusters"
        return Dataset.from_json(self.client, resource_json, alias)
Пример #14
0
    def test_feature_to_record(self):
        feature = {"type": "Feature", "id": "1"}
        actual = Dataset._feature_to_record(feature, ["pk"], "geo")
        expected = {"pk": "1"}
        self.assertEqual(expected, actual)

        feature = {
            "type": "Feature",
            "id": "1",
            "geometry": {"type": "Point", "coordinates": [0, 0]},
        }
        actual = Dataset._feature_to_record(feature, ["pk"], "geo")
        expected = {"pk": "1", "geo": {"point": [0, 0]}}
        self.assertEqual(expected, actual)

        feature = {
            "type": "Feature",
            "id": "1",
            "geometry": {"type": "MultiPoint", "coordinates": [[0, 0], [1, 1]]},
        }
        actual = Dataset._feature_to_record(feature, ["pk"], "geo")
        expected = {"pk": "1", "geo": {"multiPoint": [[0, 0], [1, 1]]}}
        self.assertEqual(expected, actual)

        feature = {
            "type": "Feature",
            "id": "1",
            "geometry": {"type": "LineString", "coordinates": [[0, 0], [1, 1]]},
        }
        actual = Dataset._feature_to_record(feature, ["pk"], "geo")
        expected = {"pk": "1", "geo": {"lineString": [[0, 0], [1, 1]]}}
        self.assertEqual(expected, actual)

        feature = {
            "type": "Feature",
            "id": "1",
            "geometry": {
                "type": "MultiLineString",
                "coordinates": [[[0, 0], [1, 1], [2, 2]]],
            },
        }
        actual = Dataset._feature_to_record(feature, ["pk"], "geo")
        expected = {"pk": "1", "geo": {"multiLineString": [[[0, 0], [1, 1], [2, 2]]]}}
        self.assertEqual(expected, actual)

        feature = {
            "type": "Feature",
            "id": "1",
            "geometry": {"type": "Polygon", "coordinates": [[[0, 0], [1, 1], [2, 2]]]},
        }
        actual = Dataset._feature_to_record(feature, ["pk"], "geo")
        expected = {"pk": "1", "geo": {"polygon": [[[0, 0], [1, 1], [2, 2]]]}}
        self.assertEqual(expected, actual)

        feature = {
            "type": "Feature",
            "id": "1",
            "geometry": {
                "type": "MultiPolygon",
                "coordinates": [[[[0, 0], [1, 1], [2, 2]]]],
            },
        }
        actual = Dataset._feature_to_record(feature, ["pk"], "geo")
        expected = {"pk": "1", "geo": {"multiPolygon": [[[[0, 0], [1, 1], [2, 2]]]]}}
        self.assertEqual(expected, actual)

        feature = {"type": "Feature", "id": "1", "geometry": None}
        actual = Dataset._feature_to_record(feature, ["pk"], "geo")
        expected = {"pk": "1"}
        self.assertEqual(expected, actual)

        feature = {
            "type": "Feature",
            "id": "1",
            "bbox": [0, 0, 1, 1],
            "geometry": {"type": "Point", "coordinates": [0, 0]},
        }
        actual = Dataset._feature_to_record(feature, ["pk"], "geo")
        expected = {"pk": "1", "geo": {"point": [0, 0]}, "bbox": [0, 0, 1, 1]}
        self.assertEqual(expected, actual)

        feature = {
            "type": "Feature",
            "id": "1",
            "bbox": None,
            "geometry": {"type": "Point", "coordinates": [0, 0]},
        }
        actual = Dataset._feature_to_record(feature, ["pk"], "geo")
        expected = {"pk": "1", "geo": {"point": [0, 0]}}
        self.assertEqual(expected, actual)

        feature = {
            "type": "Feature",
            "id": "1",
            "bbox": [0, 0, 1, 1],
            "geometry": {"type": "Point", "coordinates": [0, 0]},
            "properties": {"prop1": "val1", "prop2": "val2"},
        }
        actual = Dataset._feature_to_record(feature, ["pk"], "geo")
        expected = {
            "pk": "1",
            "geo": {"point": [0, 0]},
            "bbox": [0, 0, 1, 1],
            "prop1": "val1",
            "prop2": "val2",
        }
        self.assertEqual(expected, actual)

        feature = {
            "type": "Feature",
            "id": "1",
            "bbox": [0, 0, 1, 1],
            "geometry": {"type": "Point", "coordinates": [0, 0]},
            "properties": None,
        }
        actual = Dataset._feature_to_record(feature, ["pk"], "geo")
        expected = {"pk": "1", "geo": {"point": [0, 0]}, "bbox": [0, 0, 1, 1]}
        self.assertEqual(expected, actual)

        feature = {
            "type": "Feature",
            "id": "1",
            "bbox": [0, 0, 1, 1],
            "geometry": {"type": "Point", "coordinates": [0, 0]},
            # Properties with names that conflict with
            # the props in the key or geometry
            # get ignored
            "properties": {"pk": "val1", "geo": "val2", "bbox": "val3"},
        }
        actual = Dataset._feature_to_record(feature, ["pk"], "geo")
        expected = {"pk": "1", "geo": {"point": [0, 0]}, "bbox": [0, 0, 1, 1]}
        self.assertEqual(expected, actual)

        feature = {
            "type": "Feature",
            "id": ["1", "2"],
            "geometry": {"type": "Point", "coordinates": [0, 0]},
        }
        actual = Dataset._feature_to_record(feature, ["pk1", "pk2"], "geo")
        expected = {"pk1": "1", "pk2": "2", "geo": {"point": [0, 0]}}
        self.assertEqual(expected, actual)

        feature = {"type": "Feature", "id": "1", "geometry": None}
        Dataset._feature_to_record(feature, ["pk"], "geo")
        # feature_to_record is required to not raise an exception

        feature = {
            "type": "Feature",
            "id": None,
            "geometry": {"type": "Point", "coordinates": [0, 0]},
        }
        with pytest.raises(ValueError):
            Dataset._feature_to_record(feature, ["pk"], "geo")

        feature = {
            "type": "Feature",
            "geometry": {"type": "Point", "coordinates": [0, 0]},
        }
        with pytest.raises(ValueError):
            Dataset._feature_to_record(feature, ["pk"], "geo")

        class NotAFeature:
            @property
            def __geo_interface__(self):
                return {
                    "type": "Feature",
                    "id": "1",
                    "geometry": {"type": "Point", "coordinates": [0, 0]},
                }

        naf = NotAFeature()
        actual = Dataset._feature_to_record(naf, ["pk"], "geo")
        expected = {"pk": "1", "geo": {"point": [0, 0]}}
        self.assertEqual(expected, actual)
Пример #15
0
    def test_record_to_feature(self):
        empty_record = {"id": "1"}

        def key_value_single(rec):
            return rec["id"]

        actual = Dataset._record_to_feature(
            empty_record, key_value_single, ["id"], "geom"
        )
        expected = {"type": "Feature", "id": "1"}
        self.assertEqual(expected, actual)

        record_with_point = {"id": "1", "geom": {"point": [1, 1]}}
        actual = Dataset._record_to_feature(
            record_with_point, key_value_single, ["id"], "geom"
        )
        expected = {
            "type": "Feature",
            "id": "1",
            "geometry": {"type": "Point", "coordinates": [1, 1]},
        }
        self.assertEqual(expected, actual)

        record_with_multi_point = {"id": "1", "geom": {"multiPoint": [[1, 1]]}}
        actual = Dataset._record_to_feature(
            record_with_multi_point, key_value_single, ["id"], "geom"
        )
        expected = {
            "type": "Feature",
            "id": "1",
            "geometry": {"type": "MultiPoint", "coordinates": [[1, 1]]},
        }
        self.assertEqual(expected, actual)

        record_with_line = {"id": "1", "geom": {"lineString": [[1, 1], [2, 2]]}}
        actual = Dataset._record_to_feature(
            record_with_line, key_value_single, ["id"], "geom"
        )
        expected = {
            "type": "Feature",
            "id": "1",
            "geometry": {"type": "LineString", "coordinates": [[1, 1], [2, 2]]},
        }
        self.assertEqual(expected, actual)

        record_with_multi_line = {
            "id": "1",
            "geom": {"multiLineString": [[[1, 1], [2, 2]]]},
        }
        actual = Dataset._record_to_feature(
            record_with_multi_line, key_value_single, ["id"], "geom"
        )
        expected = {
            "type": "Feature",
            "id": "1",
            "geometry": {"type": "MultiLineString", "coordinates": [[[1, 1], [2, 2]]]},
        }
        self.assertEqual(expected, actual)

        record_with_polygon = {
            "id": "1",
            "geom": {"polygon": [[[1, 1], [2, 2], [3, 3]]]},
        }
        actual = Dataset._record_to_feature(
            record_with_polygon, key_value_single, ["id"], "geom"
        )
        expected = {
            "type": "Feature",
            "id": "1",
            "geometry": {"type": "Polygon", "coordinates": [[[1, 1], [2, 2], [3, 3]]]},
        }
        self.assertEqual(expected, actual)

        record_with_multi_polygon = {
            "id": "1",
            "geom": {"multiPolygon": [[[[1, 1], [2, 2], [3, 3]]]]},
        }
        actual = Dataset._record_to_feature(
            record_with_multi_polygon, key_value_single, ["id"], "geom"
        )
        expected = {
            "type": "Feature",
            "id": "1",
            "geometry": {
                "type": "MultiPolygon",
                "coordinates": [[[[1, 1], [2, 2], [3, 3]]]],
            },
        }
        self.assertEqual(expected, actual)

        record_with_full_geo = {
            "id": "1",
            "geom": {
                "point": None,
                "multiPoint": None,
                "lineString": None,
                "multiLineString": None,
                "polygon": None,
                "multiPolygon": [[[[1, 1], [2, 2], [3, 3]]]],
            },
        }
        actual = Dataset._record_to_feature(
            record_with_full_geo, key_value_single, ["id"], "geom"
        )
        expected = {
            "type": "Feature",
            "id": "1",
            "geometry": {
                "type": "MultiPolygon",
                "coordinates": [[[[1, 1], [2, 2], [3, 3]]]],
            },
        }
        self.assertEqual(expected, actual)

        record_with_null_geo = {
            "id": "1",
            "geom": {
                "point": None,
                "multiPoint": None,
                "lineString": None,
                "multiLineString": None,
                "polygon": None,
                "multiPolygon": None,
            },
        }
        actual = Dataset._record_to_feature(
            record_with_null_geo, key_value_single, ["id"], "geom"
        )
        expected = {"geometry": None, "type": "Feature", "id": "1"}
        self.assertEqual(expected, actual)

        record_with_bbox = {"id": "1", "bbox": [[0, 0], [1, 1]]}
        actual = Dataset._record_to_feature(
            record_with_bbox, key_value_single, ["id"], "geom"
        )
        expected = {"type": "Feature", "id": "1", "bbox": [[0, 0], [1, 1]]}
        self.assertEqual(expected, actual)

        record_with_props = {"id": "1", "p1": "v1", "p2": "v2"}
        actual = Dataset._record_to_feature(
            record_with_props, key_value_single, ["id"], "geom"
        )
        expected = {
            "type": "Feature",
            "id": "1",
            "properties": {"p1": "v1", "p2": "v2"},
        }
        self.assertEqual(expected, actual)

        def key_value_composite(rec):
            return [rec[v] for v in ["id1", "id2"]]

        record_with_composite_key = {"id1": "1", "id2": "2"}
        actual = Dataset._record_to_feature(
            record_with_composite_key, key_value_composite, ["id1", "id2"], "geom"
        )
        expected = {"type": "Feature", "id": ["1", "2"]}
        self.assertEqual(expected, actual)

        record_with_everything = {
            "id1": "1",
            "id2": "2",
            "bbox": [[0, 0], [1, 1]],
            "name": "record with everything",
            "geom": {
                "point": None,
                "multiPoint": None,
                "lineString": None,
                "multiLineString": None,
                "polygon": [[[0, 0], [0, 1], [1, 1], [1, 0], [0, 0]]],
                "multiPolygon": None,
            },
            "alternate_geom": {
                "point": [1, 1],
                "multiPoint": None,
                "lineString": None,
                "multiLineString": None,
                "polygon": None,
                "multiPolygon": None,
            },
        }
        actual = Dataset._record_to_feature(
            record_with_everything, key_value_composite, ["id1", "id2"], "geom"
        )
        expected = {
            "type": "Feature",
            "id": ["1", "2"],
            "bbox": [[0, 0], [1, 1]],
            "properties": {
                "name": "record with everything",
                "alternate_geom": {
                    "point": [1, 1],
                    "multiPoint": None,
                    "lineString": None,
                    "multiLineString": None,
                    "polygon": None,
                    "multiPolygon": None,
                },
            },
            "geometry": {
                "type": "Polygon",
                "coordinates": [[[0, 0], [0, 1], [1, 1], [1, 0], [0, 0]]],
            },
        }
        self.assertEqual(expected, actual)

        record_without_geo = {"id": "1", "prop1": "val1"}
        actual = Dataset._record_to_feature(
            record_without_geo, key_value_single, ["id"], None
        )
        expected = {"type": "Feature", "id": "1", "properties": {"prop1": "val1"}}
        self.assertEqual(expected, actual)
Пример #16
0
def edit_attributes(
    *,
    dataset: Dataset,
    attribute_types: Optional[Dict[str, attribute_type.AttributeType]] = None,
    attribute_descriptions: Optional[Dict[str, str]] = None,
    override_existing_types: bool = True,
) -> Dataset:
    """Edit existing attributes in a dataset

    The attribute type and/or descriptions can be updated to new values. Attributes that will be
    updated must be in either the attribute_types or attribute_descriptions dictionaries or
    both. The default attribute type will be ARRAY STRING. To set non-default attribute types, they
    must be defined in the attribute_types dictionary. Any attribute descriptions can be specified
    in the attribute_descriptions dictionary. If only the attribute_descriptions dictionary is
    defined, the attribute type will not be updated.

    Args:
        dataset: An existing TUC dataset
        attribute_types: dictionary for non-default types, attribute name is the key and
            AttributeType is the value
        attribute_descriptions: dictionary for attribute descriptions, attribute name is the
            key and the attribute description is the value
        override_existing_types: bool flag, when true will alter existing attributes

    Returns:
        Updated Dataset

    Raises:
        requests.HTTPError: If any HTTP error is encountered
        ValueError: If the dataset is not a source dataset
        ValueError: If a passed attribute does not exist in the dataset
        ValueError: If a passed attribute is a primary key and can't be removed
        ValueError: If there are no updates to attributes in attribute_types or
            attribute_descriptions arguments
    """
    dataset_name = dataset.name
    if dataset.upstream_datasets():
        raise ValueError(f"{dataset_name} is not a source dataset")

    # Check description or type changes are passed in
    if attribute_types is None and attribute_descriptions is None:
        raise ValueError(
            """Updates to attributes must be passed in via attribute_types
            or attribute_descriptions arguments""")

    # Get list of attributes that need updating from attribute_types and
    # attribute_descriptions dictionaries
    attributes = {attr
                  for attr in attribute_types or list()
                  } | {attr
                       for attr in attribute_descriptions or list()}

    # Get current dataset attributes
    target_attribute_dict = {attr.name: attr for attr in dataset.attributes}
    existing_attributes = target_attribute_dict.keys()
    primary_keys = dataset.spec().to_dict()["keyAttributeNames"]

    # Check that all of the attribute names already exist in dataset
    for attribute_name in attributes:
        if attribute_name not in existing_attributes:
            # This attribute does not exist
            raise ValueError(
                f"An attribute with name '{attribute_name}' does not exist in {dataset_name}"
            )
        elif attribute_name in primary_keys:
            # Can not edit a primary key
            raise ValueError(
                f"The attribute '{attribute_name}' is a primary key and can't be updated"
            )

    # Update attributes in dataset
    for attribute_name in attributes:
        attr_spec_dict = _make_spec_dict(
            attribute_name=attribute_name,
            attribute_types=attribute_types,
            attribute_descriptions=attribute_descriptions,
        )
        existing_attribute_spec = target_attribute_dict[attribute_name].spec()
        if attribute_types is None or attribute_name not in attribute_types:
            new_type_class = attribute_type.from_json(
                existing_attribute_spec.to_dict()["type"])
        else:
            new_type_class = attribute_type.from_json(attr_spec_dict["type"])
        old_type_class = attribute_type.from_json(
            existing_attribute_spec.to_dict()["type"])

        if new_type_class == old_type_class:
            # Update description
            if (attribute_descriptions is not None
                    and attribute_name in attribute_descriptions.keys()):
                existing_attribute_spec = existing_attribute_spec.with_description(
                    attribute_descriptions[attribute_name])
                existing_attribute_spec.put()
            else:
                LOGGER.info(
                    f"There are no updates to the attribute '{attribute_name}' in {dataset_name}"
                )
        elif override_existing_types:
            # Update type
            new_attr_spec = existing_attribute_spec.to_dict()
            new_attr_spec["type"] = attr_spec_dict["type"]

            # Update description
            if "description" in attr_spec_dict.keys():
                new_attr_spec["description"] = attr_spec_dict["description"]

            # Remove and add attribute with new spec
            dataset.attributes.delete_by_resource_id(
                target_attribute_dict[attribute_name].resource_id)
            dataset.attributes.create(new_attr_spec)
            LOGGER.info(
                f"Updated attribute '{attribute_name}' in {dataset_name}")
        else:
            LOGGER.info(
                f"""The attribute '{attribute_name}' in {dataset_name} curently has
                 the type '{str(old_type_class)}'. Set 'override_existing_types' to
                 True to update the type to '{str(new_type_class)}'
                """)

    return dataset
Пример #17
0
def create_attributes(
    *,
    dataset: Dataset,
    attributes: Iterable[str],
    attribute_types: Optional[Dict[str, attribute_type.AttributeType]] = None,
    attribute_descriptions: Optional[Dict[str, str]] = None,
) -> Dataset:
    """Create new attributes in a dataset

    The default attribute type will be ARRAY STRING. To set non-default attribute types, they must
    be defined in the attribute_types dictionary. Any attribute descriptions can be specified in
    the attribute_descriptions dictionary.

    Args:
        dataset: An existing TUC dataset
        attributes: list of attribute names to be added to dataset
        attribute_types: dictionary for non-default types, attribute name is the key and
            AttributeType is the value
        attribute_descriptions: dictionary for attribute descriptions, attribute name is the
            key and the attribute description is the value

    Returns:
        Updated Dataset

    Raises:
        requests.HTTPError: If any HTTP error is encountered
        TypeError: If the attributes argument is not an Iterable
        ValueError: If the dataset is a unified dataset
        ValueError: If an attribute passed in already exists in the dataset

    """
    dataset_name = dataset.name
    if dataset.upstream_datasets():
        raise ValueError(f"{dataset_name} is not a source dataset")

    # Check input type is correct
    if not isinstance(attributes, Iterable):
        raise TypeError("attributes arg must be an Iterable")

    # Get current dataset attributes
    existing_attributes = [attr.name for attr in dataset.attributes]

    # Check that none of the new attribute names already exist
    for attribute_name in attributes:
        if attribute_name in existing_attributes:
            # This attribute already exists
            raise ValueError(
                f"An attribute with name '{attribute_name}' already exists in {dataset_name}"
            )

    # Add attributes to dataset
    for attribute_name in attributes:
        attr_spec_dict = _make_spec_dict(
            attribute_name=attribute_name,
            attribute_types=attribute_types,
            attribute_descriptions=attribute_descriptions,
        )
        dataset.attributes.create(attr_spec_dict)
        LOGGER.info(f"Created attribute '{attribute_name}' in {dataset_name}")

    return dataset
Пример #18
0
def update(
    dataset: Dataset,
    *,
    attributes: Optional[Iterable[str]] = None,
    attribute_types: Optional[Dict[str, attribute_type.AttributeType]] = None,
    attribute_descriptions: Optional[Dict[str, str]] = None,
    description: Optional[str] = None,
    tags: Optional[List[str]] = None,
    override_existing_types: bool = False,
) -> Dataset:
    """Flexibly update a source dataset in Tamr

    All the attributes that should exist in the dataset must be defined in the attributes argument.
    This function will add/remove attributes in the dataset until the dataset attributes matches
    the set of attributes passed in as an argument. The default attribute type will be ARRAY
    STRING . To set non-default attribute types, they must be defined in the attribute_types
    dictionary. Any attribute descriptions can be specified in the attribute_descriptions
    dictionary. By default, the existing attribute types will not change unless
    override_existing_types is set to True. When False, the attribute type updates will only be
    logged.

    Args:
        dataset: An existing TUC dataset
        attributes: Complete list of attribute names that should exist in the updated dataset
        attribute_types: dictionary for non-default types, attribute name is the key and
            AttributeType is the value
        attribute_descriptions: dictionary for attribute descriptions, attribute name is the
            key and the attribute description is the value
        description: updated description of dataset, if None will not update the description
        tags: updated tags for the dataset, if None will not update tags
        override_existing_types: boolean flag, when true will alter existing attribute's types

    Returns:
        Updated Dataset

    Raises:
        requests.HTTPError: If any HTTP error is encountered
        ValueError: If the dataset is not a source dataset
        TypeError: If the attributes argument is not an Iterable

    Example:
        >>> import tamr_toolbox as tbox
        >>> from tbox.models import attribute_type
        >>> tamr_client = tbox.utils.client.create(**instance_connection_info)
        >>> dataset = = tamr_client.datasets.by_name("my_dataset_name")
        >>> tbox.dataset.manage.update(
        >>>     client=tamr_client,
        >>>     dataset=dataset,
        >>>     attributes=["unique_id","name","address","total_sales"],
        >>>     attribute_types={"total_sales":attribute_type.ARRAY(attribute_type.DOUBLE)},
        >>>     override_existing_types = True,
        >>> )
    """
    dataset_name = dataset.name
    if dataset.upstream_datasets():
        raise ValueError(f"{dataset_name} is not a source dataset")
    primary_keys = dataset.spec().to_dict()["keyAttributeNames"]

    # Check input type is correct
    if attributes and not isinstance(attributes, Iterable):
        raise TypeError("attributes arg must be an Iterable")

    # Update description and tags
    dataset_spec = dataset.spec()
    if description:
        dataset_spec = dataset_spec.with_description(description)
        LOGGER.info(f"Updating description for {dataset_name}")
    if tags:
        dataset_spec = dataset_spec.with_tags(tags)
        LOGGER.info(f"Updating tags for {dataset_name}")

    dataset_spec.put()

    if attributes:
        # Get current dataset attributes
        existing_attributes = [attr.name for attr in dataset.attributes]

        # Update attributes in dataset
        for attribute_name in attributes:
            if attribute_name in primary_keys:
                continue
            elif attribute_name in existing_attributes:
                # This attribute already exists, update to new type
                type_dict = {
                    attribute_name: (attribute_types
                                     or dict()).get(attribute_name,
                                                    attribute_type.DEFAULT)
                }
                desc_dict = {
                    attribute_name: (attribute_descriptions
                                     or dict()).get(attribute_name)
                }

                edit_attributes(
                    dataset=dataset,
                    attribute_types=type_dict,
                    attribute_descriptions=desc_dict,
                    override_existing_types=override_existing_types,
                )
            else:
                # This attribute does not already exist, create
                create_attributes(
                    dataset=dataset,
                    attributes=[attribute_name],
                    attribute_types=attribute_types,
                    attribute_descriptions=attribute_descriptions,
                )

        # Remove any attributes from dataset that aren't in the new list of attributes
        for attribute_name in existing_attributes:
            if attribute_name not in attributes and attribute_name not in primary_keys:
                delete_attributes(dataset=dataset, attributes=[attribute_name])

    return dataset
Пример #19
0
def from_dataset(
    dataset: Dataset,
    *,
    columns: Optional[List[str]] = None,
    flatten_delimiter: Optional[str] = None,
    flatten_columns: Optional[List[str]] = None,
    force_flatten: bool = False,
    nrows: Optional[int] = None,
    allow_dataset_refresh: bool = False,
) -> "pandas.DataFrame":
    """
    Creates a DataFrame from a Tamr Dataset

    Args:
        dataset: Tamr Dataset object
        columns: optional, ordered list of columns to keep
        flatten_delimiter: if set, flatten list types to strings by concatenating with this
            delimiter
        flatten_columns: optional, list of columns to flatten
        force_flatten:  if False, arrays with inner types other than string will not be flattened.
            if True, will force all inner types to strings when flattening values.
        nrows: number of rows to read. default None will read all rows
        allow_dataset_refresh: if True, allows running a job to refresh dataset to make streamable

    Returns:
        DataFrame

    Raises:
        ValueError: if `columns` or `flatten_columns` contain columns that are not present in
            `dataset`
    """
    # This function requires pandas, an optional dependency
    import pandas

    LOGGER.info(
        f"Streaming records to DataFrame for dataset {dataset.name} (id={dataset.resource_id})."
    )
    dataset_attrs = [attr for attr in dataset.attributes]
    attr_names = [attr.name for attr in dataset_attrs]
    # check that specified columns exist
    if columns is not None:
        common._check_columns_subset(
            input_list=columns, reference_list=attr_names, raise_error=True
        )
    # checks on columns to flatten
    if flatten_delimiter is not None:
        if flatten_columns is None:
            flatten_columns = list(attr_names)
        else:
            # check that specified columns exist
            common._check_columns_subset(
                input_list=flatten_columns, reference_list=attr_names, raise_error=True
            )
        # check types of flatten_columns
        for attr in dataset_attrs:
            if attr.name not in flatten_columns:
                continue
            attr_type = attr.spec().to_dict()["type"]
            if attr_type["baseType"] == "ARRAY" and attr_type["innerType"]["baseType"] != "STRING":
                if force_flatten:
                    LOGGER.info(
                        f"Will force attribute to string: {attr.name}, with type: {attr_type}"
                    )
                else:
                    LOGGER.warning(
                        f"Will not flatten attribute: {attr.name}, with type: {attr_type}"
                    )
                    flatten_columns.remove(attr.name)

    if not dataset.status().is_streamable:
        if allow_dataset_refresh:
            LOGGER.info(f"Refreshing dataset {dataset.name} to make streamable.")
            op = dataset.refresh()
            operation.enforce_success(op)
        else:
            message = (
                f"Dataset {dataset.name} is not streamable. Refresh it first, or run"
                f" with allow_dataset_refresh=True"
            )
            LOGGER.error(message)
            raise RuntimeError(message)

    # if flattening, set the function to apply to records as _flatten_list
    # otherwise set as _identity
    func = None
    if flatten_delimiter is not None:
        func = partial(common._flatten_list, delimiter=flatten_delimiter, force=force_flatten)
    df = pandas.DataFrame.from_records(
        common._yield_records(
            dataset, func=func, columns=columns, flatten_columns=flatten_columns
        ),
        columns=columns,
        nrows=nrows,
    )
    return df
 def test_get_usage(self):
     responses.add(responses.GET,
                   f"{self._base_url}/datasets/1/usage",
                   json=self._usage_json)
     u = Dataset(self.tamr, self._dataset_json).usage()
     self.assertEqual(u._data, self._usage_json)
Пример #21
0
def from_dataset(
    dataset: Dataset,
    export_file_path: Union[Path, str],
    *,
    csv_delimiter: str = ",",
    columns: Optional[List[str]] = None,
    flatten_delimiter: str = "|",
    quote_character: str = '"',
    quoting: int = csv.QUOTE_MINIMAL,
    na_value: str = "NaN",
    nrows: Optional[int] = None,
    allow_dataset_refresh: bool = False,
    buffer_size: int = 10000,
    overwrite: bool = False,
) -> int:
    """
    Export a Tamr Dataset to a csv file. Records are streamed to disk and written according to a
    given buffer size. As a result this is more memory efficient than first reading to a
    pandas.DataFrame and writing to CSV.

    Args:
        dataset: Tamr Dataset object
        export_file_path: Path to the csv file where the dataset will be saved
        csv_delimiter: Delimiter of the csv file
        columns: Optional, Ordered list of columns to write. If None, write all columns in
            arbitrary order.
        flatten_delimiter: Flatten list types to strings by concatenating with this delimiter
        quote_character: Character used to escape value for csv delimiter when it appears in the
            value.
        quoting: The escape strategy to use according to the Python csv writer.
            See https://docs.python.org/2/library/csv.html#csv.QUOTE_MINIMAL
        na_value: Value to write that represents empty or missing data.
            See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
            for the na_values supported by default in pandas.read_csv
        nrows: Optional, Number of rows to write. If None, then write all rows.
        allow_dataset_refresh: If True, allows running a job to refresh dataset to make streamable.
            Otherwise a RuntimeError will be thrown if the dataset is unstreamable.
        buffer_size: Number of records to store in memory before writing to disk
        overwrite: if True and export_file_name already exists, overwrite the file.
            Otherwise throw an error

    Returns:
        The total number of records written

    Raises:
        FileExistsError: if the csv file to which the dataset is to be streamed exists
            and `overwrite` is False
        RuntimeError: if `dataset` is not streamable and `allow_dataset_refresh` is False
        ValueError: if `columns` or `flatten_columns` contain columns that are not
            present in `dataset`
    """
    LOGGER.info(
        f"Streaming records to csv file {export_file_path} from dataset {dataset.name} "
        f"(id={dataset.resource_id}).")

    if os.path.exists(export_file_path):
        if not overwrite:
            message = (
                f"CSV file {export_file_path} already exists. "
                f"(Set 'overwrite' flag to True if you wish to overwrite)")
            LOGGER.error(message)
            raise FileExistsError(message)
        else:
            LOGGER.warning(
                f"CSV file {export_file_path} already exists and will be overwritten"
            )

    if csv_delimiter == flatten_delimiter:
        message = (
            f"The CSV delimiter '{csv_delimiter}' cannot be identical to the list "
            f"flattening delimiter '{flatten_delimiter}'")
        LOGGER.error(message)
        raise ValueError(message)

    attribute_names = [attr.name for attr in dataset.attributes]

    # check that specified columns exist
    if columns is not None:
        common._check_columns_subset(input_list=columns,
                                     reference_list=attribute_names,
                                     raise_error=True)

    if not dataset.status().is_streamable:
        if allow_dataset_refresh:
            LOGGER.info(
                f"Refreshing dataset {dataset.name} to make streamable.")
            op = dataset.refresh()
            operation.enforce_success(op)
        else:
            message = (
                f"Dataset {dataset.name} is not streamable. Refresh it first, or "
                f"run with allow_dataset_refresh=True")
            LOGGER.error(message)
            raise RuntimeError(message)

    func = partial(common._flatten_list,
                   delimiter=flatten_delimiter,
                   force=True)

    # Open CSV file and use newline='' as recommended by
    # https://docs.python.org/3/library/csv.html#csv.writer
    with open(export_file_path, "w", newline="") as csv_file:
        csv_writer = csv.writer(
            csv_file,
            delimiter=csv_delimiter,
            quotechar=quote_character,
            quoting=quoting,
        )
        buffer = []
        header = None
        # Set record number to -1 in case the dataset streamed has no records
        record_number = -1

        for record_number, record in enumerate(
                common._yield_records(dataset, func=func, columns=columns)):
            # Obtain and write the header information only on the first pass
            if header is None:
                header = record.keys() if columns is None else columns
                csv_writer.writerow(header)

            # Replace empty values with a specific null value
            # This also allows nulls to be treated differently from empty strings
            record = [
                na_value if record[k] is None else record[k] for k in header
            ]
            buffer.append(record)

            at_max_buffer = buffer_size is not None and (len(buffer) >=
                                                         buffer_size)
            at_max_rows = nrows is not None and record_number >= nrows - 1
            if at_max_buffer or at_max_rows:
                csv_writer.writerows(buffer)
                LOGGER.debug(
                    f"Written dataset {dataset.name} up to record {record_number+1}"
                )
                buffer = []
                if at_max_rows:
                    break

        # Write anything remaining
        # This will occur whenever the buffer is non-zero and the number of records
        # is not exactly divisible by the buffer number
        # For example, writing a dataset with 1100 records using a buffer size of 500
        # will write in 3 chunks: 2 x 500 above and the remaining 100 handled here
        if len(buffer) != 0:
            LOGGER.debug(
                f"Written dataset {dataset.name} up to record {record_number + 1}"
            )
            csv_writer.writerows(buffer)

        if record_number == -1:
            # If record number is -1 then no records were streamed, possibly because the dataset
            # has no records. We therefore want to simply save the headers
            if columns is not None:
                csv_writer.writerow(columns)
            else:
                csv_writer.writerow(attribute_names)

    records_written = record_number + 1

    LOGGER.info(
        f"Wrote {records_written} from dataset {dataset.name} (id={dataset.resource_id}) "
        f"to {export_file_path}")

    return records_written
Пример #22
0
def _yield_records(
    dataset: Dataset,
    *,
    func: Optional[Callable] = None,
    columns: Optional[List] = None,
    flatten_columns: Optional[List] = None,
) -> Iterable:
    """
    Generator function for records from a Tamr Dataset.
    Can optionally apply a flattening function to some or all columns,
    and can optionally keep only a subset of columns from the original records.

    Args:
        dataset: Tamr Dataset
        func: optional, callable function to transform records
        columns: optional, list of columns to keep
        flatten_columns: optional, list of columns to flatten

    Returns: iterable over Dataset records

    Raises:
        ValueError: if `flatten_columns` is not None but `func` is None since it
            won't know how to flatten them.
    """

    if flatten_columns is not None and func is None:
        message = (
            f"Columns specified for flatting ('flatten_columns'={flatten_columns}), "
            f"but no flattening function provided for parameter 'func'"
        )
        LOGGER.error(message)
        raise ValueError(message)

    checked_columns = False
    for record in dataset.records():
        if not checked_columns:
            if columns is not None:
                _check_columns_subset(
                    input_list=columns, reference_list=record.keys(), raise_error=True
                )
            if flatten_columns is not None:
                _check_columns_subset(
                    input_list=flatten_columns, reference_list=record.keys(), raise_error=True
                )
            checked_columns = True

        # Set flatten_columns to all if unspecified
        if func is not None and flatten_columns is None:
            flatten_columns = record.keys()

        output = {}
        for k, v in record.items():
            if columns is not None and k not in columns:
                # remove the column by skipping
                continue

            do_flatten_column = flatten_columns is not None and k in flatten_columns
            if do_flatten_column and func is not None:
                output[k] = func(v)
            else:
                output[k] = v
        yield output