def delete_attributes( *, dataset: Dataset, attributes: Iterable[str] = None, ) -> Dataset: """Remove attributes from dataset by attribute name Args: dataset: An existing TUC dataset attributes: list of attribute names to delete from dataset Returns: Updated Dataset Raises: ValueError: If the dataset is not a source dataset ValueError: If a passed attribute does not exist in the dataset ValueError: If a passed attribute is a primary key and can't be removed TypeError: If the attributes argument is not an Iterable """ dataset_name = dataset.name if dataset.upstream_datasets(): raise ValueError(f"{dataset_name} is not a source dataset") # Check input type is correct if not isinstance(attributes, Iterable): raise TypeError("attributes arg must be an Iterable") # Get current dataset attributes target_attribute_dict = {attr.name: attr for attr in dataset.attributes} existing_attributes = target_attribute_dict.keys() primary_keys = dataset.spec().to_dict()["keyAttributeNames"] # Check all attributes exist before starting to remove any for attribute_name in attributes: if attribute_name not in existing_attributes: raise ValueError( f"The attribute '{attribute_name}' does not exist in {dataset_name}" ) elif attribute_name in primary_keys: # Can not edit a primary key raise ValueError( f"The attribute '{attribute_name}' is a primary key and can't be removed" ) # Remove attributes from dataset for attribute_name in attributes: dataset.attributes.delete_by_resource_id( target_attribute_dict[attribute_name].resource_id) LOGGER.info(f"Deleted attribute '{attribute_name}' in {dataset_name}") return dataset
def _extract_confidence(*, dataset: Dataset, category_set: set) -> data_type.JsonDict: """ Extracts tier-specific average confidence from a Tamr internal dataset `<unified dataset name>_classifications_average_confidences` to a dictionary Args: dataset: Tamr internal Dataset with a name ending in `_unified_dataset_classifications_average_confidences` category_set: set of category paths at the desired tier Returns: dictionary - keys are category paths, joined by '|' if multi-level taxonomy. Values are average confidence of the corresponding keys, where it is None if no confidence exists for the category. """ confidence_dict = {} for record in dataset.records(): path = "|".join(record["classificationPath"]) if path in category_set: confidence_dict[path] = record["averageConfidence"] empty_confidence_categories = category_set - set(confidence_dict.keys()) for category in empty_confidence_categories: confidence_dict[category] = None return confidence_dict
def unified_dataset(self): """Unified dataset for this project. :return: Unified dataset for this project. :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset` """ alias = self.api_path + "/unifiedDataset" resource_json = self.client.get(alias).successful().json() return Dataset.from_json(self.client, resource_json, alias)
def create(self, creation_spec): """ Create a Dataset in Tamr :param creation_spec: Dataset creation specification should be formatted as specified in the `Public Docs for Creating a Dataset <https://docs.tamr.com/reference#create-a-dataset>`_. :type creation_spec: dict[str, str] :returns: The created Dataset :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset` """ data = self.client.post(self.api_path, json=creation_spec).successful().json() return Dataset.from_json(self.client, data)
def pairs(self): """Record pairs generated by Tamr's binning model. Pairs are displayed on the "Pairs" page in the Tamr UI. Call :func:`~tamr_unify_client.dataset.resource.Dataset.refresh` from this dataset to regenerate pairs according to the latest binning model. :returns: The record pairs represented as a dataset. :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset` """ alias = self.api_path + "/recordPairs" return Dataset(self.client, None, alias)
def record_clusters(self): """Record Clusters as a dataset. Tamr clusters labeled pairs using pairs model. These clusters populate the cluster review page and get transient cluster ids, rather than published cluster ids (i.e., "Permanent Ids") Call :func:`~tamr_unify_client.dataset.resource.Dataset.refresh` from this dataset to generate clusters based on to the latest pair-matching model. :returns: The record clusters represented as a dataset. :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset` """ alias = self.api_path + "/recordClusters" return Dataset(self.client, None, alias)
def test_delete_attribute(self): url = "http://localhost:9100/api/versioned/v1/datasets/1/attributes/RowNum" responses.add(responses.GET, url, json=self._attributes_json[0]) responses.add(responses.DELETE, url, status=204) responses.add(responses.GET, url, status=404) dataset = Dataset(self.tamr, self._dataset_json) attribute = dataset.attributes.by_resource_id("RowNum") self.assertEqual(attribute._data, self._attributes_json[0]) response = attribute.delete() self.assertEqual(response.status_code, 204) self.assertRaises(HTTPError, lambda: dataset.attributes.by_resource_id("RowNum"))
def _find_associated_projects(dataset: Dataset) -> List[str]: """Return list of project_list that the dataset is part of. Args: dataset: The target dataset. Returns: List of Project IDs that the target `dataset` is part of. """ project_list = set( [step.project().resource_id for step in dataset.usage().usage.input_to_project_steps] ) return list(project_list)
def _request_upstream_datasets(dataset: Dataset) -> Dataset: """ Returns a dataset's upstream dataset Args: dataset: a Tamr Dataset Object Returns: The upstream datasets """ # Find upstream datasets, output is a DatasetURI upstream = dataset.upstream_datasets() dataset_upstream = [] # Make Dataset our of DatasetURI for data in upstream: dataset_upstream.append( dataset.client.datasets.by_resource_id(data.resource_id)) return dataset_upstream
def published_cluster_stats(self): """Retrieves published cluster stats for this project. :returns: The published cluster stats dataset. :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset` """ # Replace this workaround with a direct API call once API # is fixed. APIs that need to work are: fetching the dataset and # being able to call refresh on resulting dataset. Until then, we grab # the dataset by constructing its name from the corresponding Unified Dataset's name unified_dataset = self.unified_dataset() name = unified_dataset.name + "_dedup_published_cluster_stats" dataset = self.client.datasets.by_name(name) path = self.api_path + "/publishedClusterStats" return Dataset.from_json(self.client, dataset._data, path)
def high_impact_pairs(self): """High-impact pairs as a dataset. Tamr labels pairs as "high-impact" if labeling these pairs would help it learn most quickly (i.e. "Active learning"). High-impact pairs are displayed with a ⚡ lightning bolt icon on the "Pairs" page in the Tamr UI. Call :func:`~tamr_unify_client.dataset.resource.Dataset.refresh` from this dataset to produce new high-impact pairs according to the latest pair-matching model. :returns: The high-impact pairs represented as a dataset. :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset` """ alias = self.api_path + "/highImpactPairs" return Dataset(self.client, None, alias)
def from_dataset(dataset: Dataset) -> Dict[str, TranslationDictionary]: """ Stream a dictionary from Tamr Args: dataset: Tamr Dataset object Returns: A toolbox translation dictionary Raises: ValueError: if the provided `dataset` is not a toolbox translation dictionary dataset NameError: if the provided `dataset` does not contain all the attributes of a toolbox translation dictionary RuntimeError: if there is any other problem while reading the `dataset` as a toolbox translation dictionary """ if dataset.key_attribute_names[0] != "standardized_phrase": error_message = f"Provided Tamr Dataset is not a toolbox translation dictionary" LOGGER.error(error_message) raise ValueError(error_message) dictionary = {} for record in dataset.records(): try: entry = TranslationDictionary(**record) # values are returned as a list of a single string, we change this to string entry.translated_phrase = entry.translated_phrase[0] entry.detected_language = entry.detected_language[0] # original phrases are stored on Tamr as lists, we save it as a set entry.original_phrases = set(entry.original_phrases) except NameError as e: error_message = ( f"Supplied Tamr dataset is not in a toolbox translation dictionary format: {e}" ) LOGGER.error(error_message) raise NameError(error_message) except Exception as e: error_message = f"Error while reading the Tamr dataset translation dictionary: {e}" LOGGER.error(error_message) raise RuntimeError(error_message) formatted_dictionary = {entry.standardized_phrase: entry} dictionary.update(formatted_dictionary) return dictionary
def published_clusters(self): """Published record clusters generated by Tamr's pair-matching model. :returns: The published clusters represented as a dataset. :rtype: :class:`~tamr_unify_client.dataset.resource.Dataset` """ unified_dataset = self.unified_dataset() # Replace this workaround with a direct API call once API # is fixed. APIs that need to work are: fetching the dataset and # being able to call refresh on resulting dataset. Until then, we grab # the dataset by constructing its name from the corresponding Unified Dataset's name name = unified_dataset.name + "_dedup_published_clusters" canonical = self.client.datasets.by_name(name) resource_json = canonical._data alias = self.api_path + "/publishedClusters" return Dataset.from_json(self.client, resource_json, alias)
def test_feature_to_record(self): feature = {"type": "Feature", "id": "1"} actual = Dataset._feature_to_record(feature, ["pk"], "geo") expected = {"pk": "1"} self.assertEqual(expected, actual) feature = { "type": "Feature", "id": "1", "geometry": {"type": "Point", "coordinates": [0, 0]}, } actual = Dataset._feature_to_record(feature, ["pk"], "geo") expected = {"pk": "1", "geo": {"point": [0, 0]}} self.assertEqual(expected, actual) feature = { "type": "Feature", "id": "1", "geometry": {"type": "MultiPoint", "coordinates": [[0, 0], [1, 1]]}, } actual = Dataset._feature_to_record(feature, ["pk"], "geo") expected = {"pk": "1", "geo": {"multiPoint": [[0, 0], [1, 1]]}} self.assertEqual(expected, actual) feature = { "type": "Feature", "id": "1", "geometry": {"type": "LineString", "coordinates": [[0, 0], [1, 1]]}, } actual = Dataset._feature_to_record(feature, ["pk"], "geo") expected = {"pk": "1", "geo": {"lineString": [[0, 0], [1, 1]]}} self.assertEqual(expected, actual) feature = { "type": "Feature", "id": "1", "geometry": { "type": "MultiLineString", "coordinates": [[[0, 0], [1, 1], [2, 2]]], }, } actual = Dataset._feature_to_record(feature, ["pk"], "geo") expected = {"pk": "1", "geo": {"multiLineString": [[[0, 0], [1, 1], [2, 2]]]}} self.assertEqual(expected, actual) feature = { "type": "Feature", "id": "1", "geometry": {"type": "Polygon", "coordinates": [[[0, 0], [1, 1], [2, 2]]]}, } actual = Dataset._feature_to_record(feature, ["pk"], "geo") expected = {"pk": "1", "geo": {"polygon": [[[0, 0], [1, 1], [2, 2]]]}} self.assertEqual(expected, actual) feature = { "type": "Feature", "id": "1", "geometry": { "type": "MultiPolygon", "coordinates": [[[[0, 0], [1, 1], [2, 2]]]], }, } actual = Dataset._feature_to_record(feature, ["pk"], "geo") expected = {"pk": "1", "geo": {"multiPolygon": [[[[0, 0], [1, 1], [2, 2]]]]}} self.assertEqual(expected, actual) feature = {"type": "Feature", "id": "1", "geometry": None} actual = Dataset._feature_to_record(feature, ["pk"], "geo") expected = {"pk": "1"} self.assertEqual(expected, actual) feature = { "type": "Feature", "id": "1", "bbox": [0, 0, 1, 1], "geometry": {"type": "Point", "coordinates": [0, 0]}, } actual = Dataset._feature_to_record(feature, ["pk"], "geo") expected = {"pk": "1", "geo": {"point": [0, 0]}, "bbox": [0, 0, 1, 1]} self.assertEqual(expected, actual) feature = { "type": "Feature", "id": "1", "bbox": None, "geometry": {"type": "Point", "coordinates": [0, 0]}, } actual = Dataset._feature_to_record(feature, ["pk"], "geo") expected = {"pk": "1", "geo": {"point": [0, 0]}} self.assertEqual(expected, actual) feature = { "type": "Feature", "id": "1", "bbox": [0, 0, 1, 1], "geometry": {"type": "Point", "coordinates": [0, 0]}, "properties": {"prop1": "val1", "prop2": "val2"}, } actual = Dataset._feature_to_record(feature, ["pk"], "geo") expected = { "pk": "1", "geo": {"point": [0, 0]}, "bbox": [0, 0, 1, 1], "prop1": "val1", "prop2": "val2", } self.assertEqual(expected, actual) feature = { "type": "Feature", "id": "1", "bbox": [0, 0, 1, 1], "geometry": {"type": "Point", "coordinates": [0, 0]}, "properties": None, } actual = Dataset._feature_to_record(feature, ["pk"], "geo") expected = {"pk": "1", "geo": {"point": [0, 0]}, "bbox": [0, 0, 1, 1]} self.assertEqual(expected, actual) feature = { "type": "Feature", "id": "1", "bbox": [0, 0, 1, 1], "geometry": {"type": "Point", "coordinates": [0, 0]}, # Properties with names that conflict with # the props in the key or geometry # get ignored "properties": {"pk": "val1", "geo": "val2", "bbox": "val3"}, } actual = Dataset._feature_to_record(feature, ["pk"], "geo") expected = {"pk": "1", "geo": {"point": [0, 0]}, "bbox": [0, 0, 1, 1]} self.assertEqual(expected, actual) feature = { "type": "Feature", "id": ["1", "2"], "geometry": {"type": "Point", "coordinates": [0, 0]}, } actual = Dataset._feature_to_record(feature, ["pk1", "pk2"], "geo") expected = {"pk1": "1", "pk2": "2", "geo": {"point": [0, 0]}} self.assertEqual(expected, actual) feature = {"type": "Feature", "id": "1", "geometry": None} Dataset._feature_to_record(feature, ["pk"], "geo") # feature_to_record is required to not raise an exception feature = { "type": "Feature", "id": None, "geometry": {"type": "Point", "coordinates": [0, 0]}, } with pytest.raises(ValueError): Dataset._feature_to_record(feature, ["pk"], "geo") feature = { "type": "Feature", "geometry": {"type": "Point", "coordinates": [0, 0]}, } with pytest.raises(ValueError): Dataset._feature_to_record(feature, ["pk"], "geo") class NotAFeature: @property def __geo_interface__(self): return { "type": "Feature", "id": "1", "geometry": {"type": "Point", "coordinates": [0, 0]}, } naf = NotAFeature() actual = Dataset._feature_to_record(naf, ["pk"], "geo") expected = {"pk": "1", "geo": {"point": [0, 0]}} self.assertEqual(expected, actual)
def test_record_to_feature(self): empty_record = {"id": "1"} def key_value_single(rec): return rec["id"] actual = Dataset._record_to_feature( empty_record, key_value_single, ["id"], "geom" ) expected = {"type": "Feature", "id": "1"} self.assertEqual(expected, actual) record_with_point = {"id": "1", "geom": {"point": [1, 1]}} actual = Dataset._record_to_feature( record_with_point, key_value_single, ["id"], "geom" ) expected = { "type": "Feature", "id": "1", "geometry": {"type": "Point", "coordinates": [1, 1]}, } self.assertEqual(expected, actual) record_with_multi_point = {"id": "1", "geom": {"multiPoint": [[1, 1]]}} actual = Dataset._record_to_feature( record_with_multi_point, key_value_single, ["id"], "geom" ) expected = { "type": "Feature", "id": "1", "geometry": {"type": "MultiPoint", "coordinates": [[1, 1]]}, } self.assertEqual(expected, actual) record_with_line = {"id": "1", "geom": {"lineString": [[1, 1], [2, 2]]}} actual = Dataset._record_to_feature( record_with_line, key_value_single, ["id"], "geom" ) expected = { "type": "Feature", "id": "1", "geometry": {"type": "LineString", "coordinates": [[1, 1], [2, 2]]}, } self.assertEqual(expected, actual) record_with_multi_line = { "id": "1", "geom": {"multiLineString": [[[1, 1], [2, 2]]]}, } actual = Dataset._record_to_feature( record_with_multi_line, key_value_single, ["id"], "geom" ) expected = { "type": "Feature", "id": "1", "geometry": {"type": "MultiLineString", "coordinates": [[[1, 1], [2, 2]]]}, } self.assertEqual(expected, actual) record_with_polygon = { "id": "1", "geom": {"polygon": [[[1, 1], [2, 2], [3, 3]]]}, } actual = Dataset._record_to_feature( record_with_polygon, key_value_single, ["id"], "geom" ) expected = { "type": "Feature", "id": "1", "geometry": {"type": "Polygon", "coordinates": [[[1, 1], [2, 2], [3, 3]]]}, } self.assertEqual(expected, actual) record_with_multi_polygon = { "id": "1", "geom": {"multiPolygon": [[[[1, 1], [2, 2], [3, 3]]]]}, } actual = Dataset._record_to_feature( record_with_multi_polygon, key_value_single, ["id"], "geom" ) expected = { "type": "Feature", "id": "1", "geometry": { "type": "MultiPolygon", "coordinates": [[[[1, 1], [2, 2], [3, 3]]]], }, } self.assertEqual(expected, actual) record_with_full_geo = { "id": "1", "geom": { "point": None, "multiPoint": None, "lineString": None, "multiLineString": None, "polygon": None, "multiPolygon": [[[[1, 1], [2, 2], [3, 3]]]], }, } actual = Dataset._record_to_feature( record_with_full_geo, key_value_single, ["id"], "geom" ) expected = { "type": "Feature", "id": "1", "geometry": { "type": "MultiPolygon", "coordinates": [[[[1, 1], [2, 2], [3, 3]]]], }, } self.assertEqual(expected, actual) record_with_null_geo = { "id": "1", "geom": { "point": None, "multiPoint": None, "lineString": None, "multiLineString": None, "polygon": None, "multiPolygon": None, }, } actual = Dataset._record_to_feature( record_with_null_geo, key_value_single, ["id"], "geom" ) expected = {"geometry": None, "type": "Feature", "id": "1"} self.assertEqual(expected, actual) record_with_bbox = {"id": "1", "bbox": [[0, 0], [1, 1]]} actual = Dataset._record_to_feature( record_with_bbox, key_value_single, ["id"], "geom" ) expected = {"type": "Feature", "id": "1", "bbox": [[0, 0], [1, 1]]} self.assertEqual(expected, actual) record_with_props = {"id": "1", "p1": "v1", "p2": "v2"} actual = Dataset._record_to_feature( record_with_props, key_value_single, ["id"], "geom" ) expected = { "type": "Feature", "id": "1", "properties": {"p1": "v1", "p2": "v2"}, } self.assertEqual(expected, actual) def key_value_composite(rec): return [rec[v] for v in ["id1", "id2"]] record_with_composite_key = {"id1": "1", "id2": "2"} actual = Dataset._record_to_feature( record_with_composite_key, key_value_composite, ["id1", "id2"], "geom" ) expected = {"type": "Feature", "id": ["1", "2"]} self.assertEqual(expected, actual) record_with_everything = { "id1": "1", "id2": "2", "bbox": [[0, 0], [1, 1]], "name": "record with everything", "geom": { "point": None, "multiPoint": None, "lineString": None, "multiLineString": None, "polygon": [[[0, 0], [0, 1], [1, 1], [1, 0], [0, 0]]], "multiPolygon": None, }, "alternate_geom": { "point": [1, 1], "multiPoint": None, "lineString": None, "multiLineString": None, "polygon": None, "multiPolygon": None, }, } actual = Dataset._record_to_feature( record_with_everything, key_value_composite, ["id1", "id2"], "geom" ) expected = { "type": "Feature", "id": ["1", "2"], "bbox": [[0, 0], [1, 1]], "properties": { "name": "record with everything", "alternate_geom": { "point": [1, 1], "multiPoint": None, "lineString": None, "multiLineString": None, "polygon": None, "multiPolygon": None, }, }, "geometry": { "type": "Polygon", "coordinates": [[[0, 0], [0, 1], [1, 1], [1, 0], [0, 0]]], }, } self.assertEqual(expected, actual) record_without_geo = {"id": "1", "prop1": "val1"} actual = Dataset._record_to_feature( record_without_geo, key_value_single, ["id"], None ) expected = {"type": "Feature", "id": "1", "properties": {"prop1": "val1"}} self.assertEqual(expected, actual)
def edit_attributes( *, dataset: Dataset, attribute_types: Optional[Dict[str, attribute_type.AttributeType]] = None, attribute_descriptions: Optional[Dict[str, str]] = None, override_existing_types: bool = True, ) -> Dataset: """Edit existing attributes in a dataset The attribute type and/or descriptions can be updated to new values. Attributes that will be updated must be in either the attribute_types or attribute_descriptions dictionaries or both. The default attribute type will be ARRAY STRING. To set non-default attribute types, they must be defined in the attribute_types dictionary. Any attribute descriptions can be specified in the attribute_descriptions dictionary. If only the attribute_descriptions dictionary is defined, the attribute type will not be updated. Args: dataset: An existing TUC dataset attribute_types: dictionary for non-default types, attribute name is the key and AttributeType is the value attribute_descriptions: dictionary for attribute descriptions, attribute name is the key and the attribute description is the value override_existing_types: bool flag, when true will alter existing attributes Returns: Updated Dataset Raises: requests.HTTPError: If any HTTP error is encountered ValueError: If the dataset is not a source dataset ValueError: If a passed attribute does not exist in the dataset ValueError: If a passed attribute is a primary key and can't be removed ValueError: If there are no updates to attributes in attribute_types or attribute_descriptions arguments """ dataset_name = dataset.name if dataset.upstream_datasets(): raise ValueError(f"{dataset_name} is not a source dataset") # Check description or type changes are passed in if attribute_types is None and attribute_descriptions is None: raise ValueError( """Updates to attributes must be passed in via attribute_types or attribute_descriptions arguments""") # Get list of attributes that need updating from attribute_types and # attribute_descriptions dictionaries attributes = {attr for attr in attribute_types or list() } | {attr for attr in attribute_descriptions or list()} # Get current dataset attributes target_attribute_dict = {attr.name: attr for attr in dataset.attributes} existing_attributes = target_attribute_dict.keys() primary_keys = dataset.spec().to_dict()["keyAttributeNames"] # Check that all of the attribute names already exist in dataset for attribute_name in attributes: if attribute_name not in existing_attributes: # This attribute does not exist raise ValueError( f"An attribute with name '{attribute_name}' does not exist in {dataset_name}" ) elif attribute_name in primary_keys: # Can not edit a primary key raise ValueError( f"The attribute '{attribute_name}' is a primary key and can't be updated" ) # Update attributes in dataset for attribute_name in attributes: attr_spec_dict = _make_spec_dict( attribute_name=attribute_name, attribute_types=attribute_types, attribute_descriptions=attribute_descriptions, ) existing_attribute_spec = target_attribute_dict[attribute_name].spec() if attribute_types is None or attribute_name not in attribute_types: new_type_class = attribute_type.from_json( existing_attribute_spec.to_dict()["type"]) else: new_type_class = attribute_type.from_json(attr_spec_dict["type"]) old_type_class = attribute_type.from_json( existing_attribute_spec.to_dict()["type"]) if new_type_class == old_type_class: # Update description if (attribute_descriptions is not None and attribute_name in attribute_descriptions.keys()): existing_attribute_spec = existing_attribute_spec.with_description( attribute_descriptions[attribute_name]) existing_attribute_spec.put() else: LOGGER.info( f"There are no updates to the attribute '{attribute_name}' in {dataset_name}" ) elif override_existing_types: # Update type new_attr_spec = existing_attribute_spec.to_dict() new_attr_spec["type"] = attr_spec_dict["type"] # Update description if "description" in attr_spec_dict.keys(): new_attr_spec["description"] = attr_spec_dict["description"] # Remove and add attribute with new spec dataset.attributes.delete_by_resource_id( target_attribute_dict[attribute_name].resource_id) dataset.attributes.create(new_attr_spec) LOGGER.info( f"Updated attribute '{attribute_name}' in {dataset_name}") else: LOGGER.info( f"""The attribute '{attribute_name}' in {dataset_name} curently has the type '{str(old_type_class)}'. Set 'override_existing_types' to True to update the type to '{str(new_type_class)}' """) return dataset
def create_attributes( *, dataset: Dataset, attributes: Iterable[str], attribute_types: Optional[Dict[str, attribute_type.AttributeType]] = None, attribute_descriptions: Optional[Dict[str, str]] = None, ) -> Dataset: """Create new attributes in a dataset The default attribute type will be ARRAY STRING. To set non-default attribute types, they must be defined in the attribute_types dictionary. Any attribute descriptions can be specified in the attribute_descriptions dictionary. Args: dataset: An existing TUC dataset attributes: list of attribute names to be added to dataset attribute_types: dictionary for non-default types, attribute name is the key and AttributeType is the value attribute_descriptions: dictionary for attribute descriptions, attribute name is the key and the attribute description is the value Returns: Updated Dataset Raises: requests.HTTPError: If any HTTP error is encountered TypeError: If the attributes argument is not an Iterable ValueError: If the dataset is a unified dataset ValueError: If an attribute passed in already exists in the dataset """ dataset_name = dataset.name if dataset.upstream_datasets(): raise ValueError(f"{dataset_name} is not a source dataset") # Check input type is correct if not isinstance(attributes, Iterable): raise TypeError("attributes arg must be an Iterable") # Get current dataset attributes existing_attributes = [attr.name for attr in dataset.attributes] # Check that none of the new attribute names already exist for attribute_name in attributes: if attribute_name in existing_attributes: # This attribute already exists raise ValueError( f"An attribute with name '{attribute_name}' already exists in {dataset_name}" ) # Add attributes to dataset for attribute_name in attributes: attr_spec_dict = _make_spec_dict( attribute_name=attribute_name, attribute_types=attribute_types, attribute_descriptions=attribute_descriptions, ) dataset.attributes.create(attr_spec_dict) LOGGER.info(f"Created attribute '{attribute_name}' in {dataset_name}") return dataset
def update( dataset: Dataset, *, attributes: Optional[Iterable[str]] = None, attribute_types: Optional[Dict[str, attribute_type.AttributeType]] = None, attribute_descriptions: Optional[Dict[str, str]] = None, description: Optional[str] = None, tags: Optional[List[str]] = None, override_existing_types: bool = False, ) -> Dataset: """Flexibly update a source dataset in Tamr All the attributes that should exist in the dataset must be defined in the attributes argument. This function will add/remove attributes in the dataset until the dataset attributes matches the set of attributes passed in as an argument. The default attribute type will be ARRAY STRING . To set non-default attribute types, they must be defined in the attribute_types dictionary. Any attribute descriptions can be specified in the attribute_descriptions dictionary. By default, the existing attribute types will not change unless override_existing_types is set to True. When False, the attribute type updates will only be logged. Args: dataset: An existing TUC dataset attributes: Complete list of attribute names that should exist in the updated dataset attribute_types: dictionary for non-default types, attribute name is the key and AttributeType is the value attribute_descriptions: dictionary for attribute descriptions, attribute name is the key and the attribute description is the value description: updated description of dataset, if None will not update the description tags: updated tags for the dataset, if None will not update tags override_existing_types: boolean flag, when true will alter existing attribute's types Returns: Updated Dataset Raises: requests.HTTPError: If any HTTP error is encountered ValueError: If the dataset is not a source dataset TypeError: If the attributes argument is not an Iterable Example: >>> import tamr_toolbox as tbox >>> from tbox.models import attribute_type >>> tamr_client = tbox.utils.client.create(**instance_connection_info) >>> dataset = = tamr_client.datasets.by_name("my_dataset_name") >>> tbox.dataset.manage.update( >>> client=tamr_client, >>> dataset=dataset, >>> attributes=["unique_id","name","address","total_sales"], >>> attribute_types={"total_sales":attribute_type.ARRAY(attribute_type.DOUBLE)}, >>> override_existing_types = True, >>> ) """ dataset_name = dataset.name if dataset.upstream_datasets(): raise ValueError(f"{dataset_name} is not a source dataset") primary_keys = dataset.spec().to_dict()["keyAttributeNames"] # Check input type is correct if attributes and not isinstance(attributes, Iterable): raise TypeError("attributes arg must be an Iterable") # Update description and tags dataset_spec = dataset.spec() if description: dataset_spec = dataset_spec.with_description(description) LOGGER.info(f"Updating description for {dataset_name}") if tags: dataset_spec = dataset_spec.with_tags(tags) LOGGER.info(f"Updating tags for {dataset_name}") dataset_spec.put() if attributes: # Get current dataset attributes existing_attributes = [attr.name for attr in dataset.attributes] # Update attributes in dataset for attribute_name in attributes: if attribute_name in primary_keys: continue elif attribute_name in existing_attributes: # This attribute already exists, update to new type type_dict = { attribute_name: (attribute_types or dict()).get(attribute_name, attribute_type.DEFAULT) } desc_dict = { attribute_name: (attribute_descriptions or dict()).get(attribute_name) } edit_attributes( dataset=dataset, attribute_types=type_dict, attribute_descriptions=desc_dict, override_existing_types=override_existing_types, ) else: # This attribute does not already exist, create create_attributes( dataset=dataset, attributes=[attribute_name], attribute_types=attribute_types, attribute_descriptions=attribute_descriptions, ) # Remove any attributes from dataset that aren't in the new list of attributes for attribute_name in existing_attributes: if attribute_name not in attributes and attribute_name not in primary_keys: delete_attributes(dataset=dataset, attributes=[attribute_name]) return dataset
def from_dataset( dataset: Dataset, *, columns: Optional[List[str]] = None, flatten_delimiter: Optional[str] = None, flatten_columns: Optional[List[str]] = None, force_flatten: bool = False, nrows: Optional[int] = None, allow_dataset_refresh: bool = False, ) -> "pandas.DataFrame": """ Creates a DataFrame from a Tamr Dataset Args: dataset: Tamr Dataset object columns: optional, ordered list of columns to keep flatten_delimiter: if set, flatten list types to strings by concatenating with this delimiter flatten_columns: optional, list of columns to flatten force_flatten: if False, arrays with inner types other than string will not be flattened. if True, will force all inner types to strings when flattening values. nrows: number of rows to read. default None will read all rows allow_dataset_refresh: if True, allows running a job to refresh dataset to make streamable Returns: DataFrame Raises: ValueError: if `columns` or `flatten_columns` contain columns that are not present in `dataset` """ # This function requires pandas, an optional dependency import pandas LOGGER.info( f"Streaming records to DataFrame for dataset {dataset.name} (id={dataset.resource_id})." ) dataset_attrs = [attr for attr in dataset.attributes] attr_names = [attr.name for attr in dataset_attrs] # check that specified columns exist if columns is not None: common._check_columns_subset( input_list=columns, reference_list=attr_names, raise_error=True ) # checks on columns to flatten if flatten_delimiter is not None: if flatten_columns is None: flatten_columns = list(attr_names) else: # check that specified columns exist common._check_columns_subset( input_list=flatten_columns, reference_list=attr_names, raise_error=True ) # check types of flatten_columns for attr in dataset_attrs: if attr.name not in flatten_columns: continue attr_type = attr.spec().to_dict()["type"] if attr_type["baseType"] == "ARRAY" and attr_type["innerType"]["baseType"] != "STRING": if force_flatten: LOGGER.info( f"Will force attribute to string: {attr.name}, with type: {attr_type}" ) else: LOGGER.warning( f"Will not flatten attribute: {attr.name}, with type: {attr_type}" ) flatten_columns.remove(attr.name) if not dataset.status().is_streamable: if allow_dataset_refresh: LOGGER.info(f"Refreshing dataset {dataset.name} to make streamable.") op = dataset.refresh() operation.enforce_success(op) else: message = ( f"Dataset {dataset.name} is not streamable. Refresh it first, or run" f" with allow_dataset_refresh=True" ) LOGGER.error(message) raise RuntimeError(message) # if flattening, set the function to apply to records as _flatten_list # otherwise set as _identity func = None if flatten_delimiter is not None: func = partial(common._flatten_list, delimiter=flatten_delimiter, force=force_flatten) df = pandas.DataFrame.from_records( common._yield_records( dataset, func=func, columns=columns, flatten_columns=flatten_columns ), columns=columns, nrows=nrows, ) return df
def test_get_usage(self): responses.add(responses.GET, f"{self._base_url}/datasets/1/usage", json=self._usage_json) u = Dataset(self.tamr, self._dataset_json).usage() self.assertEqual(u._data, self._usage_json)
def from_dataset( dataset: Dataset, export_file_path: Union[Path, str], *, csv_delimiter: str = ",", columns: Optional[List[str]] = None, flatten_delimiter: str = "|", quote_character: str = '"', quoting: int = csv.QUOTE_MINIMAL, na_value: str = "NaN", nrows: Optional[int] = None, allow_dataset_refresh: bool = False, buffer_size: int = 10000, overwrite: bool = False, ) -> int: """ Export a Tamr Dataset to a csv file. Records are streamed to disk and written according to a given buffer size. As a result this is more memory efficient than first reading to a pandas.DataFrame and writing to CSV. Args: dataset: Tamr Dataset object export_file_path: Path to the csv file where the dataset will be saved csv_delimiter: Delimiter of the csv file columns: Optional, Ordered list of columns to write. If None, write all columns in arbitrary order. flatten_delimiter: Flatten list types to strings by concatenating with this delimiter quote_character: Character used to escape value for csv delimiter when it appears in the value. quoting: The escape strategy to use according to the Python csv writer. See https://docs.python.org/2/library/csv.html#csv.QUOTE_MINIMAL na_value: Value to write that represents empty or missing data. See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html for the na_values supported by default in pandas.read_csv nrows: Optional, Number of rows to write. If None, then write all rows. allow_dataset_refresh: If True, allows running a job to refresh dataset to make streamable. Otherwise a RuntimeError will be thrown if the dataset is unstreamable. buffer_size: Number of records to store in memory before writing to disk overwrite: if True and export_file_name already exists, overwrite the file. Otherwise throw an error Returns: The total number of records written Raises: FileExistsError: if the csv file to which the dataset is to be streamed exists and `overwrite` is False RuntimeError: if `dataset` is not streamable and `allow_dataset_refresh` is False ValueError: if `columns` or `flatten_columns` contain columns that are not present in `dataset` """ LOGGER.info( f"Streaming records to csv file {export_file_path} from dataset {dataset.name} " f"(id={dataset.resource_id}).") if os.path.exists(export_file_path): if not overwrite: message = ( f"CSV file {export_file_path} already exists. " f"(Set 'overwrite' flag to True if you wish to overwrite)") LOGGER.error(message) raise FileExistsError(message) else: LOGGER.warning( f"CSV file {export_file_path} already exists and will be overwritten" ) if csv_delimiter == flatten_delimiter: message = ( f"The CSV delimiter '{csv_delimiter}' cannot be identical to the list " f"flattening delimiter '{flatten_delimiter}'") LOGGER.error(message) raise ValueError(message) attribute_names = [attr.name for attr in dataset.attributes] # check that specified columns exist if columns is not None: common._check_columns_subset(input_list=columns, reference_list=attribute_names, raise_error=True) if not dataset.status().is_streamable: if allow_dataset_refresh: LOGGER.info( f"Refreshing dataset {dataset.name} to make streamable.") op = dataset.refresh() operation.enforce_success(op) else: message = ( f"Dataset {dataset.name} is not streamable. Refresh it first, or " f"run with allow_dataset_refresh=True") LOGGER.error(message) raise RuntimeError(message) func = partial(common._flatten_list, delimiter=flatten_delimiter, force=True) # Open CSV file and use newline='' as recommended by # https://docs.python.org/3/library/csv.html#csv.writer with open(export_file_path, "w", newline="") as csv_file: csv_writer = csv.writer( csv_file, delimiter=csv_delimiter, quotechar=quote_character, quoting=quoting, ) buffer = [] header = None # Set record number to -1 in case the dataset streamed has no records record_number = -1 for record_number, record in enumerate( common._yield_records(dataset, func=func, columns=columns)): # Obtain and write the header information only on the first pass if header is None: header = record.keys() if columns is None else columns csv_writer.writerow(header) # Replace empty values with a specific null value # This also allows nulls to be treated differently from empty strings record = [ na_value if record[k] is None else record[k] for k in header ] buffer.append(record) at_max_buffer = buffer_size is not None and (len(buffer) >= buffer_size) at_max_rows = nrows is not None and record_number >= nrows - 1 if at_max_buffer or at_max_rows: csv_writer.writerows(buffer) LOGGER.debug( f"Written dataset {dataset.name} up to record {record_number+1}" ) buffer = [] if at_max_rows: break # Write anything remaining # This will occur whenever the buffer is non-zero and the number of records # is not exactly divisible by the buffer number # For example, writing a dataset with 1100 records using a buffer size of 500 # will write in 3 chunks: 2 x 500 above and the remaining 100 handled here if len(buffer) != 0: LOGGER.debug( f"Written dataset {dataset.name} up to record {record_number + 1}" ) csv_writer.writerows(buffer) if record_number == -1: # If record number is -1 then no records were streamed, possibly because the dataset # has no records. We therefore want to simply save the headers if columns is not None: csv_writer.writerow(columns) else: csv_writer.writerow(attribute_names) records_written = record_number + 1 LOGGER.info( f"Wrote {records_written} from dataset {dataset.name} (id={dataset.resource_id}) " f"to {export_file_path}") return records_written
def _yield_records( dataset: Dataset, *, func: Optional[Callable] = None, columns: Optional[List] = None, flatten_columns: Optional[List] = None, ) -> Iterable: """ Generator function for records from a Tamr Dataset. Can optionally apply a flattening function to some or all columns, and can optionally keep only a subset of columns from the original records. Args: dataset: Tamr Dataset func: optional, callable function to transform records columns: optional, list of columns to keep flatten_columns: optional, list of columns to flatten Returns: iterable over Dataset records Raises: ValueError: if `flatten_columns` is not None but `func` is None since it won't know how to flatten them. """ if flatten_columns is not None and func is None: message = ( f"Columns specified for flatting ('flatten_columns'={flatten_columns}), " f"but no flattening function provided for parameter 'func'" ) LOGGER.error(message) raise ValueError(message) checked_columns = False for record in dataset.records(): if not checked_columns: if columns is not None: _check_columns_subset( input_list=columns, reference_list=record.keys(), raise_error=True ) if flatten_columns is not None: _check_columns_subset( input_list=flatten_columns, reference_list=record.keys(), raise_error=True ) checked_columns = True # Set flatten_columns to all if unspecified if func is not None and flatten_columns is None: flatten_columns = record.keys() output = {} for k, v in record.items(): if columns is not None and k not in columns: # remove the column by skipping continue do_flatten_column = flatten_columns is not None and k in flatten_columns if do_flatten_column and func is not None: output[k] = func(v) else: output[k] = v yield output