def delete( session: Session, dataset: Dataset, records: Iterable[Dict], *, primary_key_name: Optional[str] = None, ) -> JsonDict: """Deletes the specified records, based on primary key values. Does not check that other attribute values match. Args: dataset: Dataset from which to delete records records: The records to update, as dictionaries primary_key_name: The primary key for these records, which must be a key in each record dictionary. By default the key_attribute_name of dataset Returns: JSON response body from server Raises: requests.HTTPError: If an HTTP error is encountered primary_key.NotFound: If primary_key_name does not match dataset primary key primary_key.NotFound: If primary_key_name not in a record dictionary """ if primary_key_name is None: primary_key_name = dataset.key_attribute_names[0] if primary_key_name not in dataset.key_attribute_names: raise primary_key.NotFound( f"Primary key: {primary_key_name} is not in dataset key attribute names: {dataset.key_attribute_names}" ) updates = (_delete_command(record, primary_key_name=primary_key_name) for record in records) return _update(session, dataset, updates)
def upsert( session: Session, dataset: Dataset, df: "pd.DataFrame", *, primary_key_name: Optional[str] = None, ) -> JsonDict: """Upserts a record for each row of `df` with attributes for each column in `df`. Args: dataset: Dataset to receive record updates df: The DataFrame containing records to be upserted primary_key_name: The primary key of the dataset. Must be a column of `df`. By default the key_attribute_name of dataset Returns: JSON response body from the server Raises: requests.HTTPError: If an HTTP error is encountered requests.HTTPError: If an HTTP error is encountered primary_key.NotFound: If `primary_key_name` is not a column in `df` or the index of `df` ValueError: If `primary_key_name` matches both a column in `df` and the index of `df` """ if primary_key_name is None: primary_key_name = dataset.key_attribute_names[0] # preconditions if primary_key_name in df.columns and primary_key_name == df.index.name: raise primary_key.Ambiguous( f"Index {primary_key_name} has the same name as column {primary_key_name}" ) elif primary_key_name not in df.columns and primary_key_name != df.index.name: raise primary_key.NotFound( f"Primary key: {primary_key_name} is not DataFrame index name: {df.index.name} or in DataFrame column names: {df.columns}" ) # promote primary key column to index if primary_key_name in df.columns: df = df.set_index(primary_key_name) # serialize records via to_json to handle `np.nan` values serialized_records = ((pk, row.to_json()) for pk, row in df.iterrows()) records = ({ primary_key_name: pk, **json.loads(row) } for pk, row in serialized_records) return record.upsert(session, dataset, records, primary_key_name=primary_key_name)
def _check_primary_key(df: "pd.DataFrame", primary_key_name: str): """Check if the primary key name uniquely identifies a column or index of the DataFrame Args: df: The DataFrame to inspect primary_key_name: The index or column name to be used as the primary key Raises: primary_key.Ambiguous: If the primary key name matches both the index and a column primary_key.NotFound: If the primary key name does not match the index or any column """ if primary_key_name in df.columns and primary_key_name == df.index.name: raise primary_key.Ambiguous( f"Index {primary_key_name} has the same name as column {primary_key_name}" ) elif primary_key_name not in df.columns and primary_key_name != df.index.name: raise primary_key.NotFound( f"Primary key: {primary_key_name} is not DataFrame index name: {df.index.name} or in" f" DataFrame column names: {df.columns}")
def create( session: Session, instance: Instance, df: "pd.DataFrame", *, name: str, primary_key_name: Optional[str] = None, description: Optional[str] = None, external_id: Optional[str] = None, ) -> Dataset: """Create a dataset in Tamr from the DataFrame `df` and creates a record from each row All attributes other than the primary key are created as the default type array(string) Args: instance: Tamr instance df: The DataFrame containing records to be upserted name: Dataset name primary_key_name: The primary key of the dataset. Must be a column of `df`. By default the name of the index of `df` description: Dataset description external_id: External ID of the dataset Returns: Dataset created in Tamr Raises: dataset.AlreadyExists: If a dataset with these specifications already exists. requests.HTTPError: If any other HTTP error is encountered. primary_key.NotFound: If `primary_key_name` is not a column in `df` or the index of `df` ValueError: If `primary_key_name` matches both a column in `df` and the index of `df` """ # preconditions if primary_key_name is None: if df.index.name is not None: primary_key_name = df.index.name else: raise primary_key.NotFound( "No primary key was specified and DataFrame index is unnamed") _check_primary_key(df, primary_key_name) # dataset creation try: ds = dataset.create( session, instance, name=name, key_attribute_names=(primary_key_name, ), description=description, external_id=external_id, ) except (TamrClientException, requests.HTTPError) as e: raise CreationFailure(f"Dataset was not created: {e}") # attribute creation for col in df.columns: if col == primary_key_name: # this attribute already exists as a key attribute continue try: attribute.create(session, ds, name=col, is_nullable=True) except (TamrClientException, requests.HTTPError) as e: _handle_creation_failure(session, ds, f"An attribute was not created: {e}") # record creation try: response = upsert(session, ds, df, primary_key_name=primary_key_name) if not response["allCommandsSucceeded"]: _handle_creation_failure(session, ds, "Some records had validation errors") except (TamrClientException, requests.HTTPError) as e: _handle_creation_failure(session, ds, f"Record could not be created: {e}") # Get Dataset from server return dataset._dataset._by_url(session, ds.url)