def test_validate_invalid_milestone_name(self): bad_schema = tableschema.Schema(TEST_SCHEMA2.descriptor) bad_schema.descriptor["fields"][4]["custom_milestone_field_names"] = { "10": "bad_milestone_name" } bad_schema.commit() self.assertFalse(table_schema.validate_schema(bad_schema))
def simple_pipeline( member_id: str, row_format: bool, multiple_val_delimiter: str, data: Dict[str, pd.DataFrame], schema: Schema, column_mapping: pd.DataFrame, source_field_mappings: FieldMappings, ): """Simple pipeline to transform Mission Impact data to prepare it for upload to the Gateway system. Note that log changes are not noops; logs are captured and sent directly to users of the web endpoint, as well as saved by the Airflow server, so readability of all logs of level INFO and above is critical. Parameters ---------- member_id : str The organization's Member ID. row_format : bool Whether the data is organized using the Mission Impact Row Format (if this is false, it implies that the data is organized using the Mission Impact Column Format). multiple_val_delimiter : str The separator for multiple values in the dataset. data : Dict[str, pd.DataFrame] Dictionary of {dataset name -> dataset}. schema: Schema Mission Impact Table Schema column_mapping : pd.DataFrame Column Mapping. source_field_mappings : FieldMappings Field Mappings. Returns ------- type Returns the transformed dataset and any resolved field mappings. """ return_val = {} # Validate Table Schema table_schema.validate_schema(schema) # Validate Column Mappings validation_failures = ColumnMappingValidator( schema, row_format).validate(column_mapping) if validation_failures: logging.error( "The pipeline could not finish, because some of your column mappings are not valid. Please review the following names in your Column Mappings Google sheet:<br>" ) logging.error(email.format_validation_failures(validation_failures)) return_val[EMAIL_METADATA_KEY] = validation_failures return_val[ FAILURE_EMAIL_TASK_ID_KEY] = SEND_COLUMN_MAPPING_INVALID_EMAIL_TASK_ID return return_val column_mapping = ColumnMappingLoader.convert_column_mapping_dataframe_to_dict( column_mapping) # Validate Field Mappings validation_failures: Dict[str, Dict] = FieldMappingValidator( schema).validate_multiple(source_field_mappings) if validation_failures: logging.error("Field mappings are not valid!") for _, validation_failure in validation_failures.items(): logging.error(email.format_validation_failures(validation_failure)) return_val[EMAIL_METADATA_KEY] = validation_failures return_val[ FAILURE_EMAIL_TASK_ID_KEY] = SEND_FIELD_MAPPING_INVALID_EMAIL_TASK_ID return return_val # Validate Data Shape validation_failures = DatasetShapeValidator( schema, column_mapping, row_format).validate_multiple_dataset_shape(data) if validation_failures: logging.error("Dataset shape is not valid!") for _, validation_failure in validation_failures.items(): logging.error(email.format_validation_failures(validation_failure)) return_val[EMAIL_METADATA_KEY] = validation_failures return_val[ FAILURE_EMAIL_TASK_ID_KEY] = SEND_DATA_SHAPE_INVALID_EMAIL_TASK_ID return return_val # Shape Data shape_transformer: DatasetShapeTransformer = DatasetShapeTransformer( member_id, schema, column_mapping, row_format, multiple_val_delimiter) # TODO: Move concatentation of multiple datasets into DatasetShapeTransformer # Combine all of the datasets into one combined_shaped_dataset: pd.DataFrame = pd.concat( [ shape_transformer.transform_dataset_shape(df) for df in data.values() ], ignore_index=True, sort=True, ) combined_shaped_dataset = combined_shaped_dataset.fillna("") # Generate Field mappings generated_field_mappings: FieldMappings = FieldMappingGenerator( schema).generate_mappings_from_dataset(combined_shaped_dataset) # Resolve Field Mappings resolved_field_mappings: FieldMappings = FieldMappingResolver.resolve_mappings( generated_field_mappings, source_field_mappings, overwrite=False, remove_unapproved_source_mappings=True, ) return_val[FIELD_MAPPINGS_RETURN_KEY] = resolved_field_mappings # Validate Field Mapping Approvals validation_failures: Dict[str, Dict] = FieldMappingApprovalValidator( ).validate_multiple(resolved_field_mappings) if validation_failures: logging.error( 'The pipeline could not finish, because some of your field mappings do not have approved values. Most likely, your data has new responses, which require new mappings. Go to your Field Mappings Google sheet, and approve the new mappings by toggling "No" to "Yes" on the following fields:<br>' ) logging.error( email.format_unapproved_mappings(resolved_field_mappings)) # No need to store email metadata, since resolved field mappings are used to generate # field mapping approval needed email. return_val[ FAILURE_EMAIL_TASK_ID_KEY] = SEND_FIELD_MAPPING_APPROVAL_EMAIL_TASK_ID return return_val # Process Data transformed_dataset, invalid_values, dropped_rows = DataProcessor( resolved_field_mappings, schema).process(combined_shaped_dataset) final_shaped_dataset = GatewayDatasetShapeTransformer( schema).transform_dataset_shape(transformed_dataset) # Store number of rows in processed data, plus dropped data info. logging.warning("<br>" + email.format_successful_upload( final_shaped_dataset.shape[0], dropped_rows, invalid_values)) return_val[EMAIL_METADATA_KEY] = { NUM_ROWS_TO_UPLOAD_KEY: final_shaped_dataset.shape[0], DROPPED_ROWS_KEY: dropped_rows, DROPPED_VALUES_KEY: invalid_values, } return_val[DATASET_RETURN_KEY] = final_shaped_dataset return return_val
def test_validate_numeric_enum_not_int(self): bad_schema = tableschema.Schema(TEST_SCHEMA2.descriptor) bad_schema.descriptor["fields"][4]["type"] = "string" bad_schema.commit() self.assertFalse(table_schema.validate_schema(bad_schema))
def test_validate_numeric_enum_missing_value(self): bad_schema = tableschema.Schema(TEST_SCHEMA2.descriptor) bad_schema.descriptor["fields"][4]["constraints"]["maximum"] = 3 bad_schema.commit() self.assertFalse(table_schema.validate_schema(bad_schema))
def test_validate_invalid_milestone(self): bad_schema = tableschema.Schema(TEST_SCHEMA2.descriptor) bad_schema.descriptor["fields"][4]["milestones"] = [10] bad_schema.commit() self.assertFalse(table_schema.validate_schema(bad_schema))
def test_validate_valid_schema(self): self.assertTrue(table_schema.validate_schema(TEST_SCHEMA2))