def test_lake_creator_setup(kube: TestClient): workspace = get_workspace() notebook_bucket = workspace['ScratchBucket'] env_name = workspace['env_name'] # Locate Bucket Paths demo_config = get_demo_configuration(env_name) lake_bucket = demo_config.get("LakeBucket").split(':::')[1] users_bucket = notebook_bucket.split("/")[2] logger.info(f"lake_bucket={lake_bucket}, users_bucket={users_bucket}") # Create Databases database_name = f"cms_raw_db_{env_name}".replace( '-', '_') # athena doesnt support '-' in db or table name create_db(database_name, lake_bucket, 'lake: claims data from cms') assert check_database_exists(database_name) == database_name create_db('default', lake_bucket) assert check_database_exists('default') == 'default' create_db('users', users_bucket) assert check_database_exists('users') == 'users' # Get orbit job parameters location = GLUE.get_database(Name=database_name)['Database']['LocationUri'] bucket = location[5:].split('/')[0] logger.info(f"bucket={bucket}, location={location}") extractedPrefix = "extracted/" # S3 Clean Up clean_bucket_prefix(bucket, extractedPrefix) assert len(get_s3_extracted_files(bucket, extractedPrefix)) == 0 #sh.run("rm -f /home/jovyan/shared/regression/CREATOR_PASSED") clean_bucket_prefix(bucket, database_name) assert len(get_s3_extracted_files(bucket, database_name)) == 0
def get_lake_creator_list_of_files(): orbit_workspace = get_workspace() env_name = orbit_workspace['env_name'] notebooks_run_config = { # a list of notebooks names to skip the execution for. Example: ["Example-7-Data-Profiling"] "exclusion_list": ['Example-3-Ray Job Example', 'Example-4-Ray Tune Example', 'Example-92-Delete-DemoCronJobs', 'Example-1-simple', 'Example-2-spark', 'Example-3-gpu', 'Example-90-Failure-Behavior', 'Example-6-Schedule-Notebook','Example-8-SDK-Controller-Sched'], "inclusion_list": [], # if not empty, only those will run. Example: ["Example-7-Data-Profiling"] "optional_list": [], # indicates to ignore a failure. Example: ["Example-6-Schedule-Notebook", "Example-8-LakeFormation-Security"] "minimum_successful": 1, # number of minimum notebooks to be completed to consider entire test not failed # (has an effect when this number is larger than number of mandatory ) "maxRetries": 3, # max number of attempts to execute a notebook "notebooks_to_run": [], # all noootebooks for execution. "sagemaker_notebooks_list": ["Example-1-xgboost_mnist", "Example-2-SageMaker-Batch Transform - breast cancer prediction with high level SDK", "Example-5-SageMaker-on-EKS-xgboost_mnist"] # sagemaker notebooks with small profile } # If we are running in an isolated env, here is your exclusion_list addition if env_name.endswith('-iso'): notebooks_run_config["exclusion_list"].append('Example-91-LakeFormation-Security') notebooks_run_config["exclusion_list"].append('Example-5-SageMaker-on-EKS-xgboost_mnist') sample_notebooks_path = "../../samples/notebooks" analyst_folders = ["B-DataAnalyst", "I-Image", "H-Model-Development"] notebook_file_path = [] # List specific folders for analyst notebooks for folder in analyst_folders: logger.info(f"Reading folder={folder}") notebooks = [str(nb) for nb in Path(f"{sample_notebooks_path}/{folder}").glob("*.ipynb")] notebook_file_path += notebooks sorted_notebook_paths = sorted(notebook_file_path) for p in sorted_notebook_paths: parts = p.split('/') nb_file_name, nb_folder = parts[-1], parts[-2] nb_name= nb_file_name.split('.')[0] logger.info(f"nb_folder={nb_folder}/nb_name={nb_name}") if nb_name.split('.')[0] in notebooks_run_config["exclusion_list"]: # ignore inclusion_list. exclusion_list is having highest priority for filters logger.info(f"Ignoring notebook={nb_name}") continue if not notebooks_run_config["inclusion_list"] or nb_name in notebooks_run_config["inclusion_list"]: # run notebook if white list is empty or if the notebook is in white list. if nb_folder in ["H-Model-Development"]: notebooks_run_config["notebooks_to_run"].append( {"folder": nb_folder, "name": nb_file_name, "profile": "small"}) else: notebooks_run_config["notebooks_to_run"].append({"folder": nb_folder, "name": nb_file_name}) return notebooks_run_config
def update_teamspace_lakeformation_permissions( db_name: Optional[str] = "*") -> None: """ This call will perform a scan over the provided database's tables. Base on the security selector for the given Team Space and base on the current column tags, the permissions will be update to allow access for permitted columns. Parameters ----------- db_name: optional, str Name of the database for which to update permissions. Returns ------- None None. Example -------- >>> import aws_orbit_sdk.glue_catalog as glue >>> glue.update_teamspace_lakeformation_permissions(database_name) """ workspace = get_workspace() lambda_client = boto3.client("lambda") inp = { "env_name": workspace["env_name"], "team_space": workspace["team_space"], "db_name": db_name, "role_arn": workspace["EksPodRoleArn"], } payload = json.dumps(inp) response = lambda_client.invoke( FunctionName= f"orbit-{workspace['env_name']}-authorize_lake_formation_for_role", InvocationType="RequestResponse", LogType="Tail", Payload=bytes(payload, "utf-8"), ) if response["ResponseMetadata"]["HTTPStatusCode"] == 200: response_payload = json.loads( response["Payload"].read().decode("utf-8")) if "errorMessage" in response_payload: raise Exception(response_payload["errorMessage"]) print("Lakeformation permissions have been updated")
def get_sample_data(self, database: str, table: str, sample: int, field: str, direction: str): workspace = get_workspace() logger.info( f"query staging location: {workspace['ScratchBucket']}/athena/query/" ) conn = pyathena.connect( s3_staging_dir=f"{workspace['ScratchBucket']}/athena/query/", region_name=workspace["region"], ) if field and len(field) > 0: query = f'SELECT * FROM "{database}"."{table}" order by {field} desc LIMIT {sample}' else: query = f'SELECT * FROM "{database}"."{table}" LIMIT {sample}' df = pd.read_sql(query, conn) result = df.to_json(orient="records") return result
def get(self): global DATA self.log.info(f"GET - {self.__class__}") if "MOCK" not in os.environ or os.environ["MOCK"] == "0": DATA = get_workspace() cluster_name = "orbit-" + DATA["env_name"] eks_nodegroups = controller.get_nodegroups(cluster_name=cluster_name) self.log.debug(f"eks_nodegroups={eks_nodegroups}") if "MOCK" in os.environ: path = f"{Path(__file__).parent.parent.parent}/test/mockup/compute-eks.json" self.log.info(f"writing mockup data to {path}") with open(path, "w") as outfile: json.dump(eks_nodegroups, outfile, indent=4) else: path = f"{Path(__file__).parent.parent.parent}/test/mockup/compute-eks.json" with open(path) as f: eks_nodegroups = json.load(f) self.finish(self._dump(eks_nodegroups))
def get(self): global DATA self.log.info(f"GET - {self.__class__}") if "MOCK" not in os.environ or os.environ["MOCK"] == "0": DATA = get_workspace() # hide some details if "Elbs" in DATA: del DATA["Elbs"] if "Plugins" in DATA: del DATA["Plugins"] if "MOCK" in os.environ: path = f"{Path(__file__).parent.parent.parent}/test/mockup/team.json" self.log.info(f"writing mockup data to {path}") with open(path, "w") as outfile: json.dump(DATA, outfile, indent=4) else: path = f"{Path(__file__).parent.parent.parent}/test/mockup/team.json" with open(path) as f: DATA = json.load(f) self.finish(self._dump(DATA))
# Initialize parameters logging.basicConfig( format="%(asctime)s %(levelname)-8s %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S", ) logger = logging.getLogger() glue = boto3.client("glue") sns = boto3.client("sns") s3 = boto3.client("s3") # Adding output path and other parameters for the reports notebook_name = "Automated-Data-Transformations.ipynb" workspace = get_workspace() team_space = workspace["team_space"] env_name = workspace["env_name"] source_path = "$ORBIT_TRANSFORMATION_NOTEBOOKS_ROOT" base_path = "orbit/profiling" logger.info(f"Team space: {team_space}, Environment name: {env_name}") def create_tasks(glue_tables: Dict[str, Any], target_folder: str, database: str, samplingRatio: float) -> List[Dict[str, Any]]: """ Creating a data profiling task for each Glue table in the database. Parameters ------------ glue_tables : list
def run_crawler(crawler: str, target_db: str, target_path: str, wait: Optional[Any] = True) -> str: """ This API starts a glue crawler for the given path and will create the tables base on data found in the provided database. The call can wait until the crawler is done and table created. Parameters ---------- crawler: str A unique name of the Crawler target_db: str The name of the target database target_path: str The S3 Path where the data for the table resides. wait: optional, bool If True, will wait until the Crawler is finished. Returns ------- state: str Returns the state of the crawler after finished running and creating tables. Example ------- >>> import aws_orbit_sdk.glue_catalog as glue >>> response = glue.run_crawler(crawler, target_db, target_path, wait=True) """ role = get_workspace()["EksPodRoleArn"] glue = boto3.client("glue") try: glue.delete_crawler(Name=crawler) logger.info("existing crawler deleted") except Exception as e: error = str(e) if "EntityNotFoundException" not in error: logger.error(error) pass response = glue.create_crawler( Name=crawler, Role=role, DatabaseName=target_db, Targets={"S3Targets": [{ "Path": target_path }]}, ) state = response["ResponseMetadata"]["HTTPStatusCode"] if state != 200: raise Exception("Failed to create crawler") glue.start_crawler(Name=crawler) logger.info("Crawler started...") state = "INIT" while state != "READY": response = glue.get_crawler(Name=crawler) state = response["Crawler"]["State"] if not wait: return state logger.info(f"Crawler in state: {state}, waiting a min... ") time.sleep(60) response = glue.get_crawler_metrics(CrawlerNameList=[crawler]) if "CrawlerMetricsList" not in response or "TablesCreated" not in response[ "CrawlerMetricsList"][0]: raise Exception("Crawler failed to create table") stats = response["CrawlerMetricsList"][0] logger.info(stats) logger.info("Crawler finished creating table") return state
def get_connection_to_athena( self, DbName: str, region_name: Optional[str] = None, S3QueryResultsLocation: Optional[str] = None, ) -> Dict[str, Union[str, sa.engine.Engine]]: """ Connect Athena to an existing database Parameters ---------- DbName : str Name of the glue database name. region_name : str, optional The region to connect to athena. The default region will be used if receives None. S3QueryResultsLocation : str, optional The s3 bucket where to store query results. The results will not be saved if received None. Returns ------- db_url : str A sql alchemy connection string. engine : sqlalchemy.engine.Engine A sql alchemy engine. Example -------- >>> from aws.utils.notebooks.database import AthenaUtils >>> from sqlalchemy.engine import create_engine >>> from aws.utils.notebooks.common import get_workspace >>> (db_url,engine) = AthenaUtils.get_connection_to_athena( ... DbName = glue_db, ... my_region = my_region, ... S3QueryResultsLocation = results_location) """ workspace = get_workspace() if region_name is None: region_name = workspace["region"] if S3QueryResultsLocation is None: S3QueryResultsLocation = f"{workspace['ScratchBucket']}/athena" template_con_str = ( "awsathena+rest://athena.{region_name}.amazonaws.com:443/" "{schema_name}?s3_staging_dir={s3_staging_dir}") conn_str = template_con_str.format( region_name=region_name, schema_name=DbName, s3_staging_dir=quote_plus(S3QueryResultsLocation), ) engine = create_engine(conn_str) self.db_url = conn_str self.current_engine = engine self.db_class = "athena" return { "db_url": self.db_url, "engine": self.current_engine, }