def test_passing_overall_iam_action_override(self): """test_passing_overall_iam_action_override: Tests iam:CreateAccessKey (in overrides file as Permissions management, but in the AWS docs as Write)""" desired_result = "Permissions management" action_overrides = get_action_access_level_overrides_from_yml("iam") result = determine_access_level_override("iam", "CreateAccessKey", "Write", action_overrides) self.assertEqual(result, desired_result)
def create_database(destination_directory, access_level_overrides_file): """ Create the JSON Data source that holds the IAM data. :param destination_directory: :param access_level_overrides_file: The path to the file that we use for overriding access levels that are incorrect in the AWS documentation :return: """ # Create the docs directory if it doesn't exist Path(os.path.join(destination_directory, "docs")).mkdir(parents=True, exist_ok=True) # This holds the entire IAM definition schema = {} # for filename in ['list_amazonathena.partial.html']: file_list = [] for filename in os.listdir(BUNDLED_HTML_DIRECTORY_PATH): if os.path.isfile(os.path.join(BUNDLED_HTML_DIRECTORY_PATH, filename)): if filename not in file_list: file_list.append(filename) file_list.sort() for filename in file_list: if not filename.startswith("list_"): continue with open(os.path.join(BUNDLED_HTML_DIRECTORY_PATH, filename), "r") as f: soup = BeautifulSoup(f.read(), "html.parser") main_content = soup.find(id="main-content") if main_content is None: continue # Get service name title = main_content.find("h1", class_="topictitle").text title = re.sub(".*Actions, resources, and condition Keys for *", "", str(title), flags=re.IGNORECASE) title = title.replace("</h1>", "") service_name = chomp(title) service_prefix = "" for c in main_content.find("h1", class_="topictitle").parent.children: if "prefix" in str(c): service_prefix = str(c) service_prefix = service_prefix.split( '<code class="code">')[1] service_prefix = chomp(service_prefix.split("</code>")[0]) break if service_prefix not in schema.keys(): schema[service_prefix] = {} # The URL to that service's Actions, Resources, and Condition Keys page service_authorization_url_prefix = "https://docs.aws.amazon.com/service-authorization/latest/reference" service_authorization_url = f"{service_authorization_url_prefix}/{filename}" schema[service_prefix] = { "service_name": service_name, "prefix": service_prefix, "service_authorization_url": service_authorization_url, "privileges": {}, "resources": {}, "conditions": {}, } access_level_overrides_cfg = get_action_access_level_overrides_from_yml( service_prefix, access_level_overrides_file) tables = main_content.find_all("div", class_="table-contents") for table in tables: # There can be 3 tables, the actions table, an ARN table, and a condition key table # Example: https://docs.aws.amazon.com/IAM/latest/UserGuide/list_awssecuritytokenservice.html if not header_matches("actions", table) or not header_matches( "description", table): continue rows = table.find_all("tr") row_number = 0 while row_number < len(rows): row = rows[row_number] cells = row.find_all("td") if len(cells) == 0: # Skip the header row, which has th, not td cells row_number += 1 continue if len(cells) != 6: # Sometimes the privilege contains Scenarios, and I don't know how to handle this # raise Exception("Unexpected format in {}: {}".format(prefix, row)) break # See if this cell spans multiple rows rowspan = 1 if "rowspan" in cells[0].attrs: rowspan = int(cells[0].attrs["rowspan"]) priv = "" # Get the privilege for link in cells[0].find_all("a"): if "href" not in link.attrs: # pylint: disable=no-else-continue # Skip the <a id='...'> tags api_documentation_link = None continue else: api_documentation_link = link.attrs.get('href') logger.debug(api_documentation_link) priv = chomp(link.text) if priv == "": priv = chomp(cells[0].text) action_name = priv description = chomp(cells[1].text) access_level = chomp(cells[2].text) # Access Level ##### # access_level_overrides_cfg will only be true if the service in question is present # in the overrides YML file if access_level_overrides_cfg: override_result = determine_access_level_override( service_prefix, action_name, access_level, access_level_overrides_cfg, ) if override_result: access_level = override_result logger.debug( "Override: Setting access level for %s:%s to %s", service_prefix, action_name, access_level, ) # else: # access_level = access_level # else: # access_level = access_level resource_types = {} resource_cell = 3 while rowspan > 0: if len(cells) == 3 or len(cells) == 6: # ec2:RunInstances contains a few "scenarios" which start in the # description field, len(cells) is 5. # I'm ignoring these as I don't know how to handle them. # These include things like "EC2-Classic-InstanceStore" and # "EC2-VPC-InstanceStore-Subnet" resource_type = chomp(cells[resource_cell].text) condition_keys_element = cells[resource_cell + 1] condition_keys = [] if condition_keys_element.text != "": for key_element in condition_keys_element.find_all( "p"): condition_keys.append( chomp(key_element.text)) dependent_actions_element = cells[resource_cell + 2] dependent_actions = [] if dependent_actions_element.text != "": for ( action_element ) in dependent_actions_element.find_all("p"): dependent_actions.append( chomp(action_element.text)) if "*" in resource_type: required = True resource_type = resource_type.strip("*") else: required = False resource_types[resource_type] = { "resource_type": resource_type, "required": required, "condition_keys": condition_keys, "dependent_actions": dependent_actions, } rowspan -= 1 if rowspan > 0: row_number += 1 resource_cell = 0 row = rows[row_number] cells = row.find_all("td") if "[permission only]" in priv: priv = priv.split(" ")[0] privilege_schema = { "privilege": priv, "description": description, "access_level": access_level, "resource_types": resource_types, "api_documentation_link": api_documentation_link } schema[service_prefix]["privileges"][ priv] = privilege_schema row_number += 1 # Get resource table for table in tables: if not header_matches("resource types", table) or not header_matches( "arn", table): continue rows = table.find_all("tr") for row in rows: cells = row.find_all("td") if len(cells) == 0: # Skip the header row, which has th, not td cells continue if len(cells) != 3: raise Exception( "Unexpected number of resource cells {} in {}". format(len(cells), filename)) resource = chomp(cells[0].text) arn = no_white_space(cells[1].text) conditions = [] for condition in cells[2].find_all("p"): conditions.append(chomp(condition.text)) schema[service_prefix]["resources"][resource] = { "resource": resource, "arn": arn, "condition_keys": conditions } # Get condition keys table for table in tables: if not (header_matches("<th> condition keys </th>", table) and header_matches("<th> type </th>", table)): continue rows = table.find_all("tr") for row in rows: cells = row.find_all("td") if len(cells) == 0: # Skip the header row, which has th, not td cells continue if len(cells) != 3: raise Exception( "Unexpected number of condition cells {} in {}". format(len(cells), filename)) condition = no_white_space(cells[0].text) description = chomp(cells[1].text) value_type = chomp(cells[2].text) schema[service_prefix]["conditions"][condition] = { "condition": condition, "description": description, "type": value_type, } # this_service_schema = { # service_prefix: service_schema # } # schema.update(this_service_schema) iam_definition_file = os.path.join(destination_directory, "iam-definition.json") with open(iam_definition_file, "w") as file: json.dump(schema, file, indent=4) logger.info("Wrote IAM definition file to path: ", iam_definition_file)
def create_database(destination_directory, access_level_overrides_file): """ Create the JSON Data source that holds the IAM data. :param destination_directory: :param access_level_overrides_file: The path to the file that we use for overriding access levels that are incorrect in the AWS documentation :return: """ # Create the docs directory if it doesn't exist Path(os.path.join(destination_directory, "data", "docs")).mkdir(parents=True, exist_ok=True) schema = [] # for filename in ['list_amazonathena.partial.html']: for filename in [ f for f in os.listdir(BUNDLED_HTML_DIRECTORY_PATH) if os.path.isfile(os.path.join(BUNDLED_HTML_DIRECTORY_PATH, f)) ]: if not filename.startswith("list_"): continue with open(os.path.join(BUNDLED_HTML_DIRECTORY_PATH, filename), "r") as f: soup = BeautifulSoup(f.read(), "html.parser") main_content = soup.find(id="main-content") if main_content is None: continue # Get service name title = main_content.find("h1", class_="topictitle").text title = re.sub(".*Actions, Resources, and Condition Keys for *", "", str(title)) title = title.replace("</h1>", "") service_name = chomp(title) prefix = "" for c in main_content.find("h1", class_="topictitle").parent.children: if "prefix" in str(c): prefix = str(c) prefix = prefix.split('<code class="code">')[1] prefix = chomp(prefix.split("</code>")[0]) break service_schema = { "service_name": service_name, "prefix": prefix, "privileges": [], "resources": [], "conditions": [], } access_level_overrides_cfg = get_action_access_level_overrides_from_yml( prefix, access_level_overrides_file) tables = main_content.find_all("div", class_="table-contents") for table in tables: # There can be 3 tables, the actions table, an ARN table, and a condition key table # Example: https://docs.aws.amazon.com/IAM/latest/UserGuide/list_awssecuritytokenservice.html if "<th> Actions </th>" not in [ chomp(str(x)) for x in table.find_all("th") ]: continue rows = table.find_all("tr") row_number = 0 while row_number < len(rows): row = rows[row_number] cells = row.find_all("td") if len(cells) == 0: # Skip the header row, which has th, not td cells row_number += 1 continue if len(cells) != 6: # Sometimes the privilege might span multiple rows. # Example: amazonroute53-DisassociateVPCFromHostedZone # We should be handling this, but if we are not, then bail raise Exception("Unexpected format in {}: {}".format( prefix, row)) # See if this cell spans multiple rows rowspan = 1 if "rowspan" in cells[0].attrs: rowspan = int(cells[0].attrs["rowspan"]) priv = "" # Get the privilege for link in cells[0].find_all("a"): if "href" not in link.attrs: # Skip the <a id='...'> tags continue priv = chomp(link.text) if priv == "": priv = chomp(cells[0].text) service_prefix = prefix action_name = priv description = chomp(cells[1].text) access_level = chomp(cells[2].text) # Access Level ##### # access_level_overrides_cfg will only be true if the service in question is present # in the overrides YML file if access_level_overrides_cfg: override_result = determine_access_level_override( service_prefix, action_name, access_level, access_level_overrides_cfg, ) if override_result: access_level = override_result logger.debug( "Override: Setting access level for %s:%s to %s", service_prefix, action_name, access_level, ) # else: # access_level = access_level # else: # access_level = access_level resource_types = [] resource_cell = 3 while rowspan > 0: if len(cells) == 3 or len(cells) == 6: # ec2:RunInstances contains a few "scenarios" which start in the # description field, len(cells) is 5. # I'm ignoring these as I don't know how to handle them. # These include things like "EC2-Classic-InstanceStore" and # "EC2-VPC-InstanceStore-Subnet" resource_type = chomp(cells[resource_cell].text) condition_keys_element = cells[resource_cell + 1] condition_keys = [] if condition_keys_element.text != "": for key_element in condition_keys_element.find_all( "p"): condition_keys.append( chomp(key_element.text)) dependent_actions_element = cells[resource_cell + 2] dependent_actions = [] if dependent_actions_element.text != "": for ( action_element ) in dependent_actions_element.find_all("p"): dependent_actions.append( chomp(action_element.text)) resource_types.append({ "resource_type": resource_type, "condition_keys": condition_keys, "dependent_actions": dependent_actions, }) rowspan -= 1 if rowspan > 0: row_number += 1 resource_cell = 0 row = rows[row_number] cells = row.find_all("td") if "[permission only]" in priv: priv = priv.split(" ")[0] privilege_schema = { "privilege": priv, "description": description, "access_level": access_level, "resource_types": resource_types, } service_schema["privileges"].append(privilege_schema) row_number += 1 # Get resource table for table in tables: if "<th> Resource Types </th>" not in [ chomp(str(x)) for x in table.find_all("th") ]: continue rows = table.find_all("tr") for row in rows: cells = row.find_all("td") if len(cells) == 0: # Skip the header row, which has th, not td cells continue if len(cells) != 3: raise Exception( "Unexpected number of resource cells {} in {}". format(len(cells), filename)) resource = chomp(cells[0].text) arn = no_white_space(cells[1].text) conditions = [] for condition in cells[2].find_all("p"): conditions.append(chomp(condition.text)) service_schema["resources"].append({ "resource": resource, "arn": arn, "condition_keys": conditions }) # Get condition keys table for table in tables: if "<th> Condition Keys </th>" not in [ chomp(str(x)) for x in table.find_all("th") ] or "<th> Type </th>" not in [ chomp(str(x)) for x in table.find_all("th") ]: continue rows = table.find_all("tr") for row in rows: cells = row.find_all("td") if len(cells) == 0: # Skip the header row, which has th, not td cells continue if len(cells) != 3: raise Exception( "Unexpected number of condition cells {} in {}". format(len(cells), filename)) condition = no_white_space(cells[0].text) description = chomp(cells[1].text) value_type = chomp(cells[2].text) service_schema["conditions"].append({ "condition": condition, "description": description, "type": value_type, }) schema.append(service_schema) schema.sort(key=lambda x: x["prefix"]) iam_definition_file = os.path.join(destination_directory, "iam-definition.json") with open(iam_definition_file, "w") as file: json.dump(schema, file, indent=4) logger.info("Wrote IAM definition file to path: ", iam_definition_file)
def build_action_table(db_session, service, access_level_overrides_file): """ Builds the action table in the SQLite database. See the first Table on any service-specific page in the Actions, Resources, and Condition Keys documentation. That information is scraped, parsed, and stored in the SQLite database using this function. :param db_session: Database session object :param service: AWS Service to query. This can be called in a loop or for a single service (see connect_db function above). :param access_level_overrides_file: The path to the file that we use for overriding access levels that are incorrect in the AWS documentation """ directory = os.path.abspath(os.path.dirname(__file__)) + '/data/docs/' html_list = get_html(directory, service) access_level_overrides_cfg = get_action_access_level_overrides_from_yml( service, access_level_overrides_file) for df_list in html_list: for df in df_list: # pylint: disable=invalid-name table = json.loads(df.to_json(orient='split')) table_data = df # Actions table if 'Actions' in table_data and 'Access Level' in table_data: for i in range(len(table['data'])): # If the table is set to none # If the cell is blank, that indicates it needs wildcard if table['data'][i][3] is None: resource_type_name = 'None' resource_type_name_append_wildcard = 'False' resource_arn_format = '*' # Check if resource type name has wildcard suffix - i.e., parameter* instead of parameter # If it does, set the append_wildcard flag to true, # and set the resource name to that but without the # wildcard to make searching easier elif '*' in table['data'][i][3]: temp_resource_type_name = table['data'][i][3] resource_type_name = temp_resource_type_name[:-1] if resource_type_name is None: resource_type_name = 'None' resource_type_name_append_wildcard = 'True' query_resource_arn_format = db_session.query( ArnTable.raw_arn).filter(and_(ArnTable.service.ilike(service), ArnTable.resource_type_name.like(resource_type_name))) first_result = query_resource_arn_format.first() try: resource_arn_format = first_result.raw_arn # For EC2 RunInstances, ResourceTypes have some duplicates. # The Resource Types (*required) column has duplicates # and the Access Level has `nan` except AttributeError: continue else: resource_type_name = table['data'][i][3] resource_type_name_append_wildcard = 'False' first_result = db_session.query( ArnTable.raw_arn).filter(ArnTable.service.ilike(service), ArnTable.resource_type_name.like(table['data'][i][3])).first() try: if '*' in first_result.raw_arn: resource_arn_format = first_result.raw_arn[:-1] else: resource_arn_format = first_result.raw_arn except AttributeError: continue # For lambda:InvokeFunction, the cell is 'lambda:InvokeFunction [permission only]'. # To avoid this, let's test for a space in the name. # If there is a space, remove the space and all text after # it. # pylint: disable=unused-variable if ' ' in table['data'][i][0]: text_with_space = table['data'][i][0] action_name, sep, tail = text_with_space.partition( ' ') else: action_name = table['data'][i][0] # Access Level ##### # access_level_overrides_cfg will only be true if the service in question is present # in the overrides YML file if access_level_overrides_cfg: override_result = determine_access_level_override( service, str.lower(action_name), table['data'][i][2], access_level_overrides_cfg) if override_result: access_level = override_result print( f"Override: Setting access level for {service}:{action_name} to {access_level}") else: access_level = table['data'][i][2] else: access_level = table['data'][i][2] # Condition keys ##### if table['data'][i][4] is None: condition_keys = None # If there are multiple condition keys, make them comma separated # Otherwise, if we ingest them as-is, it will show up as # two spaces elif ' ' in table['data'][i][4]: condition_keys = get_comma_separated_condition_keys( table['data'][i][4]) else: condition_keys = table['data'][i][4] ##### Dependent actions ##### if table['data'][i][5] is None: dependent_actions = None elif ' ' in table['data'][i][5]: # Let's just use the same method that we use for # separating condition keys dependent_actions = get_comma_separated_condition_keys( table['data'][i][5]) else: dependent_actions = table['data'][i][5] db_session.add(ActionTable( service=service, name=str.lower(action_name), description=table['data'][i][1], access_level=access_level, resource_type_name=resource_type_name, resource_type_name_append_wildcard=resource_type_name_append_wildcard, resource_arn_format=str(resource_arn_format), condition_keys=condition_keys, dependent_actions=dependent_actions )) db_session.commit() elif 'Resource Types' in table_data and 'ARN' in table_data: continue else: continue db_session.commit()