def ingest_elb(landing_table, aws_access_key=None, aws_secret_key=None, session=None, account=None): elbs = get_all_elbs( aws_access_key=aws_access_key, aws_secret_key=aws_secret_key, session=session, account=account, ) monitor_time = datetime.utcnow().isoformat() db.insert( landing_table, values=[( row, monitor_time, row.get('CanonicalHostedZoneName', ''), row.get('CanonicalHostedZoneNameID', ''), row['CreatedTime'], row['DNSName'], row['LoadBalancerName'], row['Region']['RegionName'], row['Scheme'], row.get('VPCId', 'VpcId'), row.get('Account', {}).get('ACCOUNT_ID'), ) for row in elbs], select= 'PARSE_JSON(column1), column2, column3, column4, column5, column6, ' 'column7, column8, column9, column10', ) return len(elbs)
def ingest_users(tio, table_name): users = tio.users.list() timestamp = datetime.utcnow() for user in users: user['role'] = { 16: 'Basic', 24: 'Scan Operator', 32: 'Standard', 40: 'Scan Manager', 64: 'Administrator', }.get(user['permissions'], 'unknown permissions {permissions}') db.insert(table=f'data.{table_name}', values=[(user.get('username', None), user.get('role', None), user, timestamp, user.get('uuid', None), user.get('id', None), user.get('user_name', None), user.get('email', None), user.get('type', None), user.get('permissions', None), user.get('last_login_attempt', None), user.get('login_fail_count', None), user.get('login_fail_total', None), user.get('enabled', None), user.get('two_factor', None), user.get('lastlogin', None), user.get('uuid_id', None)) for user in users], select=""" column1, column2, PARSE_JSON(column3), column4, column5, column6, column7, column8, column9, column10, to_timestamp(column11, 3)::timestamp_ltz, column12, column13, column14, PARSE_JSON(column15), to_timestamp(column16, 3)::timestamp_ltz, column17 """)
def ingest(table_name, options): current_time = datetime.datetime.utcnow() org_client = sts_assume_role( src_role_arn=options['source_role_arn'], dest_role_arn=options['destination_role_arn'], dest_external_id=options['destination_role_external_id']).client( 'organizations') account_pages = org_client.get_paginator('list_accounts').paginate() accounts = [a for page in account_pages for a in page['Accounts']] db.insert(table=f'data.{table_name}', values=[( a, current_time, a['Arn'], a['Email'], a['Id'], a['JoinedMethod'], a['JoinedTimestamp'], a['Name'], a['Status'], ) for a in accounts], select=( 'PARSE_JSON(column1)', 'column2', 'column3::STRING', 'column4::STRING', 'column5::NUMBER', 'column6::STRING', 'column7::TIMESTAMP_LTZ', 'column8::STRING', 'column9::STRING', )) return len(accounts)
def ingest_users(url, headers, landing_table, timestamp): while 1: response = requests.get(url=url, headers=headers) if response.status_code != 200: log.error('OKTA REQUEST FAILED: ', response.text) return result = response.json() if result == []: break db.insert( landing_table, values=[(row, timestamp) for row in result], select='PARSE_JSON(column1), column2', ) log.info(f'Inserted {len(result)} rows.') yield len(result) url = '' links = requests.utils.parse_header_links(response.headers['Link']) for link in links: if link['rel'] == 'next': url = link['url'] if len(url) == 0: break
def ingest_iam(landing_table, aws_access_key=None, aws_secret_key=None, session=None, account=None): users = get_iam_users( aws_access_key=aws_access_key, aws_secret_key=aws_secret_key, session=session, account=account, ) monitor_time = datetime.utcnow().isoformat() db.insert( landing_table, values=[( row, monitor_time, row['Path'], row['UserName'], row['UserId'], row.get('Arn'), row['CreateDate'], row.get('PasswordLastUsed'), row.get('Account', {}).get('ACCOUNT_ID'), ) for row in users], select=db.derive_insert_select(LANDING_TABLES_COLUMNS['IAM']), columns=db.derive_insert_columns(LANDING_TABLES_COLUMNS['IAM']), ) return len(users)
def ingest_users(url, headers, landing_table, now): while 1: response = requests.get(url=url, headers=headers) if response.status_code != 200: log.error('OKTA REQUEST FAILED: ', response.text) return result = response.json() if result == []: break db.insert( landing_table, [{'raw': row, 'event_time': now} for row in result], ) log.info(f'Inserted {len(result)} rows.') yield len(result) url = '' links = requests.utils.parse_header_links(response.headers['Link']) for link in links: if link['rel'] == 'next': url = link['url'] if len(url) == 0: break
def run_baseline(name, comment): try: metadata = yaml.load(comment) assert type(metadata) is dict source = metadata['log source'] required_values = metadata['required values'] code_location = metadata['module name'] time_filter = metadata['filter'] time_column = metadata['history'] except Exception as e: log.error(e, f"{name} has invalid metadata: >{metadata}<, skipping") return with open(f"../baseline_modules/{code_location}/{code_location}.R") as f: r_code = f.read() r_code = format_code(r_code, required_values) frame = query_log_source(source, time_filter, time_column) ro.globalenv['input_table'] = frame output = ro.r(r_code) output = output.to_dict() results = unpack(output) try: log.info(f"{name} generated {len(results)} rows") db.insert(f"{DATA_SCHEMA}.{name}", results, overwrite=True) except Exception as e: log.error("Failed to insert the results into the target table", e)
def ingest_sg(landing_table, aws_access_key=None, aws_secret_key=None, session=None, account=None): groups = get_all_security_groups( aws_access_key=aws_access_key, aws_secret_key=aws_secret_key, session=session, account=account, ) monitor_time = datetime.utcnow().isoformat() db.insert( landing_table, values=[( row, row['Description'], monitor_time, row['GroupId'], row['GroupName'], row['OwnerId'], row['Region']['RegionName'], row.get('VpcId'), ) for row in groups], select= 'PARSE_JSON(column1), column2, column3, column4, column5, column6, column7, column8', ) return len(groups)
def ingest_ec2( landing_table, aws_access_key=None, aws_secret_key=None, session=None, account=None ): instances = get_ec2_instances( aws_access_key=aws_access_key, aws_secret_key=aws_secret_key, session=session, account=account, ) monitor_time = datetime.utcnow().isoformat() db.insert( landing_table, values=[ ( row, row['InstanceId'], row['Architecture'], monitor_time, row['InstanceType'], # can be not present if a managed instance such as EMR row.get('KeyName', ''), row['LaunchTime'], row['Region']['RegionName'], row['State']['Name'], row.get('InstanceName', ''), row.get('Account', {}).get('ACCOUNT_ID'), ) for row in instances ], select='PARSE_JSON(column1), column2, column3, column4, column5, column6, column7, column8, column9, column10', ) return len(instances)
def run_baseline(name, comment): from rpy2 import robjects as ro try: metadata = yaml.safe_load(comment) assert type(metadata) is dict source = metadata['log source'] required_values = metadata['required values'] code_location = metadata['module name'] time_filter = metadata['filter'] time_column = metadata['history'] except Exception as e: log.error(e, f"{name} has invalid metadata: >{metadata}<, skipping") return os.mkdir(FORMATTED_CODE_DIRECTORY) files = os.listdir(f'../baseline_modules/{code_location}') shutil.copyfile("../baseline_modules/run_module.R", f"{FORMATTED_CODE_DIRECTORY}/run_module.R") for file in files: print(file) if not file.startswith('.'): with open(f"../baseline_modules/{code_location}/{file}") as f: r_code = f.read() r_code = format_code(r_code, required_values) with open(f"{FORMATTED_CODE_DIRECTORY}/{file}", 'w+') as ff: ff.write(r_code) with open(f"{FORMATTED_CODE_DIRECTORY}/run_module.R") as fr: r_code = fr.read() frame = query_log_source(source, time_filter, time_column) ro.globalenv['input_table'] = frame ro.r(f"setwd('./{FORMATTED_CODE_DIRECTORY}')") output = ro.r(r_code) output = output.to_dict() results = unpack(output) # Get the columns of the baseline table; find the timestamp column and pop it from the list columns = [ row['name'] for row in db.fetch(f'desc table {DATA_SCHEMA}.{name}') ] columns.remove('EXPORT_TIME') try: log.info(f"{name} generated {len(results)} rows") db.insert(f"{DATA_SCHEMA}.{name}", results, columns=columns, overwrite=True) except Exception as e: log.error("Failed to insert the results into the target table", e) finally: shutil.rmtree(f"../{FORMATTED_CODE_DIRECTORY}")
def ami_dispatch( landing_table, aws_access_key='', aws_secret_key='', accounts=None, source_role_arn='', destination_role_name='', external_id='', ): results = 0 if accounts: for account in accounts: id = account['ACCOUNT_ID'] name = account['ACCOUNT_ALIAS'] target_role = f'arn:aws:iam::{id}:role/{destination_role_name}' log.info(f"Using role {target_role}") try: session = sts_assume_role(source_role_arn, target_role, external_id) results += ingest_ami(landing_table, session=session, account=account) db.insert( AWS_ACCOUNTS_METADATA, values=[(datetime.utcnow(), RUN_ID, id, name, results)], columns=[ 'snapshot_at', 'run_id', 'account_id', 'account_alias', 'ami_count', ], ) except Exception as e: db.insert( AWS_ACCOUNTS_METADATA, values=[(datetime.utcnow(), RUN_ID, id, name, 0, e)], columns=[ 'snapshot_at', 'run_id', 'account_id', 'account_alias', 'ami_count', 'error', ], ) log.error(f"Unable to assume role {target_role} with error", e) else: results += ingest_ami(landing_table, aws_access_key=aws_access_key, aws_secret_key=aws_secret_key) return results
def ingest(table_name, options): landing_table = f'data.{table_name}' service_user_creds = options['service_user_creds'] for subject in options.get('subjects_list') or ['']: for event in LOGIN_EVENTS: items = get_logs( service_user_creds, with_subject=subject, event_name=event, start_time=db.fetch_latest( landing_table, where=( f"delegating_subject='{subject}' AND " f"event_name='{event}'" ), ), ).get('items', []) db.insert( landing_table, values=[ ( item['id']['time'], item['etag'].strip('"'), subject, item.get('events', [{}])[0].get('name'), { p['name']: ( p.get('value') or p.get('boolValue') or p.get('multiValue') ) for p in item.get('events', [{}])[0].get('parameters', []) }, item['id']['customerId'], item['actor'].get('email'), item['actor'].get('profileId'), item.get('ipAddress'), item, ) for item in items ], select=( 'CURRENT_TIMESTAMP()', 'column1', 'column2', 'column3', 'column4', 'PARSE_JSON(column5)', 'column6', 'column7', 'column8', 'column9', 'PARSE_JSON(column10)', ), ) yield len(items)
def ingest(table_name, options): landing_table = f'data.{table_name}' token = options['token'] asset_entity_id = options['asset_entity_id'] general_url = ( f"https://api.assetpanda.com:443//v2/entities/{asset_entity_id}/objects" ) fields_url = f"https://api.assetpanda.com:443//v2/entities/{asset_entity_id}" params = {"offset": 0, "limit": PAGE_SIZE} total_object_count = 0 insert_time = datetime.utcnow() while params['offset'] <= total_object_count: log.debug("total_object_count: ", total_object_count) assets = get_data(token=token, url=general_url, params=params) list_object, total_object_count = get_list_objects_and_total_from_get_object( assets) dict_fields = get_data(token, fields_url, params=params) list_field = dict_fields["fields"] # Stripping down the metadata to remove unnecessary fields. We only really care about the following: # {"field_140": "MAC_Address", "field_135" :"IP"} clear_fields: dict = reduce(reduce_fields, list_field, {}) # replace every key "field_NO" by the value of the clear_field["field_NO"] list_object_without_field_id = replace_device_key( list_object, clear_fields) db.insert( landing_table, values=[(entry, entry.get('id', None), insert_time) for entry in list_object_without_field_id], select=db.derive_insert_select(LANDING_TABLE_COLUMNS), columns=db.derive_insert_columns(LANDING_TABLE_COLUMNS), ) log.info( f'Inserted {len(list_object_without_field_id)} rows ({landing_table}).' ) yield len(list_object_without_field_id) # increment the offset to get new entries each iteration in the while loop params["offset"] += PAGE_SIZE
def ingest_ami( landing_table, aws_access_key=None, aws_secret_key=None, session=None, account=None ): images = get_images( aws_access_key=aws_access_key, aws_secret_key=aws_secret_key, session=session, account=account, ) monitor_time = datetime.utcnow().isoformat() db.insert( landing_table, values=[ ( row, monitor_time, row.get('VirtualizationType'), row.get('Description'), row.get('Tags'), row.get('Hypervisor'), row.get('EnaSupport'), row.get('SriovNetSupport'), row.get('ImageId'), row.get('State'), row.get('BlockDeviceMappings'), row.get('Architecture'), row.get('ImageLocation'), row.get('RootDeviceType'), row.get('RootDeviceName'), row.get('OwnerId'), row.get('CreationDate'), row.get('Public'), row.get('ImageType'), row.get('Name'), row.get('Account', {}).get('ACCOUNT_ID'), row['Region']['RegionName'], ) for row in images ], select=db.derive_insert_select(LANDING_TABLES_COLUMNS['AMI']), columns=db.derive_insert_columns(LANDING_TABLES_COLUMNS['AMI']), ) return len(images)
def ingest(table_name, options): tenant_id = options['tenant_id'] client_id = options['client_id'] client_secret = options['client_secret'] cloud_type = options.get('cloud_type', 'reg') subscriptions_service = get_subscription_service( { "tenantId": tenant_id, "clientId": client_id, "clientSecret": client_secret }, cloud_type, ) subscription_list = subscriptions_service.list() subscriptions = [s.as_dict() for s in subscription_list] db.insert( f'data.{table_name}', values=[( parse(subscription_list.raw.response.headers['Date']).isoformat(), tenant_id, row, row['id'], row['subscription_id'], row['display_name'], row['state'], row['subscription_policies'], row['authorization_source'], ) for row in subscriptions], select=( 'column1', 'column2', 'PARSE_JSON(column3)', 'column4', 'column5', 'column6', 'column7', 'PARSE_JSON(column8)', 'column9', ), ) yield len(subscriptions)
def ingest_agents(table_name, options): last_export_time = next( db.fetch( f'SELECT MAX(export_at) as time FROM data.{table_name}'))['TIME'] timestamp = datetime.now(timezone.utc) if (last_export_time is None or (timestamp - last_export_time).total_seconds() > 86400): agents = {a['uuid']: a for a in get_agent_data()}.values() for page in groups_of(10000, agents): db.insert( table=f'data.{table_name}', values=[(agent, timestamp) for agent in page], select=db.derive_insert_select(AGENT_LANDING_TABLE), columns=db.derive_insert_columns(AGENT_LANDING_TABLE), ) else: log.info('Not time to import Tenable Agents')
def ingest_agents(table_name, options): last_export_time = next( db.fetch( f'SELECT MAX(export_at) as time FROM data.{table_name}'))['TIME'] timestamp = datetime.now(timezone.utc) if (last_export_time is None or (timestamp - last_export_time).total_seconds() > 86400): all_agents = sorted(get_agent_data(), key=lambda a: a.get('last_connect', 0)) unique_agents = {a['uuid']: a for a in all_agents}.values() rows = [{'raw': ua, 'export_at': timestamp} for ua in unique_agents] log.debug(f'inserting {len(unique_agents)} unique (by uuid) agents') db.insert(f'data.{table_name}', rows) return len(rows) else: log.info('Not time to import Tenable Agents') return 0
def ingest(table_name, options): tenant_id = options['tenant_id'] client_id = options['client_id'] client_secret = options['client_secret'] subscriptions_service = get_client_from_json_dict( SubscriptionClient, { "tenantId": tenant_id, "clientId": client_id, "clientSecret": client_secret, "activeDirectoryEndpointUrl": "https://login.microsoftonline.com", "resourceManagerEndpointUrl": "https://management.azure.com/", "managementEndpointUrl": "https://management.core.windows.net/", }).subscriptions subscription_list = subscriptions_service.list() subscriptions = [s.as_dict() for s in subscription_list] db.insert( f'data.{table_name}', values=[( parse(subscription_list.raw.response.headers['Date']).isoformat(), tenant_id, row, row['id'], row['subscription_id'], row['display_name'], row['state'], row['subscription_policies'], row['authorization_source'], ) for row in subscriptions], select=( 'column1', 'column2', 'PARSE_JSON(column3)', 'column4', 'column5', 'column6', 'column7', 'PARSE_JSON(column8)', 'column9', )) yield len(subscriptions)
def ingest(table_name, options, dryrun=False): domainkey = options['domainkey'] skey = options['skey'] ikey = options['ikey'] admin_api = duo_client.Admin( ikey=ikey, skey=skey, host=f'api-{domainkey}.duosecurity.com', ) admins = list(admin_api.get_admins()) db.insert( f'data.{table_name}', [{ 'raw': a } for a in admins], dryrun=dryrun, ) return len(admins)
def ingest_vulns(table_name): last_export_time = next( db.fetch( f'SELECT MAX(export_at) as time FROM data.{table_name}'))['TIME'] timestamp = datetime.now(timezone.utc) if (last_export_time is None or (timestamp - last_export_time).total_seconds() > 86400): log.info("Exporting vulnerabilities...") vulns = TIO.exports.vulns() for page in groups_of(10000, vulns): db.insert( table=f'data.{table_name}', values=[(vuln, timestamp) for vuln in page], select=db.derive_insert_select(VULN_LANDING_TABLE), columns=db.derive_insert_columns(AGENT_LANDING_TABLE), ) else: log.info('Not time to import Tenable vulnerabilities yet')
async def main(table_name): async with aiohttp.ClientSession() as session: cids = [ c['id'] for c in (await fetch(session, '/computers')).get('computers', []) ] log.info(f'loading {len(cids)} computer details') computers = await asyncio.gather( *[fetch_computer(session, cid) for cid in cids]) log.info(f'inserting {len(computers)} computers into {table_name}') rows = [ updated(c.get('computer'), computer_id=cid, recorded_at=c.get('recorded_at')) for cid, c in zip(cids, computers) ] db.insert(table_name, rows) return len(rows)
def process_endpoint(endpoint): log.info(f"starting {endpoint}") json_body = {'links': {'next': {'href': endpoint}}} page = 1 while json_body['links']['next'] is not None: log.info(f"Getting page {str(page)}") r = get(json_body['links']['next']['href']) if r.status_code != 200: log.error(f"Ingest request for {endpoint} failed", r.text) db.record_failed_ingestion(ZENGRC_TABLE, r, TIMESTAMP) break json_body = r.json() data = [[json.dumps(i), TIMESTAMP] for i in json_body['data']] try: db.insert(ZENGRC_TABLE, data, select='PARSE_JSON(column1), column2') page += 1 except Exception as e: log.error(e)
def ingest(table_name, options): landing_table = f'data.{table_name}' timestamp = datetime.utcnow() organization_id = options['organization_id'] api_secret = options['api_secret'] api_key = options['api_key'] params: dict = {"limit": PAGE_SIZE, "page": 1} # API starts at 1 while 1: devices: dict = get_data(organization_id, api_key, api_secret, params) params["page"] += 1 if len(devices) == 0: break db.insert( landing_table, values=[( timestamp, device, device.get('deviceId'), device.get('osVersionName', None), device.get('lastSyncStatus', None), device.get('type', None), device.get('version', None), device.get('lastSync', None), device.get('osVersion', None), device.get('name', None), device.get('status', None), device.get('originId', None), device.get('appliedBundle', None), device.get('hasIpBlocking', None), ) for device in devices], select=db.derive_insert_select(LANDING_TABLE_COLUMNS), columns=db.derive_insert_columns(LANDING_TABLE_COLUMNS), ) log.info(f'Inserted {len(devices)} rows.') yield len(devices)
def ingest_elb(aws_access_key, aws_secret_key, landing_table, regions): elbs = get_all_elbs(aws_access_key, aws_secret_key, regions) monitor_time = datetime.utcnow().isoformat() db.insert( landing_table, values=[( row, monitor_time, row['CanonicalHostedZoneName'], row['CanonicalHostedZoneNameID'], row['CreatedTime'], row['DNSName'], row['LoadBalancerName'], row['Region']['RegionName'], row['Scheme'], row['VPCId']) for row in elbs], select='PARSE_JSON(column1), column2, column3, column4, column5, column6, ' 'column7, column8, column9, column10' ) return len(elbs)
def ingest(table_name, options): landing_table = f'data.{table_name}' api_key = options['api_key'] subdomain = options['subdomain'] url = f'https://{subdomain}.okta.com/api/v1/logs' headers = { 'Accept': 'application/json', 'Content-Type': 'application/json', 'Authorization': f'SSWS {api_key}' } ts = db.fetch_latest(landing_table, 'event_time') if ts is None: log.error("Unable to find a timestamp of most recent Okta log, " "defaulting to one hour ago") ts = datetime.datetime.now() - datetime.timedelta(hours=1) params = {'since': ts.strftime("%Y-%m-%dT%H:%M:%S.000Z")} while 1: response = requests.get(url=url, headers=headers, params=params) if response.status_code != 200: log.error('OKTA REQUEST FAILED: ', response.text) return result = response.json() if result == []: break db.insert(landing_table, values=[(row, row['published']) for row in result], select='PARSE_JSON(column1), column2') log.info(f'Inserted {len(result)} rows.') yield len(result) url = response.headers['Link'].split(', ')[1].split(';')[0][1:-1]
def ingest_vulns(table_name): last_export_time = next( db.fetch( f'SELECT MAX(export_at) AS time FROM data.{table_name}'))['TIME'] now = datetime.now(timezone.utc) if (last_export_time is None or (now - last_export_time) > timedelta(days=1)): log.debug('TIO export vulns') # insert empty row... db.insert(f'data.{table_name}', [{'export_at': now}]) # ...because this line takes awhile vulns = TIO.exports.vulns() rows = [{'raw': v, 'export_at': now} for v in vulns] db.insert(f'data.{table_name}', rows) return len(rows) else: log.info('Not time to import Tenable vulnerabilities yet') return 0
def get_data_worker(account_id, account_name): try: ec2_session = get_aws_client(account_id) instances = [] try: ec2_regions = [ region['RegionName'] for region in ec2_session.client( 'ec2').describe_regions()['Regions'] ] except Exception as e: log.info( f"ec2_describe_instances account [{account_id}] {account_name} exception", e) return None for region in ec2_regions: try: client = ec2_session.client('ec2', region_name=region) paginator = client.get_paginator('describe_instances') page_iterator = paginator.paginate() region = [ instance for page in page_iterator for instance_array in page['Reservations'] for instance in instance_array['Instances'] ] instances.extend(region) except Exception as e: log.info( f"ec2_describe_instances: account [{account_id}] {account_name} exception", e) db.insert(AWS_ACCOUNTS_INFORMATION_TABLE, values=[(datetime.utcnow(), account_id, account_name, None, e)]) return None instance_list = [ json.dumps({ **instance, "AccountId": account_id }, default=str) for instance in instances ] try: db.insert(AWS_ACCOUNTS_INFORMATION_TABLE, values=[(datetime.utcnow(), account_id, account_name, len(instance_list), None)]) except Exception: print('Failed to insert into AWS_ACCOUNT_INFORMATION table.') print( f"ec2_describe_instances: account: {account_name} instances: {len(instance_list)}" ) return instance_list except Exception as e: print( f"ec2_describe_instances: account: {account_name} exception: {e}") db.insert(AWS_ACCOUNTS_INFORMATION_TABLE, values=[(datetime.utcnow(), account_id, account_name, None, e)]) return None
def ingest(table_name, options): landing_table = f'data.{table_name}' timestamp = datetime.utcnow() client_id = options['client_id'] client_secret = options['client_secret'] # Call the authorization endpoint so we can make subsequent calls to the API with an auth token token: str = get_token_basic(client_id, client_secret) offset = "" params_get_id_devices: dict = {"limit": PAGE_SIZE, "offset": offset} while 1: dict_id_devices: dict = get_data(token, CROWDSTRIKE_DEVICES_BY_ID_URL, params_get_id_devices) resources: list = dict_id_devices["resources"] params_get_id_devices["offset"] = get_offset_from_devices_results( dict_id_devices) if len(resources) == 0: break device_details_url_and_params: str = create_url_params_get_devices( CROWDSTRIKE_DEVICE_DETAILS_URL, resources) dict_devices: dict = get_data(token, device_details_url_and_params) devices = dict_devices["resources"] db.insert( landing_table, values=[( timestamp, device, device.get('device_id'), device.get('first_seen', None), device.get('system_manufacturer', None), device.get('config_id_base', None), device.get('last_seen', None), device.get('policies', None), device.get('slow_changing_modified_timestamp', None), device.get('minor_version', None), device.get('system_product_name', None), device.get('hostname', None), device.get('mac_address', None), device.get('product_type_desc', None), device.get('platform_name', None), device.get('external_ip', None), device.get('agent_load_flags', None), device.get('group_hash', None), device.get('provision_status', None), device.get('os_version', None), device.get('groups', None), device.get('bios_version', None), device.get('modified_timestamp', None), device.get('local_ip', None), device.get('agent_version', None), device.get('major_version', None), device.get('meta', None), device.get('agent_local_time', None), device.get('bios_manufacturer', None), device.get('platform_id', None), device.get('device_policies', None), device.get('config_id_build', None), device.get('config_id_platform', None), device.get('cid', None), device.get('status', None), device.get('service_pack_minor', None), device.get('product_type', None), device.get('service_pack_major', None), device.get('build_number', None), device.get('pointer_size', None), device.get('site_name', None), device.get('machine_domain', None), device.get('ou', None), ) for device in devices], select=db.derive_insert_select(LANDING_TABLE_COLUMNS), columns=db.derive_insert_columns(LANDING_TABLE_COLUMNS), ) log.info(f'Inserted {len(devices)} rows.') yield len(devices)
def load_data(messages): data = [(m, m['date']) for m in messages] try: db.insert(AGARI_TABLE, data, select='PARSE_JSON(column1), column2') except Exception as e: log.error("failed to ingest data", e)
def ingest(table_name, options): ingest_type = ( 'users' if table_name.endswith('_USERS_CONNECTION') else 'groups' if table_name.endswith('_GROUPS_CONNECTION') else 'logs') landing_table = f'data.{table_name}' api_key = options['api_key'] subdomain = options['subdomain'] ingest_urls = { 'users': f'https://{subdomain}.okta.com/api/v1/users', 'deprovisioned_users': f'https://{subdomain}.okta.com/api/v1/users?filter=status+eq+\"DEPROVISIONED\"', 'groups': f'https://{subdomain}.okta.com/api/v1/groups', 'logs': f'https://{subdomain}.okta.com/api/v1/logs', } headers = { 'Accept': 'application/json', 'Content-Type': 'application/json', 'Authorization': f'SSWS {api_key}', } timestamp = datetime.datetime.utcnow() if ingest_type == 'groups': response = requests.get(url=ingest_urls[ingest_type], headers=headers) result = response.json() for row in result: try: row['users'] = requests.get(url=row['_links']['users']['href'], headers=headers).json() except TypeError: log.info(row) raise db.insert( landing_table, values=[(row, timestamp) for row in result], select='PARSE_JSON(column1), column2', ) log.info(f'Inserted {len(result)} rows.') yield len(result) elif ingest_type == 'users': yield from ingest_users(ingest_urls['users'], headers, landing_table, timestamp) yield from ingest_users(ingest_urls['deprovisioned_users'], headers, landing_table, timestamp) else: ts = db.fetch_latest(landing_table, 'event_time') if ts is None: log.error("Unable to find a timestamp of most recent Okta log, " "defaulting to one hour ago") ts = datetime.datetime.now() - datetime.timedelta(hours=1) params = {'since': ts.strftime("%Y-%m-%dT%H:%M:%S.000Z"), 'limit': 500} i = 0 print(params['since']) url = ingest_urls[ingest_type] while 1: response = requests.get(url=url, headers=headers, params=params) if response.status_code != 200: log.error('OKTA REQUEST FAILED: ', response.text) return result = response.json() if result == []: break db.insert( landing_table, values=[(row, row['published']) for row in result], select='PARSE_JSON(column1), column2', ) log.info(f'Inserted {len(result)} rows. {i}') i += 1 yield len(result) url = '' links = requests.utils.parse_header_links(response.headers['Link']) for link in links: if link['rel'] == 'next': url = link['url'] if len(url) == 0: break