def main(): config = load_config() metrics.init(config, 'iris-sync-targets', stats_reset) default_nap_time = 3600 try: nap_time = int(config.get('sync_script_nap_time', default_nap_time)) except ValueError: nap_time = default_nap_time engine = create_engine( config['db']['conn']['str'] % config['db']['conn']['kwargs'], **config['db']['kwargs']) # Initialize these to zero at the start of the app, and don't reset them at every # metrics interval metrics.set('users_found', 0) metrics.set('teams_found', 0) metrics_task = spawn(metrics.emit_forever) while True: if not bool(metrics_task): logger.error('metrics task failed, %s', metrics_task.exception) metrics_task = spawn(metrics.emit_forever) sync(config, engine) logger.info('Sleeping for %d seconds' % nap_time) sleep(nap_time)
def allow_send(self, message): application = message.get('application') if not application: return True # Purpose of quotas is to protect downstreams. If we're already going to drop this message, # don't let it account against quota. if message.get('mode') == 'drop': return True rate = self.rates.get(application) if not rate: return True hard_buckets, soft_buckets, hard_limit, soft_limit, wait_time, plan_name, target = rate # Increment both buckets for this minute hard_buckets[-1] += 1 soft_buckets[-1] += 1 # If hard limit breached, disallow sending this message and create incident hard_quota_usage = sum(hard_buckets) hard_usage_pct = 0 if hard_limit > 0: hard_usage_pct = (hard_quota_usage // hard_limit) * 100 metrics.set('app_%s_quota_hard_usage_pct' % application, hard_usage_pct) if hard_quota_usage > hard_limit: metrics.incr('quota_hard_exceed_cnt') if plan_name: with self.last_incidents_mutex: self.notify_incident(application, hard_limit, len(hard_buckets), plan_name, wait_time) return False # If soft limit breached, just notify owner and still send soft_quota_usage = sum(soft_buckets) soft_usage_pct = 0 if soft_limit > 0: soft_usage_pct = (soft_quota_usage // soft_limit) * 100 metrics.set('app_%s_quota_soft_usage_pct' % application, soft_usage_pct) if soft_quota_usage > soft_limit: metrics.incr('quota_soft_exceed_cnt') if target: with self.last_soft_quota_notification_time_mutex: self.notify_target(application, soft_limit, len(soft_buckets), *target) return True return True
def main(): boot_time = time.time() config = load_config() metrics.init(config, 'iris-owa-sync', default_metrics) owaconfig = config.get('owa') if not owaconfig: logger.critical('Missing OWA configs') sys.exit(1) api_host = owaconfig.get('api_host', 'http://localhost:16649') iris_client = IrisClient(api_host, 0, owaconfig['iris_app'], owaconfig['iris_app_key']) proxies = owaconfig.get('proxies') # only way to configure a proxy is to monkey-patch (http adapter) a monkey-patch (baseprotocol) :/ if proxies: UseProxyHttpAdapter._my_proxies = proxies exchangelib.protocol.BaseProtocol.HTTP_ADAPTER_CLS = UseProxyHttpAdapter creds = exchangelib.Credentials(**owaconfig['credentials']) try: nap_time = int(owaconfig.get('sleep_interval', 60)) except ValueError: nap_time = 60 while True: start_time = time.time() message_count = 0 try: config = exchangelib.Configuration(credentials=creds, **owaconfig['config']) account = exchangelib.Account(config=config, access_type=exchangelib.DELEGATE, **owaconfig['account']) except (exchangelib.errors.EWSError, requests.exceptions.RequestException): logger.exception('Failed authenticating to OWA365') metrics.incr('owa_api_failure_count') else: logger.info('Receiving mail on behalf of %s', owaconfig['account'].get('primary_smtp_address')) message_count = poll(account, iris_client) now = time.time() run_time = now - start_time logger.info( 'Last run took %2.f seconds and processed %s messages. Waiting %s seconds until next poll..', run_time, message_count, nap_time) metrics.set('uptime', now - boot_time) metrics.emit() sleep(nap_time)
def deactivate(): # deactivate incidents that have expired logger.info('[-] start deactivate task...') start_deactivation = time.time() connection = db.engine.raw_connection() cursor = connection.cursor() cursor.execute(INACTIVE_SQL) connection.commit() cursor.close() connection.close() metrics.set('deactivation', time.time() - start_deactivation) logger.info('[*] deactivate task finished')
def aggregate(now): # see if it's time to send the batches logger.info('[-] start aggregate task - queued: %s', len(messages)) start_aggregations = time.time() for key in queues.keys(): aggregation_window = cache.plans[key[0]]['aggregation_window'] if now - sent.get(key, 0) >= aggregation_window: aggregated_message_ids = queues[key] connection = db.engine.raw_connection() cursor = connection.cursor() cursor.execute( 'SELECT `id` FROM `message` WHERE active=1 AND `id` in %s', [aggregated_message_ids]) active_message_ids = {r[0] for r in cursor} cursor.close() connection.close() inactive_message_ids = aggregated_message_ids - active_message_ids l = len(active_message_ids) logger.info( '[x] dropped %s messages from claimed incidents, %s remain for %r', len(inactive_message_ids), l, key) # remove inactive message from the queue for message_id in inactive_message_ids: del messages[message_id] if l == 1: m = messages.pop(next(iter(active_message_ids))) logger.info('aggregate - %(message_id)s pushing to send queue', m) send_queue.put(m) elif l > 1: uuid = uuid4().hex m = messages[next(iter(active_message_ids))] logger.info('aggregate - %s pushing to send queue', uuid) m['batch_id'] = uuid # Cast from set to list, as sets are not msgpack serializable m['aggregated_ids'] = list(active_message_ids) send_queue.put(m) for message_id in active_message_ids: del messages[message_id] logger.info('[-] purged %s from messages %s remaining', active_message_ids, len(messages)) del queues[key] sent[key] = now metrics.set('aggregations', time.time() - start_aggregations) logger.info('[*] aggregate task finished - queued: %s', len(messages))
def allow_send(self, message): application = message.get('application') if not application: return True rate = self.rates.get(application) if not rate: return True hard_buckets, soft_buckets, hard_limit, soft_limit, wait_time, plan_name, target = rate # Increment both buckets for this minute hard_buckets[-1] += 1 soft_buckets[-1] += 1 # If hard limit breached, disallow sending this message and create incident hard_quota_usage = sum(hard_buckets) hard_usage_pct = 0 if hard_limit > 0: hard_usage_pct = (hard_quota_usage / hard_limit) * 100 metrics.set('app_%s_quota_hard_usage_pct' % application, hard_usage_pct) if hard_quota_usage > hard_limit: metrics.incr('quota_hard_exceed_cnt') self.notify_incident(application, hard_limit, len(hard_buckets), plan_name, wait_time) return False # If soft limit breached, just notify owner and still send soft_quota_usage = sum(soft_buckets) soft_usage_pct = 0 if soft_limit > 0: soft_usage_pct = (soft_quota_usage / soft_limit) * 100 metrics.set('app_%s_quota_soft_usage_pct' % application, soft_usage_pct) if soft_quota_usage > soft_limit: metrics.incr('quota_soft_exceed_cnt') self.notify_target(application, soft_limit, len(soft_buckets), *target) return True return True
def main(): global ldap_timeout config = load_config() metrics.init(config, 'iris-sync-targets', stats_reset) default_ldap_timeout = 20 default_nap_time = 3600 ldap_timeout = int( config.get('sync_script_ldap_timeout', default_ldap_timeout)) try: nap_time = int(config.get('sync_script_nap_time', default_nap_time)) except ValueError: nap_time = default_nap_time engine = create_engine( config['db']['conn']['str'] % config['db']['conn']['kwargs'], **config['db']['kwargs']) # Optionally, maintain an internal list of mailing lists from ldap that can also be # used as targets. ldap_lists = config.get('ldap_lists') # Initialize these to zero at the start of the app, and don't reset them at every # metrics interval metrics.set('users_found', 0) metrics.set('teams_found', 0) metrics.set('ldap_lists_found', 0) metrics.set('ldap_memberships_found', 0) metrics_task = spawn(metrics.emit_forever) while True: if not bool(metrics_task): logger.error('metrics task failed, %s', metrics_task.exception) metrics_task = spawn(metrics.emit_forever) sync_from_oncall(config, engine) # Do ldap mailing list sync *after* we do the normal sync, to ensure we have the users # which will be in ldap already populated. if ldap_lists: if 'ldap_cert_path' in ldap_lists: ldap_cert_path = ldap_lists['ldap_cert_path'] if not os.access(ldap_cert_path, os.R_OK): logger.error("Failed to read ldap_cert_path certificate") raise IOError else: ldap_lists['cert_path'] = ldap_cert_path list_run_start = time.time() sync_ldap_lists(ldap_lists, engine) logger.info('Ldap mailing list sync took %.2f seconds', time.time() - list_run_start) logger.info('Sleeping for %d seconds' % nap_time) sleep(nap_time)
def leave_cluster(self): self.started_shutdown = True # cancel any attempts to acquire leader lock which could make us hang self.lock.cancel() if self.zk.state == KazooState.CONNECTED: if self.party and self.party.participating: logger.info('Leaving party') self.party.leave() if self.lock and self.lock.is_acquired: logger.info('Releasing lock') self.lock.release() # Make us not the leader self.is_leader = False # Avoid sending metrics that we are still the leader when we're not metrics.set('is_leader_sender', 0)
def gwatch_renewer(): gmail_config = config['gmail'] gcli = Gmail(gmail_config, config.get('gmail_proxy')) while True: logger.info('[-] start gmail watcher loop...') logger.info('renewing gmail watcher...') re = gcli.watch(gmail_config['project'], gmail_config['topic']) try: history_id, expiration = (int( re['historyId']), int(re['expiration']) / 1000 - time.time()) except KeyError: logger.exception( '[*] gmail watcher run failed. Skipping this run.') else: metrics.set('gmail_history_id', history_id) metrics.set('gmail_seconds_to_watch_expiration', expiration) logger.info('[*] gmail watcher loop finished') # only renew every 8 hours sleep(60 * 60 * 8)
def poll(): # poll unsent messages logger.info('[-] start send task...') start_send = time.time() connection = db.engine.raw_connection() cursor = connection.cursor(db.dict_cursor) if messages: cursor.execute(UNSENT_MESSAGES_SQL + ' AND `message`.`id` NOT IN %s', [tuple(messages)]) else: cursor.execute(UNSENT_MESSAGES_SQL) new_msg_count = cursor.rowcount queued_msg_cnt = len(messages) metrics.set('new_msg_count', new_msg_count) logger.info('%d new messages waiting in database - queued: %d', new_msg_count, queued_msg_cnt) for m in cursor: # iris's own email response does not have context since content and # subject are already set if m.get('context'): context = ujson.loads(m['context']) # inject meta variables context['iris'] = {k: m[k] for k in m if k != 'context'} m['context'] = context message_queue.put(m) metrics.set('poll', time.time() - start_send) metrics.set('queue', len(messages)) logger.info('[*] send task finished') cursor.close() connection.close()
def update_forever(self): while True: if self.started_shutdown: return old_status = self.is_master self.update_status() new_status = self.is_master if old_status != new_status: log = logger.info else: log = logger.debug if self.is_master: log('I am the master sender') else: log('I am a slave sender') metrics.set('slave_instance_count', self.slave_count) metrics.set('is_master_sender', int(self.is_master is True)) sleep(UPDATE_FREQUENCY)
def poll(account, iris_client): try: metrics.set('total_inbox_count', account.inbox.total_count) metrics.set('unread_inbox_count', account.inbox.unread_count) except (exchangelib.errors.EWSError, requests.exceptions.RequestException): logger.exception('Failed to gather inbox counts from OWA API') metrics.incr('owa_api_failure_count') processed_messages = 0 messages_to_mark_read = [] try: for message in account.inbox.filter( is_read=False).order_by('-datetime_received'): processed_messages += 1 try: relay(message, iris_client) except Exception: logger.exception('Uncaught exception during message relaying') metrics.incr('message_relay_failure_count') # Mark it as read in bulk later. (This syntax isn't documented) message.is_read = True messages_to_mark_read.append((message, ('is_read', ))) except (exchangelib.errors.EWSError, requests.exceptions.RequestException): logger.exception('Failed to iterate through inbox') metrics.incr('owa_api_failure_count') if messages_to_mark_read: bulk_update_count = len(messages_to_mark_read) logger.info('will mark %s messages as read', bulk_update_count) try: account.bulk_update(items=messages_to_mark_read) except (exchangelib.errors.EWSError, requests.exceptions.RequestException): logger.exception( 'Failed to update read status on %s messages in bulk', bulk_update_count) metrics.incr('owa_api_failure_count') metrics.set('message_process_count', processed_messages) return processed_messages
def sync_ldap_lists(ldap_settings, engine): try: l = ldap.ldapobject.ReconnectLDAPObject( ldap_settings['connection']['url']) except Exception: logger.exception('Connecting to ldap to get our mailing lists failed.') return try: l.simple_bind_s(*ldap_settings['connection']['bind_args']) except Exception: logger.exception('binding to ldap to get our mailing lists failed.') return session = sessionmaker(bind=engine)() mailing_list_type_name = 'mailing-list' list_type_id = session.execute( 'SELECT `id` FROM `target_type` WHERE `name` = :name', { 'name': mailing_list_type_name }).scalar() if not list_type_id: try: list_type_id = session.execute( 'INSERT INTO `target_type` (`name`) VALUES (:name)', { 'name': mailing_list_type_name }).lastrowid session.commit() logger.info('Created target_type "%s" with id %s', mailing_list_type_name, list_type_id) except (IntegrityError, DataError): logger.exeption('Failed creating mailing-list type ID') return ldap_add_pause_interval = ldap_settings.get('user_add_pause_interval', None) ldap_add_pause_duration = ldap_settings.get('user_add_pause_duration', 1) ldap_lists = get_ldap_lists(l, ldap_settings['search_strings']) ldap_lists_count = len(ldap_lists) metrics.set('ldap_lists_found', ldap_lists_count) metrics.set('ldap_memberships_found', 0) logger.info('Found %s ldap lists', ldap_lists_count) existing_ldap_lists = { row[0] for row in session.execute( '''SELECT `name` FROM `target` WHERE `target`.`type_id` = :type_id''', {'type_id': list_type_id}) } kill_lists = existing_ldap_lists - {item[1] for item in ldap_lists} if kill_lists: metrics.incr('ldap_lists_removed', len(kill_lists)) for ldap_list in kill_lists: prune_target(engine, ldap_list, mailing_list_type_name) user_add_count = 0 for list_cn, list_name in ldap_lists: try: members = get_ldap_flat_membership(l, ldap_settings['search_strings'], list_cn, ldap_settings['max_depth'], 0, set()) except ldap.SERVER_DOWN: # reconnect and retry once metrics.incr('ldap_reconnects') logger.warning('LDAP server went away for list %s. Reconnecting', list_name) l.reconnect(ldap_settings['connection']['url']) members = get_ldap_flat_membership(l, ldap_settings['search_strings'], list_cn, ldap_settings['max_depth'], 0, set()) if not members: logger.info('Ignoring/pruning empty ldap list %s', list_name) continue num_members = len(members) metrics.incr('ldap_memberships_found', num_members) created = False list_id = session.execute( '''SELECT `mailing_list`.`target_id` FROM `mailing_list` JOIN `target` on `target`.`id` = `mailing_list`.`target_id` WHERE `target`.`name` = :name''', { 'name': list_name }).scalar() if not list_id: try: list_id = session.execute( '''INSERT INTO `target` (`type_id`, `name`) VALUES (:type_id, :name)''', { 'type_id': list_type_id, 'name': list_name }).lastrowid session.commit() except (IntegrityError, DataError): logger.exception( 'Failed adding row to target table for mailing list %s. Skipping this list.', list_name) metrics.incr('ldap_lists_failed_to_add') continue try: session.execute( '''INSERT INTO `mailing_list` (`target_id`, `count`) VALUES (:list_id, :count)''', { 'list_id': list_id, 'count': num_members }) session.commit() except (IntegrityError, DataError): logger.exception( 'Failed adding row to mailing_list table for mailing list %s (ID: %s). Skipping this list.', list_name, list_id) metrics.incr('ldap_lists_failed_to_add') continue logger.info('Created list %s with id %s', list_name, list_id) metrics.incr('ldap_lists_added') created = True if not created: session.execute( 'UPDATE `mailing_list` SET `count` = :count WHERE `target_id` = :list_id', { 'count': num_members, 'list_id': list_id }) session.commit() existing_members = { row[0] for row in session.execute( ''' SELECT `target_contact`.`destination` FROM `mailing_list_membership` JOIN `target_contact` ON `target_contact`.`target_id` = `mailing_list_membership`.`user_id` WHERE `mailing_list_membership`.`list_id` = :list_id AND `target_contact`.`mode_id` = (SELECT `id` FROM `mode` WHERE `name` = 'email') ''', {'list_id': list_id}) } add_members = members - existing_members kill_members = existing_members - members if add_members: metrics.incr('ldap_memberships_added', len(add_members)) for member in add_members: try: session.execute( '''INSERT IGNORE INTO `mailing_list_membership` (`list_id`, `user_id`) VALUES (:list_id, (SELECT `target_id` FROM `target_contact` JOIN `target` ON `target`.`id` = `target_id` WHERE `destination` = :name AND `mode_id` = (SELECT `id` FROM `mode` WHERE `name` = 'email') AND `target`.`type_id` = (SELECT `id` FROM `target_type` WHERE `name` = 'user'))) ''', { 'list_id': list_id, 'name': member }) logger.info('Added %s to list %s', member, list_name) except (IntegrityError, DataError): metrics.incr('ldap_memberships_failed_to_add') logger.warn('Failed adding %s to %s', member, list_name) user_add_count += 1 if (ldap_add_pause_interval is not None) and ( user_add_count % ldap_add_pause_interval) == 0: logger.info('Pausing for %s seconds every %s users.', ldap_add_pause_duration, ldap_add_pause_interval) time.sleep(ldap_add_pause_duration) if kill_members: metrics.incr('ldap_memberships_removed', len(kill_members)) batch_remove_ldap_memberships(session, list_id, kill_members) session.commit() session.close()
def sync_from_oncall(config, engine, purge_old_users=True): # users and teams present in our oncall database oncall_base_url = config.get('oncall-api') if not oncall_base_url: logger.error( 'Missing URL to oncall-api, which we use for user/team lookups. Bailing.' ) return oncall = oncallclient.OncallClient(config.get('oncall-app', ''), config.get('oncall-key', ''), oncall_base_url) oncall_users = fetch_users_from_oncall(oncall) if not oncall_users: logger.warning('No users found. Bailing.') return oncall_team_names = fetch_teams_from_oncall(oncall) if not oncall_team_names: logger.warning('We do not have a list of team names') oncall_team_names = set(oncall_team_names) session = sessionmaker(bind=engine)() # users present in iris' database iris_users = {} for row in engine.execute( '''SELECT `target`.`name` as `name`, `mode`.`name` as `mode`, `target_contact`.`destination` FROM `target` JOIN `user` on `target`.`id` = `user`.`target_id` LEFT OUTER JOIN `target_contact` ON `target`.`id` = `target_contact`.`target_id` LEFT OUTER JOIN `mode` ON `target_contact`.`mode_id` = `mode`.`id` WHERE `target`.`active` = TRUE ORDER BY `target`.`name`'''): contacts = iris_users.setdefault(row.name, {}) if row.mode is None or row.destination is None: continue contacts[row.mode] = row.destination iris_usernames = iris_users.viewkeys() # users from the oncall endpoints and config files metrics.set('users_found', len(oncall_users)) metrics.set('teams_found', len(oncall_team_names)) oncall_users.update(get_predefined_users(config)) oncall_usernames = oncall_users.viewkeys() # set of users not presently in iris users_to_insert = oncall_usernames - iris_usernames # set of existing iris users that are in the user oncall database users_to_update = iris_usernames & oncall_usernames users_to_mark_inactive = iris_usernames - oncall_usernames # get objects needed for insertion target_types = { name: id for name, id in session.execute( 'SELECT `name`, `id` FROM `target_type`') } # 'team' and 'user' modes = { name: id for name, id in session.execute('SELECT `name`, `id` FROM `mode`') } iris_team_names = { name for (name, ) in engine.execute( '''SELECT `name` FROM `target` WHERE `type_id` = %s''', target_types['team']) } target_add_sql = 'INSERT INTO `target` (`name`, `type_id`) VALUES (%s, %s) ON DUPLICATE KEY UPDATE `active` = TRUE' user_add_sql = 'INSERT IGNORE INTO `user` (`target_id`) VALUES (%s)' target_contact_add_sql = '''INSERT INTO `target_contact` (`target_id`, `mode_id`, `destination`) VALUES (%s, %s, %s) ON DUPLICATE KEY UPDATE `destination` = %s''' # insert users that need to be logger.info('Users to insert (%d)' % len(users_to_insert)) for username in users_to_insert: logger.info('Inserting %s' % username) try: target_id = engine.execute( target_add_sql, (username, target_types['user'])).lastrowid engine.execute(user_add_sql, (target_id, )) except SQLAlchemyError as e: metrics.incr('users_failed_to_add') metrics.incr('sql_errors') logger.exception('Failed to add user %s' % username) continue metrics.incr('users_added') for key, value in oncall_users[username].iteritems(): if value and key in modes: logger.info('%s: %s -> %s' % (username, key, value)) engine.execute(target_contact_add_sql, (target_id, modes[key], value, value)) # update users that need to be contact_update_sql = 'UPDATE target_contact SET destination = %s WHERE target_id = (SELECT id FROM target WHERE name = %s) AND mode_id = %s' contact_insert_sql = 'INSERT INTO target_contact (target_id, mode_id, destination) VALUES ((SELECT id FROM target WHERE name = %s), %s, %s)' contact_delete_sql = 'DELETE FROM target_contact WHERE target_id = (SELECT id FROM target WHERE name = %s) AND mode_id = %s' logger.info('Users to update (%d)' % len(users_to_update)) for username in users_to_update: try: db_contacts = iris_users[username] oncall_contacts = oncall_users[username] for mode in modes: if mode in oncall_contacts and oncall_contacts[mode]: if mode in db_contacts: if oncall_contacts[mode] != db_contacts[mode]: logger.info('%s: updating %s' % (username, mode)) metrics.incr('user_contacts_updated') engine.execute( contact_update_sql, (oncall_contacts[mode], username, modes[mode])) else: logger.info('%s: adding %s' % (username, mode)) metrics.incr('user_contacts_updated') engine.execute( contact_insert_sql, (username, modes[mode], oncall_contacts[mode])) elif mode in db_contacts: logger.info('%s: deleting %s' % (username, mode)) metrics.incr('user_contacts_updated') engine.execute(contact_delete_sql, (username, modes[mode])) else: logger.debug('%s: missing %s' % (username, mode)) except SQLAlchemyError as e: metrics.incr('users_failed_to_update') metrics.incr('sql_errors') logger.exception('Failed to update user %s' % username) continue # sync teams between iris and oncall teams_to_insert = oncall_team_names - iris_team_names teams_to_deactivate = iris_team_names - oncall_team_names logger.info('Teams to insert (%d)' % len(teams_to_insert)) for t in teams_to_insert: logger.info('Inserting %s' % t) try: target_id = engine.execute(target_add_sql, (t, target_types['team'])).lastrowid metrics.incr('teams_added') except SQLAlchemyError as e: logger.exception('Error inserting team %s: %s' % (t, e)) metrics.incr('teams_failed_to_add') continue session.commit() session.close() # mark users/teams inactive if purge_old_users: logger.info('Users to mark inactive (%d)' % len(users_to_mark_inactive)) for username in users_to_mark_inactive: prune_target(engine, username, 'user') for team in teams_to_deactivate: prune_target(engine, team, 'team')
def sync_from_oncall(config, engine, purge_old_users=True): # users and teams present in our oncall database oncall_base_url = config.get('oncall-api') if not oncall_base_url: logger.error( 'Missing URL to oncall-api, which we use for user/team lookups. Bailing.' ) return oncall = oncallclient.OncallClient(config.get('oncall-app', ''), config.get('oncall-key', ''), oncall_base_url) oncall_users = fetch_users_from_oncall(oncall) if not oncall_users: logger.warning('No users found. Bailing.') return # get teams from oncall-api and separate the list of tuples into two lists of name and ids oncall_teams_api_response = fetch_teams_from_oncall(oncall) if not oncall_teams_api_response: logger.warning('No teams found. Bailing.') return oncall_team_response = list(zip(*oncall_teams_api_response)) oncall_team_names = [name.lower() for name in oncall_team_response[0]] oncall_team_ids = oncall_team_response[1] oncall_response_dict_name_key = dict( zip(oncall_team_names, oncall_team_ids)) oncall_response_dict_id_key = dict(zip(oncall_team_ids, oncall_team_names)) oncall_case_sensitive_dict = { name.lower(): name for name in oncall_team_response[0] } if not oncall_team_names: logger.warning('We do not have a list of team names') oncall_team_names = set(oncall_team_names) oncall_team_ids = set(oncall_team_ids) session = sessionmaker(bind=engine)() # users present in iris' database iris_users = {} for row in engine.execute( '''SELECT `target`.`name` as `name`, `mode`.`name` as `mode`, `target_contact`.`destination` FROM `target` JOIN `user` on `target`.`id` = `user`.`target_id` LEFT OUTER JOIN `target_contact` ON `target`.`id` = `target_contact`.`target_id` LEFT OUTER JOIN `mode` ON `target_contact`.`mode_id` = `mode`.`id` WHERE `target`.`active` = TRUE ORDER BY `target`.`name`'''): contacts = iris_users.setdefault(row.name, {}) if row.mode is None or row.destination is None: continue contacts[row.mode] = row.destination iris_usernames = iris_users.keys() # users from the oncall endpoints and config files metrics.set('users_found', len(oncall_users)) metrics.set('teams_found', len(oncall_team_names)) oncall_users.update(get_predefined_users(config)) oncall_usernames = oncall_users.keys() # set of users not presently in iris users_to_insert = oncall_usernames - iris_usernames # set of existing iris users that are in the user oncall database users_to_update = iris_usernames & oncall_usernames users_to_mark_inactive = iris_usernames - oncall_usernames # get objects needed for insertion target_types = { name: target_id for name, target_id in session.execute( 'SELECT `name`, `id` FROM `target_type`') } # 'team' and 'user' modes = { name: mode_id for name, mode_id in session.execute('SELECT `name`, `id` FROM `mode`') } iris_team_names = { name.lower() for (name, ) in engine.execute( '''SELECT `name` FROM `target` WHERE `type_id` = %s''', target_types['team']) } target_add_sql = 'INSERT INTO `target` (`name`, `type_id`) VALUES (%s, %s) ON DUPLICATE KEY UPDATE `active` = TRUE' oncall_add_sql = 'INSERT INTO `oncall_team` (`target_id`, `oncall_team_id`) VALUES (%s, %s)' user_add_sql = 'INSERT IGNORE INTO `user` (`target_id`) VALUES (%s)' target_contact_add_sql = '''INSERT INTO `target_contact` (`target_id`, `mode_id`, `destination`) VALUES (%s, %s, %s) ON DUPLICATE KEY UPDATE `destination` = %s''' # insert users that need to be logger.info('Users to insert (%d)', len(users_to_insert)) for username in users_to_insert: sleep(update_sleep) logger.info('Inserting %s', username) try: target_id = engine.execute( target_add_sql, (username, target_types['user'])).lastrowid engine.execute(user_add_sql, (target_id, )) except SQLAlchemyError as e: metrics.incr('users_failed_to_add') metrics.incr('sql_errors') logger.exception('Failed to add user %s' % username) continue metrics.incr('users_added') for key, value in oncall_users[username].items(): if value and key in modes: logger.info('%s: %s -> %s', username, key, value) try: engine.execute(target_contact_add_sql, (target_id, modes[key], value, value)) except SQLAlchemyError as e: logger.exception('Error adding contact for target id: %s', target_id) metrics.incr('sql_errors') continue # update users that need to be contact_update_sql = 'UPDATE target_contact SET destination = %s WHERE target_id = (SELECT id FROM target WHERE name = %s AND type_id = %s) AND mode_id = %s' contact_insert_sql = 'INSERT INTO target_contact (target_id, mode_id, destination) VALUES ((SELECT id FROM target WHERE name = %s AND type_id = %s), %s, %s)' contact_delete_sql = 'DELETE FROM target_contact WHERE target_id = (SELECT id FROM target WHERE name = %s AND type_id = %s) AND mode_id = %s' logger.info('Users to update (%d)', len(users_to_update)) for username in users_to_update: sleep(update_sleep) try: db_contacts = iris_users[username] oncall_contacts = oncall_users[username] for mode in modes: if mode in oncall_contacts and oncall_contacts[mode]: if mode in db_contacts: if oncall_contacts[mode] != db_contacts[mode]: logger.info('%s: updating %s', username, mode) metrics.incr('user_contacts_updated') engine.execute(contact_update_sql, (oncall_contacts[mode], username, target_types['user'], modes[mode])) else: logger.info('%s: adding %s', username, mode) metrics.incr('user_contacts_updated') engine.execute(contact_insert_sql, (username, target_types['user'], modes[mode], oncall_contacts[mode])) elif mode in db_contacts: logger.info('%s: deleting %s', username, mode) metrics.incr('user_contacts_updated') engine.execute( contact_delete_sql, (username, target_types['user'], modes[mode])) else: logger.debug('%s: missing %s', username, mode) except SQLAlchemyError as e: metrics.incr('users_failed_to_update') metrics.incr('sql_errors') logger.exception('Failed to update user %s', username) continue # sync teams between iris and oncall # iris_db_oncall_team_ids (team_ids in the oncall_team table) # oncall_team_ids (team_ids from oncall api call) # oncall_team_names (names from oncall api call) # oncall_response_dict_name_key (key value pairs of oncall team names and ids from api call) # oncall_response_dict_id_key same as above but key value inverted # oncall_case_sensitive_dict maps the case insensitive oncall name to the original capitalization # iris_team_names (names from target table) # iris_target_name_id_dict dictionary of target name -> target_id mappings # iris_db_oncall_team_id_name_dict dictionary of oncall team_id -> oncall name mappings # get all incoming names that match a target check if that target has an entry in oncall table if not make one iris_target_name_id_dict = { name.lower(): target_id for name, target_id in engine.execute( '''SELECT `name`, `id` FROM `target` WHERE `type_id` = %s''', target_types['team']) } matching_target_names = iris_team_names.intersection(oncall_team_names) if matching_target_names: existing_up_to_date_oncall_teams = { name.lower() for (name, ) in session.execute( '''SELECT `target`.`name` FROM `target` JOIN `oncall_team` ON `oncall_team`.`target_id` = `target`.`id` WHERE `target`.`name` IN :matching_names''', {'matching_names': tuple(matching_target_names)}) } # up to date target names that don't have an entry in the oncall_team table yet matching_target_names_no_oncall_entry = matching_target_names - existing_up_to_date_oncall_teams for t in matching_target_names_no_oncall_entry: logger.info('Inserting existing team into oncall_team %s', t) sleep(update_sleep) try: engine.execute( '''UPDATE `target` SET `active` = TRUE WHERE `id` = %s''', iris_target_name_id_dict[t]) engine.execute(oncall_add_sql, (iris_target_name_id_dict[t], oncall_response_dict_name_key[t])) except SQLAlchemyError as e: logger.exception('Error inserting oncall_team %s: %s', t, e) metrics.incr('sql_errors') continue # rename all mismatching target names iris_db_oncall_team_id_name_dict = { team_id: name.lower() for name, team_id in engine.execute( '''SELECT target.name, oncall_team.oncall_team_id FROM `target` JOIN `oncall_team` ON oncall_team.target_id = target.id''' ) } iris_db_oncall_team_ids = { oncall_team_id for (oncall_team_id, ) in engine.execute( '''SELECT `oncall_team_id` FROM `oncall_team`''') } matching_oncall_ids = oncall_team_ids.intersection(iris_db_oncall_team_ids) name_swaps = {} # find teams in the iris database whose names have changed for oncall_id in matching_oncall_ids: current_name = iris_db_oncall_team_id_name_dict[oncall_id] new_name = oncall_response_dict_id_key[oncall_id] try: if current_name != new_name: # handle edge case of teams swapping names if not iris_target_name_id_dict.get(new_name, None): target_id_to_rename = iris_target_name_id_dict[ current_name] logger.info('Renaming team %s to %s', current_name, new_name) engine.execute( '''UPDATE `target` SET `name` = %s, `active` = TRUE WHERE `id` = %s''', (oncall_case_sensitive_dict[new_name], target_id_to_rename)) else: # there is a team swap so rename to a random name to prevent a violation of unique target name constraint new_name = str(uuid.uuid4()) target_id_to_rename = iris_target_name_id_dict[ current_name] name_swaps[oncall_id] = target_id_to_rename logger.info('Renaming team %s to %s', current_name, new_name) engine.execute( '''UPDATE `target` SET `name` = %s, `active` = TRUE WHERE `id` = %s''', (new_name, target_id_to_rename)) sleep(update_sleep) except SQLAlchemyError as e: logger.exception('Error changing team name of %s to %s', current_name, new_name) metrics.incr('sql_errors') # go back and rename name_swaps to correct value for oncall_id, target_id_to_rename in name_swaps.items(): new_name = oncall_response_dict_id_key[oncall_id] try: engine.execute( '''UPDATE `target` SET `name` = %s, `active` = TRUE WHERE `id` = %s''', (oncall_case_sensitive_dict[new_name], target_id_to_rename)) except SQLAlchemyError as e: logger.exception('Error renaming target: %s', new_name) metrics.incr('sql_errors') continue sleep(update_sleep) # create new entries for new teams # if the team_id doesn't exist in oncall_team at this point then it is a new team. new_team_ids = oncall_team_ids - iris_db_oncall_team_ids logger.info('Teams to insert (%d)' % len(new_team_ids)) for team_id in new_team_ids: t = oncall_case_sensitive_dict[oncall_response_dict_id_key[team_id]] new_target_id = None # add team to target table logger.info('Inserting team %s', t) sleep(update_sleep) try: new_target_id = engine.execute(target_add_sql, (t, target_types['team'])).lastrowid metrics.incr('teams_added') except SQLAlchemyError as e: logger.exception('Error inserting team %s: %s', t, e) metrics.incr('teams_failed_to_add') metrics.incr('sql_errors') continue # add team to oncall_team table if new_target_id: logger.info('Inserting new team into oncall_team %s', t) try: engine.execute(oncall_add_sql, (new_target_id, team_id)) except SQLAlchemyError as e: logger.exception('Error inserting oncall_team %s: %s', t, e) metrics.incr('sql_errors') continue session.commit() session.close() # mark users/teams inactive if purge_old_users: # find active teams that don't exist in oncall anymore updated_iris_team_names = { name.lower() for (name, ) in engine.execute( '''SELECT `name` FROM `target` WHERE `type_id` = %s AND `active` = TRUE''', target_types['team']) } teams_to_deactivate = updated_iris_team_names - oncall_team_names logger.info('Users to mark inactive (%d)' % len(users_to_mark_inactive)) logger.info('Teams to mark inactive (%d)' % len(teams_to_deactivate)) for username in users_to_mark_inactive: prune_target(engine, username, 'user') sleep(update_sleep) for team in teams_to_deactivate: prune_target(engine, team, 'team') sleep(update_sleep)
def process_retention(engine, max_days, batch_size, cooldown_time, archive_path): time_start = time.time() connection = engine.raw_connection() cursor = connection.cursor(engine.dialect.dbapi.cursors.SSCursor) deleted_incidents = 0 deleted_messages = 0 deleted_comments = 0 # First, archive/kill incidents and their messages while True: # Get incidents to archive and kill, in batches try: cursor.execute( ''' SELECT %s FROM `incident` LEFT JOIN `plan` on `plan`.`id` = `incident`.`plan_id` LEFT JOIN `application` on `application`.`id` = `incident`.`application_id` LEFT JOIN `target` ON `incident`.`owner_id` = `target`.`id` WHERE `incident`.`created` < (CURDATE() - INTERVAL %%s DAY) LIMIT %%s ''' % (', '.join(field[0] for field in incident_fields)), [max_days, batch_size]) except Exception: logger.exception('Failed getting incidents') try: cursor.close() except Exception: pass cursor = connection.cursor(engine.dialect.dbapi.cursors.SSCursor) break incident_ids = deque() for incident in cursor: archive_incident(incident, archive_path) incident_ids.append(incident[0]) if not incident_ids: break logger.info('Archived %d incidents', len(incident_ids)) # Then, Archive+Kill all comments in these incidents while True: try: cursor.execute( ''' SELECT %s FROM `comment` LEFT JOIN `target` ON `comment`.`user_id` = `target`.`id` WHERE `comment`.`incident_id` in %%s LIMIT %%s ''' % (', '.join(field[0] for field in comment_fields)), [tuple(incident_ids), batch_size]) except Exception: metrics.incr('sql_errors') logger.exception('Failed getting comments') try: cursor.close() except Exception: pass cursor = connection.cursor( engine.dialect.dbapi.cursors.SSCursor) break comment_ids = deque() for comment in cursor: archive_comment(comment, archive_path) comment_ids.append(comment[0]) if not comment_ids: break logger.info('Archived %d comments', len(comment_ids)) try: deleted_rows = cursor.execute( 'DELETE FROM `comment` WHERE `id` IN %s', [tuple(comment_ids)]) connection.commit() except Exception: metrics.incr('sql_errors') logger.exception('Failed deleting comments from incidents') try: cursor.close() except Exception: pass cursor = connection.cursor( engine.dialect.dbapi.cursors.SSCursor) break else: if deleted_rows: logger.info('Killed %d comments from %d incidents', deleted_rows, len(incident_ids)) deleted_comments += deleted_rows sleep(cooldown_time) else: break # Kill all dynamic plan maps associated with these incidents while True: try: deleted_rows = cursor.execute( 'DELETE FROM `dynamic_plan_map` WHERE `incident_id` IN %s', [tuple(incident_ids)]) connection.commit() except Exception: metrics.incr('sql_errors') logger.exception('Failed deleting dynamic plan maps') try: cursor.close() except Exception: pass cursor = connection.cursor( engine.dialect.dbapi.cursors.SSCursor) break else: if deleted_rows: logger.info('Killed %d dynamic plan maps', deleted_rows) deleted_messages += deleted_rows sleep(cooldown_time) else: break # Archive+Kill all messages in these incidents while True: try: cursor.execute( ''' SELECT %s FROM `message` JOIN `priority` on `priority`.`id` = `message`.`priority_id` LEFT JOIN `mode` on `mode`.`id` = `message`.`mode_id` LEFT JOIN `template` ON `message`.`template_id` = `template`.`id` LEFT JOIN `target` ON `message`.`target_id` = `target`.`id` WHERE `message`.`incident_id` in %%s LIMIT %%s ''' % (', '.join(field[0] for field in message_fields)), [tuple(incident_ids), batch_size]) except Exception: metrics.incr('sql_errors') logger.exception('Failed getting messages') try: cursor.close() except Exception: pass cursor = connection.cursor( engine.dialect.dbapi.cursors.SSCursor) break message_ids = deque() for message in cursor: archive_message(message, archive_path) message_ids.append(message[0]) if not message_ids: break logger.info('Archived %d messages', len(message_ids)) # explicitly delete all the extra message data try: cursor.execute( 'DELETE FROM `message_changelog` WHERE `message_id` IN %s', [tuple(message_ids)]) cursor.execute( 'DELETE FROM `response` WHERE `message_id` IN %s', [tuple(message_ids)]) cursor.execute( 'DELETE FROM `twilio_delivery_status` WHERE `message_id` IN %s', [tuple(message_ids)]) cursor.execute( 'DELETE FROM `twilio_retry` WHERE `message_id` IN %s', [tuple(message_ids)]) cursor.execute( 'DELETE FROM `generic_message_sent_status` WHERE `message_id` IN %s', [tuple(message_ids)]) connection.commit() except Exception: metrics.incr('sql_errors') logger.exception('Failed deleting message child') try: cursor.close() except Exception: pass cursor = connection.cursor( engine.dialect.dbapi.cursors.SSCursor) try: deleted_rows = cursor.execute( 'DELETE FROM `message` WHERE `id` IN %s', [tuple(message_ids)]) connection.commit() except Exception: metrics.incr('sql_errors') logger.exception('Failed deleting messages from incidents') try: cursor.close() except Exception: pass cursor = connection.cursor( engine.dialect.dbapi.cursors.SSCursor) # try deleting individually to directly identify any issues and prevent single error from stopping cleanup deleted_rows = 0 for msg_id in message_ids: try: deleted_rows += cursor.execute( 'DELETE FROM `message` WHERE `id`=%s', msg_id) connection.commit() except Exception: metrics.incr('sql_errors') logger.exception('Failed deleting message id: %s', msg_id) else: if deleted_rows: logger.info('Killed %d messages from %d incidents', deleted_rows, len(incident_ids)) deleted_messages += deleted_rows sleep(cooldown_time) else: break # Finally kill incidents try: deleted_rows = cursor.execute( 'DELETE FROM `incident` WHERE `id` IN %s', [tuple(incident_ids)]) connection.commit() except Exception: metrics.incr('sql_errors') logger.exception('Failed deleting incidents') try: cursor.close() except Exception: pass cursor = connection.cursor(engine.dialect.dbapi.cursors.SSCursor) # try deleting individually to directly identify any issues and prevent single error from stopping clean-up deleted_rows = 0 for inc_id in incident_ids: try: deleted_rows += cursor.execute( 'DELETE FROM `incident` WHERE `id`=%s', inc_id) connection.commit() except Exception: metrics.incr('sql_errors') logger.exception('Failed deleting incident id: %s', inc_id) logger.info('Deleted %s incidents', deleted_rows) deleted_incidents += deleted_rows sleep(cooldown_time) # Next, kill messages not tied to incidents, like quota notifs or incident tracking emails while True: try: deleted_rows = cursor.execute( 'DELETE FROM `message` WHERE `created` < (CURDATE() - INTERVAL %s DAY) AND `incident_id` IS NULL LIMIT %s', [max_days, batch_size]) connection.commit() except Exception: metrics.incr('sql_errors') logger.exception('Failed deleting messages') try: cursor.close() except Exception: pass break else: if deleted_rows: logger.info('Killed %d misc messages', deleted_rows) deleted_messages += deleted_rows sleep(cooldown_time) else: break try: cursor.close() except Exception: pass connection.close() logger.info( 'Run took %.2f seconds and deleted %d incidents and %d messages', time.time() - time_start, deleted_incidents, deleted_messages) metrics.set('deleted_messages', deleted_messages) metrics.set('deleted_incidents', deleted_incidents) metrics.set('deleted_comments', deleted_comments)
def process_retention(engine, max_days, batch_size, cooldown_time, archive_path): time_start = time.time() connection = engine.raw_connection() cursor = connection.cursor(engine.dialect.dbapi.cursors.SSCursor) deleted_incidents = 0 deleted_messages = 0 # First, archive/kill incidents and their messages while True: # Get incidents to archive and kill, in batches try: cursor.execute( ''' SELECT %s FROM `incident` LEFT JOIN `plan` on `plan`.`id` = `incident`.`plan_id` LEFT JOIN `application` on `application`.`id` = `incident`.`application_id` LEFT JOIN `target` ON `incident`.`owner_id` = `target`.`id` WHERE `incident`.`created` < (CURDATE() - INTERVAL %%s DAY) LIMIT %%s ''' % (', '.join(field[0] for field in incident_fields)), [max_days, batch_size]) except Exception: logger.exception('Failed getting incidents') try: cursor.close() except Exception: pass cursor = connection.cursor(engine.dialect.dbapi.cursors.SSCursor) break incident_ids = deque() for incident in cursor: archive_incident(incident, archive_path) incident_ids.append(incident[0]) if not incident_ids: break logger.info('Archived %d incidents', len(incident_ids)) # Archive+Kill all messages in these incidents while True: try: cursor.execute( ''' SELECT %s FROM `message` JOIN `priority` on `priority`.`id` = `message`.`priority_id` LEFT JOIN `mode` on `mode`.`id` = `message`.`mode_id` LEFT JOIN `template` ON `message`.`template_id` = `template`.`id` LEFT JOIN `target` ON `message`.`target_id` = `target`.`id` WHERE `message`.`incident_id` in %%s LIMIT %%s ''' % (', '.join(field[0] for field in message_fields)), [tuple(incident_ids), batch_size]) except Exception: metrics.incr('sql_errors') logger.exception('Failed getting messages') try: cursor.close() except Exception: pass cursor = connection.cursor(engine.dialect.dbapi.cursors.SSCursor) break message_ids = deque() for message in cursor: archive_message(message, archive_path) message_ids.append(message[0]) if not message_ids: break logger.info('Archived %d messages', len(message_ids)) try: deleted_rows = cursor.execute('DELETE FROM `message` WHERE `id` IN %s', [tuple(message_ids)]) connection.commit() except Exception: metrics.incr('sql_errors') logger.exception('Failed deleting messages from incidents') try: cursor.close() except Exception: pass cursor = connection.cursor(engine.dialect.dbapi.cursors.SSCursor) break else: if deleted_rows: logger.info('Killed %d messages from %d incidents', deleted_rows, len(incident_ids)) deleted_messages += deleted_rows sleep(cooldown_time) else: break # Finally kill incidents try: deleted_rows = cursor.execute('DELETE FROM `incident` WHERE `id` IN %s', [tuple(incident_ids)]) connection.commit() except Exception: metrics.incr('sql_errors') logger.exception('Failed deleting incidents') try: cursor.close() except Exception: pass cursor = connection.cursor(engine.dialect.dbapi.cursors.SSCursor) break else: logger.info('Deleted %s incidents', deleted_rows) deleted_incidents += deleted_rows sleep(cooldown_time) # Next, kill messages not tied to incidents, like quota notifs or incident tracking emails while True: try: deleted_rows = cursor.execute('DELETE FROM `message` WHERE `created` < (CURDATE() - INTERVAL %s DAY) AND `incident_id` IS NULL LIMIT %s', [max_days, batch_size]) connection.commit() except Exception: metrics.incr('sql_errors') logger.exception('Failed deleting messages') try: cursor.close() except Exception: pass break else: if deleted_rows: logger.info('Killed %d misc messages', deleted_rows) deleted_messages += deleted_rows sleep(cooldown_time) else: break try: cursor.close() except Exception: pass connection.close() logger.info('Run took %.2f seconds and deleted %d incidents and %d messages', time.time() - time_start, deleted_incidents, deleted_messages) metrics.set('deleted_messages', deleted_messages) metrics.set('deleted_incidents', deleted_incidents)
def main(): global ldap_timeout global ldap_pagination_size global update_sleep config = load_config() metrics.init(config, 'iris-sync-targets', stats_reset) default_ldap_timeout = 60 default_ldap_pagination_size = 400 default_update_sleep = 0 default_ldap_nap_time = 3600 default_oncall_nap_time = 60 ldap_timeout = int(config.get('sync_script_ldap_timeout', default_ldap_timeout)) ldap_pagination_size = int(config.get('sync_script_ldap_pagination_size', default_ldap_pagination_size)) update_sleep = float(config.get('target_update_pause', default_update_sleep)) try: ldap_nap_time = int(config.get('sync_script_ldap_nap_time', default_ldap_nap_time)) oncall_nap_time = int(config.get('sync_script_oncall_nap_time', default_oncall_nap_time)) except ValueError: ldap_nap_time = default_ldap_nap_time oncall_nap_time = default_oncall_nap_time # check if we are using special connection settings for this script if config.get('db_target_sync'): engine = create_engine(config['db_target_sync']['conn']['str'] % config['db_target_sync']['conn']['kwargs'], **config['db_target_sync']['kwargs']) else: engine = create_engine(config['db']['conn']['str'] % config['db']['conn']['kwargs'], **config['db']['kwargs']) # Optionally, maintain an internal list of mailing lists from ldap that can also be # used as targets. ldap_lists = config.get('ldap_lists') # Initialize these to zero at the start of the app, and don't reset them at every # metrics interval metrics.set('users_found', 0) metrics.set('teams_found', 0) metrics.set('ldap_lists_found', 0) metrics.set('ldap_memberships_found', 0) metrics_task = spawn(metrics.emit_forever) oncall_task = spawn(oncall_sync_loop, config, engine, oncall_nap_time) if ldap_lists: if 'ldap_cert_path' in ldap_lists: ldap_cert_path = ldap_lists['ldap_cert_path'] if not os.access(ldap_cert_path, os.R_OK): logger.error("Failed to read ldap_cert_path certificate") raise IOError else: ldap_lists['cert_path'] = ldap_cert_path ldap_task = spawn(ldap_sync_loop, ldap_lists, engine, ldap_nap_time) while True: if not bool(metrics_task): metrics.incr('failed_tasks') logger.error('metrics task failed, %s', metrics_task.exception) spawn(metrics.emit_forever) if not bool(oncall_task): metrics.incr('failed_tasks') logger.error('oncall task failed, %s', oncall_task.exception) metrics_task = spawn(oncall_sync_loop, config, engine, oncall_nap_time) if ldap_lists: if not bool(ldap_task): metrics.incr('failed_tasks') logger.error('ldap task failed, %s', ldap_task.exception) ldap_task = spawn(ldap_sync_loop, ldap_lists, engine, ldap_nap_time) sleep(10)
def main(): global config config = load_config() start_time = time.time() logger.info('[-] bootstraping sender...') init_sender(config) init_plugins(config.get('plugins', {})) init_vendors(config.get('vendors', []), config.get('applications', [])) send_task = spawn(send) worker_tasks = [spawn(worker) for x in xrange(100)] rpc.init(config['sender'], dict(send_message=send_message)) rpc.run(config['sender']) spawn(coordinator.update_forever) gwatch_renewer_task = None prune_audit_logs_task = None interval = 60 logger.info('[*] sender bootstrapped') while True: runtime = int(time.time()) logger.info('--> sender looop started.') cache.refresh() cache.purge() # If we're currently a master, ensure our master-greenlets are running # and we're doing the master duties if coordinator.am_i_master(): if not bool(gwatch_renewer_task): if should_mock_gwatch_renewer: gwatch_renewer_task = spawn(mock_gwatch_renewer) else: gwatch_renewer_task = spawn(gwatch_renewer) if not bool(prune_audit_logs_task): prune_audit_logs_task = spawn(prune_old_audit_logs_worker) try: escalate() deactivate() poll() aggregate(runtime) except Exception: metrics.incr('task_failure') logger.exception("Exception occured in main loop.") # If we're not master, don't do the master tasks and make sure those other # greenlets are stopped if they're running else: logger.info('I am not the master so I am not doing master sender tasks.') # Stop these task greenlets if they're running. Technically this should # never happen because if we're the master, we'll likely only stop being the # master if our process exits, which would kill these greenlets anyway. if bool(gwatch_renewer_task): logger.info('I am not master anymore so stopping the gwatch renewer') gwatch_renewer_task.kill() if bool(prune_audit_logs_task): logger.info('I am not master anymore so stopping the audit logs worker') prune_audit_logs_task.kill() # check status for all background greenlets and respawn if necessary if not bool(send_task): logger.error("send task failed, %s", send_task.exception) metrics.incr('task_failure') send_task = spawn(send) bad_workers = [] for i, task in enumerate(worker_tasks): if not bool(task): logger.error("worker task failed, %s", task.exception) metrics.incr('task_failure') bad_workers.append(i) for i in bad_workers: worker_tasks[i] = spawn(worker) now = time.time() metrics.set('sender_uptime', int(now - start_time)) spawn(metrics.emit) elapsed_time = now - runtime nap_time = max(0, interval - elapsed_time) logger.info('--> sender loop finished in %s seconds - sleeping %s seconds', elapsed_time, nap_time) sleep(nap_time)
# no message created due to role look up failure, reset step to # 0 for retry step = 0 cursor.execute(UPDATE_INCIDENT_SQL, (step, incident_id)) msg_count += step_msg_cnt else: logger.error('plan id %d has no steps, incident id %d is invalid', plan_id, incident_id) cursor.execute(INVALIDATE_INCIDENT, incident_id) connection.commit() cursor.close() connection.close() logger.info('[*] %s new messages', msg_count) logger.info('[*] escalate task finished') metrics.set('notifications', time.time() - start_notifications) def aggregate(now): # see if it's time to send the batches logger.info('[-] start aggregate task - queued: %s', len(messages)) start_aggregations = time.time() for key in queues.keys(): aggregation_window = cache.plans[key[0]]['aggregation_window'] if now - sent.get(key, 0) >= aggregation_window: aggregated_message_ids = queues[key] connection = db.engine.raw_connection() cursor = connection.cursor() cursor.execute( 'SELECT `id` FROM `message` WHERE active=1 AND `id` in %s',