def log_event(owner_id, event_type, action, error=None, **kwargs): """Log a new event. Log a new event comprised of the arguments provided. Once the new event has been prepared, additional processing is applied in order to associate any relevant stories, and, finally, it is pushed to RabbitMQ. Arguments: - owner_id: the current Owner's ID. - event_type: the event type - job, shell, session, request, incident. - action: the action described by the event, such as create_machine. - error: indicates the error included in the event, if one exists. - kwargs: extra parameters to be logged. """ def _default(obj): return {'_python_object': str(obj)} try: # Prepare the base event to be logged. event = { 'owner_id': owner_id or None, 'log_id': uuid.uuid4().hex, 'action': action, 'type': event_type, 'time': time.time(), 'error': error if error else False, 'extra': json.dumps(kwargs, default=_default) } # Bring more key-value pairs to the top level. for key in FIELDS: if key in kwargs: event[key] = kwargs.pop(key) if 'story_id' in kwargs: event['story_id'] = kwargs.pop('story_id') if 'user_id' in event: try: event['email'] = User.objects.get(id=event['user_id']).email except User.DoesNotExist: log.error('User %s does not exist', event['user_id']) # Associate event with relevant stories. for key in ('job_id', 'shell_id', 'session_id', 'incident_id'): if key == 'session_id' and event_type != 'session': # AB Testing. continue if event.get(key): event.update({'story_id': event[key], 'stories': []}) associate_stories(event) break else: # Special case for closing stories unless an error has been raised, # such as PolicyUnauthorizedError. # TODO: Can be used to close any type of story, not only incidents. if action in ('close_story', ) and not error: event['stories'] = [('closes', 'incident', event['story_id'])] # Attempt to close open incidents. if action in CLOSES_INCIDENT: try: close_open_incidents(event) except Exception as exc: log.error('Event %s failed to close open incidents: %s', event['log_id'], exc) # Cross populate session-log data. try: cross_populate_session_data(event, kwargs) except Exception as exc: log.error('Failed to cross-populate log/session data: %s', exc) except Exception as exc: log.error('Failed to log event %s: %s', event, exc) else: # FIXME: Deprecate conn = MongoClient(config.MONGO_URI) coll = conn['mist'].logging coll.save(event.copy()) conn.close() # Construct RabbitMQ routing key. keys = [str(owner_id), str(event_type), str(action)] keys.append('true' if error else 'false') routing_key = '.'.join(map(str.lower, keys)) # Broadcast event to RabbitMQ's "events" exchange. amqp_publish('events', routing_key, event, ex_type='topic', ex_declare=True, auto_delete=False) event.pop('extra') event.update(kwargs) return event
def list_machines(schedule_id): """Perform list machines. Cloud controller stores results in mongodb.""" # Fetch schedule and cloud from database. # FIXME: resolve circular deps error from mist.api.poller.models import ListMachinesPollingSchedule sched = ListMachinesPollingSchedule.objects.get(id=schedule_id) cloud = sched.cloud now = datetime.datetime.now() # Check if this cloud should be autodisabled. if sched.last_success: two_days = datetime.timedelta(days=2) if now - sched.last_success > two_days and sched.failure_count > 50: autodisable_cloud(sched.cloud) return elif sched.failure_count > 100: autodisable_cloud(sched.cloud) return # Find last run. If too recent, abort. if sched.last_success and sched.last_failure: last_run = max(sched.last_success, sched.last_failure) else: last_run = sched.last_success or sched.last_failure if last_run: if now - last_run < sched.interval.timedelta: log.warning("Running too soon for cloud %s, aborting!", cloud) return # Is another same task running? if sched.last_attempt_started: # Other same task started recently, abort. if now - sched.last_attempt_started < datetime.timedelta(seconds=60): log.warning("Other same tasks started recently, aborting.") return # Has been running for too long or has died. Ignore. log.warning("Other same task seems to have started, but it's been " "quite a while, will ignore and run normally.") sched.last_attempt_started = now cloud.save() try: # Run list_machines. machines = cloud.ctl.compute.list_machines() except Exception as exc: # Store failure. log.warning("Failed to list_machines for cloud %s: %r", cloud, exc) sched.last_failure = datetime.datetime.now() sched.failure_count += 1 sched.last_attempt_started = None cloud.save() raise else: # Store success. log.info("Succeeded to list_machines for cloud %s", cloud) sched.last_success = datetime.datetime.now() sched.failure_count = 0 sched.last_attempt_started = None cloud.save() # Publish results to rabbitmq (for backwards compatibility). if amqp_owner_listening(cloud.owner.id): amqp_publish_user(cloud.owner.id, routing_key='list_machines', data={'cloud_id': cloud.id, 'machines': [machine.as_dict() for machine in machines]}) # Push historic information for inventory and cost reporting. for machine in machines: data = {'owner_id': machine.cloud.owner.id, 'machine_id': machine.id, 'cost_per_month': machine.cost.monthly} log.info("Will push to elastic: %s", data) amqp_publish(exchange='machines_inventory', routing_key='', auto_delete=False, data=data)
def list_machines(self, persist=True): """Return list of machines for cloud A list of nodes is fetched from libcloud, the data is processed, stored on machine models, and a list of machine models is returned. Subclasses SHOULD NOT override or extend this method. This method wraps `_list_machines` which contains the core implementation. """ task_key = 'cloud:list_machines:%s' % self.cloud.id task = PeriodicTaskInfo.get_or_add(task_key) try: with task.task_runner(persist=persist): old_machines = { '%s-%s' % (m.id, m.machine_id): m.as_dict() for m in self.list_cached_machines() } machines = self._list_machines() except PeriodicTaskThresholdExceeded: self.cloud.disable() raise # Initialize AMQP connection to reuse for multiple messages. amqp_conn = Connection(config.AMQP_URI) if amqp_owner_listening(self.cloud.owner.id): if not config.MACHINE_PATCHES: amqp_publish_user( self.cloud.owner.id, routing_key='list_machines', connection=amqp_conn, data={ 'cloud_id': self.cloud.id, 'machines': [machine.as_dict() for machine in machines] }) else: # Publish patches to rabbitmq. new_machines = { '%s-%s' % (m.id, m.machine_id): m.as_dict() for m in machines } # Exclude last seen and probe fields from patch. for md in old_machines, new_machines: for m in md.values(): m.pop('last_seen') m.pop('probe') patch = jsonpatch.JsonPatch.from_diff(old_machines, new_machines).patch if patch: amqp_publish_user(self.cloud.owner.id, routing_key='patch_machines', connection=amqp_conn, data={ 'cloud_id': self.cloud.id, 'patch': patch }) # Push historic information for inventory and cost reporting. for machine in machines: data = { 'owner_id': self.cloud.owner.id, 'machine_id': machine.id, 'cost_per_month': machine.cost.monthly } amqp_publish(exchange='machines_inventory', routing_key='', auto_delete=False, data=data, connection=amqp_conn) return machines
def log_request_to_elastic(exception): try: log_dict = g.log_dict except AttributeError: return # log original exception if isinstance(exception, MistError): if exception.orig_exc: log_dict['_exc'] = repr(exception.orig_exc) log_dict['_exc_type'] = type(exception.orig_exc) if exception.orig_traceback: log_dict['_traceback'] = exception.orig_traceback elif isinstance(exception, Exception): log_dict['_exc'] = repr(exception) log_dict['_exc_type'] = type(exception) log_dict['_traceback'] = traceback.format_exc() if log_dict: log_methods.log_event(**log_dict) # if a bad exception didn't occur then return, else log it to file if g.exc_flag is False or exception is None: return # Publish traceback in rabbitmq, for heka to parse and forward to # elastic log.info('Bad exception occured, logging to rabbitmq') es_dict = log_dict.copy() es_dict.pop('_exc_type', '') es_dict['time'] = time.time() es_dict['traceback'] = es_dict.pop('_traceback', '') es_dict['exception'] = es_dict.pop('_exc', '') es_dict['type'] = 'exception' routing_key = '%s.%s' % (es_dict['owner_id'], es_dict['action']) pickler = jsonpickle.pickler.Pickler() helpers.amqp_publish('exceptions', routing_key, pickler.flatten(es_dict), ex_type='topic', ex_declare=True, auto_delete=False) # log bad exception to file log.info('Bad exception occured, logging to file') lines = [] lines.append('Exception: %s' % log_dict.pop('_exc', '')) lines.append('Exception type: %s' % log_dict.pop('_exc_type', '')) lines.append('Time: %s' % time.strftime('%Y-%m-%d %H:%M %Z')) lines += ([ '%s: %s' % (key, value) for key, value in list(log_dict.items()) if value and key != '_traceback' ]) for key in ('owner', 'user', 'sudoer'): id_ = log_dict.get('%s_id' % key) if id_: try: value = Owner.objects.get(id=id_) lines.append('%s: %s' % (key, value)) except Owner.DoesNotExist: pass except Exception as exc: log.error('Error finding user in logged exc: %r', exc) lines.append('-' * 10) lines.append(log_dict.get('_traceback', '')) lines.append('=' * 10) msg = '\n'.join(lines) + '\n' directory = 'var/log/exceptions' if not os.path.exists(directory): os.makedirs(directory) filename = '%s/%s' % (directory, int(time.time())) with open(filename, 'w+') as f: f.write(msg)