def check_orphaned(): """ Machines created in providers might be in an error state or some configuration in between may have prevented them to join Jenkins (or manually removed). This task will go through the nodes it knows about, make sure they exist in the provider and if so, remove them from the mita database and the provider. """ conn = connections.jenkins_connection() try: nodes = models.Node.query.all() except InvalidRequestError: logger.exception('could not list nodes') models.rollback() # we can try again at the next scheduled task run return for node in nodes: # it is all good if this node exists in Jenkins. That is the whole # reason for its miserable existence, to work for Mr. Jenkins. Let it # be. if node.jenkins_name: if conn.node_exists(node.jenkins_name): continue # So this node is not in Jenkins. If it is less than 15 minutes then # don't do anything because it might be just taking a while to join. # ALERT MR ROBINSON: 15 minutes is a magical number. now = datetime.utcnow() difference = now - node.created if difference.seconds > 900: # magical number alert logger.info("found created node that didn't join Jenkins: %s", node) provider = providers.get(node.provider) # "We often miss opportunity because it's dressed in overalls and # looks like work". Node missed his opportunity here. try: provider.destroy_node(name=node.cloud_name) except CloudNodeNotFound: logger.info("cloud was not found on provider: %s", node.cloud_name) logger.info( "will remove node from database, API confirms it no longer exists" ) node.delete() models.commit() except Exception: logger.exception("unable to destroy node: %s", node.cloud_name) logger.error("will skip database removal") continue # providers can purge nodes in error state too, try to prune those as well providers_conf = pecan.conf.provider.to_dict() for provider_name in providers_conf.keys(): provider = providers.get(provider_name) provider.purge()
def check_orphaned(): """ Machines created in providers might be in an error state or some configuration in between may have prevented them to join Jenkins (or manually removed). This task will go through the nodes it knows about, make sure they exist in the provider and if so, remove them from the mita database and the provider. """ conn = connections.jenkins_connection() nodes = models.Node.query.all() for node in nodes: # it is all good if this node exists in Jenkins. That is the whole # reason for its miserable existence, to work for Mr. Jenkins. Let it # be. if node.jenkins_name: if conn.node_exists(node.jenkins_name): continue # So this node is not in Jenkins. If it is less than 15 minutes then # don't do anything because it might be just taking a while to join. # ALERT MR ROBINSON: 15 minutes is a magical number. now = datetime.utcnow() difference = now - node.created if difference.seconds > 900: # magical number alert logger.info("found created node that didn't join Jenkins: %s", node) provider = providers.get(node.provider) # "We often miss opportunity because it's dressed in overalls and # looks like work". Node missed his opportunity here. try: provider.destroy_node(name=node.cloud_name) except Exception: logger.exception("unable to destroy node: %s", node.cloud_name) logger.info("removed useless node from provider and database: %s", node) node.delete() models.commit()
def delete(self): if request.method != 'POST': abort(405) if not self.node: abort(404) # XXX we need validation here # XXX WE REALLY NEED VALIDATION HERE try: delay = request.json.get('delay', 0) # simplejson.decoder.JSONDecodeError inherits from ValueError which is # the same that the builtin Python json handler will raise when no JSON # is passed in. except ValueError: delay = 0 if delay: delete_node.apply_async( (self.node.id,), countdown=delay) else: delete_provider_node( providers.get(self.node.provider), self.node.cloud_name ) delete_jenkins_node(self.node.jenkins_name) self.node.delete()
def idle(self): """ perform a check on the status of the current node, verifying how long it has been idle (if at all) marking the current timestamp since idle and determine if the node needs to be terminated. """ if not self.node: abort(404, 'could not find UUID: %s' % self.identifier) provider_for_node = conf.nodes[self.node.name]['provider'] provider = providers.get(provider_for_node) if request.method != 'POST': abort(405) now = datetime.utcnow() if self.node.idle: # it was idle before so check how many seconds since it was lazy. # `idle` is a property that will only be true-ish if idle_since has # been set. difference = now - self.node.idle_since if difference.seconds > 1200: # 20 minutes # we need to terminate this couch potato logger.info("Destroying node: %s" % self.node.cloud_name) try: provider.destroy_node(name=self.node.cloud_name) except CloudNodeNotFound: logger.info("node does not exist in cloud provider") conn = jenkins_connection() if conn.node_exists(self.node.jenkins_name): logger.info("Deleting node in jenkins: %s" % self.node.jenkins_name) conn.delete_node(self.node.jenkins_name) # delete from our database self.node.delete() else: # mark it as being idle self.node.idle_since = now
def idle(self): """ perform a check on the status of the current node, verifying how long it has been idle (if at all) marking the current timestamp since idle and determine if the node needs to be terminated. """ if not self.node: abort(404, 'could not find UUID: %s' % self.identifier) provider_for_node = conf.nodes[self.node.name]['provider'] provider = providers.get(provider_for_node) if request.method != 'POST': abort(405) now = datetime.utcnow() if self.node.idle: # it was idle before so check how many seconds since it was lazy. # `idle` is a property that will only be true-ish if idle_since has # been set. difference = now - self.node.idle_since if difference.seconds > 600: # 10 minutes # we need to terminate this couch potato logger.info("Destroying node: %s" % self.node.cloud_name) try: provider.destroy_node(name=self.node.cloud_name) except CloudNodeNotFound: logger.info("node does not exist in cloud provider") conn = jenkins_connection() if conn.node_exists(self.node.jenkins_name): logger.info("Deleting node in jenkins: %s" % self.node.jenkins_name) conn.delete_node(self.node.jenkins_name) # delete from our database self.node.delete() else: # mark it as being idle self.node.idle_since = now
def status(self, **kw): # since this is a read-only request via GET we need to ask for query # args to determine the right node because the name alone is not good # enoug (we might have more than one node named 'centos6' for example. provider = providers.get(request.json['provider']) status = provider.node_status(self.node_name, **kw) state = NodeState[status] state_int = NodeState[state] return {'status': state, 'status_int': state_int}
def delete_node(node_id): node = models.Node.get(node_id) if not node: logger.warning('async node deletion could not be completed') logger.warning('%s node id no longer exists', node_id) return util.delete_provider_node(providers.get(node.provider), node.cloud_name) util.delete_jenkins_node(node.jenkins_name) node.delete() models.commit()
def delete_node(node_id): node = models.Node.get(node_id) if not node: logger.warning('async node deletion could not be completed') logger.warning('%s node id no longer exists', node_id) return util.delete_provider_node( providers.get(node.provider), node.cloud_name ) util.delete_jenkins_node(node.jenkins_name) node.delete() models.commit()
def index(self): provider = providers.get(request.json['provider']) # request.json is read-only, since we are going to add extra metadata # to get the classes created, make a clean copy _json = deepcopy(request.json) # Before creating a node, check if it has already been created by us: name = _json['name'] keyname = _json['keyname'] image_name = _json['image_name'] size = _json['size'] labels = _json['labels'] script = _json['script'] existing_nodes = Node.filter_by( name=name, keyname=keyname, image_name=image_name, size=size, ).all() matching_nodes = [n for n in existing_nodes if n.labels_match(labels)] if not matching_nodes: # we don't have anything that matches this that has been ever created logger.info('requested node does not exist, will create one') # slap the UUID into the new node details _id = str(uuid.uuid4()) logger.info('changing name: %s' % _json['name']) _json['name'] = "%s__%s" % (name, _id) logger.info('changed name into something else: %s' % _json['name']) # try to slap it into the script, it is not OK if we are not allowed to, assume we should try: _json['script'] = script % _id except TypeError: logger.error('attempted to add a UUID to the script but failed') logger.error('ensure that a formatting entry exists, like: %%s') return # do not add anything if we haven't been able to format logger.warning('creating node with details: %s' % str(_json)) provider.create_node(**_json) _json.pop('name') Node( name=request.json['name'], identifier=_id, **_json )
def delete(self): if request.method != 'POST': abort(405) if not self.node: abort(404) provider = providers.get(request.json['provider']) try: # destroy from the cloud provider destroyed = provider.destroy_node(**request.json) if not destroyed: # FIXME: this needs to return a proper reponse, not just a 500 abort(500) # delete from the database self.node.delete() # FIXME: catch the exception from libcloud here except Exception: # find a way to REALLY sound the alarm because # if we can't delete it means that the user is going # to pay for a resource it should no longer exist abort(500)
def check_orphaned(): """ Machines created in providers might be in an error state or some configuration in between may have prevented them to join Jenkins (or manually removed). This task will go through the nodes it knows about, make sure they exist in the provider and if so, remove them from the mita database and the provider. """ conn = connections.jenkins_connection() nodes = models.Node.query.all() for node in nodes: # it is all good if this node exists in Jenkins. That is the whole # reason for its miserable existence, to work for Mr. Jenkins. Let it # be. if node.jenkins_name: if conn.node_exists(node.jenkins_name): continue # So this node is not in Jenkins. If it is less than 15 minutes then # don't do anything because it might be just taking a while to join. # ALERT MR ROBINSON: 15 minutes is a magical number. now = datetime.utcnow() difference = now - node.created if difference.seconds > 900: # magical number alert logger.info("found created node that didn't join Jenkins: %s", node) provider = providers.get(node.provider) # "We often miss opportunity because it's dressed in overalls and # looks like work". Node missed his opportunity here. try: provider.destroy_node(name=node.cloud_name) except Exception: logger.exception("unable to destroy node: %s", node.cloud_name) logger.error("will skip database removal") return logger.info("removed useless node from provider and database: %s", node) node.delete() models.commit()
def index(self): provider = providers.get(request.json['provider']) # request.json is read-only, since we are going to add extra metadata # to get the classes created, make a clean copy _json = deepcopy(request.json) # Before creating a node, check if it has already been created by us: name = _json['name'] keyname = _json['keyname'] image_name = _json['image_name'] size = _json['size'] labels = _json['labels'] script = _json['script'] count = _json.get('count', 1) # a buffered count is 3/4 what is needed rounded up buffered_count = int(round(count * 0.75)) existing_nodes = Node.filter_by( name=name, keyname=keyname, image_name=image_name, size=size, ).all() # try to slap it into the script, it is not OK if we are not allowed to, assume we should # this is just a validation step, should be taken care of by proper schema validation. try: script % '0000-aaaaa' except TypeError: logger.error('attempted to add a UUID to the script but failed') logger.error( 'ensure that a formatting entry for %s["script"] exists, like: %%s' % name ) return # do not add anything if we haven't been able to format logger.info('checking if an existing node matches required labels: %s', str(labels)) matching_nodes = [n for n in existing_nodes if n.labels_match(labels)] if not matching_nodes: # we don't have anything that matches this that has been ever created logger.info('job needs %s nodes to get unstuck', count) logger.info( 'no matching nodes were found, will create new ones. count: %s', buffered_count ) for i in range(buffered_count): # slap the UUID into the new node details node_kwargs = deepcopy(request.json) _id = str(uuid.uuid4()) node_kwargs['name'] = "%s__%s" % (name, _id) node_kwargs['script'] = script % _id provider.create_node(**node_kwargs) node_kwargs.pop('name') Node( name=name, identifier=_id, **node_kwargs ) models.commit() else: logger.info('found existing nodes that match labels: %s', len(matching_nodes)) now = datetime.utcnow() # we have something that matches, go over all of them and check: # if *all of them* are over 6 (by default) minutes since creation. # that means that they are probably busy, so create a new one already_created_nodes = 0 for n in matching_nodes: difference = now - n.created if difference.seconds < 360: # 6 minutes already_created_nodes += 1 if already_created_nodes > count: logger.info('job needs %s nodes to get unstuck', count) logger.info( 'but there are %s node(s) already created 6 minutes ago', already_created_nodes ) logger.info('will not create one') return logger.info('job needs %s nodes to get unstuck', count) logger.info( 'no nodes created recently enough, will create new ones. count: %s', buffered_count ) for i in range(buffered_count): # slap the UUID into the new node details node_kwargs = deepcopy(request.json) _id = str(uuid.uuid4()) node_kwargs['name'] = "%s__%s" % (name, _id) node_kwargs['script'] = script % _id provider.create_node(**node_kwargs) node_kwargs.pop('name') Node( name=name, identifier=_id, **node_kwargs ) models.commit()
def index(self): provider = providers.get(request.json['provider']) # request.json is read-only, since we are going to add extra metadata # to get the classes created, make a clean copy _json = deepcopy(request.json) # Before creating a node, check if it has already been created by us: name = _json['name'] keyname = _json['keyname'] image_name = _json['image_name'] size = _json['size'] labels = _json['labels'] script = _json['script'] count = _json.get('count', 1) # a buffered count is 3/4 what is needed rounded up buffered_count = int(round(count * 0.75)) existing_nodes = Node.filter_by( name=name, keyname=keyname, image_name=image_name, size=size, ).all() # try to slap it into the script, it is not OK if we are not allowed to, assume we should # this is just a validation step, should be taken care of by proper schema validation. try: script % '0000-aaaaa' except TypeError: logger.error('attempted to add a UUID to the script but failed') logger.error( 'ensure that a formatting entry for %s["script"] exists, like: %%s' % name ) return # do not add anything if we haven't been able to format logger.info('checking if an existing node matches required labels: %s', str(labels)) matching_nodes = [n for n in existing_nodes if n.labels_match(labels)] if not matching_nodes: # we don't have anything that matches this that has been ever created logger.info('job needs %s nodes to get unstuck', count) logger.info( 'no matching nodes were found, will create new ones. count: %s', buffered_count ) for i in range(buffered_count): # slap the UUID into the new node details node_kwargs = deepcopy(request.json) _id = str(uuid.uuid4()) node_kwargs['name'] = "%s__%s" % (name, _id) node_kwargs['script'] = script % _id provider.create_node(**node_kwargs) node_kwargs.pop('name') Node( name=name, identifier=_id, **node_kwargs ) models.commit() else: logger.info('found existing nodes that match labels: %s', len(matching_nodes)) now = datetime.utcnow() # we have something that matches, go over all of them and check: # if *all of them* are over 6 (by default) minutes since creation. # that means that they are probably busy, so create a new one already_created_nodes = 0 for n in matching_nodes: difference = now - n.created if difference.seconds < 360: # 6 minutes already_created_nodes += 1 if already_created_nodes >= count: logger.info('job needs %s nodes to get unstuck', count) logger.info( 'but there are %s node(s) already created 6 minutes ago', already_created_nodes ) logger.info('will not create one') return logger.info('job needs %s nodes to get unstuck', count) logger.info( 'no nodes created recently enough, will create new ones. count: %s', buffered_count ) for i in range(buffered_count): # slap the UUID into the new node details node_kwargs = deepcopy(request.json) _id = str(uuid.uuid4()) node_kwargs['name'] = "%s__%s" % (name, _id) node_kwargs['script'] = script % _id provider.create_node(**node_kwargs) node_kwargs.pop('name') Node( name=name, identifier=_id, **node_kwargs ) models.commit()
def index(self): provider = providers.get(request.json['provider']) # request.json is read-only, since we are going to add extra metadata # to get the classes created, make a clean copy _json = deepcopy(request.json) # Before creating a node, check if it has already been created by us: name = _json['name'] keyname = _json['keyname'] image_name = _json['image_name'] size = _json['size'] labels = _json['labels'] script = _json['script'] existing_nodes = Node.filter_by( name=name, keyname=keyname, image_name=image_name, size=size, ).all() # slap the UUID into the new node details _id = str(uuid.uuid4()) _json['name'] = "%s__%s" % (name, _id) # try to slap it into the script, it is not OK if we are not allowed to, assume we should try: _json['script'] = script % _id except TypeError: logger.error('attempted to add a UUID to the script but failed') logger.error( 'ensure that a formatting entry for %s["script"] exists, like: %%s' % name ) return # do not add anything if we haven't been able to format logger.info('checking if an existing node matches required labels: %s', str(labels)) matching_nodes = [n for n in existing_nodes if n.labels_match(labels)] if not matching_nodes: # we don't have anything that matches this that has been ever created logger.info('no matching nodes were found, will create one') logger.warning('creating node with details: %s' % str(_json)) provider.create_node(**_json) _json.pop('name') Node( name=request.json['name'], identifier=_id, **_json ) else: logger.info('found existing nodes that match labels: %s', len(matching_nodes)) now = datetime.utcnow() # we have something that matches, go over all of them and check: # if *all of them* are over 8 (by default) minutes since creation. # that means that they are probably busy, so create a new one for n in matching_nodes: difference = now - n.created if difference.seconds < 480: # 8 minutes logger.info( 'a matching node was already created in the past 8 minutes: %s', n.name ) logger.info('will not create one') return # FIXME: need to check with cloud provider and see if this # node is running, otherwise it means this node is dead and # should be removed from the DB logger.info('no nodes created recently, will create a new one') provider.create_node(**_json) _json.pop('name') Node( name=request.json['name'], identifier=_id, **_json )