def run(self): # Generate the journal mapping for the nodes mapping = utils.generate_journal_mapping( self.parameters['Cluster.node_configuration'], integration_id=self.parameters.get("TendrlContext.integration_id")) # Update output dict job = Job(job_id=self.job_id).load() job.output[self.__class__.__name__] = json.dumps(mapping) job.save()
def test_save(mock_save): job = Job() payload = maps.NamedDict() payload['parent'] = "Test Parent Job Id" job.payload = payload with patch.object(objects.BaseObject, 'load') as mock_load: mock_load.return_value = load(False) job.status = "true" job.save() with patch.object(objects.BaseObject, 'load') as mock_load: mock_load.return_value = load(False) job.status = "failed" job.save() with patch.object(objects.BaseObject, 'load') as mock_load: mock_load.return_value = load(True) job.status = "failed" job.save()
def process_job(job): jid = job.key.split('/')[-1] job_status_key = "/queue/%s/status" % jid job_lock_key = "/queue/%s/locked_by" % jid NS.node_context = NS.node_context.load() # Check job not already locked by some agent try: _locked_by = NS._int.client.read(job_lock_key).value if _locked_by: return except etcd.EtcdKeyNotFound: pass # Check job not already "finished", or "processing" try: _status = NS._int.client.read(job_status_key).value if _status in ["finished", "processing"]: return except etcd.EtcdKeyNotFound: pass # tendrl-node-agent tagged as tendrl/monitor will ensure # >10 min old "new" jobs are timed out and marked as # "failed" (the parent job of these jobs will also be # marked as "failed") if "tendrl/monitor" in NS.node_context.tags: _job_valid_until_key = "/queue/%s/valid_until" % jid _valid_until = None try: _valid_until = NS._int.client.read(_job_valid_until_key).value except etcd.EtcdKeyNotFound: pass if _valid_until: _now_epoch = (time_utils.now() - datetime.datetime( 1970, 1, 1).replace(tzinfo=utc)).total_seconds() if int(_now_epoch) >= int(_valid_until): # Job has "new" status since 10 minutes, # mark status as "failed" and Job.error = # "Timed out" try: NS._int.wclient.write(job_status_key, "failed", prevValue="new") except etcd.EtcdCompareFailed: pass else: job = Job(job_id=jid).load() _msg = str("Timed-out (>10min as 'new')") job.errors = _msg job.save() return else: _now_plus_10 = time_utils.now() + datetime.timedelta(minutes=10) _epoch_start = datetime.datetime(1970, 1, 1).replace(tzinfo=utc) # noinspection PyTypeChecker _now_plus_10_epoch = (_now_plus_10 - _epoch_start).total_seconds() NS._int.wclient.write(_job_valid_until_key, int(_now_plus_10_epoch)) job = Job(job_id=jid).load() if job.payload["type"] == NS.type and \ job.status == "new": # Job routing # Flows created by tendrl-api use 'tags' from flow # definition to target jobs _tag_match = False if job.payload.get("tags", []): for flow_tag in job.payload['tags']: if flow_tag in NS.node_context.tags: _tag_match = True if not _tag_match: _job_tags = ", ".join(job.payload.get("tags", [])) _msg = "Node (%s)(type: %s)(tags: %s) will not " \ "process job-%s (tags: %s)" % \ (NS.node_context.node_id, NS.type, NS.node_context.tags, jid, _job_tags) Event( Message(priority="info", publisher=NS.publisher_id, payload={"message": _msg})) return job_status_key = "/queue/%s/status" % job.job_id job_lock_key = "/queue/%s/locked_by" % job.job_id try: lock_info = dict(node_id=NS.node_context.node_id, fqdn=NS.node_context.fqdn, tags=NS.node_context.tags, type=NS.type) NS._int.wclient.write(job_lock_key, json.dumps(lock_info)) NS._int.wclient.write(job_status_key, "processing", prevValue="new") except etcd.EtcdCompareFailed: # job is already being processed by some tendrl # agent return the_flow = None try: current_ns, flow_name, obj_name = \ _extract_fqdn(job.payload['run']) if obj_name: runnable_flow = current_ns.ns.get_obj_flow(obj_name, flow_name) else: runnable_flow = current_ns.ns.get_flow(flow_name) the_flow = runnable_flow(parameters=job.payload['parameters'], job_id=job.job_id) Event( Message(job_id=job.job_id, flow_id=the_flow.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={"message": "Processing Job %s" % job.job_id})) Event( Message(job_id=job.job_id, flow_id=the_flow.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "Running Flow %s" % job.payload['run'] })) the_flow.run() try: NS._int.wclient.write(job_status_key, "finished", prevValue="processing") except etcd.EtcdCompareFailed: # This should not happen! _msg = "Cannot mark job as 'finished', " \ "current job status invalid" raise FlowExecutionFailedError(_msg) Event( Message(job_id=job.job_id, flow_id=the_flow.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "Job (%s): Finished " "Flow %s" % (job.job_id, job.payload['run']) })) except (FlowExecutionFailedError, AtomExecutionFailedError, Exception) as e: _trace = str(traceback.format_exc(e)) _msg = "Failure in Job %s Flow %s with error:" % \ (job.job_id, job.payload['run']) Event( ExceptionMessage(priority="error", publisher=NS.publisher_id, payload={ "message": _msg + _trace, "exception": e })) if the_flow: Event( Message(job_id=job.job_id, flow_id=the_flow.parameters['flow_id'], priority="error", publisher=NS.publisher_id, payload={"message": _msg + "\n" + _trace})) else: Event( Message(priority="error", publisher=NS.publisher_id, payload={"message": _msg + "\n" + _trace})) try: NS._int.wclient.write(job_status_key, "failed", prevValue="processing") except etcd.EtcdCompareFailed: # This should not happen! _msg = "Cannot mark job as 'failed', current" \ "job status invalid" raise FlowExecutionFailedError(_msg) else: job = job.load() job.errors = _trace job.save()
def run(self): Event( Message( priority="info", publisher=NS.publisher_id, payload={ "message": "Generating brick mapping for gluster volume" }, job_id=self.parameters["job_id"], flow_id=self.parameters["flow_id"], cluster_id=NS.tendrl_context.integration_id, ) ) brick_count = self.parameters.get('Volume.brick_count') subvol_size = self.parameters.get('Volume.subvol_size') message = "" # get brick_count number of bricks from all the selected nodes nodes = {} for node in self.parameters.get('Cluster.node_configuration'): key = "nodes/%s/NodeContext/fqdn" % node host = NS._int.client.read(key).value nodes[host] = [] hosts = NS._int.client.read( '/clusters/%s/Bricks/free/' % NS.tendrl_context.integration_id ) for host in hosts.leaves: host = host.key.split("/")[-1] bricks = NS._int.client.read( '/clusters/%s/Bricks/free/%s' % ( NS.tendrl_context.integration_id, host ) ) for brick in bricks.leaves: brick = brick.key.split("/")[-1] if host in nodes: if len(nodes[host]) < brick_count: nodes[host].append(brick) # Form a brick list such that when you fill sub volumes with # bricks from this list, it should honour the failure domains brick_list = [] total_bricks = len(nodes) * brick_count for iterator in range(total_bricks): brick_list.append("") counter = 0 node_count = len(nodes) for key, value in nodes.iteritems(): if len(value) < brick_count: message = "Host %s has %s bricks which is less than" + \ " bricks per host %s" % ( key, len(value), brick_count ) job = Job(job_id=self.parameters["job_id"]).load() res = {"message": message, "result": [[]], "optimal": False} job.output["GenerateBrickMapping"] = json.dumps(res) job.save() return False for i in range(brick_count): brick_list[node_count * i + counter] = value[i] counter += 1 # Check if total number of bricks available is less than the # sub volume size. If its less, then return accordingly if len(brick_list) < subvol_size: message = "Total bricks available %s less than subvol_size %s" % ( len(brick_list), subvol_size ) job = Job(job_id=self.parameters["job_id"]).load() res = {"message": message, "result": [[]], "optimal": False} job.output["GenerateBrickMapping"] = json.dumps(res) job.save() return False # Fill the result list with bricks from the brick_list, # try to fill untill you exhaust the brick list and # also the number of sub volumes is maximum for the # available list result = [] lower_bound = 0 upper_bound = subvol_size while True: if upper_bound > len(brick_list): break subvol = brick_list[lower_bound:upper_bound] result.append(subvol) lower_bound = upper_bound upper_bound += subvol_size # check if the mapping provided is optimal as per expected # failure domain or not optimal = True if node_count < subvol_size: optimal = False # Write the result back to the job job = Job(job_id=self.parameters["job_id"]).load() res = {"message": message, "result": result, "optimal": optimal} job.output["GenerateBrickMapping"] = json.dumps(res) job.save() return True