def _on_cloud_journal_event(self, ev): #logging.debug('before journal event %s' % ev) with self.lock: tag = self.inmem_items.get(ev.tag_name) if not tag: logging.warning('no object in inmem_items for cloud tag %s' % ev.tag_name) return if not tag.IsCloud(): # it's like assert logging.error('tag %s is not cloud tag in inmem_items but receives event from cloud' % ev.tag_name) return if tag.version >= ev.version: # TODO warn even on equal versions, but not for initial _subscribe_all if tag.version > ev.version: logging.warning('local version (%d) > journal version (%d) for tag %s' \ % (tag.version, ev.version, ev.tag_name)) return def add_event(event, version, msg=None): self._repr_modifier.add((tag, event, msg, version)) # FIXME here with warning, on state sync without it if ev.version > ev.last_reset_version and tag.version < ev.last_reset_version: logging.debug('overtaking reset %s.%d.%d for %d' % (ev.tag_name, ev.version, ev.last_reset_version, tag.version)) add_event(ETagEvent.Reset, ev.last_reset_version, ev.last_reset_comment) # TODO last_reset_comment is wrong add_event(ev.event, ev.version, ev.last_reset_comment if ev.event == ETagEvent.Reset else None) logging.debug('after journal event for %s' % ev.tag_name)
def _RawTag(self, tagname, dont_create=False): if not tagname: raise ValueError("Empty tag name") tag = self.inmem_items.get(tagname, None) if tag: return tag if not self.db_file_opened: self.DBConnect() tagDescr = self.infile_items.get(tagname, None) if tagDescr: tag = cPickle.loads(tagDescr) if tag.IsCloud(): if not self._has_cloud_setup(): logging.error("Tag %s is cloud on disk storage, but no setup for" \ " cloud in config. Restart server with proper setup!" % tagname) elif not tag.IsRemote(): # Hack for disable_remote_tags if self._is_cloud_tag_name(tag.GetFullname()): logging.error("Tag %s is not cloud on disk storage, but must be." \ " Convert tags in disk storage!" % tagname) self._set_modify_func(tag) elif dont_create: return None else: tag = self._create_tag(tagname) return tag
def cleanup_directory(directory, to_keep, max_removed_items_to_output=100): removed = [] files = os.listdir(directory) for basename in files: if basename in to_keep: continue filename = directory + '/' + basename remove = shutil.rmtree \ if os.path.isdir(filename) and not os.path.islink(filename) \ else os.unlink try: remove(filename) except Exception as e: logging.error("Can't remove %s: %s" % (filename, e)) else: removed.append(basename) if removed: logging.info('%d files removed from %s: %s' \ % (len(removed), directory, ', '.join(removed[:max_removed_items_to_output])))
def RegisterTagEventForClient(self, clientname, tagname, event, message=None): logging.debug("%s remote tag %s on host %s", TagEventName[event], tagname, clientname) client = self.topologyInfo.GetClient(clientname, checkname=False) if client is None: logging.error("unknown client %s appeared", clientname) return False client.RegisterTagEvent("%s:%s" % (self.network_name, tagname), event, message)
def __reset_requests(self): rest, self._incoming.queue = self._incoming.queue, deque() for sock, _ in rest: try: _socket_send_reset(sock) except Exception as e: logging.error("Failed to send RST to RPC client: %s" % e)
def __put_request(self): try: request = self.get_request() except socket.error: logging.error("XMLRPCServer: socket error") return self._incoming.put(request)
def handle_request(self, timings): try: request = self.get_request() except socket.error: logging.error("XMLRPCServer: socket error") return timings.append(time.time()) self._timings[id(request[0])] = timings self.requests.put(request)
def func(*args, **kwargs): penalty = 0.01 _tries = tries while _tries: try: return f(*args, **kwargs) break except tuple(exception_list), e: time.sleep(penalty) penalty = min(penalty * penalty_factor, 5) _tries -= 1 logging.error('Exception in %s, exception message: %s, attempts left: %s', f.func_name, e.message, _tries)
def _Communicate(self, f): self.Connect() try: f() self.errorsCnt = 0 logging.debug("SendData to %s: ok", self.name) except (IOError, socket.timeout) as e: logging.warning("SendData to %s: failed: %s", self.name, e) self.lastError = e self.errorsCnt += 1 except Exception as e: logging.error("SendData to %s: failed: %s", self.name, e)
def mark_as_too_old(self): with self.lock: if self.state not in ImplState.LimitedLifetimeSuspendedStates: raise NonTooOldMarkableStateError("mark_as_too_old called for %s" % self) if self.is_too_old: logging.error("mark_as_too_old called for already old %s" % self) return self._send_becomes_too_old_notification() # before _update_state self.is_too_old = True self.do_not_run = False # FIXME Clear or not? (change _check_add_files otherwise) self._update_state()
def _masks_reload_loop(self): while True: if self._masks_should_stop.wait(self._cloud_tags_masks_reload_interval): return try: match = self._load_masks() except Exception as e: logging.error("Failed to reload tags' masks from: %s" % e) continue if self._match_cloud_tag.count and not match.count: logging.warning("New cloud tags masks discarded: old count %d, new count %d" % ( self._match_cloud_tag.count, match.count)) continue logging.debug("Cloud tag's masks reloaded. Regexp count: %d" % match.count) self._match_cloud_tag = match
def _fetch_resource_list(self, pck): try: ans = pck._sandbox.list_task_resources(pck._sandbox_task_id) except: logging.exception('Failed to fetch task resources for %s' % pck) with pck._lock: if pck._target_stop_mode != StopMode.CANCEL: self._schedule(pck, self._start_fetch_resource_list, self._SANDBOX_TASK_CREATION_RETRY_INTERVAL) return # TODO We don't have to _fetch_resource_list() in any TERMINATED task # state (e.g. ERROR, EXCEPTION) #import json #logging.debug('task #%s resources list answer: %s' % (pck._sandbox_task_id, json.dumps(ans, indent=3))) res_by_type = { resource['type']: resource for resource in ans['items'] } #logging.debug('task #%s res_by_type: %s' % (pck._sandbox_task_id, json.dumps(res_by_type, indent=3))) with pck._lock: resource = res_by_type.get('REM_JOBPACKET_EXECUTION_SNAPSHOT') if not resource: logging.error("No REM_JOBPACKET_EXECUTION_SNAPSHOT resource in %s" % pck) err = "No REM_JOBPACKET_EXECUTION_SNAPSHOT resource" pck.set_error(err, False) self._mark_as_finished(pck, err) return pck._result_snapshot_resource_id = resource['id'] if pck._final_state is None: pck._final_update_url = res_by_type['REM_JOBPACKET_GRAPH_UPDATE']['http']['proxy'] pck._set_state(RemotePacketState.FETCHING_FINAL_UPDATE) else: self._mark_as_finished(pck, '_fetch_resource_list') return self._fetch_final_update(pck) # not under lock
def _is_cloud_tag_name(self, name): if self.IsRemoteTagName(name): return False try: if self._tags_random_cloudiness: return hash(name) % 3 == 0 if self._all_tags_in_cloud: return True return self._match_cloud_tag(name) except Exception as e: now = time.time() if now - self._last_tag_mask_error_report_time > 5: logging.error("Failed to match tag masks: %s" % e) self._last_tag_mask_error_report_time = now return False
def _write(self, data): timeout = 1.0 max_timeout = 15.0 while True: with self._db_lock: try: if not self._db: self._reopen() self._db.write(data) self._db.sync() except Exception as err: self._db = None logging.error("Can't write to journal (%d items left): %s" \ % (len(self._queue), err)) else: break timeout = min(max_timeout, timeout * 2) time.sleep(timeout)
def convert_in_memory_tags_to_cloud_if_need(self): if not self._has_cloud_setup(): return False updates = [] for tag_name, tag in self.inmem_items.iteritems(): must_be_cloud = self._is_cloud_tag_name(tag_name) \ and not tag.IsRemote() # Hack for disable_remote_tags if must_be_cloud == tag.IsCloud(): continue elif must_be_cloud: if tag.IsLocallySet(): updates.append((tag_name, ETagEvent.Set)) self._make_tag_cloud(tag) else: logging.error("Tag %s is cloud, but must not be" % tag_name) if not updates: return False logging.info("before conversion %d tags to CloudTag's" % len(updates)) cloud = self._create_cloud_client(lambda ev: None) try: for bucket in split_in_groups(updates, 100000): # TODO Fix cloud_client.update cloud.update(bucket).get() finally: try: cloud.stop() except: logging.exception("Failed to stop temporary cloud client") logging.info("after conversion %d tags to CloudTag's" % len(updates)) return True
def convert_to_v2(self): for job in self.jobs.values(): d = job.__dict__ d.pop('packetRef', None) d.pop('callbacks', None) d.pop('nonpersistent_callbacks', None) job.max_try_count = d.pop('maxTryCount') job.pck_id = self.id pckd = self.__dict__ state = pckd.pop('state') if state == ReprState.NONINITIALIZED: #self._recover_noninitialized(ctx) logging.error("Packet %s in NONINITIALIZED state" % self) self.do_not_run = bool(self.flags & self.PacketFlag.USER_SUSPEND) self.is_broken = bool(self.flags & self.PacketFlag.RCVR_ERROR) pckd.pop('flags') if state == ReprState.SUCCESSFULL and self.do_not_run: #logging.warning("SUCCESSFULL and USER_SUSPEND in %s" % self.id) self.do_not_run = False pckd.pop('streams') # FIXME Overhead: will re-concat multi-deps pckd.pop('_active_jobs', None) pckd.pop('edges') # constant graph succeed_jobs = pckd.pop('done') jobs_to_run = pckd.pop('leafs') #active_jobs_cache = set() pckd.pop('as_in_queue_working') child_to_parents = pckd.pop('waitJobs') def pop_failed_job(): if not jobs_to_run: raise ValueError("jobs_to_run is empty to pop") for job_id in jobs_to_run: result = self.jobs[job_id].last_result() if not result: continue if not result.IsSuccessfull(): jobs_to_run.remove(job_id) return job_id jobs_to_retry = {} if state == ReprState.WAITING: if jobs_to_run: if self.waitingDeadline: job_id = pop_failed_job() or jobs_to_run.pop() jobs_to_retry[1] = (job_id, None, self.waitingDeadline) else: logging.error("No waitingDeadline: %s" % self) else: logging.error("WAITING && !jobs_to_run: %s" % self) pckd.pop('waitingDeadline', None) failed_jobs = set() if state == ReprState.ERROR: job_id = pop_failed_job() if jobs_to_run else None if job_id: failed_jobs.add(job_id) elif not self.is_broken: logging.error("ERROR && !broken && !failed_jobs: %s" % self) working_jobs = {jid for jid, deps in child_to_parents.items() if not deps} \ - (succeed_jobs | jobs_to_run \ | set(descr[0] for descr in jobs_to_retry.values()) \ | failed_jobs) jobs_to_run |= working_jobs if working_jobs: logging.debug('working_jobs for %s in %s: %s' % (self.id, state, working_jobs)) self.done_tag = pckd.pop('done_indicator') self.job_done_tag = pckd.pop('job_done_indicator') self.all_dep_tags = pckd.pop('allTags') self.bin_links = pckd.pop('binLinks') self.is_resetable = pckd.pop('isResetable') self.wait_dep_tags = pckd.pop('waitTags') # if we are in SUSPENDED (RCVR_ERROR or not) and len(self.wait_dep_tags) # -- we will wait tags (in previous packet.py impl) self.tags_awaited = not self.wait_dep_tags or state in _TAGS_AWAITED_STATES clean_state = pckd.pop('_clean_state') # TODO apply to _graph_executor queues = self._get_listeners_by_type((LocalQueue, LegacyQueue)) # FIXME Select one type queue = queues[0] if queues else None self.queue = queue if queue: self.DropCallbackListener(queue) self.__class__ = LocalPacket self.files_modified = False self.resources_modified = False self.files_sharing = None self.shared_files_resource_id = None self.resolved_releases = {} self.unresolved_release_count = 0 self.destroying = state == ReprState.HISTORIED self.sbx_files = {} self._repr_state = None self.state = None self.finish_status = True if state == ReprState.SUCCESSFULL else \ (False if state == ReprState.ERROR and not self.is_broken else None) self._saved_jobs_status = None self.last_sandbox_task_id = None self._graph_executor = DUMMY_GRAPH_EXECUTOR self._repr_state = state # to avoid duplicates in pck.history self.req_sandbox_host = None if state == ReprState.SUCCESSFULL: #pass g = self._create_job_graph_executor() self._saved_jobs_status = g.produce_detailed_status() #self._saved_jobs_status = self._produce_compressed_job_status(g) del g elif state == ReprState.HISTORIED: pass elif self.queue and (failed_jobs or succeed_jobs or jobs_to_retry): g = self._graph_executor = self._create_job_graph_executor() g.failed_jobs = failed_jobs g.succeed_jobs = succeed_jobs g.jobs_to_run = jobs_to_run g.jobs_to_retry = jobs_to_retry g.child_to_parents = child_to_parents g._clean_state = clean_state g.state = g._calc_state() # FIXME bug? waitJobs may not contain all jobs-with-parents _complete_waitJobs(self.id, g) try: _check_graph_consistence(g) except Exception as e: raise AssertionError("Inconsistent job graph in %s: %s" % (self.id, e)) self.state = self._calc_state() self._update_repr_state() if self.queue: if self.has_pending_jobs(): self.queue.packets_with_pending_jobs.add(self) self.queue.relocate_packet(self) if self._repr_state != state and not(state == ReprState.WORKABLE and self._repr_state == ReprState.PENDING): logging.warning("ReprState mismatch for %s: %s -> %s" % (self, state, self._repr_state))
def log_fail(error): logging.error('Failed to fetch %s for %s: %s' % (pck._final_update_url, pck, error))
def _status(self): history = self.History() total_time = history[-1][1] - history[0][1] wait_time = 0 for ((state, start_time), (_, end_time)) in zip(history, history[1:] + [("", time.time())]): if state in (ReprState.SUSPENDED, ReprState.WAITING): wait_time += end_time - start_time result_tag = self.done_tag.name if self.done_tag else None waiting_time = None if self.state == ImplState.TIME_WAIT: deadline = self._graph_executor.get_nearest_retry_deadline() if deadline: waiting_time = max(int(deadline - time.time()), 0) else: logging.error("Packet %s in WAITING but has no get_nearest_retry_deadline" % self.id) all_tags = list(self.all_dep_tags) status = dict(name=self.name, is_sandbox=isinstance(self, SandboxPacket), last_sandbox_task_id=self.last_sandbox_task_id, # TODO History of tasks last_global_error=self._graph_executor.get_global_error(), resolved_releases=self.resolved_releases, state=self._repr_state, extended_state=self._get_extended_state(), wait=list(self.wait_dep_tags), all_tags=all_tags, result_tag=result_tag, priority=self.priority, notify_emails=self.notify_emails, history=history, total_time=total_time, wait_time=wait_time, last_modified=history[-1][1], waiting_time=waiting_time, queue=self.queue.name if self.queue else None, labels=self.user_labels, oauth_login=self.oauth_login, ) extra_flags = set() if self.is_broken: extra_flags.add("can't-be-recovered") if self.do_not_run: extra_flags.add("manually-suspended") if self.is_too_old: extra_flags.add("too-old") if extra_flags: status["extra_flags"] = ";".join(extra_flags) if not self._is_dummy_graph_executor(): jobs = self._graph_executor.produce_detailed_status() \ or self._produce_clean_jobs_status() elif self._saved_jobs_status: jobs = self._saved_jobs_status else: jobs = self._produce_clean_jobs_status() for job in jobs: tag = self.job_done_tag.get(int(job['id'])) # id is str already if tag: job['result_tag'] = tag.name status["jobs"] = jobs return status