def _try_fetch_upload_task_resource_id(self, job): task_id = job.upload_task_id try: ans = self.sandbox.list_task_resources(task_id) except Exception as e: # XXX TODO WTF # 2016-04-21 19:28:15,834 27491 WARNING resource_sharing: # Failed to list resources of task 56706912: # HTTPSConnectionPool(host='sandbox.yandex-team.ru', port=443): # Max retries exceeded with url: /api/v1.0//resource?task_id=56706912&limit=100 # (Caused by <class 'httplib.BadStatusLine'>: '') logging.warning("Failed to list resources of task %d: %s" % (task_id, e)) return resource_id = None for res in ans['items']: if res['type'] == job.resource_type: resource_id = res['id'] if not resource_id: raise RuntimeError("Resource of type %s not found in task %d resources" \ % (job.resource_type, task_id)) return resource_id
def _sandbox_wait2_loop(self): while not self.should_stop: T('begin_wait2_loop') with self.lock: while not(self.should_stop or self.wait2_queue): with Timing('wait2_queue_wait'): self.wait2_queue_not_empty.wait() if self.should_stop: # TODO return job = self.wait2_queue.popleft() try: with Timing('sbx_wait2_fetch_task_resource_id'): resource_id = self._try_fetch_upload_task_resource_id(job) except Exception as e: logging.warning("Failed to get resource %s from task %d: %s" \ % (job.resource_type, job.upload_task_id, e)) self._set_promise_error(job) del job continue if not resource_id: self._schedule_retry(job, self.Action.FETCH_RESOURCE_ID, delay=3.0) else: logging.debug('Done with %s, resource_id = %d' % (job, resource_id)) self._set_promise_value(job, resource_id) del job
def _finished(job_id, f): T('enter_sky_share_finished %d' % job_id) job = self.running[job_id] in_progress.remove(job) try: with Timing('sky_share_future_get %d' % job_id): torrent_id = f.get() #except ???Error as e: # TODO #pass except Exception as e: logging.warning('sky share for %s faled: %s' % (job, e)) # TODO better checks or collect STDERR for file in job.files: if not os.path.exists(job.directory + '/' + file): msg = 'Failed to share %s/%s: %s' % (job.directory, job.files, e) self._set_promise_error(job, OSError(errno.ENOENT, msg)) return schedule_retry(job) return logging.debug('sky share successfully done for %s: %s' % (job, torrent_id)) with self.lock: job.torrent_id = torrent_id self.upload_queue.append(job) self.upload_queue_not_empty.notify()
def send_update(update, is_final): try: return rem_proxy.update_graph( opts.task_id, rpc_server_addr, update, is_final ) except Exception as e: if isinstance(e, socket.error): logging.warning("on_notifier_fail: %s" % e) else: logging.exception("on_notifier_fail") if is_xmlrpc_exception(e, WrongTaskIdError): try: on_notifier_fail() except: logging.exception("on_notifier_fail") raise RuntimeError("Failed to send data to rem server: %s" % e.faultString) else: # FIXME Actually if isinstance(e, xmlrpclib.Fault) then not retriable # but not fatal as WrongTaskIdError raise RemNotifier.RetriableError(str(e))
def _try_create_upload_task(self, job): # TODO raise on non-recoverabl errors try: task = self._create_resource_upload_task(job) except Exception as e: logging.warning('Failed to create upload task %s to Sandbox: %s' % (job, e)) return if job.description: try: task.update(description=job.description) except Exception as e: logging.warning('Failed to update task %s: %s' % (job, e)) return try: task.start() except Exception as e: logging.warning('Failed to start upload task %s to Sandbox: %s' % (job, e)) return return task
def _sandbox_wait1_loop(self): poll_interval = 3.0 in_progress = {} noop_sandbox_statuses = { 'DRAFT', 'ENQUEUING', 'ENQUEUED', 'PREPARING', 'EXECUTING', 'TEMPORARY', 'FINISHING', 'STOPPING', 'WAIT_RES', 'WAIT_TASK', 'WAIT_TIME', } next_poll_time = time.time() while not self.should_stop: T('begin_wait1_loop') with self.lock: timeout = None if in_progress: timeout = max(0.0, next_poll_time - time.time()) T('before_before_wait1_queue_not_empty_sleep %s' \ % ((timeout, self.should_stop, len(self.wait1_queue)),)) if (timeout is None or timeout) and not(self.should_stop or self.wait1_queue): T('before_wait1_queue_not_empty_sleep %s' % timeout) self.wait1_queue_not_empty.wait(timeout) if self.should_stop: # TODO return job = None if self.wait1_queue: job = self.wait1_queue.popleft() in_progress[job.upload_task_id] = job del job if time.time() < next_poll_time: logging.debug('continue_wait1_sleep') continue try: with Timing('sbx_list_task_statuses'): statuses = self.sandbox.list_task_statuses(in_progress.keys()) except Exception as e: logging.warning("Failed to get sandbox tasks' statuses: %s" % e) continue finally: next_poll_time = max(next_poll_time + poll_interval, time.time()) T('wait1_next_poll_time=%s' % next_poll_time) logging.debug("Task statuses: %s" % statuses) # TODO Comment out done = [] for task_id in in_progress.keys(): status = statuses.get(task_id) if status in noop_sandbox_statuses: continue job = in_progress.pop(task_id) if status == 'SUCCESS': done.append(job) logging.debug('Upload task=%d in SUCCESS for %s' % (task_id, job)) elif status in ['FAILURE', 'EXCEPTION'] or status is None: logging.warning("Task %d in FAILURE. Will create new task" % task_id) self._schedule_retry(job, self.Action.CREATE_UPLOAD_TASK, delay=5.0) else: logging.error("Unknown task status %s for task=%d, %s" % (status, task_id, job)) self._set_promise_error(job, RuntimeError("Unknown task status %s for task_id=%d" % (status, task_id))) T('after_process_all_wait1_statuses') with self.lock: self.wait2_queue.extend(done) self.wait2_queue_not_empty.notify()
def get_token_info(token): # FIXME use https (it's slower: 20ms vs 5ms) url = "http://blackbox.yandex-team.ru/blackbox?method=oauth" "&oauth_token={token}&userip={addr}&format=json" url = url.format(token=token, addr="127.0.0.1") last_error = None delay = MIN_FETCH_INTERVAL deadline = time.time() + MAX_TOTAL_FETCH_TIME while True: ok = False now = time.time() if now > deadline: if last_error: raise last_error else: fetch_timeout = 0.0 else: fetch_timeout = min(FETCH_TIMEOUT, deadline - now) try: resp = requests.get(url, timeout=fetch_timeout) ok = True except (exceptions.Timeout, exceptions.SSLError, exceptions.ConnectionError) as e: logging.warning("Failed to get answer from blackbox: %s" % e) last_error = NetworkError(str(e)) else: if resp.status_code / 100 == 5: # doc says that blackbox always returns 200... ok = False msg = "Got %d http status from blackbox" % resp.status_code last_error = BlackboxServerError(msg) logging.warning(msg) if ok: break now = time.time() if now > deadline: raise last_error # logging.debug('RETRY: %s' % last_error) delay = min(delay * 2, MAX_FETCH_INTERVAL, deadline - now) time.sleep(delay) if resp.status_code != 200: raise RuntimeError("blackbox answered with %s status code" % resp.status_code) resp = resp.json() ret = OAuthInfo() ret.token = token ret.error = resp.get("error") if "status" in resp: ret.token_status = resp["status"]["value"] if ret.token_status == TokenStatus.VALID: ret.login = resp["login"] oauth = resp["oauth"] ret.client_id = oauth["client_id"] else: raise RuntimeError(ret.error) return ret