def __init__(self, art_name, art_dict, logger=None): self.name = art_name self.key: str = None self.local_path: str = None self.remote_path: str = None self.credentials = None self.hash = None self.logger = logger if self.logger is None: self.logger = logs.get_logger(self.__class__.__name__) self.logger.setLevel(storage_setup.get_storage_verbose_level()) self.storage_handler: StorageHandler = None self.unpack: bool = art_dict.get('unpack') self.is_mutable: bool = art_dict.get('mutable') if 'key' in art_dict.keys(): self.key = art_dict['key'] if 'local' in art_dict.keys(): self.local_path = art_dict['local'] if 'qualified' in art_dict.keys(): self.remote_path = art_dict['qualified'] if 'url' in art_dict.keys(): self.remote_path = art_dict['url'] if 'hash' in art_dict.keys(): self.hash = art_dict['hash'] self.credentials = credentials.Credentials.get_credentials(art_dict) self._setup_storage_handler(art_dict)
def test_experiment_lifetime(self): my_path = os.path.dirname(os.path.realpath(__file__)) logger = logs.get_logger('test_experiment_lifetime') logger.setLevel(10) config_name = os.path.join(my_path, 'test_config.yaml') key = 'test_experiment_lifetime' + str(uuid.uuid4()) with model.get_db_provider(model.get_config(config_name)) as db: try: db.delete_experiment(key) except Exception: pass p = subprocess.Popen(['studio', 'run', '--config=' + config_name, '--experiment=' + key, '--force-git', '--verbose='+EXPERIMENT_VERBOSE_LEVEL, '--lifetime=-10m', 'stop_experiment.py'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=my_path) pout, _ = p.communicate() if pout: logger.debug("studio run output: \n" + pout.decode()) db.delete_experiment(key)
def __init__(self, db_config, measure_timestamp_diff=False, blocking_auth=True, compression=None): verbose = get_storage_verbose_level() self.logger = logs.get_logger(self.__class__.__name__) self.logger.setLevel(verbose) guest = db_config.get('guest') self.app = pyrebase.initialize_app(db_config) if compression is None: compression = db_config.get('compression') self.auth = None if not guest and 'serviceAccount' not in db_config.keys(): self.auth = get_auth(db_config['type'], blocking_auth, verbose=verbose) super().__init__(StorageType.storageFirebase, self.logger, measure_timestamp_diff=measure_timestamp_diff, compression=compression)
def __init__(self, auth_cookie=None, verbose=10, branch=None, user_startup_script=None): self.startup_script_file = os.path.join( os.path.dirname(__file__), 'scripts/ec2_worker_startup.sh') self.install_studio_script = os.path.join(os.path.dirname(__file__), 'scripts/install_studio.sh') self.client = boto3.client('ec2') self.asclient = boto3.client('autoscaling') self.cwclient = boto3.client('cloudwatch') self.region = self.client._client_config.region_name self.logger = logs.get_logger('EC2WorkerManager') self.logger.setLevel(verbose) self.auth_cookie = auth_cookie self.prices = self._get_ondemand_prices(_instance_specs.keys()) self.repo_url = git_util.get_my_repo_url() self.branch = branch if branch else git_util.get_my_checkout_target() self.user_startup_script = user_startup_script if user_startup_script: self.logger.warn('User startup script argument is deprecated')
def __init__(self, func=lambda x: x, parent=None, q_in=None, q_out=None, num_workers=0, q_size=None, batch_size=1, filterf=lambda x: x is not None, batcher=lambda x: x, timeout=1): min_q_size = 10 self.func = func self.parent = parent self.num_workers = num_workers self.filterf = filterf self.batch_size = batch_size self.batcher = batcher if num_workers > 0: self.q_size = q_size if q_size else 2 * num_workers self.q_out = q_out self.q_in = q_in self.q_size = max(min_q_size, 2 * num_workers) self.logger = logs.get_logger('BufferedPipe') self.logger.setLevel(10) self.timeout = timeout self.worker_frame = Thread
def __init__(self, config, verbose=10, blocking_auth=True, compression=None): # TODO: implement connection self.url = config.get('serverUrl', None) self.verbose = get_storage_verbose_level() self.logger = logs.get_logger('HTTPProvider') self.logger.setLevel(self.verbose) self.credentials: Credentials = \ Credentials.get_credentials(config) self.storage_handler = HTTPStorageHandler( self.url, self.credentials.to_dict() if self.credentials else None, compression=compression) self.auth = None guest = config.get('guest', None) if not guest and 'serviceAccount' not in config.keys(): self.auth = get_auth(config.get('authentication', None), blocking_auth) self.compression = compression if self.compression is None: self.compression = config.get('compression', None)
def __init__(self, zone='us-east1-c', auth_cookie=None, verbose=10, branch=None, user_startup_script=None): assert 'GOOGLE_APPLICATION_CREDENTIALS' in os.environ.keys() with open(os.environ['GOOGLE_APPLICATION_CREDENTIALS'], 'r') as f: credentials_dict = json.loads(f.read()) self.compute = googleapiclient.discovery.build('compute', 'v1') self.startup_script_file = os.path.join( os.path.dirname(__file__), 'scripts/gcloud_worker_startup.sh') self.install_studio_script = os.path.join(os.path.dirname(__file__), 'scripts/install_studio.sh') self.zone = zone self.projectid = credentials_dict['project_id'] self.logger = logs.get_logger("GCloudWorkerManager") self.logger.setLevel(verbose) self.auth_cookie = auth_cookie self.user_startup_script = user_startup_script self.repo_url = git_util.get_my_repo_url() self.branch = branch if branch else git_util.get_my_checkout_target() self.log_bucket = "studioml-logs" if user_startup_script: self.logger.warn('User startup script argument is deprecated')
def get_worker_manager(config, cloud=None, verbose=10): if cloud is None: return None assert cloud in ['gcloud', 'gcspot', 'ec2', 'ec2spot'] logger = logs.get_logger('runner.get_worker_manager') logger.setLevel(verbose) auth = get_auth(config['database']['authentication']) auth_cookie = auth.get_token_file() if auth else None branch = config['cloud'].get('branch') logger.info('using branch {}'.format(branch)) if cloud in ['gcloud', 'gcspot']: cloudconfig = config['cloud']['gcloud'] worker_manager = GCloudWorkerManager( auth_cookie=auth_cookie, zone=cloudconfig['zone'], branch=branch, user_startup_script=config['cloud'].get('user_startup_script')) if cloud in ['ec2', 'ec2spot']: worker_manager = EC2WorkerManager( auth_cookie=auth_cookie, branch=branch, user_startup_script=config['cloud'].get('user_startup_script')) return worker_manager
def getlogger(): global logger if logger is None: logger = logs.get_logger('studio_server') logger.setLevel(10) return logger
def __init__(self, queue, route, amqp_url='', config=None, logger=None): """Setup the example publisher object, passing in the URL we will use to connect to RabbitMQ. """ self._rmq_lock = threading.RLock() self._connection = None self._channel = None self._consumer = None self._consume_ready = False self._msg_tracking_lock = threading.RLock() self._deliveries = [] self._acked = 0 self._nacked = 0 self._message_number = 0 self._rmq_msg = None self._rmq_id = None self._stopping = False self._exchange = 'StudioML.topic' self._exchange_type = 'topic' self._routing_key = route self._url = amqp_url self._is_persistent: bool = False if logger is not None: self._logger = logger else: self._logger = logs.get_logger('RabbitMQ') self._logger.setLevel(get_storage_verbose_level()) if config is not None: # extract from the config data structure any settings related to # queue messaging for rabbit MQ if 'cloud' in config: if 'queue' in config['cloud']: if 'rmq' in config['cloud']['queue']: self._url = config['cloud']['queue']['rmq'] self._logger.warning('use queue url %s', self._url) flag_persistent = config['cloud']['queue']\ .get('persistent', False) if isinstance(flag_persistent, str): flag_persistent = flag_persistent.lower() == 'true' self._is_persistent = flag_persistent self._queue = queue self._queue_deleted = True self._connection_failed = False self._connection_failure_reason = None # The pika library for RabbitMQ has an asynchronous run method # that needs to run forever and will do reconnections etc # automatically for us thr = threading.Thread(target=self._run, args=(), kwargs={}) thr.setDaemon(True) thr.start() self._wait_queue_created(600)
def __init__(self, name: str, receiver_keypath: str, sender_keypath: str = None): """ param: name - payload builder name param: receiver_keypath - file path to .pem file with recipient public key param: sender_keypath - file path to .pem file with sender private key """ super(EncryptedPayloadBuilder, self).__init__(name) # XXX Set logger verbosity level here self.logger = logs.get_logger(self.__class__.__name__) self.recipient_key_path = receiver_keypath self.recipient_key = None try: self.recipient_key =\ RSA.import_key(open(self.recipient_key_path).read()) except: check_for_kb_interrupt() msg = "FAILED to import recipient public key from: {0}"\ .format(self.recipient_key_path) self.logger.error(msg) raise ValueError(msg) self.sender_key_path = sender_keypath self.sender_key: SigningKey = None self.verify_key: VerifyKey = None self.sender_fingerprint = None if self.sender_key_path is None: self.logger.error("Signing key path must be specified for encrypted payloads. ABORTING.") raise ValueError() # We expect ed25519 signing key in "openssh private key" format try: public_key_data, private_key_data =\ Ed25519KeyUtil.parse_private_key_file( self.sender_key_path, self.logger) if public_key_data is None or private_key_data is None: self._raise_error( "Failed to import private signing key from {0}. ABORTING." .format(self.sender_key_path)) self.sender_key = SigningKey(private_key_data) self.verify_key = VerifyKey(public_key_data) except Exception: self._raise_error("FAILED to open/read private signing key file: {0}"\ .format(self.sender_key_path)) self.sender_fingerprint = \ self._get_fingerprint(public_key_data) self.simple_builder =\ UnencryptedPayloadBuilder("simple-builder-for-encryptor")
def __init__(self, queue, args): self.config = args.config if args.guest: self.config['database']['guest'] = True self.task_queue = queue self.logger = logs.get_logger('LocalExecutor') self.logger.setLevel(model.parse_verbosity(self.config.get('verbose'))) self.logger.debug("Config: ") self.logger.debug(self.config)
def __init__(self, db_config, handler: StorageHandler, compression=None): self.logger = logs.get_logger(self.__class__.__name__) self.logger.setLevel(get_storage_verbose_level()) self.compression = compression if self.compression is None: self.compression = db_config.get('compression', None) self.auth = None self.storage_handler = handler self.max_keys = db_config.get('max_keys', 100)
def __init__(self, path=None, verbose=10): if path is None: self.path = self._get_queue_directory() else: self.path = path self.logger = logs.get_logger(self.__class__.__name__) self.logger.setLevel(verbose) self.status_marker = os.path.join(self.path, 'is_active.queue') try: with open(self.status_marker, "w") as smark: _ = smark except IOError: self.logger.error('FAILED to create %s for LocalQueue. ABORTING.', self.status_marker) sys.exit(-1)
def allocate_resources(experiment, config=None, verbose=10): logger = logs.get_logger('allocate_resources') logger.setLevel(verbose) logger.info('Allocating resources {} for experiment {}'.format( experiment.resources_needed, experiment.key)) ret_val = True gpus_needed = int(experiment.resources_needed.get('gpus')) \ if experiment.resources_needed else 0 if gpus_needed > 0: ret_val = ret_val and allocate_gpus( gpus_needed, experiment.resources_needed, config) else: allocate_gpus(0) return ret_val
def get_db_provider(config=None, blocking_auth=True): db_provider = get_storage_db_provider() if db_provider is not None: return db_provider if config is None: config = get_config() verbose = parse_verbosity(config.get('verbose', None)) # Save this verbosity level as global for the whole experiment job: set_storage_verbose_level(verbose) logger = logs.get_logger("get_db_provider") logger.setLevel(verbose) logger.debug('Choosing db provider with config:') logger.debug(config) if 'storage' in config.keys(): artifact_store = get_artifact_store(config['storage']) else: artifact_store = None assert 'database' in config.keys() db_config = config['database'] if db_config['type'].lower() == 's3': db_provider = S3Provider(db_config, blocking_auth=blocking_auth) if artifact_store is None: artifact_store = db_provider.get_storage_handler() elif db_config['type'].lower() == 'gs': raise NotImplementedError("GS is not supported.") elif db_config['type'].lower() == 'local': db_provider = LocalDbProvider(db_config, blocking_auth=blocking_auth) if artifact_store is None: artifact_store = db_provider.get_storage_handler() else: raise ValueError('Unknown type of the database ' + db_config['type']) setup_storage(db_provider, artifact_store) return db_provider
def __init__(self, name: str, path: str = None, logger=None): if logger is not None: self._logger = logger else: self._logger = logs.get_logger('LocalQueue') self._logger.setLevel(get_storage_verbose_level()) self.name = name if path is None: self.path = self._get_queue_directory() else: self.path = path self.path = os.path.join(self.path, name) os.makedirs(self.path, exist_ok=True) # Local queue is considered active, iff its directory exists. self._lock_path = os.path.join(self.path, LOCK_FILE_NAME) self._lock = filelock.SoftFileLock(self._lock_path)
def __init__(self, config, blocking=True, verbose=logs.DEBUG): self.logger = logs.get_logger(self.__class__.__name__) self.logger.setLevel(verbose) if isinstance(config, dict): self.config = config else: self.config = {'type': config} self.tokendir = os.path.abspath( os.path.expanduser(self.config.get('token_directory', TOKEN_DIR))) if not os.path.exists(self.tokendir): os.makedirs(self.tokendir) self.token = self._load_token()[0] if self.token is None and blocking: self._sign_in()
def __init__(self, cred_dict): self.logger = logs.get_logger(self.__class__.__name__) self.logger.setLevel(storage_setup.get_storage_verbose_level()) self.type = None self.key = None self.secret_key = None self.session_token = None self.region = None self.profile = None if cred_dict is None: return if isinstance(cred_dict, str) and cred_dict == 'none': return if not isinstance(cred_dict, dict): msg: str =\ "NOT SUPPORTED credentials format {0}".format(repr(cred_dict)) util.report_fatal(msg, self.logger) if len(cred_dict) == 0: # Empty credentials dictionary is like None: return if len(cred_dict) == 1 and AWS_TYPE in cred_dict.keys(): aws_creds = cred_dict[AWS_TYPE] self.type = AWS_TYPE self.key = aws_creds.get(AWS_KEY, None) self.secret_key = aws_creds.get(AWS_SECRET_KEY, None) self.session_token = aws_creds.get(AWS_SESSION_TOKEN, None) self.region = self._get_named(AWS_REGION, aws_creds) self.profile = self._get_named(AWS_PROFILE, aws_creds) if self.key is None or self.secret_key is None: msg: str = \ "INVALID aws credentials format {0}".format(repr(cred_dict)) util.report_fatal(msg, self.logger) else: msg: str =\ "NOT SUPPORTED credentials format {0}".format(repr(cred_dict)) util.report_fatal(msg, self.logger)
def test_stop_experiment(self): my_path = os.path.dirname(os.path.realpath(__file__)) logger = logs.get_logger('test_stop_experiment') logger.setLevel(10) config_name = os.path.join(my_path, 'test_config.yaml') key = 'test_stop_experiment' + str(uuid.uuid4()) with model.get_db_provider(model.get_config(config_name)) as db: try: db.delete_experiment(key) except Exception: pass p = subprocess.Popen(['studio', 'run', '--config=' + config_name, '--experiment=' + key, '--force-git', '--verbose='+EXPERIMENT_VERBOSE_LEVEL, 'stop_experiment.py'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=my_path) # wait till experiment spins up experiment = None while experiment is None or experiment.status == 'waiting': time.sleep(1) try: experiment = db.get_experiment(key) except BaseException: pass logger.info('Stopping experiment') db.stop_experiment(key) pout, _ = p.communicate() if pout: logger.debug("studio run output: \n" + pout.decode()) db.delete_experiment(key)
def __init__(self, config, blocking=True, verbose=logs.DEBUG): if not os.path.exists(TOKEN_DIR): try: os.makedirs(TOKEN_DIR) except OSError: pass self.logger = logs.get_logger(self.__class__.__name__) self.logger.setLevel(logs.DEBUG) self.firebase = pyrebase.initialize_app(config) self.user = {} self.use_email_auth = config.get('use_email_auth', False) if self.use_email_auth: self.email = config.get('email') self.password = config.get('password') if not self.password or not self.email: self.email = input('Firebase token is not found or expired! ' + 'You need to re-login. (Or re-run with ' + 'studio/studio-runner ' + 'with --guest option ) ' '\nemail:') self.password = getpass.getpass('password:'******'Authentication required! Either specify ' + 'use_email_auth in config file, or run ' 'studio and go to webui ' + '(localhost:5000 by default) ' 'to authenticate using google credentials') while self.expired: time.sleep(1) self._update_user() self.sched = BackgroundScheduler() self.sched.start() self.sched.add_job(self._update_user, 'interval', minutes=31) atexit.register(self.sched.shutdown)
def __init__(self, queue_name, sub_name=None, verbose=10): from google.cloud import pubsub assert 'GOOGLE_APPLICATION_CREDENTIALS' in os.environ.keys() with open(os.environ['GOOGLE_APPLICATION_CREDENTIALS']) as f: credentials = json.loads(f.read()) project_name = credentials['project_id'] self.logger = logs.get_logger(self.__class__.__name__) if verbose is not None: self.logger.setLevel(parse_verbosity(verbose)) self.pubclient = pubsub.PublisherClient() self.subclient = pubsub.SubscriberClient() self.project = project_name self.topic_name = self.pubclient.topic_path(project_name, queue_name) self.logger.info("Topic name = {}".format(self.topic_name)) try: self.pubtopic = self.pubclient.get_topic(self.topic_name) except BaseException as e: check_for_kb_interrupt() self.pubtopic = self.pubclient.create_topic(self.topic_name) self.logger.info('topic {} created'.format(self.topic_name)) sub_name = sub_name if sub_name else queue_name + "_sub" self.logger.info("Topic name = {}".format(queue_name)) self.logger.info("Subscription name = {}".format(sub_name)) self.sub_name = self.subclient.subscription_path( project_name, sub_name) try: self.subclient.get_subscription(self.sub_name) except BaseException as e: check_for_kb_interrupt() self.logger.warn(e) self.subclient.create_subscription(self.sub_name, self.topic_name, ack_deadline_seconds=20) self.logger.info('subscription {} created'.format(sub_name))
def __init__(self, remote_path, credentials_dict, timestamp=None, compression=None): self.logger = logs.get_logger(self.__class__.__name__) self.logger.setLevel(get_storage_verbose_level()) self.url = remote_path self.timestamp = timestamp parsed_url = urlparse(self.url) self.scheme = parsed_url.scheme self.endpoint = parsed_url.netloc self.path = parsed_url.path self.credentials = Credentials(credentials_dict) super().__init__(StorageType.storageHTTP, self.logger, False, compression=compression)
def get_db_provider(config=None, blocking_auth=True): db_provider = get_storage_db_provider() if db_provider is not None: return db_provider if config is None: config = get_config() verbose = parse_verbosity(config.get('verbose')) # Save this verbosity level as global for the whole experiment job: set_storage_verbose_level(verbose) logger = logs.get_logger("get_db_provider") logger.setLevel(verbose) logger.debug('Choosing db provider with config:') logger.debug(config) if 'storage' in config.keys(): artifact_store = db_provider_setup.get_artifact_store( config['storage']) else: artifact_store = None assert 'database' in config.keys() db_config = config['database'] if db_config['type'].lower() == 'firebase': db_provider = FirebaseProvider(db_config, blocking_auth=blocking_auth) elif db_config['type'].lower() == 'http': db_provider = HTTPProvider(db_config, verbose=verbose, blocking_auth=blocking_auth) else: db_provider = db_provider_setup.get_db_provider( config=config, blocking_auth=blocking_auth) setup_storage(db_provider, artifact_store) return db_provider
def __init__(self, name, config=None, logger=None): if logger is not None: self.logger = logger else: self.logger = logs.get_logger('SQSQueue') self.logger.setLevel(get_storage_verbose_level()) self.name = name self.is_persistent = False self.credentials = self._setup_from_config(config) aws_access_key_id = self.credentials.get_key() aws_secret_access_key = self.credentials.get_secret_key() if self.credentials.get_profile() is not None: # If profile name is specified, for whatever reason # boto3 API will barf if (key, secret key) pair # is also defined. aws_access_key_id = None aws_secret_access_key = None self._session = boto3.session.Session( aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, aws_session_token=None, region_name=self.credentials.get_region(), profile_name=self.credentials.get_profile() ) self._client = self._session.client('sqs') create_q_response = self._client.create_queue( QueueName=name) self.queue_url = create_q_response['QueueUrl'] self.logger.info('Creating SQS queue with name %s', name) self.logger.info('Queue url = %s', self.queue_url)
def test_two_receivers(self): logger = logs.get_logger('test_two_receivers') logger.setLevel(10) q1 = self.get_queue() q1.clean() q2 = self.get_queue(q1.get_name()) data1 = str(uuid.uuid4()) data2 = str(uuid.uuid4()) logger.debug('data1 = ' + data1) logger.debug('data2 = ' + data2) q1.enqueue(data1) recv_data1 = self.get_queue_data( q2.dequeue(timeout=self.get_timeout())) self.assertEqual(data1, recv_data1) q1.enqueue(data1) q1.enqueue(data2) recv_data1 = q1.dequeue(timeout=self.get_timeout()) recv_data2 = q2.dequeue(timeout=self.get_timeout()) recv1 = self.get_queue_data(recv_data1) recv2 = self.get_queue_data(recv_data2) logger.debug('recv1 = ' + recv1) logger.debug('recv2 = ' + recv2) self.assertTrue(data1 == recv1 or data2 == recv1) self.assertTrue(data1 == recv2 or data2 == recv2) self.assertFalse(recv1 == recv2) self.assertTrue(q1.dequeue() is None) self.assertTrue(q2.dequeue() is None)
def __init__(self, config, measure_timestamp_diff=False, compression=None): self.logger = logs.get_logger(self.__class__.__name__) self.logger.setLevel(get_storage_verbose_level()) if compression is None: compression = config.get('compression', None) self.endpoint = config.get('endpoint', '~') self.endpoint = os.path.realpath(os.path.expanduser(self.endpoint)) if not os.path.exists(self.endpoint) \ or not os.path.isdir(self.endpoint): msg: str = "Store root {0} doesn't exist or not a directory. Aborting."\ .format(self.endpoint) self._report_fatal(msg) self.bucket = config.get('bucket', 'storage') self.store_root = os.path.join(self.endpoint, self.bucket) self._ensure_path_dirs_exist(self.store_root) super().__init__(StorageType.storageLocal, self.logger, measure_timestamp_diff, compression=compression)
def worker_loop(queue, parsed_args, single_experiment=False, timeout=0, verbose=None): fetch_artifacts = True logger = logs.get_logger('worker_loop') hold_period = 4 retval = 0 while True: msg = queue.dequeue(acknowledge=False, timeout=timeout) if not msg: break first_exp, ack_key = msg data_dict = json.loads(sixdecode(first_exp)) experiment_key = data_dict['experiment']['key'] config = data_dict['config'] parsed_args.config = config if verbose: config['verbose'] = verbose else: verbose = model.parse_verbosity(config.get('verbose', None)) logger.setLevel(verbose) logger.debug('Received message: \n{}'.format(data_dict)) executor = LocalExecutor(queue, parsed_args) with model.get_db_provider(config) as db: # experiment = experiment_from_dict(data_dict['experiment']) def try_get_experiment(): experiment = db.get_experiment(experiment_key) if experiment is None: raise ValueError( 'experiment is not found - indicates storage failure') return experiment experiment = retry(try_get_experiment, sleep_time=10, logger=logger) if config.get('experimentLifetime', None) and \ int(str2duration(config['experimentLifetime']) .total_seconds()) + experiment.time_added < time.time(): logger.info( 'Experiment expired (max lifetime of {0} was exceeded)'. format(config.get('experimentLifetime', None))) queue.acknowledge(ack_key) continue if allocate_resources(experiment, config, verbose=verbose): def hold_job(): queue.hold(ack_key, hold_period) hold_job() sched = BackgroundScheduler() sched.add_job(hold_job, 'interval', minutes=hold_period / 2) sched.start() try: python = 'python' if experiment.pythonver[0] == '3': python = 'python3' if '_singularity' not in experiment.artifacts.keys(): pip_diff = pip_needed_packages(experiment.pythonenv, python) if any(pip_diff): logger.info( 'Setting up python packages for experiment') if pip_install_packages(pip_diff, python, logger) != 0: logger.info( "Installation of all packages together " + " failed, " "trying one package at a time") for pkg in pip_diff: pip_install_packages([pkg], python, logger) for tag, item in experiment.artifacts.items(): art: Artifact = item if fetch_artifacts or art.local_path is None: get_only_newer: bool = True if tag == 'workspace': get_only_newer = False if not art.is_mutable: logger.info('Fetching artifact ' + tag) art.local_path = retry(lambda: db.get_artifact( art, only_newer=get_only_newer), sleep_time=10, logger=logger) else: logger.info('Skipping mutable artifact ' + tag) returncode = executor.run(experiment) if returncode != 0: retval = returncode finally: sched.shutdown() queue.acknowledge(ack_key) if single_experiment: logger.info('single_experiment is True, quitting') return retval else: logger.info('Cannot run experiment ' + experiment.key + ' due lack of resources. Will retry') # Debounce failed requests we cannot service yet time.sleep(config.get('sleep_time', 5)) logger.info("Queue in {0} is empty, quitting".format( fs_tracker.get_queue_directory())) return retval
import six import signal import pdb from apscheduler.schedulers.background import BackgroundScheduler from studio import fs_tracker, model from studio.util import logs from studio.queues.local_queue import LocalQueue from studio.util.gpu_util import get_available_gpus, get_gpu_mapping, get_gpus_summary from studio.artifacts.artifact import Artifact from studio.experiments.experiment import Experiment from studio.util.util import sixdecode, str2duration, retry,\ parse_verbosity, check_for_kb_interrupt logs.get_logger('apscheduler.scheduler').setLevel(logs.ERROR) class LocalExecutor(object): """Runs job while capturing environment and logs results. """ def __init__(self, queue, args): self.config = args.config if args.guest: self.config['database']['guest'] = True self.task_queue = queue self.logger = logs.get_logger('LocalExecutor') self.logger.setLevel( model.parse_verbosity(self.config.get('verbose', None)))
def __init__(self, config, measure_timestamp_diff=False, compression=None): self.logger = logs.get_logger(self.__class__.__name__) self.logger.setLevel(get_storage_verbose_level()) self.credentials: Credentials =\ Credentials.get_credentials(config) self.endpoint = config.get('endpoint', None) if self.credentials is None: msg: str = "NO CREDENTIALS provided for {0}."\ .format(self.endpoint) self._report_fatal(msg) if self.credentials.get_type() != AWS_TYPE: msg: str = "EXPECTED aws credentials for {0}: {1}"\ .format(self.endpoint, repr(self.credentials.to_dict())) self._report_fatal(msg) aws_key: str = self.credentials.get_key() aws_secret_key = self.credentials.get_secret_key() region_name = self.credentials.get_region() profile_name = self.credentials.get_profile() if profile_name is not None: # it seems that explicitly specified profile name # should not be used with explicitly specified credentials: aws_key = None aws_secret_key = None session = Session(aws_access_key_id=aws_key, aws_secret_access_key=aws_secret_key, region_name=region_name, profile_name=profile_name) session.events.unregister('before-parameter-build.s3.ListObjects', set_list_objects_encoding_type_url) self.client = session.client('s3', endpoint_url=self.endpoint, config=Config(signature_version='s3v4')) if compression is None: compression = config.get('compression', None) self.cleanup_bucket = config.get('cleanup_bucket', False) if isinstance(self.cleanup_bucket, str): self.cleanup_bucket = self.cleanup_bucket.lower() == 'true' self.bucket_cleaned_up: bool = False self.endpoint = self.client._endpoint.host self.bucket = config['bucket'] try: buckets = self.client.list_buckets() except Exception as exc: msg: str = "FAILED to list buckets for {0}: {1}"\ .format(self.endpoint, exc) self._report_fatal(msg) if self.bucket not in [b['Name'] for b in buckets['Buckets']]: try: if region_name is not None: self.client.create_bucket(Bucket=self.bucket, CreateBucketConfiguration={ 'LocationConstraint': region_name }) else: self.client.create_bucket(Bucket=self.bucket) except Exception as exc: msg: str = "FAILED to create bucket {0} for {1}: {2}"\ .format(self.bucket, self.endpoint, exc) self._report_fatal(msg) super().__init__(StorageType.storageS3, self.logger, measure_timestamp_diff, compression=compression)