def __init__( self, database, sql_expr=None, table=None, schema=None, geometry=None, crs=None, api_key=None, civis_kwargs={}, metadata={}, ): """ Create the Civis Source. Parameters ---------- database: str The name of the database in the platform. sql_expr: str The SQL expression to pass to the database backend. Either this or table must be given. table: str The table name to pass to the database backend. Either this or sql_expr must be given. schema: The schema for the table. Defaults to "public". geometry: str or list of str A column or list of columns that should be interpreted as geometries. crs: str or dict A coordinate reference string of the format that GeoPandas can understand. Only relevant if geometry columns are given. api_key: str An optional API key. If not given the env variable CIVIS_API_KEY will be used. civis_kwargs: dict Optional kwargs to pass to the civis.io functions. """ self._database = database self._table = table self._dbschema = schema self._sql_expr = sql_expr self._geom = [geometry] if isinstance(geometry, str) else geometry self._crs = crs self._client = civis.APIClient(api_key) self._civis_kwargs = civis_kwargs self._dataframe = None if crs and not geometry: warnings.warn("A CRS was provided but no geometry columns") # Only support reading with pandas self._civis_kwargs["use_pandas"] = True self._civis_kwargs["client"] = self._client # Enforce that exactly one of table or sql_expr are provided if bool(table) == bool(sql_expr): raise ValueError("Must provide a table OR a sql_expr") super(CivisSource, self).__init__(metadata=metadata)
def get_most_recent_run(scriptid, client=None): """Get the most recent run of a container script. Parameters ---------- scriptid : int The ID of the container script. client : None or civis.APIClient, optional A Civis API client to use. If not given, one will be instantiated. Returns ------- runid : int The ID of the latest run. """ client = client or civis.APIClient(resources='all') # First get all of the runs. runs = client.scripts.list_containers_runs(scriptid) if len(runs) > 0: # Get the most recent one. run_times = [ dateutil.parser.parse(r['finished_at'] or r['started_at']) for r in runs ] run = runs[run_times.index(max(run_times))] return int(run['id']) else: return None
def magic(line, cell=None): """Civis query magic. This magic works both as a cell magic (for table previews) and a line magic to query a table and return a DataFrame. """ client = civis.APIClient() if cell is None: # Not using maxsplit kwarg b/c it is not compatible w/ Python 2 database, sql = line.split(' ', 1) df = civis.io.read_civis_sql( sql, database.strip(), use_pandas=True, client=client) if len(df) == 0: df = None else: database = line.strip() sql = cell fut = civis.io.query_civis( sql, database, client=client, preview_rows=100) res = fut.result() if len(res['result_rows']) > 0: df = pd.DataFrame.from_records( res['result_rows'], columns=res['result_columns']) else: df = None return df
def _print_script_runs(scriptid): """Print the runs for a script. Parameters ---------- scriptid : int The container script to print the runs for. """ client = civis.APIClient(resources='all') try: # First get all of the runs. runs = client.scripts.list_containers_runs(scriptid) fmt = "%- 25s %- 25s %- 25s %- 25s %s" head = fmt % ("run id", "started", "finished", "status", "error msg") print(head) for run in runs: print(fmt % (run['id'], run['started_at'] if run['started_at'] else '-', run['finished_at'] if run['finished_at'] else '-', run['state'], run['error'] if run['error'] else '-')) except civis.base.CivisAPIError as e: print("Could not get script %s runs: %s.%s: %s" % (scriptid, e.__module__, e.__class__.__name__, e), file=sys.stderr) sys.exit(-1)
def worker_func(func_file_id): # Have the output File expire in 7 days. expires_at = (datetime.now() + timedelta(days=7)).isoformat() client = civis.APIClient() job_id = os.environ.get('CIVIS_JOB_ID') run_id = os.environ.get('CIVIS_RUN_ID') if not job_id or not run_id: raise RuntimeError("This function must be run inside a " "Civis container job.") # Run the function. result = None try: func, remote_backend = _robust_pickle_download( func_file_id, client=client, n_retries=5, delay=0.5) _backend = _setup_remote_backend(remote_backend) # graceful nested context managers are ~hard across python versions, # this just works... if NO_SKLEARN: with _joblib_para_backend(_backend): result = func() else: # we are using the nested context managers to set the joblib # backend to the requested one in both copes of joblib, the # package and the copy shipped by sklearn at # `sklearn.externals.joblib`. joblib maintains the current # backend as global state in the package and thus there are # two backends to set when you have two copies of the package # in play. with _sklearn_para_backend(_backend): with _joblib_para_backend(_backend): result = func() except Exception: print("Error! Attempting to record exception.") # Wrap the exception in joblib's TransportableException # so that joblib can properly display the results. e_type, e_value, e_tb = sys.exc_info() text = format_exc(e_type, e_value, e_tb, context=10, tb_offset=1) result = TransportableException(text, e_type) raise finally: # Serialize the result and upload it to the Files API. if result is not None: # If the function exits without erroring, we may not have a result. result_buffer = BytesIO() cloudpickle.dump(result, result_buffer, pickle.HIGHEST_PROTOCOL) result_buffer.seek(0) output_name = "Results from Joblib job {} / run {}".format(job_id, run_id) output_file_id = _robust_file_to_civis(result_buffer, output_name, n_retries=5, delay=0.5, expires_at=expires_at, client=client) client.scripts.post_containers_runs_outputs(job_id, run_id, 'File', output_file_id) print("Results output to file ID: {}".format(output_file_id))
def notebooks_up(notebook_id, mem=None, cpu=None): """Start an existing notebook and open it in the browser.""" client = civis.APIClient() kwargs = {'memory': mem, 'cpu': cpu} kwargs = {k: v for k, v in kwargs.items() if v is not None} client.notebooks.patch(notebook_id, **kwargs) _notebooks_up(notebook_id) _notebooks_open(notebook_id)
def jobs_follow_log(id): client = civis.APIClient() runs = client.jobs.list_runs(id, limit=1, order='id', order_dir='desc') if not runs: raise click.ClickException('No runs found for that job ID.') run_id = runs[0].id print('Run ID: ' + str(run_id)) _jobs_follow_run_log(id, run_id)
def _print_scripts(user_ids, state, hidden): """List scripts given a list of users. This function only lists scripts that are containers. Parameters ---------- user_ids : list of ints List of user IDs to get jobs for. An empty list corresponds to all scripts visible to the user making the API call. state : str List scripts only in this state. Use None to get scripts in all states. hidden : bool If True, display hidden scripts in addition to non-hidden ones. """ client = civis.APIClient(resources='all') # Get scripts. scripts = [] def _append_scripts(_hidden): _scripts = client.scripts.list( limit=50, hidden=_hidden, status=state, order='updated_at', type='containers', author=','.join(str(u) for u in user_ids) if user_ids else None) for s in _scripts: scripts.append(s) _append_scripts(_hidden=False) if hidden: _append_scripts(_hidden=True) # Print them out. fmt_str = "%- 20s %- 20s %- 20s %- 25s %- 25s %- 25s %s" head = fmt_str % ("id", "author", "status", "created", "started", "finished", "name") print(head) for s in scripts: if user_ids and s['author']['id'] not in user_ids: continue if s['last_run'] is not None: s_tme = s['last_run']['startedAt'] or '-' f_tme = s['last_run']['finishedAt'] or '-' else: s_tme = '-' f_tme = '-' c_tme = s['created_at'] or '-' print(fmt_str % (s['id'], s['author']['username'], s['state'], c_tme, s_tme, f_tme, s['name']))
def notebooks_down(notebook_id): """Shut down a running notebook.""" client = civis.APIClient() nb = client.notebooks.get(notebook_id) state = nb['most_recent_deployment']['state'] if state not in ['running', 'pending']: print('Notebook is in state "{}" and can\'t be stopped.'.format(state)) deployment_id = nb['most_recent_deployment']['deploymentId'] client.notebooks.delete_deployments(notebook_id, deployment_id)
def unshare_service(args): client = civis.APIClient() tokens = client.services.list_tokens(args.id) try: token = next(t for t in tokens if t["name"] == args.name) client.services.delete_tokens(args.id, token["id"]) print(f"Successfully unshared {args.name}") except StopIteration: print(f"Could not find share token with the name {args.name}")
def _robust_file_to_civis(buf, name, client=None, n_retries=5, delay=0.0, **kwargs): """Upload the contents of an input file-like buffer Call :func:`~civis.io.file_to_civis`, and retry a specified number of times before giving up. This will abandon Civis files created for failed uploads. Thoase files may be partially filled; it's necessary to create new files to ensure that the contents are exactly as requested. .. note:: This function starts by calling ``.seek(0)`` on the buffer, and will do so before every retry. Parameters ---------- buf : File File-like bytes object to send to a Civis File name : str Name of the new Civis File client : civis.APIClient, optional n_retries : int, optional Retry the upload this many times before raising an error. delay : float, optional If provided, wait this many seconds between retries. kwargs : Extra keyword arguments will be passed to ``io.file_to_civis`` Returns ------- int ID of the new Civis File See Also -------- civis.io.file_to_civis """ client = client or civis.APIClient() retry_exc = (requests.HTTPError, requests.ConnectionError, requests.ConnectTimeout) n_failed = 0 while True: buf.seek(0) try: file_id = civis.io.file_to_civis(buf, name=name, client=client, **kwargs) except retry_exc as exc: if n_failed < n_retries: n_failed += 1 log.debug("Upload failure %s due to %s; retrying.", n_failed, str(exc)) time.sleep(delay) else: raise else: return file_id
def _print_script_logs(scriptid, runid=None): """Print the logs and information for a script. Parameters ---------- scriptid : int The script ID to print out. runid : int or None, optional An optional run ID. If None, then the most recent run will be used. """ client = civis.APIClient(resources='all') try: # get the run details runid = runid or get_most_recent_run(scriptid, client=client) # Get rest of script details. deets = client.scripts.get_containers(scriptid) # Make them into pure dicts for dumping to yaml. deets = { k: dict(v) if (isinstance(v, civis.response.Response) or isinstance(v, dict)) else v for k, v in deets.items() } # Put the name and id at the top so they are easy to see print('name:', deets['name']) print('id:', deets['id']) del deets['name'] del deets['id'] # Make the docker command more grep-able. print('docker_command:') for line in deets['docker_command'].split('\n'): line = line.rstrip() print(' ' + line) del deets['docker_command'] # now dump the rest. print(yaml.dump(deets, default_flow_style=False).strip()) if runid is not None: # Get and print the logs for this run resp = client.scripts.list_containers_runs_logs(scriptid, runid) print("log file:") for r in resp[::-1]: print(" [%s] %s" % (r['created_at'], r['message'])) else: print('log file: -') except civis.base.CivisAPIError as e: print("Could not print script %s logs: %s.%s: %s" % (scriptid, e.__module__, e.__class__.__name__, e), file=sys.stderr) sys.exit(-1)
def get_pword(login): if login == '': raise IndexError client = civis.APIClient(api_key=os.environ['CIVIS_API_KEY']) return civis.io.read_civis_sql( sql= f"SELECT pword FROM users.creds WHERE LOWER(login) = '{login.lower()}'", database='HRC', client=client, hidden=True)[1][0]
def notebooks_new_cmd(language='python3', mem=None, cpu=None): """Create a new notebook and open it in the browser.""" client = civis.APIClient() kwargs = {'memory': mem, 'cpu': cpu} kwargs = {k: v for k, v in kwargs.items() if v is not None} new_nb = client.notebooks.post(language=language, **kwargs) print("Created new {language} notebook with ID {id} in Civis Platform" " (https://platform.civisanalytics.com/#/notebooks/{id})." .format(language=language, id=new_nb.id)) _notebooks_up(new_nb.id) _notebooks_open(new_nb.id)
def notebooks_download_cmd(notebook_id, path): """Download a notebook to a specified local path.""" client = civis.APIClient() info = client.notebooks.get(notebook_id) response = requests.get(info['notebook_url'], stream=True) response.raise_for_status() chunk_size = 32 * 1024 chunked = response.iter_content(chunk_size) with open(path, 'wb') as f: for lines in chunked: f.write(lines)
def __init__(self, *args, civis_api_key=None, civis_api_key_env_var="CIVIS_API_KEY", database=None, schema=None, existing_table_rows="append", include_columns=None, dummy_run=False, block=False, max_errors=0, table=None, via_staging_table=False, columns=None, staging_table=None, remap=None, recorded_tables=TimedDict(timeout=30), **kwargs): self.civis_api_key = civis_api_key or os.environ[civis_api_key_env_var] self.include_columns = include_columns self.table = table self.dummy_run = dummy_run self.schema = schema self.max_errors = int(max_errors) self.existing_table_rows = existing_table_rows self.database = database self.via_staging_table = via_staging_table self.block = block self.remap = remap self.api_client = civis.APIClient() self.recorded_tables = recorded_tables self.columns = columns super(SendToCivis, self).__init__(**kwargs) if self.via_staging_table: self.staging_table = "_".join([ table, "staging", hashlib.md5(bytes(str(random.random()), "ascii")).hexdigest()[:HASH_SUFFIX_LENGTH], ]) self.log_info("staging table for: " + self.name + " " + str(self.staging_table)) else: self.staging_table = staging_table if self.civis_api_key is None and len(self.civis_api_key) == 0: raise Exception("Could not get a Civis API key.") self.monitor_futures_thread = threading.Thread( target=SendToCivis.monitor_futures, args=(self, ), daemon=True) self.monitor_futures_thread.start()
def post_json_run_output(json_value_dict): client = civis.APIClient() json_value_object = client.json_values.post( json.dumps(json_value_dict), name='email_outputs' ) client.scripts.post_python3_runs_outputs( os.environ['CIVIS_JOB_ID'], os.environ['CIVIS_RUN_ID'], 'JSONValue', json_value_object.id )
def share_service(args): client = civis.APIClient() service = client.services.get(args.id) try: response = client.services.post_tokens(args.id, args.name) url = f"{service['current_url']}/civis-platform-auth?token={response['token']}" print(f"Share service id {args.id} with the following URL: {url}") except civis.base.CivisAPIError as e: if "Name has already been taken" in str(e): print(f"The share name {args.name} is already in use. " "Please choose another") else: raise e
def _get(scriptid, runid=None, path=None): """Download job data for a script. During jobs, civis-compute automatically saves any outputs written to the directory given by the environment variable `${CIVIS_JOB_DATA}`. The `get` command downloads these outputs if they exist. The outputs are stored as a run output with the name civis_job_data_${CIVIS_JOB_ID}_${CIVIS_RUN_ID} Parameters ---------- scriptid : int The container script ID. runid : int or None, optional The run ID to get outputs for. If None, the most recent run is used. path : str or None The path to download the data to. If None, the current working directory is used. """ client = civis.APIClient(resources='all') try: runid = runid or get_most_recent_run(scriptid, client=client) outputs = client.scripts.list_containers_runs_outputs(scriptid, runid) for output in outputs: if (output['object_type'] == 'File' and 'civis_job_data_' in output['name']): if path: oname = os.path.join(path, output['name']) else: oname = output['name'] print(oname) with open(oname, 'wb') as fp: civis.io.civis_to_file(output['object_id'], fp) break except civis.base.CivisAPIError as e: print( "Could not get script %s outputs: %s.%s: %s" % ( scriptid, e.__module__, e.__class__.__name__, e), file=sys.stderr) sys.exit(-1)
def test_get_table_id(schema_tablename): """Check that get_table_id handles quoted schema.tablename correctly.""" client = civis.APIClient(local_api_spec=TEST_SPEC, api_key='none') client.get_database_id = mock.Mock(return_value=123) mock_tables = mock.MagicMock() mock_tables.__getitem__.side_effect = {0: mock.Mock()}.__getitem__ client.tables.list = mock.Mock(return_value=mock_tables) client.get_table_id(table=schema_tablename, database=123) client.tables.list.assert_called_once_with(database_id=123, schema='foo', name='bar')
def _robust_pickle_download(output_file_id, client=None, n_retries=5, delay=0.0): """Download and deserialize the result from output_file_id Retry network errors `n_retries` times with `delay` seconds between calls Parameters ---------- output_file_id : int ID of the file to download client : civis.APIClient, optional n_retries : int, optional Retry the upload this many times before raising an error. delay : float, optional If provided, wait this many seconds between retries. Returns ------- obj Any Python object; the result of calling ``cloudpickle.load`` on the downloaded file See Also -------- cloudpickle.load """ client = client or civis.APIClient() retry_exc = (requests.HTTPError, requests.ConnectionError, requests.ConnectTimeout) n_failed = 0 while True: buffer = BytesIO() try: civis.io.civis_to_file(output_file_id, buffer, client=client) except retry_exc as exc: buffer.close() if n_failed < n_retries: n_failed += 1 log.debug("Download failure %s due to %s; retrying.", n_failed, str(exc)) time.sleep(delay) else: raise else: buffer.seek(0) return cloudpickle.load(buffer)
def worker_func(func_file_id): # Have the output File expire in 7 days. expires_at = (datetime.now() + timedelta(days=7)).isoformat() client = civis.APIClient() job_id = os.environ.get('CIVIS_JOB_ID') run_id = os.environ.get('CIVIS_RUN_ID') if not job_id or not run_id: raise RuntimeError("This function must be run inside a " "Civis container job.") func_buffer = BytesIO() civis.io.civis_to_file(func_file_id, func_buffer) func_buffer.seek(0) func = joblib.load(func_buffer) # Run the function. result = None try: result = func() except Exception: print("Error! Attempting to record exception.") # Wrap the exception in joblib's TransportableException # so that joblib can properly display the results. e_type, e_value, e_tb = sys.exc_info() text = format_exc(e_type, e_value, e_tb, context=10, tb_offset=1) result = TransportableException(text, e_type) raise finally: # Serialize the result and upload it to the Files API. # Note that if compress is 0, joblib will output multiple files. # compress=3 is a good compromise between space and read/write times # (https://github.com/joblib/joblib/blob/18f9b4ce95e8788cc0e9b5106fc22573d768c44b/joblib/numpy_pickle.py#L358). if result is not None: # If the function exits without erroring, we may not have a result. result_buffer = BytesIO() joblib.dump(result, result_buffer, compress=3) result_buffer.seek(0) output_name = "Results from Joblib job {} / run {}".format( job_id, run_id) output_file_id = civis.io.file_to_civis(result_buffer, output_name, expires_at=expires_at) client.scripts.post_containers_runs_outputs( job_id, run_id, 'File', output_file_id) print("Results output to file ID: {}".format( output_name, output_file_id))
def _init_civis_backend(self): """init the Civis API client and the executors""" self.using_template = (self.from_template_id is not None) if self.max_submit_retries < 0: raise ValueError( "max_submit_retries cannot be negative (value = %d)" % self.max_submit_retries) self.client = self.client or civis.APIClient() if self.from_template_id: self.executor = CustomScriptExecutor(self.from_template_id, client=self.client, **self.executor_kwargs) else: self.executor = _ContainerShellExecutor(client=self.client, **self.executor_kwargs)
def create_user(login, pword): client = civis.APIClient(api_key=os.environ['CIVIS_API_KEY']) try: civis.io.read_civis_sql( sql= f"SELECT login FROM users.creds WHERE LOWER(login) = '{login.lower()}'", database='HRC', client=client, hidden=True)[1][0] raise ValueError except civis.base.EmptyResultError: return civis.io.query_civis( sql= f"INSERT INTO users.creds VALUES ({login}, {hash_pword(pword)})", database='HRC', client=client, hidden=True)
def __init__(self, future, callback): self._future = future self._callback = callback self.result = None if hasattr(future, 'client'): self._client = future.client else: self._client = civis.APIClient() # Download results and trigger the next job as a callback # so that we don't have to wait for `get` to be called. # Note that the callback of a `concurrent.futures.Future` # (which self._future is a subclass of) is called with a # single argument, the Future itself. self._future.remote_func_output = None # `get` reads results from here self._future.result_fetched = False # Did we get the result? self._future.add_done_callback( self._make_fetch_callback(self._callback, self._client))
def run_container(dropbox_path): client = civis.APIClient() script_id = client.scripts.post_containers( name=f'Invisible Institute Data Run {dropbox_path}', docker_image_name='civisanalytics/datascience-python', docker_image_tag='5.0.0', required_resources={ 'cpu': 256, 'memory': 4096, 'disk_space': 5, }, repo_http_uri='https://github.com/invinst/chicago-police-data.git', repo_ref='master', docker_command=f'''cd app pip install -r requirements.txt python -m get_data.run --path_to_execute {dropbox_path}''', params=[{ 'allowed_values': [], 'default': None, 'description': None, 'label': 'Dropbox Credential', 'name': 'DROPBOX_OAUTH', 'required': True, 'type': 'credential_custom', 'value': None }], arguments={'DROPBOX_OAUTH': 6644})['id'] run_id = client.scripts.post_containers_runs(script_id)['id'] LOG.info(f'Analyzing {dropbox_path} in container script {script_id}' f' at run {run_id}') future = civis.futures.CivisFuture(client.scripts.get_containers_runs, (script_id, run_id)) try: result = future.result() state = result['state'] LOG.info(f'Script {script_id} run {run_id} {state}') except CivisJobFailure: result = client.scripts.get_containers_runs(script_id, run_id) state = result['state'] LOG.warning(f'Error: script {script_id} run {run_id} {state}')
def open_postgres_catalog(api_key=None): """ Top-level function to create a PostgreSQL CivisCatalog object. Parameters ========== api_key: Optional[str] An API key. If not provided, uses the environment variable CIVIS_API_KEY. Returns ======= A CivisCatalog targeting PostgreSQL. """ client = civis.APIClient(api_key) hosts = client.remote_hosts.list() try: db = next(h for h in hosts if h["type"] == POSTGRES_KIND) except StopIteration: raise RuntimeError("Unable to find PostgreSQL database") return CivisCatalog(db["name"], api_key=api_key)
def _cancel(scriptids): """Cancel running scripts. Parameters ---------- scriptids : tuple of ints The IDs of the scripts to cancel. """ errored = False client = civis.APIClient(resources='all') for scriptid in scriptids: try: client.scripts.post_cancel(scriptid) except civis.base.CivisAPIError as e: errored = True print("Could not cancel script %s: %s.%s: %s" % (scriptid, e.__module__, e.__class__.__name__, e), file=sys.stderr) if errored: sys.exit(-1)
def test_get_storage_host_id(): client = civis.APIClient(local_api_spec=TEST_SPEC, api_key='none') class StorageHost: def __init__(self, id, name): self.id = id self.name = name def __getitem__(self, key): return getattr(self, key) storage_hosts = [StorageHost(1234, 'test'), StorageHost(5678, 'othertest')] client.storage_hosts.list = mock.Mock(return_value=storage_hosts) assert client.get_storage_host_id('test') == 1234 client.storage_hosts.list.assert_called_once_with() assert client.get_storage_host_id(4732) == 4732 with pytest.raises(ValueError, match="Storage Host invalidname not found"): client.get_storage_host_id('invalidname')
def __init__( self, database, schema="public", api_key=None, civis_kwargs={}, has_geometry_column_table=None, **kwargs, ): """ Construct the Civis Schema. Parameters ---------- database: str The name of the database. schema: str The schema to list (defaults to "public"). api_key: str An optional API key. If not given the env variable CIVIS_API_KEY will be used. has_geometry_column_table: bool Whether the database has a "geometry_columns" table, which can be used to query for SRID information for a given column. Otherwise we try to infer based on whether it is a postgres database. civis_kwargs: dict Optional kwargs to pass to the sources. """ self._civis_kwargs = civis_kwargs self._database = database self._api_key = api_key self._client = civis.APIClient(api_key) if has_geometry_column_table is not None: self._has_geom = has_geometry_column_table else: self._has_geom = "redshift" not in self._database.lower() self._dbschema = schema # Don't shadow self._schema upstream kwargs["ttl"] = (kwargs.get("ttl") or 100) # Bump TTL so as not to load too often. super(CivisSchema, self).__init__(**kwargs)