def __init__(self, *args, **kwargs): conn = self.get_connection(kwargs['qubole_conn_id']) Qubole.configure(api_token=conn.password, api_url=conn.host) self.task_id = kwargs['task_id'] self.dag_id = kwargs['dag'].dag_id self.kwargs = kwargs self.cls = COMMAND_CLASSES[self.kwargs['command_type']] self.cmd = None
def _configure_qubole(self): logging.basicConfig(level=logging.INFO) logger = logging.getLogger('qds_connection') logger.propagate = False qdslog = logging.getLogger('qds') if not self.config.API_TOKEN: raise Exception("You didn't specify your QUBOLE_API_TOKEN in " "your environment before running commands on " "Qubole!\n. It can be found at http://api.qubole" ".com/users/edit") Qubole.configure(api_token=self.config.API_TOKEN, api_url=self.config.API_URL, version=self.config.API_VERSION, poll_interval=self.config.POLL_INTERVAL_SEC) return qdslog
def update(cls, cluster_id_label, cluster_info): """ Update the cluster with id/label `cluster_id_label` using information provided in `cluster_info`. """ conn = Qubole.agent(version="v2") return conn.put(cls.element_path(cluster_id_label), data=cluster_info)
def index(cls): """ Shows a list of all available reports by issuing a GET request to the /reports endpoint. """ conn = Qubole.agent() return conn.get(cls.rest_entity_path)
def poke(self, context): conn = BaseHook.get_connection(self.qubole_conn_id) Qubole.configure(api_token=conn.password, api_url=conn.host) this.log.info('Poking: %s', self.data) status = False try: status = self.sensor_class.check(self.data) except Exception as e: logging.exception(e) status = False this.log.info('Status of this Poke: %s', status) return status
def find_by_name(name): conn = Qubole.agent() if name is not None: schedjson = conn.get(Scheduler.rest_entity_path, params={"name":name}) if schedjson["schedules"]: return Scheduler(schedjson["schedules"][0]) return None
def terminate(cls, cluster_id_label): """ Terminate the cluster with id/label `cluster_id_label`. """ conn = Qubole.agent() data = {"state": "terminate"} return conn.put(cls.element_path(cluster_id_label) + "/state", data)
def clone(cls, cluster_id_label, cluster_info): """ Update the cluster with id/label `cluster_id_label` using information provided in `cluster_info`. """ conn = Qubole.agent() return conn.post(cls.element_path(cluster_id_label) + '/clone', data=cluster_info)
def add_node(cls, cluster_id_label, parameters=None): """ Add a node to an existing cluster """ conn = Qubole.agent() parameters = {} if not parameters else parameters return conn.post(cls.element_path(cluster_id_label) + "/nodes", data={"parameters" : parameters})
def start(cls, cluster_id_label): """ Start the cluster with id/label `cluster_id_label`. """ conn = Qubole.agent() data = {"state": "start"} return conn.put(cls.element_path(cluster_id_label) + "/state", data)
def get_results(self, fp=sys.stdout, inline=True, delim=None): """ Fetches the result for the command represented by this object Args: `fp`: a file object to write the results to directly """ result_path = self.meta_data['results_resource'] conn = Qubole.agent() r = conn.get(result_path, {'inline': inline}) if r.get('inline'): if sys.version_info < (3, 0, 0): fp.write(r['results'].encode('utf8')) else: import io if isinstance(fp, io.TextIOBase): fp.buffer.write(r['results'].encode('utf8')) elif isinstance(fp, io.BufferedIOBase) or isinstance(fp, io.RawIOBase): fp.write(r['results'].encode('utf8')) else: # Can this happen? Don't know what's the right thing to do in this case. pass else: acc = Account.find() boto_conn = boto.connect_s3(aws_access_key_id=acc.storage_access_key, aws_secret_access_key=acc.storage_secret_key) log.info("Starting download from result locations: [%s]" % ",".join(r['result_location'])) #fetch latest value of num_result_dir num_result_dir = Command.find(self.id).num_result_dir for s3_path in r['result_location']: # In Python 3, in this case, `fp` should always be binary mode. _download_to_local(boto_conn, s3_path, fp, num_result_dir, delim=delim)
def list(cls, label=None, cluster_id=None, state=None): """ List existing clusters present in your account. Kwargs: `state`: list only those clusters which are in this state Returns: List of clusters satisfying the given criteria """ if cluster_id is not None: return cls.show(cluster_id) if label is not None: return cls.show(label) conn = Qubole.agent(version="v2") cluster_list = conn.get(cls.rest_entity_path) if state is None: # return the complete list since state is None return conn.get(cls.rest_entity_path) # filter clusters based on state result = [] if 'clusters' in cluster_list: for cluster in cluster_list['clusters']: if state.lower() == cluster['state'].lower(): result.append(cluster) return result
def find(cls, name="default", **kwargs): if (name is None) or (name == "default"): conn = Qubole.agent() return cls(conn.get(cls.rest_entity_path)) else: raise ParseError("Bad name %s" % name, "Hadoop Clusters can only be named 'default' currently")
def __init__(self, access=None, secret = None, testmode=False, db_parallelism=None, mode=None, db_table=None, db_where=None, db_columns=None, db_boundary_query=None, db_extract_query=None, db_split_column=None, hive_table=None, part_spec=None, db_user=None, db_passwd=None, db_host=None, db_port=None, db_type=None, db_name=None, api_token = None, api_url=None, fetch_size = None): self.temp_location = "/tmp/sqoop/"+uuid.uuid1().hex self.tmp_dir = tempfile.mkdtemp(prefix="/media/ephemeral0/logs"+"/sqoop") logger.info("Temp Directory is:" + self.tmp_dir) self.access = access self.secret = secret self.api_token = api_token self.api_url = api_url self.fetch_size = fetch_size self.redshift_sink = False self.__loadImportParamsFromCid(testmode, db_parallelism, mode, db_table, db_where, db_columns, db_boundary_query, db_extract_query, db_split_column, hive_table, part_spec, db_user, db_passwd, db_host, db_port, db_type, db_name) self.sqoop_cmd=["/usr/lib/sqoop-h2/bin/sqoop"] self.sqoop_cmd.extend(["import"]) self.__addBasicOptions() self.__extendCmdSpecificOptions() Qubole.configure(api_token=api_token, api_url=api_url) self.cluster_label = Cluster.show(os.popen("cat /usr/lib/hustler/bin/nodeinfo_src.sh | grep cluster_id").read().split("=")[1].strip().replace('"',''))['cluster']['label'][0]
def update_node(cls, cluster_id_label, command, private_dns, parameters=None): """ Add a node to an existing cluster """ conn = Qubole.agent() parameters = {} if not parameters else parameters data = {"command" : command, "private_dns" : private_dns, "parameters" : parameters} return conn.put(cls.element_path(cluster_id_label) + "/nodes", data)
def check(cls, data): """ Method to call the sensors api with json payload :param data: valid json object :return: True or False """ conn = Qubole.agent() return conn.post(cls.rest_entity_path, data=data)['status']
def create_update_clone_parser(subparser, action=None): # cloud config parser cloud = Qubole.get_cloud() cloud.create_parser(subparser) # cluster info parser ClusterInfoV2.cluster_info_parser(subparser, action) # engine config parser Engine.engine_parser(subparser)
def snapshot(cls, cluster_id_label, s3_location, backup_type): """ Create hbase snapshot full/incremental """ conn = Qubole.agent() parameters = {} parameters['s3_location'] = s3_location if backup_type: parameters['backup_type'] = backup_type return conn.post(cls.element_path(cluster_id_label) + "/snapshots", data=parameters)
def get_log_id(cls, id): """ Fetches log for the command represented by this id Args: `id`: command id """ conn = Qubole.agent() r = conn.get_raw(cls.element_path(id) + "/logs") return r.text
def cancel_id(cls, id): """ Cancels command denoted by this id Args: `id`: command id """ conn = Qubole.agent() data = {"status": "kill"} return conn.put(cls.element_path(id), data)
def execute(self): logger.info("Running DbImportCommand " + str(self.sqoop_cmd)) if self.api_url is None: Qubole.configure(api_token=self.api_token) else: Qubole.configure(api_token=self.api_token, api_url = self.api_url) p = Popen(self.sqoop_cmd, cwd=self.tmp_dir) retCode = p.wait() a= os.popen("grep s3_default_db_location /usr/lib/hustler/bin/nodeinfo_src.sh").read() print(self.temp_location) print(self.get_s3_loc()) p = Popen(["hadoop", "dfs","-cp", self.temp_location, self.get_s3_loc() + self.temp_location]) retCode1 = p.wait() if retCode != 0 or retCode1 != 0: logger.warn("sqoop retCode = " + str(retCode)) self.__runCleanupScript() self.__runDfsCleanup() return(retCode or retCode1) else: logger.debug("sqoop retCode = " + str(retCode)) retCode = 1 if self.cmd_row['test_mode']: logger.debug("Not running hive in test mode.") retCode = 0 else: logger.info("Running hive script.") self.fixHiveQuery() q = open(self.tmp_dir+"/hive_query.q").read() logger.info("Query is: " + q) cmd=HiveCommand.create(query=q, label=self.cluster_label) while not Command.is_done(cmd.status): time.sleep(5) cmd = Command.find(cmd.id) logger.info("Hive command id: " + str(cmd.id) + "status: "+ str(cmd.status)) logger.info(cmd.status) if cmd.status == "done": retCode = 0 if retCode != 0: self.__runCleanupScript() self.__runDfsCleanup() return(retCode)
def createTemplate(data): """ Create a new template. Args: `data`: json data required for creating a template Returns: Dictionary containing the details of the template with its ID. """ conn = Qubole.agent() return conn.post(Template.rest_entity_path, data)
def get_log(self): """ Fetches log for the command represented by this object Returns: The log as a string """ log_path = self.meta_data['logs_resource'] conn = Qubole.agent() r = conn.get_raw(log_path) return r.text
def main(): logging.basicConfig(level=logging.INFO) if (len(sys.argv) < 3): usage() if (len(sys.argv) >= 2 and sys.argv[1] == "-h"): usage(0) api_token = sys.argv[1] output_path = sys.argv[2] Qubole.configure(api_token=api_token) args = HadoopCommand.parse(("streaming -files s3n://paid-qubole/HadoopAPIExamples/WordCountPython/mapper.py,s3n://paid-qubole/HadoopAPIExamples/WordCountPython/reducer.py -mapper mapper.py -reducer reducer.py -numReduceTasks 1 -input s3n://paid-qubole/default-datasets/gutenberg -output %s" % output_path).split()) cmd = HadoopCommand.run(**args) print("Streaming Job run via command id: %s, finished with status %s" % (cmd.id, cmd.status))
def editTemplate(id, data): """ Edit an existing template. Args: `id`: ID of the template to edit `data`: json data to be updated Returns: Dictionary containing the updated details of the template. """ conn = Qubole.agent() return conn.put(Template.element_path(id), data)
def viewTemplate(id): """ View an existing Template details. Args: `id`: ID of the template to fetch Returns: Dictionary containing the details of the template. """ conn = Qubole.agent() return conn.get(Template.element_path(id))
def restore_point(cls, cluster_id_label, s3_location, backup_id, table_names, overwrite=True, automatic=True): """ Restoring cluster from a given hbase snapshot id """ conn = Qubole.agent() parameters = {} parameters['s3_location'] = s3_location parameters['backup_id'] = backup_id parameters['table_names'] = table_names parameters['overwrite'] = overwrite parameters['automatic'] = automatic return conn.post(cls.element_path(cluster_id_label) + "/restore_point", data=parameters)
def get_results(self, fp=sys.stdout, inline=True, delim=None, fetch=True): """ Fetches the result for the command represented by this object get_results will retrieve results of the command and write to stdout by default. Optionally one can write to a filestream specified in `fp`. The `inline` argument decides whether the result can be returned as a CRLF separated string. In cases where the results are greater than 20MB, get_results will attempt to read from s3 and write to fp. The retrieval of results from s3 can be turned off by the `fetch` argument Args: `fp`: a file object to write the results to directly `inline`: whether or not results are returned inline as CRLF separated string `fetch`: True to fetch the result even if it is greater than 20MB, False to only get the result location on s3 """ result_path = self.meta_data["results_resource"] conn = Qubole.agent() r = conn.get(result_path, {"inline": inline}) if r.get("inline"): if sys.version_info < (3, 0, 0): fp.write(r["results"].encode("utf8")) else: import io if isinstance(fp, io.TextIOBase): fp.buffer.write(r["results"].encode("utf8")) elif isinstance(fp, io.BufferedIOBase) or isinstance(fp, io.RawIOBase): fp.write(r["results"].encode("utf8")) else: # Can this happen? Don't know what's the right thing to do in this case. pass else: if fetch: acc = Account.find() boto_conn = boto.connect_s3( aws_access_key_id=acc.storage_access_key, aws_secret_access_key=acc.storage_secret_key ) log.info("Starting download from result locations: [%s]" % ",".join(r["result_location"])) # fetch latest value of num_result_dir num_result_dir = Command.find(self.id).num_result_dir for s3_path in r["result_location"]: # In Python 3, # If the delim is None, fp should be in binary mode because # boto expects it to be. # If the delim is not None, then both text and binary modes # work. _download_to_local(boto_conn, s3_path, fp, num_result_dir, delim=delim) else: fp.write(",".join(r["result_location"]))
def show(cls, report_name, data): """ Shows a report by issuing a GET request to the /reports/report_name endpoint. Args: `report_name`: the name of the report to show `data`: the parameters for the report """ conn = Qubole.agent() return conn.get(cls.element_path(report_name), data)
def main(): root = logging.getLogger() root.setLevel(logging.INFO) ch = logging.StreamHandler(sys.stdout) ch.setLevel(logging.INFO) formatter = logging.Formatter('%(asctime)s - %(module)s - %(levelname)s - %(message)s') ch.setFormatter(formatter) root.addHandler(ch) # I am using this slightly complicated trick to pass config in the constructor of # other packages. Better way to do this ? config_parser, argparser = setup_parsers() config_args, remaining_argv = config_parser.parse_known_args() config = load_config(config_args) args = argparser.parse_args(remaining_argv) if args.debug: ch.setLevel(logging.DEBUG) root.setLevel(logging.DEBUG) logging.debug("Debug is ON!") if args.log_file is not None: fh = logging.FileHandler(args.log_file, mode='w') fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) root.setLevel(logging.DEBUG) root.addHandler(fh) try: Qubole.configure( api_token=config.get("default", "auth_token"), api_url=config.get("default", "api_url"), skip_ssl_cert_check=True ) args.func(config, args) finally: logging.debug("Cleaning up")
def find(cls, **kwargs): if cls.cached_resource is None: conn = Qubole.agent() cls.cached_resource = cls(conn.get(cls.rest_entity_path)) return cls.cached_resource
def main(): optparser = OptionParser(usage=usage_str) optparser.add_option( "--token", dest="api_token", default=os.getenv('QDS_API_TOKEN'), help= "api token for accessing Qubole. must be specified via command line or passed in via environment variable QDS_API_TOKEN" ) optparser.add_option( "--url", dest="api_url", default=os.getenv('QDS_API_URL'), help= "base url for QDS REST API. defaults to https://api.qubole.com/api ") optparser.add_option( "--version", dest="api_version", default=os.getenv('QDS_API_VERSION'), help="version of REST API to access. defaults to v1.2") optparser.add_option( "--poll_interval", dest="poll_interval", default=os.getenv('QDS_POLL_INTERVAL'), help= "interval for polling API for completion and other events. defaults to 5s" ) optparser.add_option("-v", dest="verbose", action="store_true", default=False, help="verbose mode - info level logging") optparser.add_option("--vv", dest="chatty", action="store_true", default=False, help="very verbose mode - debug level logging") optparser.disable_interspersed_args() (options, args) = optparser.parse_args() if options.chatty: logging.basicConfig(level=logging.DEBUG) elif options.verbose: logging.basicConfig(level=logging.INFO) else: # whatever is dictated by logging config pass if options.api_token is None: raise Exception("No API Token provided") if options.api_url is None: options.api_url = "https://api.qubole.com/api/" if options.api_version is None: options.api_version = "v1.2" if options.poll_interval is None: options.poll_interval = 5 Qubole.configure(api_token=options.api_token, api_url=options.api_url, version=options.api_version, poll_interval=options.poll_interval) if len(args) < 1: sys.stderr.write("Missing first argument containing command type\n") usage() cmdset = set(["hive", "pig", "hadoop"]) cmdsuffix = "cmd" cmd = args.pop(0) if ((cmd.find(cmdsuffix) != len(cmd) - 3) or (cmd[:cmd.find(cmdsuffix)] not in cmdset)): sys.stderr.write("First command must be one of <%s>\n" % "|".join(cmdset)) usage() return cmdmain(cmd[:cmd.find(cmdsuffix)], args)
def find(cls, id, **kwargs): conn = Qubole.agent() if id is not None: return cls(conn.get(cls.element_path(id)))
def update(cls, id, **kwargs): conn = Qubole.agent() return conn.put(cls.element_path(id), data=kwargs)
def status(cls, cluster_id_label): """ Show the status of the cluster with id/label `cluster_id_label`. """ conn = Qubole.agent() return conn.get(cls.element_path(cluster_id_label) + "/state")
def __init__(self, name, context, **kwargs): super(QuboleCluster, self).__init__(name, context, kwargs=kwargs) self._filesystem = S3Filesystem(self.logger, context, **kwargs) Qubole.configure(api_token=context.settings['qds_api_token'])
def rerun(args): conn = Qubole.agent() ret_val = conn.post(Action.element_path(args.id) + "/rerun", data=None) return json.dumps(ret_val, sort_keys=True, indent=4)
def create(cls, cluster_info): """ Create a new cluster using information provided in `cluster_info`. """ conn = Qubole.agent() return conn.post(cls.rest_entity_path, data=cluster_info)
def create(cls, **kwargs): conn = Qubole.agent() return cls(conn.post(cls.rest_entity_path, data=kwargs))
def rerun(self, instance_id): conn = Qubole.agent() url_path = self.element_path( id) + "/instances/" + instance_id + "/rerun" return conn.post(url_path)['status']
def show(cls, cluster_id_label): """ Show information about the cluster with id/label `cluster_id_label`. """ conn = Qubole.agent() return conn.get(cls.element_path(cluster_id_label))
def kill(self): conn = Qubole.agent() data = {"status": "kill"} return conn.put(self.element_path(self.id), data)
def resume(self): conn = Qubole.agent() data = {"status": "resume"} return conn.put(self.element_path(self.id), data)
def suspend(self): conn = Qubole.agent() data = {"status": "suspend"} return conn.put(self.element_path(self.id), data)
def delete(cls, id): conn = Qubole.agent() return conn.delete(cls.element_path(id))
def list_roles(group_id): conn = Qubole.agent() url_path = "groups/%s/roles" % group_id return conn.get(url_path)
def rerun(self): conn = Qubole.agent() return conn.post(self.element_path(self.id) + "/rerun", data=None)
def __init__(self, api_token=None): self._check_qubole_api_token_is_assigned(api_token=api_token) Qubole.configure(api_token=api_token) print('Connected to Qubole') self.old_std_out = [] self.status = None
def delete(cls, cluster_id_label): """ Delete the cluster with id/label `cluster_id_label`. """ conn = Qubole.agent() return conn.delete(cls.element_path(cluster_id_label))
def main(): optparser = OptionParser(usage=usage_str) optparser.add_option( "--token", dest="api_token", default=os.getenv('QDS_API_TOKEN'), help= "api token for accessing Qubole. must be specified via command line or passed in via environment variable QDS_API_TOKEN" ) optparser.add_option( "--url", dest="api_url", default=os.getenv('QDS_API_URL'), help= "base url for QDS REST API. defaults to https://api.qubole.com/api ") optparser.add_option( "--version", dest="api_version", default=os.getenv('QDS_API_VERSION'), help="version of REST API to access. defaults to v1.2") optparser.add_option( "--poll_interval", dest="poll_interval", default=os.getenv('QDS_POLL_INTERVAL'), help= "interval for polling API for completion and other events. defaults to 5s" ) optparser.add_option( "--skip_ssl_cert_check", dest="skip_ssl_cert_check", action="store_true", default=False, help= "skip verification of server SSL certificate. Insecure: use with caution." ) optparser.add_option("-v", dest="verbose", action="store_true", default=False, help="verbose mode - info level logging") optparser.add_option("--vv", dest="chatty", action="store_true", default=False, help="very verbose mode - debug level logging") optparser.disable_interspersed_args() (options, args) = optparser.parse_args() if options.chatty: logging.basicConfig(level=logging.DEBUG) elif options.verbose: logging.basicConfig(level=logging.INFO) else: logging.basicConfig(level=logging.WARN) pass if options.api_token is None: sys.stderr.write("No API Token provided\n") usage(optparser) if options.api_url is None: options.api_url = "https://api.qubole.com/api/" if options.api_version is None: options.api_version = "v1.2" if options.poll_interval is None: options.poll_interval = 5 if options.skip_ssl_cert_check is None: options.skip_ssl_cert_check = False elif options.skip_ssl_cert_check: sys.stderr.write( "[WARN] Insecure mode enabled: skipping SSL cert verification\n") Qubole.configure(api_token=options.api_token, api_url=options.api_url, version=options.api_version, poll_interval=options.poll_interval, skip_ssl_cert_check=options.skip_ssl_cert_check) if len(args) < 1: sys.stderr.write("Missing first argument containing command type\n") usage(optparser) cmdsuffix = "cmd" cmdset = set([ x + cmdsuffix for x in ["hive", "pig", "hadoop", "shell", "dbexport", "presto"] ]) a0 = args.pop(0) if (a0 in cmdset): return cmdmain(a0[:a0.find(cmdsuffix)], args) if (a0 == "hadoop_cluster"): return clustermain(a0, args) sys.stderr.write("First command must be one of <%s>\n" % "|".join(cmdset.union(["hadoop_cluster"]))) usage(optparser)
def kill(self): conn = Qubole.agent() return conn.put(self.element_path(self.id) + "/kill", data=None)
def save_code(cls, pipeline_id, code=None, file_path=None, language=None, jar_path=None, main_class_name=None, user_arguments=None): """ :param file_path: :param code: :param language: :param user_arguments: :param pipeline_id: :param jar_path: :param main_class_name: :return: """ data = None if cls.create_type == 2: if jar_path is None or main_class_name is None: raise ParseError("Provide Jar path for BYOJ mode.") else: cls.jar_path = jar_path data = { "data": { "attributes": { "create_type": cls.create_type, "user_arguments": str(user_arguments), "jar_path": str(jar_path), "main_class_name": str(main_class_name) } } } elif cls.create_type == 3: if code or file_path: try: if file_path: with open(file_path, 'r') as f: code = f.read() else: code = code except IOError as e: raise ParseError( "Unable to open script location or script " "location and code both are empty. ", e.message) cls.pipeline_code = code data = { "data": { "attributes": { "create_type": cls.create_type, "user_arguments": str(user_arguments), "code": str(code), "language": str(language) } } } else: raise ParseError( "Provide code or file location for BYOC mode.") conn = Qubole.agent() url = cls.rest_entity_path + "/" + str(pipeline_id) + "/save_code" response = conn.put(url, data) log.debug(response) return response
def list_users(group_id): conn = Qubole.agent() url_path = "groups/%s/qbol_users" % group_id return conn.get(url_path)
def set_composition_for_cluster(self, **kwargs): cloud = Qubole.get_cloud() composition = cloud.get_composition(**kwargs) if composition is not None: self.cluster_info["composition"] = composition
def duplicate(group_id, **kwargs): conn = Qubole.agent() url_path = "groups/%s/duplicate" % group_id return conn.post(url_path, data=kwargs)
def set_cluster_info_from_arguments(self, arguments): customer_ssh_key = util._read_file(arguments.customer_ssh_key_file) self.set_cluster_info( disallow_cluster_termination=arguments. disallow_cluster_termination, enable_ganglia_monitoring=arguments.enable_ganglia_monitoring, datadog_api_token=arguments.datadog_api_token, datadog_app_token=arguments.datadog_app_token, node_bootstrap=arguments.node_bootstrap_file, master_instance_type=arguments.master_instance_type, slave_instance_type=arguments.slave_instance_type, min_nodes=arguments.initial_nodes, max_nodes=arguments.max_nodes, node_base_cooldown_period=arguments.node_base_cooldown_period, node_spot_cooldown_period=arguments.node_spot_cooldown_period, custom_tags=arguments.custom_tags, heterogeneous_config=arguments.heterogeneous_config, idle_cluster_timeout=arguments.idle_cluster_timeout, disk_count=arguments.count, disk_type=arguments.disk_type, disk_size=arguments.size, root_disk_size=arguments.root_disk_size, upscaling_config=arguments.upscaling_config, enable_encryption=arguments.encrypted_ephemerals, customer_ssh_key=customer_ssh_key, image_uri_overrides=arguments.image_uri_overrides, env_name=arguments.env_name, python_version=arguments.python_version, r_version=arguments.r_version, disable_cluster_pause=arguments.disable_cluster_pause, paused_cluster_timeout_mins=arguments.paused_cluster_timeout_mins, disable_autoscale_node_pause=arguments. disable_autoscale_node_pause, paused_autoscale_node_timeout_mins=arguments. paused_autoscale_node_timeout_mins, parent_cluster_id=arguments.parent_cluster_id, image_version=arguments.image_version) if Qubole.get_cloud_name() == "aws": # Need to move to aws cloud. self.set_composition( master_type=arguments.master_type, master_spot_block_duration=arguments. master_spot_block_duration, master_maximum_bid_price_percentage=arguments. master_maximum_bid_price_percentage, master_timeout_for_request=arguments. master_timeout_for_request, master_spot_fallback=arguments.master_spot_fallback, min_ondemand_percentage=arguments.min_ondemand_percentage, min_spot_block_percentage=arguments.min_spot_block_percentage, min_spot_block_duration=arguments.min_spot_block_duration, min_spot_percentage=arguments.min_spot_percentage, min_maximum_bid_price_percentage=arguments. min_maximum_bid_price_percentage, min_timeout_for_request=arguments.min_timeout_for_request, min_spot_allocation_strategy=arguments. min_spot_allocation_strategy, min_spot_fallback=arguments.min_spot_fallback, autoscaling_ondemand_percentage=arguments. autoscaling_ondemand_percentage, autoscaling_spot_block_percentage=arguments. autoscaling_spot_block_percentage, autoscaling_spot_percentage=arguments. autoscaling_spot_percentage, autoscaling_spot_block_duration=arguments. autoscaling_spot_block_duration, autoscaling_maximum_bid_price_percentage=arguments. autoscaling_maximum_bid_price_percentage, autoscaling_timeout_for_request=arguments. autoscaling_timeout_for_request, autoscaling_spot_allocation_strategy=arguments. autoscaling_spot_allocation_strategy, autoscaling_spot_fallback=arguments.autoscaling_spot_fallback, autoscaling_spot_block_fallback=arguments. autoscaling_spot_block_fallback) else: self.set_composition_from_cloud_using_parser(arguments)
def cluster_info_parser(argparser, action): create_required = False label_required = False if action == "create": create_required = True elif action == "update": argparser.add_argument("cluster_id_label", help="id/label of the cluster to update") elif action == "clone": argparser.add_argument("cluster_id_label", help="id/label of the cluster to update") label_required = True argparser.add_argument("--label", dest="label", nargs="+", required=(create_required or label_required), help="list of labels for the cluster" + " (atleast one label is required)") cluster_info = argparser.add_argument_group("cluster_info") cluster_info.add_argument("--master-instance-type", dest="master_instance_type", help="instance type to use for the hadoop" + " master node") cluster_info.add_argument("--slave-instance-type", dest="slave_instance_type", help="instance type to use for the hadoop" + " slave nodes") cluster_info.add_argument( "--min-nodes", dest="initial_nodes", type=int, help="number of nodes to start the" + " cluster with", ) cluster_info.add_argument("--max-nodes", dest="max_nodes", type=int, help="maximum number of nodes the cluster" + " may be auto-scaled up to") cluster_info.add_argument( "--idle-cluster-timeout", dest="idle_cluster_timeout", help="cluster termination timeout for idle cluster") cluster_info.add_argument( "--node-bootstrap-file", dest="node_bootstrap_file", help="""name of the node bootstrap file for this cluster. It should be in stored in S3 at <account-default-location>/scripts/hadoop/NODE_BOOTSTRAP_FILE """, ) cluster_info.add_argument("--root-disk-size", dest="root_disk_size", type=int, help="size of the root volume in GB") cluster_info.add_argument( "--parent-cluster-id", dest="parent_cluster_id", type=int, help="Id of the parent cluster this hs2 cluster is attached to") cluster_info.add_argument("--image-version", dest="image_version", help="cluster image version") termination = cluster_info.add_mutually_exclusive_group() termination.add_argument( "--disallow-cluster-termination", dest="disallow_cluster_termination", action="store_true", default=None, help="don't auto-terminate idle clusters," + " use this with extreme caution", ) termination.add_argument("--allow-cluster-termination", dest="disallow_cluster_termination", action="store_false", default=None, help="auto-terminate idle clusters,") node_cooldown_period_group = argparser.add_argument_group( "node cooldown period settings") node_cooldown_period_group.add_argument( "--node-base-cooldown-period", dest="node_base_cooldown_period", type=int, help="Cooldown period for on-demand nodes" + " unit: minutes") node_cooldown_period_group.add_argument( "--node-spot-cooldown-period", dest="node_spot_cooldown_period", type=int, help="Cooldown period for spot nodes" + " unit: minutes") cluster_info.add_argument("--customer-ssh-key", dest="customer_ssh_key_file", help="location for ssh key to use to" + " login to the instance") cluster_info.add_argument( "--custom-tags", dest="custom_tags", help="""Custom tags to be set on all instances of the cluster. Specified as JSON object (key-value pairs) e.g. --custom-ec2-tags '{"key1":"value1", "key2":"value2"}' """, ) # datadisk settings datadisk_group = argparser.add_argument_group("data disk settings") datadisk_group.add_argument( "--count", dest="count", type=int, help="Number of EBS volumes to attach to" + " each instance of the cluster", ) datadisk_group.add_argument( "--disk-type", dest="disk_type", choices=["standard", "gp2"], help= "Type of the volume attached to the instances. Valid values are " + "'standard' (magnetic) and 'gp2' (ssd).") datadisk_group.add_argument( "--size", dest="size", type=int, help="Size of each EBS volume, in GB", ) datadisk_group.add_argument( "--upscaling-config", dest="upscaling_config", help="Upscaling config to be attached with the instances.", ) ephemerals = datadisk_group.add_mutually_exclusive_group() ephemerals.add_argument( "--encrypted-ephemerals", dest="encrypted_ephemerals", action="store_true", default=None, help="encrypt the ephemeral drives on" + " the instance", ) ephemerals.add_argument( "--no-encrypted-ephemerals", dest="encrypted_ephemerals", action="store_false", default=None, help="don't encrypt the ephemeral drives on" + " the instance", ) cluster_info.add_argument("--heterogeneous-config", dest="heterogeneous_config", help="heterogeneous config for the cluster") composition_group = argparser.add_argument_group( "Cluster composition settings") Qubole.get_cloud().set_composition_arguments(composition_group) # monitoring settings monitoring_group = argparser.add_argument_group("monitoring settings") ganglia = monitoring_group.add_mutually_exclusive_group() ganglia.add_argument( "--enable-ganglia-monitoring", dest="enable_ganglia_monitoring", action="store_true", default=None, help="enable ganglia monitoring for the" + " cluster", ) ganglia.add_argument( "--disable-ganglia-monitoring", dest="enable_ganglia_monitoring", action="store_false", default=None, help="disable ganglia monitoring for the" + " cluster", ) datadog_group = argparser.add_argument_group("datadog settings") datadog_group.add_argument( "--datadog-api-token", dest="datadog_api_token", default=None, help="fernet key for airflow cluster", ) datadog_group.add_argument( "--datadog-app-token", dest="datadog_app_token", default=None, help="overrides for airflow cluster", ) internal_group = argparser.add_argument_group("internal settings") internal_group.add_argument( "--image-overrides", dest="image_uri_overrides", default=None, help="overrides for image", ) env_group = argparser.add_argument_group("environment settings") env_group.add_argument("--env-name", dest="env_name", default=None, help="name of Python and R environment") env_group.add_argument("--python-version", dest="python_version", default=None, help="version of Python in environment") env_group.add_argument("--r-version", dest="r_version", default=None, help="version of R in environment") start_stop_group = argparser.add_argument_group("start stop settings") start_stop_group.add_argument("--disable-cluster-pause", dest="disable_cluster_pause", action='store_true', default=None, help="disable cluster pause") start_stop_group.add_argument("--no-disable-cluster-pause", dest="disable_cluster_pause", action='store_false', default=None, help="disable cluster pause") start_stop_group.add_argument("--paused-cluster-timeout", dest="paused_cluster_timeout_mins", default=None, type=int, help="paused cluster timeout in min") start_stop_group.add_argument("--disable-autoscale-node-pause", dest="disable_autoscale_node_pause", action='store_true', default=None, help="disable autoscale node pause") start_stop_group.add_argument("--no-disable-autoscale-node-pause", dest="disable_autoscale_node_pause", action='store_false', default=None, help="disable autoscale node pause") start_stop_group.add_argument( "--paused-autoscale-node-timeout", dest="paused_autoscale_node_timeout_mins", default=None, type=int, help="paused autoscale node timeout in min")
def qb_configure(api_token, api_url): return Qubole.configure(api_token=api_token, api_url=api_url)
def main(): optparser = OptionParser(usage=usage_str) optparser.add_option("--token", dest="api_token", default=os.getenv('QDS_API_TOKEN'), help="api token for accessing Qubole. must be specified via command line or passed in via environment variable QDS_API_TOKEN") optparser.add_option("--url", dest="api_url", default=os.getenv('QDS_API_URL'), help="base url for QDS REST API. defaults to https://api.qubole.com/api ") optparser.add_option("--version", dest="api_version", default=os.getenv('QDS_API_VERSION'), help="version of REST API to access. defaults to v1.2") optparser.add_option("--poll_interval", dest="poll_interval", type=int, default=os.getenv('QDS_POLL_INTERVAL'), help="interval for polling API for completion and other events. defaults to 5s") optparser.add_option("--skip_ssl_cert_check", dest="skip_ssl_cert_check", action="store_true", default=False, help="skip verification of server SSL certificate. Insecure: use with caution.") optparser.add_option("-v", dest="verbose", action="store_true", default=False, help="verbose mode - info level logging") optparser.add_option("--vv", dest="chatty", action="store_true", default=False, help="very verbose mode - debug level logging") optparser.disable_interspersed_args() (options, args) = optparser.parse_args() if options.chatty: logging.basicConfig(level=logging.DEBUG) elif options.verbose: logging.basicConfig(level=logging.INFO) else: logging.basicConfig(level=logging.WARN) if options.api_token is None: sys.stderr.write("No API Token provided\n") usage(optparser) if options.api_url is None: options.api_url = "https://api.qubole.com/api/" if options.api_version is None: options.api_version = "v1.2" if options.poll_interval is None: options.poll_interval = 5 if options.skip_ssl_cert_check is None: options.skip_ssl_cert_check = False elif options.skip_ssl_cert_check: log.warn("Insecure mode enabled: skipping SSL cert verification\n") Qubole.configure(api_token=options.api_token, api_url=options.api_url, version=options.api_version, poll_interval=options.poll_interval, skip_ssl_cert_check=options.skip_ssl_cert_check) if len(args) < 1: sys.stderr.write("Missing first argument containing subcommand\n") usage(optparser) a0 = args.pop(0) if a0 in CommandClasses: return cmdmain(a0, args) if a0 in SensorClasses: return sensormain(a0, args) if a0 == "account": return accountmain(args) if a0 == "cluster": api_version_number = float(options.api_version[1:]) return clustermain(args, api_version_number) if a0 == "action": return actionmain(args) if a0 == "scheduler": return schedulermain(args) if a0 == "report": return reportmain(args) if a0 == "dbtap": return dbtapmain(args) if a0 == "group": return groupmain(args) if a0 == "role": return rolemain(args) if a0 == "app": return appmain(args) if a0 == "nezha": return nezhamain(args) if a0 == "user": return usermain(args) if a0 == "template": return templatemain(args) cmdset = set(CommandClasses.keys()) sys.stderr.write("First command must be one of <%s>\n" % "|".join(cmdset.union(["cluster", "action", "scheduler", "report", "dbtap", "role", "group", "app", "account", "nezha", "user", "template"]))) usage(optparser)
# Downloading the result def get_results(command): if command is None: return None filename = get_random_filename(10) print filename fp = open(filename, 'w') command.get_results(fp, delim="\n") print "Starting Result fetch with Command id: " + str( command.id) + "\nProgress: =>", while not SparkCommand.is_done(command.status): print "\b=>", time.sleep(5) if SparkCommand.is_success(command.status): print "\nCommand Executed: Results fetch completed successfully" else: print "\nCommand Executed: Result fetch for original command " + str( command.id) + "Failed!!!. The status returned is: " + str( command.status) fp.close() content = get_content(filename) return content if __name__ == '__main__': # Stting API token Qubole.configure(api_token='<qubole-api-token>', api_url='https://<env>.qubole.com/api') get_results(execute_query('select * from default.customer limit 100'))