def run_to_pandas(connection, commands, date_col=None, index_col=None): result = None # Specify the MediaWiki date format for each of the date_cols, if any if date_col: date_col = ensure_list(date_col) date_format = "%Y%m%d%H%M%S" date_col = {col: date_format for col in date_col} # To-do: SQL syntax errors cause a chain of multiple Python errors # The simplest way to fix this is probably to get the raw results and # then turn them into a data frame; this would let us avoid using # Pandas's complex SQL machinery. for command in commands: try: result = pd.read_sql_query(command, connection, index_col=index_col, parse_dates=date_col) # pandas will encounter a TypeError with DDL (e.g. CREATE TABLE) or # DML (e.g. INSERT) statements except TypeError: pass return result
def run(commands, format="pandas", session_type="regular", extra_settings={}): """ Runs SQL commands against the Hive tables in the Data Lake using the PySpark SQL interface. Arguments: * `commands`: the SQL to run. A string for a single command or a list of strings for multiple commands within the same session (useful for things like setting session variables). Passing more than one query is *not* supported; only results from the second will be returned. * `format`: the format in which to return data * "pandas": a Pandas data frame * "raw": a list of tuples, as returned by the Spark SQL interface. * `session_type`: the type of Spark session to create. * "regular": the default; able to use up to 15% of Hadoop cluster resources * "large": for queries which require more processing (e.g. joins) or which access more data; able to use up to 30% of Hadoop cluster resources. * `extra_settings`: A dict of additional settings to use when creating the Spark session. These will override the defaults specified by `session_type`. """ if format not in ["pandas", "raw"]: raise ValueError("The `format` should be either `pandas` or `raw`.") if session_type not in ["regular", "large"]: raise ValueError( "'{}' is not a valid Spark session type.".format(session_type)) commands = ensure_list(commands) result = None # TODO: Switching the Spark session type has no effect if the previous # session is still running. spark_session = get_session(type=session_type, extra_settings=extra_settings) for cmd in commands: cmd_result = spark_session.sql(cmd) # If the result has columns, the command was a query and therefore # results-producing. If not, it was a DDL or DML command and not # results-producing. if len(cmd_result.columns) > 0: uncollected_result = cmd_result if uncollected_result and format == "pandas": result = uncollected_result.toPandas() elif format == "raw": result = uncollected_result.collect() start_session_timeout(spark_session) return result
def run(commands, format="pandas", engine="cli"): """ Runs SQL commands against the Hive tables in the Data Lake. Currently, this simply passes the commands to the `run_cli` function. """ if format not in ["pandas", "raw"]: raise ValueError("The `format` should be either `pandas` or `raw`.") if engine not in ["cli"]: raise ValueError("'{}' is not a valid engine.".format(engine)) commands = ensure_list(commands) result = None if engine == "cli": return run_cli(commands, format)
def run(commands, format="pandas", heap_size="deprecated", engine="deprecated"): """ Runs SQL commands against the Hive tables in the Data Lake. Arguments: * `commands`: the SQL to run. A string for a single command or a list of strings for multiple commands within the same session (useful for things like setting session variables). Passing more than one query is *not* supported, and will usually result in an error. * `format`: what format to return the results in * "pandas": a Pandas data frame * "raw": a TSV string, as returned by the command line interface. [Deprecated] * `heap_size`: [Deprecated] * `engine`: [Deprecated] """ if format not in ["pandas", "raw"]: raise ValueError("The `format` should be either `pandas` or `raw`.") if format == "raw": warnings.warn( "The 'raw' format is deprecated. It will be removed in the next major release.", category=FutureWarning) if heap_size != "deprecated": warnings.warn( "'heap_size' is deprecated. It will be removed in the next major release", category=FutureWarning) if engine != "deprecated": warnings.warn( "'engine' is deprecated. It will be removed in the next major release.", category=FutureWarning) check_kerberos_auth() connect_kwargs = { "host": HIVE_URL, "auth": "KERBEROS", "username": pwd.getpwuid(os.getuid()).pw_name, "kerberos_service_name": KERBEROS_SERVICE_NAME, } commands = ensure_list(commands) response = None with hive.connect(**connect_kwargs) as conn: for command in commands: if format == "pandas": try: # this will work when the command is a SQL query # so the last query in `commands` will return its results response = pd.read_sql(command, conn) except TypeError: # The weird thing here is the command actually runs, # Pandas just has trouble when trying to read the result # So when we pass here, we don't need to re-run the command pass elif format == "raw": cursor = conn.cursor() cursor.execute(command) response = cursor.fetchall() return response
def run_cli(commands, format="pandas", heap_size=1024, use_nice=True, use_ionice=True): """ Runs SQL commands against the Hive tables in the Data Lake using Hive's command line interface. Arguments: * `commands`: the SQL to run. A string for a single command or a list of strings for multiple commands within the same session (useful for things like setting session variables). Passing more than one query is *not* supported, and will usually result in an error. * `format`: what format to return the results in * "pandas": a Pandas data frame * "raw": a TSV string, as returned by the command line interface. * `heap_size`: the amount of memory available to the Hive client. Increase this if a command experiences an out of memory error. * `use_nice`: Run with a lower priority for processor usage. * `use_ionice`: Run with a lower priority for disk access. """ commands = ensure_list(commands) if format not in ["pandas", "raw"]: raise ValueError("'{}' is not a valid format.".format(format)) check_kerberos_auth() shell_command = "export HADOOP_HEAPSIZE={0} && " if use_nice: shell_command += "/usr/bin/nice " if use_ionice: shell_command += "/usr/bin/ionice " shell_command += "/usr/bin/hive -S -f {1}" result = None # Support multiple commands by concatenating them in one file. If the user # has passed more than one query, this will result in a error when Pandas # tries to read the resulting concatenated output (unless the queries # happen to produce the same number of columns). # # Ideally, we would return only the last query's results or throw a clearer # error ourselves. However, there's no simple way to determine if multiple # queries have been passed or separate their output, so it's not worth # the effort. merged_commands = ";\n".join(commands) try: # Create temporary files in current working directory to write to: cwd = os.getcwd() query_fd, query_path = tempfile.mkstemp(suffix=".hql", dir=cwd) results_fd, results_path = tempfile.mkstemp(suffix=".tsv", dir=cwd) # Write the Hive query: with os.fdopen(query_fd, 'w') as fp: fp.write(merged_commands) # Execute the Hive query: shell_command = shell_command.format(heap_size, query_path) hive_call = subprocess.run(shell_command, shell=True, stdout=results_fd, stderr=subprocess.PIPE) if hive_call.returncode == 0: # Read the results upon successful execution of cmd: if format == "pandas": try: result = pd.read_csv(results_path, sep='\t') except pd.errors.EmptyDataError: # The command had no output pass else: # If user requested "raw" results, read the text file as-is: with open(results_path, 'r') as file: content = file.read() # If the statement had output: if content: result = content # If the hive call has not completed successfully else: # Remove logspam from the standard error so it's easier to see # the actual error stderr = iter(hive_call.stderr.decode().splitlines()) cleaned_stderr = "" for line in stderr: filter = r"JAVA_TOOL_OPTIONS|parquet\.hadoop|WARN:|:WARN|SLF4J" if re.search(filter, line) is None: cleaned_stderr += line + "\n" raise ChildProcessError( "The Hive command line client encountered the following " "error:\n{}".format(cleaned_stderr)) finally: # Remove temporary files: os.unlink(query_path) os.unlink(results_path) return result
def run(commands, format="pandas", session_type="yarn-regular", extra_settings={}): """ Runs SQL commands against the Hive tables in the Data Lake using the PySpark SQL interface. Note: The session_type and extra_settings will only be applied the first time this function is called. The SparkSession is only instantiated once per process, and each subsequent call will re-use the previously created SparkSession. Arguments: * `commands`: the SQL to run. A string for a single command or a list of strings for multiple commands within the same session (useful for things like setting session variables). Passing more than one query is *not* supported; only results from the second will be returned. * `format`: the format in which to return data * "pandas": a Pandas data frame * "raw": a list of tuples, as returned by the Spark SQL interface. * `session_type`: the type of Spark session to create. * "local": Run the command in a local Spark process. Use this for prototyping or querying small-ish data (less than a couple of GB). * "yarn-regular": the default; able to use up to 15% of Hadoop cluster resources * "yarn-large": for queries which require more processing (e.g. joins) or which access more data; able to use up to 30% of Hadoop cluster resources. * `extra_settings`: A dict of additional settings to use when creating the Spark session. These will override the defaults specified by `session_type`. """ if format not in ["pandas", "raw"]: raise ValueError("The `format` should be either `pandas` or `raw`.") if format == "raw": warnings.warn( "The 'raw' format is deprecated. It will be removed in the next major release.", category=FutureWarning) commands = ensure_list(commands) # TODO: Switching the Spark session type has no effect if the previous # session is still running. spark_session = get_session(type=session_type, extra_settings=extra_settings) overall_result = None for cmd in commands: cmd_result = spark_session.sql(cmd) # If the result has columns, the command was a query and therefore # results-producing. If not, it was a DDL or DML command and not # results-producing. if len(cmd_result.columns) > 0: overall_result = cmd_result if overall_result: if format == "pandas": overall_result = overall_result.toPandas() elif format == "raw": overall_result = overall_result.collect() # (re)start a timeout on SparkSessions in Yarn after the result is collected. # A SparkSession used by this run function # will timeout after 5 minutes, unless used again. if PREDEFINED_SPARK_SESSIONS[session_type]["master"] == "yarn": start_session_timeout(spark_session, 3600) return overall_result
def run(commands, dbs, use_x1=False, format="pandas", date_col=None, index_col=None): """ Run SQL queries or commands on the Analytics MediaWiki replicas. Arguments: * `commands`: the SQL to run. A string for a single command or a list of strings for multiple commands within the same session (useful for things like setting session variables). * `dbs`: a string for one database or a list to run the commands on multiple databases and concatenate the results. Possible values: * a wiki's database code (e.g. "enwiki", "arwiktionary", "wikidatawiki") for its MediaWiki database (or its ExtensionStorage database if `use_x1` is passed) * "logs" for the EventLogging * "centralauth" for global accounts * "wikishared" for cross-wiki ExtensionStorage * "staging" for user-writable ad-hoc tests and analysis * `use_x1`: whether to the connect to the given database on the ExtensionStorage replica (only works for wiki databases or "wikishared"). Default false. * `format`: which format to return the data in. "pandas" (the default) means a Pandas DataFrame, "raw" means a named tuple consisting of (1) the columns names and (2) the records as a list of tuples, the raw format specified by Python's database API specification v2.0. * `date_col`: if using Pandas format, this parses the specified column or columns from MediaWiki datetimes into Pandas datetimes. If using raw format, has no effect. * `index_col`: if using Pandas format, passed to pandas.read_sql_query to set a columns or columns as the index. If using raw format, has no effect. """ if format == "raw": warnings.warn( "The 'raw' format is deprecated. It will be removed in the next major release.", category=FutureWarning) # Make single command and database parameters lists commands = ensure_list(commands) dbs = ensure_list(dbs) results = [] if format == "pandas": for db in dbs: connection = connect(db, use_x1) result = run_to_pandas(connection, commands, date_col, index_col) connection.close() results.append(result) if len(dbs) > 1: # Ignore the indexes on the partial results unless a custom index # column was designated if not index_col: ignore_index = True else: ignore_index = False return pd.concat(results, ignore_index=ignore_index) else: return results[0] elif format == "raw": for db in dbs: connection = connect(db, use_x1) result = run_to_tuples(connection, commands) connection.close() results.append(result) if len(dbs) > 1: # Take the first set of column names since they'll all be the same column_names = results[0].column_names record_sets = [result.records for result in results] records = [x for x in chain(record_sets)] return ResultSet(column_names, records) else: return results[0] else: raise ValueError("The format you specified is not supported.")
def run(commands, catalog="analytics_hive"): """ Runs one or more SQL commands using the Presto SQL engine and returns the last result in a Pandas DataFrame. Presto can be connected to many different backend data stores, or catalogs. Currently it is only connected to the Data Lake, with has the catalog name "analytics_hive". """ commands = ensure_list(commands) check_kerberos_auth() USER_NAME = os.getenv("USER") PRESTO_AUTH = prestodb.auth.KerberosAuthentication( config="/etc/krb5.conf", service_name="presto", principal=f"{USER_NAME}@WIKIMEDIA", ca_bundle="/etc/ssl/certs/Puppet_Internal_CA.pem") connection = prestodb.dbapi.connect(catalog=catalog, host="an-coord1001.eqiad.wmnet", port=8281, http_scheme="https", user=USER_NAME, auth=PRESTO_AUTH, source=f"{USER_NAME}, wmfdata-python") cursor = connection.cursor() final_result = None for command in commands: cursor.execute(command) result = cursor.fetchall() description = cursor.description # Weirdly, this happens after running a command that doesn't produce results (like a # CREATE TABLE or INSERT). Most users can't run those, though. # TO-DO: report this as a bug upstream if result == [[True]] and description[0][0] == "results": pass else: # Based on # https://github.com/prestodb/presto-python-client/issues/56#issuecomment-367432438 colnames = [col[0] for col in description] dtypes = [col[1] for col in description] def setup_transform(col, desired_dtype): # Only Hive dates/times need special handling if desired_dtype in ("timestamp", "date"): return lambda df: pd.to_datetime(df[col]) else: return lambda df: df[col] transformations = { col: setup_transform(col, dtype) for col, dtype in zip(colnames, dtypes) } final_result = (pd.DataFrame( result, columns=colnames).assign(**transformations)) cursor.cancel() connection.close() return final_result