示例#1
0
def multirun(cmds, wikis=None):
    print_err(
        "The multirun function has been deprecated. Please pass a list of databases to the run function instead."
    )

    if not wikis:
        raise NotImplementedError(
            "The default set of wikis to run the command on have been removed. Please explicitly specify a list of wikis."
        )

    return run(cmds, wikis)
示例#2
0
def pack(**conda_pack_kwargs):
    """
    Calls conda_pack.pack.
    If the packed output file already exists, this will not repackage
    it unless conda_pack_kwargs["force"] == True.

    Returns the path to the output packed env file.

    Arguments:
    * `conda_pack_kwargs` args to pass to conda_pack.pack().
    """
    kwargs = conda_pack_defaults.copy()
    kwargs.update(conda_pack_kwargs)

    # Make sure output is set to something, so we can return it if it already exists.
    if "output" not in kwargs:
        conda_env_name = "env"
        if "prefix" in kwargs:
            conda_env_name = os.path.basename(kwargs["prefix"])
        elif "name" in kwargs:
            conda_env_name = kwargs["name"]
        elif is_active():
            conda_env_name = active_name()

        kwargs["output"] = "conda-{}.{}".format(conda_env_name,
                                                kwargs["format"])

    conda_packed_file = kwargs["output"]
    if os.path.isfile(conda_packed_file) and not kwargs["force"]:
        print_err(
            f"A conda environment is already packed at {conda_packed_file}. "
            "If you have recently installed new packages into your conda env, set "
            "force=True in conda_pack_kwargs and it will be repacked for you.")
        return conda_packed_file
    else:
        # Isolate the import here so that we don"t get import errors
        # if conda_pack is not installed (e.g. in a virtualenv).
        import conda_pack
        # NOTE: If no conda env is currently active, and kwargs
        # doesn"t contain information about what env to pack (i.e. no name or prefix)
        # then this raise an error.
        return conda_pack.pack(**kwargs)
示例#3
0
# Import all submodules so all are accessible after `import wmfdata`
from wmfdata import charting, hive, metadata, utils  # mariadb,

welcome_message = """{0}

You can find the source for `wmfdata` at {1}"""

remote = utils.check_remote_version(metadata.version)
if remote['is_newer']:
    update_message = """You are using wmfdata {0}. A newer version is available.
You can update to {1} by running `pip install --upgrade git+{2}/wmfdata.git@release`
To see what has changed, refer to https://github.com/neilpquinn/wmfdata/CHANGELOG.md"""
    update_message = update_message.format(metadata.version, remote['version'],
                                           metadata.source)
else:
    update_message = "You are using wmfdata {0} (latest).".format(
        metadata.version)

welcome_message = welcome_message.format(update_message, metadata.source)
utils.print_err(welcome_message)
示例#4
0
# Import all submodules so they are accessible after `import wmfdata`. utils must go
# first to prevent circular import issues. Other submodules can depend on utils and/or conda ONLY.
from wmfdata import utils, conda
from wmfdata import (charting, hive, mariadb, metadata, presto, spark)

try:
    remote = utils.check_remote_version(metadata.source, metadata.version)
    if remote['is_newer']:
        update_message = (
            "You are using wmfdata v{0}, but v{1} is available.\n\n" +
            "To update, run `pip install --upgrade git+{2}.git@release --ignore-installed`.\n\n"
            + "To see the changes, refer to {2}/blob/release/CHANGELOG.md"
        ).format(metadata.version, remote['version'], metadata.source)
        utils.print_err(update_message)

# If the file with the version info is ever moved, or the code hosting changes, and so
# on, it will make all previous versions of the version check fail, so we should turn
# any errors into an understandable warning.
except:
    utils.print_err((
        "The check for a newer release of wmfdata failed to complete. Consider "
        "checking manually."))
示例#5
0
def log_test_passed(test_name):
    print_err(f"TEST PASSED: {test_name}")
示例#6
0
def get_custom_session(master="local[2]",
                       app_name="wmfdata-custom",
                       spark_config={},
                       ship_python_env=False,
                       conda_pack_kwargs={}):
    """
    Returns an existent SparkSession, or a new one if one hasn't yet been created.

    Use this instead of get_session if you'd rather have manual control over
    your SparkSession configuration.

    Note: master, app_name and spark_config are only applied the first time
    this function is called.  All subsequent calls will return the first created SparkSession.

    Arguments:
    * `master`: passed to SparkSession.builder.master()
      If this is "yarn" and and a conda env is active and and ship_python_env=False,
      remote executors will be configured to use conda.conda_base_env_prefix(),
      which defaults to anaconda-wmf. This should usually work as anaconda-wmf
      is installed on all WMF YARN worker nodes.  If your conda environment
      has required packages installed that are not in anaconda-wmf, set
      ship_python_env=True.
    * `app_name`: passed to SparkSession.builder.appName().
    * `spark_config`: passed to SparkSession.builder.config()
    * `ship_python_env`: If master='yarn' and this is True, a conda env will be packed
      and shipped to remote Spark executors.  This is useful if your conda env
      has Python or other packages that the executors will need to do their work.
    * `conda_pack_kwargs`: Args to pass to conda_pack.pack(). If none are given, this will
      call conda_pack.pack() with no args, causing the default currently active
      conda environment to be packed.
      You can pack and ship any conda environment by setting appropriate args here.
      See https://conda.github.io/conda-pack/api.html#pack
      If True, this will fail if conda and conda_pack are not installed.
    """
    check_kerberos_auth()

    if master == "yarn":
        if ship_python_env:
            # The path to our packed conda environment.
            conda_packed_file = conda.pack(**conda_pack_kwargs)
            # This will be used as the unpacked directory name in the YARN working directory.
            conda_packed_name = os.path.splitext(
                os.path.basename(conda_packed_file))[0]

            # Ship conda_packed_file to each YARN worker.
            conda_spark_archive = f"{conda_packed_file}#{conda_packed_name}"
            if "spark.yarn.dist.archives" in spark_config:
                spark_config[
                    "spark.yarn.dist.archives"] += f",{conda_spark_archive}"
            else:
                spark_config["spark.yarn.dist.archives"] = conda_spark_archive
            print_err(
                f"Will ship {conda_packed_file} to remote Spark executors.")

            # Workers should use python from the unpacked conda env.
            os.environ["PYSPARK_PYTHON"] = f"{conda_packed_name}/bin/python3"
        # Else if conda is active, use the use the conda_base_env_prefix (anaconda-wmf)
        # environment, as this should exist on all worker nodes.
        elif conda.is_active():
            os.environ["PYSPARK_PYTHON"] = os.path.join(
                conda.conda_base_env_prefix(), "bin", "python3")
        # Else use the system python.  We can't use any current conda or virtualenv python
        # as these won't be present on the remote YARN workers.
        # The python version workers should use must be the same as the currently
        # running python version, so only set this if that version of python
        # (e.g. python3.7) is installed in the system.
        elif os.path.isfile(
                os.path.join(f"/usr/bin/python{python_version()}")):
            os.environ["PYSPARK_PYTHON"] = f"/usr/bin/python{python_version()}"

        if "PYSPARK_PYTHON" in os.environ:
            print_err("PySpark executors will use {}.".format(
                os.environ["PYSPARK_PYTHON"]))

    # NOTE: We don't need to touch PYSPARK_PYTHON if master != yarn.
    # The default set by findspark will be fine.

    # Call findspark.init after PYSPARK_PYTHON has possibly been set.
    # This is needed because findspark will set PYSPARK_PYTHON path
    # to sys.executable if it isn't yet set, which will likely not
    # work in YARN mode if sys.executlable is a local conda or virtualenv
    # (as is the case in WMF Jupyter Notebooks).
    findspark.init(SPARK_HOME)
    from pyspark.sql import SparkSession

    # NOTE: if there's an existing session, it will be returned with its
    # existing settings even if the user has specified a different set of
    # settings in this function call. There will be no indication that
    # this has happened.
    builder = (SparkSession.builder.master(master).appName(app_name))

    # All ENV_VARS_TO_PROPAGATE should be set in all Spark processes.
    for var in ENV_VARS_TO_PROPAGATE:
        if var in os.environ:
            builder.config(f"spark.executorEnv.{var}", os.environ[var])
            # NOTE: Setting the var in appMasterEnv will only have an effect if
            # running in yarn cluster mode.
            builder.config(f"spark.yarn.appMasterEnv.{var}", os.environ[var])

    # Apply any provided spark configs.
    for k, v in spark_config.items():
        builder.config(k, v)

    return builder.getOrCreate()
示例#7
0
def load_csv(
    path, field_spec, db_name, table_name,
    create_db=False, sep=",", headers=True
):
    """
    Upload a CSV (or other delimiter-separated value file) to Data Lake's HDFS, for use with Hive
    and other utilities.

    `field_spec` specifies the field names and their formats, for the
    `CREATE TABLE` statement; for example, `name string, age int, graduated bool`.

    To prevent errors caused by typos, the function will not try to create the database first unless
    `create_db=True` is passed.

    `headers` gives whether the file has a header row; if it does, the function strips it before
    uploading, because Hive treats all rows as data rows.
    """
    if headers:
        new_path = "/tmp/wmfdata-" + mediawiki_dt(dt.datetime.now())
        # From rbtsbg at https://stackoverflow.com/a/39791546
        with open(path, 'r') as source, open(new_path, 'w') as target:
            source.readline()
            copyfileobj(source, target)

        path = new_path

    create_db_cmd = """
    create database if not exists {db_name}
    """.format(
        db_name=db_name
    )

    # To do: Passing a new field spec cannot change an exsting table's format
    create_table_cmd = """
    create table if not exists {db_name}.{table_name} ({field_spec})
    row format delimited fields terminated by "{sep}"
    """.format(
        db_name=db_name, table_name=table_name,
        field_spec=field_spec, sep=sep
    )

    load_table_cmd = """
        load data local inpath "{path}"
        overwrite into table {db_name}.{table_name}
    """.format(
        # To do: Convert relative paths (e.g. "~/data.csv") into absolute paths
        path=path, db_name=db_name,
        table_name=table_name
    )

    if create_db:
        run(create_db_cmd)

    run(create_table_cmd)

    proc = subprocess.Popen(
        ["hive", "-e", load_table_cmd],
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT
    )

    try:
        outs, _ = proc.communicate(timeout=15)
        for line in outs.decode().split("\n"):
            print_err(line)
    except TimeoutExpired:
        proc.kill()
        outs, _ = proc.communicate()
        for line in outs.decode().split("\n"):
            print_err(line)
示例#8
0
def run(commands,
        dbs,
        use_x1=False,
        format="pandas",
        date_col=None,
        index_col=None):
    """
    Run SQL queries or commands on the Analytics MediaWiki replicas.
    
    Arguments:
    * `commands`: the SQL to run. A string for a single command or a list of string for multiple commands within the same session (useful for things like setting session variables).
    * `dbs`: a string for one database or a list to run the commands on multiple databases and concatenate the results.  Possible values:
        * a wiki's database code (e.g. "enwiki", "arwiktionary", "wikidatawiki") for its MediaWiki database (or its ExtensionStorage database if `use_x1` is passed)
        * "logs" for the EventLogging
        * "centralauth" for global accounts
        * "wikishared" for cross-wiki ExtensionStorage 
        * "staging" for user-writable ad-hoc tests and analysis
    * `use_x1`: whether to the connect to the given database on the ExtensionStorage replica (only works for wiki databases or "wikishared"). Default false.
    * `format`: which format to return the data in. "pandas" (the default) means a Pandas DataFrame, "tuples" means a named tuple consisting of (1) the columns names and (2) the records as a list of tuples, the raw format specified by Python's database API specification v2.0.
    * `date_col`: if using Pandas format, this parses the specified column or columns from MediaWiki datetimes into Pandas datetimes. If using tuples format, has no effect.
    * `index_col`: if using Pandas format, passed to pandas.read_sql_query to set a columns or columns as the index. If using tuples format, has no effect.
    """

    # Make single command and database parameters lists
    commands = ensure_list(commands)
    dbs = ensure_list(dbs)

    results = []

    if format == "pandas":
        for db in dbs:
            connection = connect(db, use_x1)
            result = run_to_pandas(connection, commands, date_col, index_col)
            connection.close()
            results.append(result)

        if len(dbs) > 1:
            # Ignore the indexes on the partial results unless a custom index column was designated
            if not index_col:
                ignore_index = True
            else:
                ignore_index = False

            return pd.concat(results, ignore_index=ignore_index)
        else:
            return results[0]

    # Allow "raw" as a synonym of "tuples" for temporary back-compatibility (July 2019)
    elif format == "tuples" or format == "raw":
        if format == "raw":
            print_err(
                """The "raw" format has been renamed "tuples". Please use the new name instead."""
            )

        for db in dbs:
            connection = connect(db, use_x1)
            result = run_to_tuples(connection, commands)
            connection.close()
            results.append(result)

        if len(dbs) > 1:
            # Take the first set of column names since they'll all be the same
            column_names = results[0].column_names

            record_sets = [result.records for result in results]
            records = [x for x in chain(record_sets)]

            return ResultSet(column_names, records)
        else:
            return results[0]

    else:
        raise ValueError("The format you specified is not supported.")