예제 #1
0
    def _port_forward_start(self, local_port, remote_host, remote_port):
        logger.debug('Now forwarding port {} to {}:{} ...'.format(local_port, remote_host, remote_port))

        try:
            server = forward_tunnel(local_port, remote_host, remote_port, self.__client.get_transport())
        except KeyboardInterrupt:
            print('C-c: Port forwarding stopped.')
        return server
예제 #2
0
파일: local.py 프로젝트: artwr/omniduct
 def get(self, id_duct, id_str, deserializer=pickle.load):
     cache_path = self.get_path(id_duct, id_str)
     if not os.path.exists(cache_path):
         return None
     with open(cache_path, 'rb') as f:
         logger.debug(
             "Loading local cache entry from '{}'.".format(cache_path))
         return deserializer(f)
예제 #3
0
def run_in_subprocess(cmd, check_output=False, **kwargs):
    """
    Execute command using default subprocess configuration.

    Parameters
    ----------
    cmd : string
        Command to be executed in subprocess.
    kwargs : keywords
        Options to pass to subprocess.Popen.

    Returns
    -------
    proc : Popen subprocess
        Subprocess used to run command.
    """

    logger.debug('Executing command: {0}'.format(cmd))
    config = DEFAULT_SUBPROCESS_CONFIG.copy()
    config.update(kwargs)
    if not check_output:
        if omniduct_config.logging_level < 20:
            config['stdout'] = None
            config['stderr'] = None
        else:
            config['stdout'] = open(os.devnull, 'w')
            config['stderr'] = open(os.devnull, 'w')
    timeout = config.pop('timeout', None)

    process = subprocess.Popen(cmd, **config)
    try:
        stdout, stderr = process.communicate(None, timeout=timeout)
    except subprocess.TimeoutExpired:
        os.killpg(
            os.getpgid(process.pid), signal.SIGINT
        )  # send signal to the process group, recurively killing all children
        output, unused_err = process.communicate()
        raise subprocess.TimeoutExpired(process.args, timeout, output=output)
    return SubprocessResults(returncode=process.returncode,
                             stdout=stdout or '',
                             stderr=stderr or '')
예제 #4
0
def run_in_subprocess(cmd, check_output=False, **kwargs):
    """
    Execute command using default subprocess configuration.

    Parameters
    ----------
    cmd : string
        Command to be executed in subprocess.
    kwargs : keywords
        Options to pass to subprocess.Popen.

    Returns
    -------
    proc : Popen subprocess
        Subprocess used to run command.
    """

    logger.debug('Executing command: {0}'.format(cmd))
    config = DEFAULT_SUBPROCESS_CONFIG.copy()
    config.update(kwargs)
    if not check_output:
        if omniduct_config.logging_level < 20:
            config['stdout'] = None
            config['stderr'] = None
        else:
            config['stdout'] = open(os.devnull, 'w')
            config['stderr'] = open(os.devnull, 'w')
    timeout = config.pop('timeout', None)

    process = subprocess.Popen(cmd, **config)
    try:
        stdout, stderr = process.communicate(None, timeout=timeout)
    except subprocess.TimeoutExpired:
        os.killpg(os.getpgid(process.pid), signal.SIGINT)  # send signal to the process group, recursively killing all children
        output, unused_err = process.communicate()
        raise subprocess.TimeoutExpired(process.args, timeout, output=output)
    return SubprocessResults(returncode=process.returncode, stdout=stdout or b'', stderr=stderr or b'')
예제 #5
0
    def _dataframe_to_table(
        self, df, table, if_exists='fail', use_hive_cli=None,
        partition=None, sep=chr(1), table_props=None, dtype_overrides=None, **kwargs
    ):
        """
        If `use_hive_cli` (or if not specified `.push_using_hive_cli`) is
        `True`, a `CREATE TABLE` statement will be automatically generated based
        on the datatypes of the DataFrame (unless overwritten by
        `dtype_overrides`). The `DataFrame` will then be exported to a CSV
        compatible with Hive and uploaded (if necessary) to the remote, before
        being loaded into Hive using a `LOAD DATA LOCAL INFILE ...` query using
        the `hive` cli executable. Note that if a table is not partitioned, you
        cannot convert it to a parititioned table without deleting it first.

        If `use_hive_cli` (or if not specified `.push_using_hive_cli`) is
        `False`, an attempt will be made to push the `DataFrame` to Hive using
        `pandas.DataFrame.to_sql` and the SQLAlchemy binding provided by
        `pyhive` and `impyla`. This may be slower, does not support older
        versions of Hive, and does not support table properties or partitioning.

        If if the schema namespace is not specified, `table.schema` will be
        defaulted to your username.

        Additional Args:
            use_hive_cli (bool, None): A local override for the global
                `.push_using_hive_cli` attribute. If not specified, the global
                default is used. If True, then pushes are performed using the
                `hive` CLI executable on the local/remote PATH.
            **kwargs (dict): Additional arguments to send to `pandas.DataFrame.to_sql`.

        Further Parameters for CLI method (specifying these for the pandas
        method will cause a `RuntimeError` exception):
            partition (dict): A mapping of column names to values that specify
                the partition into which the provided data should be uploaded,
                as well as providing the fields by which new tables should be
                partitioned.
            sep (str): Field delimiter for data (defaults to CTRL-A, or `chr(1)`).
            table_props (dict): Properties to set on any newly created tables
                (extends `.default_table_props`).
            dtype_overrides (dict): Mapping of column names to Hive datatypes to
                use instead of default mapping.
        """
        table = self._parse_namespaces(table, defaults={'schema': self.username})
        use_hive_cli = use_hive_cli or self.push_using_hive_cli
        partition = partition or {}
        table_props = table_props or {}
        dtype_overrides = dtype_overrides or {}

        # Try using SQLALchemy method
        if not use_hive_cli:
            if partition or table_props or dtype_overrides:
                raise RuntimeError(
                    "At least one of `partition` or `table_props` or "
                    "`dtype_overrides` has been specified. Setting table "
                    "properties or partition information is not supported "
                    "via the SQLAlchemy backend. If this is important, please "
                    "pass `use_hive_cli=True`, otherwise remove these values "
                    "and try again."
                )
            try:
                return _pandas.to_sql(
                    df=df, name=table.table, schema=table.schema, con=self._sqlalchemy_engine,
                    index=False, if_exists=if_exists, **kwargs
                )
            except Exception as e:
                raise RuntimeError(
                    "Push unsuccessful. Your version of Hive may be too old to "
                    "support the `INSERT` keyword. You might want to try setting "
                    "`.push_using_hive_cli = True` if your local or remote "
                    "machine has access to the `hive` CLI executable. The "
                    "original exception was: {}".format(e.args[0])
                )

        # Try using Hive CLI

        # If `partition` is specified, the associated columns must not be
        # present in the dataframe.
        assert len(set(partition).intersection(df.columns)) == 0, "The dataframe to be uploaded must not have any partitioned fields. Please remove the field(s): {}.".format(','.join(set(partition).intersection(df.columns)))

        # Save dataframe to file and send it to the remote server if necessary
        temp_dir = tempfile.mkdtemp(prefix='omniduct_hiveserver2')
        tmp_fname = os.path.join(temp_dir, 'data_{}.csv'.format(time.time()))
        logger.info('Saving dataframe to file... {}'.format(tmp_fname))
        df.fillna(r'\N').to_csv(tmp_fname, index=False, header=False,
                                sep=sep, encoding='utf-8')

        if self.remote:
            logger.info("Uploading data to remote host...")
            self.remote.upload(tmp_fname)

        # Generate create table statement.
        auto_table_props = set(self.default_table_props).difference(table_props)
        if len(auto_table_props) > 0:
            logger.warning(
                "In addition to any specified table properties, this "
                "HiveServer2Client has added the following default table "
                "properties:\n{default_props}\nTo override them, please "
                "specify overrides using: `.push(..., table_props={{...}}).`"
                .format(default_props=json.dumps({
                    prop: value for prop, value in self.default_table_props.items()
                    if prop in auto_table_props
                }, indent=True))
            )

        tblprops = self.default_table_props.copy()
        tblprops.update(table_props or {})
        cts = self._create_table_statement_from_df(
            df=df,
            table=table,
            drop=(if_exists == 'replace') and not partition,
            text=True,
            sep=sep,
            table_props=tblprops,
            partition_cols=list(partition),
            dtype_overrides=dtype_overrides
        )

        # Generate load data statement.
        partition_clause = (
            ''
            if not partition
            else 'PARTITION ({})'.format(
                ','.join("{key} = '{value}'".format(key=key, value=value) for key, value in partition.items())
            )
        )
        lds = '\nLOAD DATA LOCAL INPATH "{path}" {overwrite} INTO TABLE {table} {partition_clause};'.format(
            path=os.path.basename(tmp_fname) if self.remote else tmp_fname,
            overwrite="OVERWRITE" if if_exists == "replace" else "",
            table=table,
            partition_clause=partition_clause
        )

        # Run create table statement and load data statments
        logger.info(
            "Creating hive table `{table}` if it does not "
            "already exist, and inserting the provided data{partition}."
            .format(
                table=table,
                partition=" into {}".format(partition_clause) if partition_clause else ""
            )
        )
        try:
            stmts = '\n'.join([cts, lds])
            logger.debug(stmts)
            proc = self._run_in_hivecli(stmts)
            if proc.returncode != 0:
                raise RuntimeError(proc.stderr.decode('utf-8'))
        finally:
            # Clean up files
            if self.remote:
                self.remote.execute('rm -rf {}'.format(tmp_fname))
            shutil.rmtree(temp_dir, ignore_errors=True)

        logger.info("Successfully uploaded dataframe {partition}`{table}`.".format(
            table=table,
            partition="into {} of ".format(partition_clause) if partition_clause else ""
        ))
예제 #6
0
def _create_table_statement_from_df(df,
                                    table,
                                    schema='default',
                                    drop=False,
                                    text=True,
                                    sep=None,
                                    loc=None):
    """
    Return create table statement for new hive table based on pandas dataframe.

    Parameters
    ----------
    df : pandas.DataFrame or pandas.Series
        Used to determine column names and types for create table statement.
    table : str
        Table name for create table statement.
    schema : str
        Schema for create table statement
    drop : bool
        Whether to include a drop table statement along with create table statement.
    text : bool
        Whether data will be stored as a text file.
    sep : str
        Field delimiter for text file (only used if text==True).
    loc : str, optional
        Desired hdfs location.

    Returns
    -------
    cmd : str
        A create table statement.
    """
    # dtype kind to hive type mapping dict.
    DTYPE_KIND_HIVE_TYPE = {
        'b': 'BOOLEAN',  # boolean
        'i': 'BIGINT',  # signed integer
        'u': 'BIGINT',  # unsigned integer
        'f': 'DOUBLE',  # floating-point
        'c': 'STRING',  # complex floating-point
        'O': 'STRING',  # object
        'S': 'STRING',  # (byte-)string
        'U': 'STRING',  # Unicode
        'V': 'STRING'  # void
    }
    sep = sep or "\t"

    # Sanitive column names and map data types to hive types.
    columns = []
    for col, dtype in df.dtypes.iteritems():
        col_sanitized = re.sub('\W', '', col.lower().replace(' ', '_'))
        hive_type = DTYPE_KIND_HIVE_TYPE[dtype.kind]
        columns.append('  {column}  {type}'.format(column=col_sanitized,
                                                   type=hive_type))

    cmd = Template("""
    {% if drop %}
    DROP TABLE IF EXISTS {{ schema }}.{{ table }};
    {% endif -%}
    CREATE TABLE IF NOT EXISTS {{ schema }}.{{ table }} (
    {%- for col in columns %}
     {{ col }} {% if not loop.last %}, {% endif %}
    {%- endfor %}
    )
    {%- if text %}
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY "{{ sep }}"
    STORED AS TEXTFILE
    {% endif %}
    {%- if loc %}
    LOCATION "{{ loc }}"
    {%- endif %}
    ;
    """).render(drop=drop,
                table=table,
                schema=schema,
                columns=columns,
                text=text,
                sep=sep)

    logger.debug('Create Table Statement: {}'.format(cmd))
    return cmd
예제 #7
0
파일: _schemas.py 프로젝트: naoyak/omniduct
                logger.warn("Did not recognize type '%s' of column '%s'" %
                            (row.Type, row.Column))
                coltype = sql_types.NullType
            result.append({
                'name': row.Column,
                'type': coltype,
                # newer Presto no longer includes this column
                'nullable': getattr(row, 'Null', True),
                'default': None,
            })
        return result

    PrestoDialect.get_columns = get_columns
except ImportError:
    logger.debug(
        "Not monkey patching pyhive's PrestoDialect.get_columns due to missing dependencies."
    )


class SchemasMixin(object):
    """
    Attaches a tab-completable `.schemas` attribute to a `DatabaseClient` instance.

    It is currently implemented as a mixin rather than directly provided on the
    base class because it requires that the host `DatabaseClient` instance have a
    `sqlalchemy` metadata object handle, and not all backends support this.

    If we are willing to forgo the ability to actually make queries using the
    SQLAlchemy ORM, we could instead use an SQL agnostic version.
    """
    @property
예제 #8
0
                coltype = type_map[row.Type]
            except KeyError:
                logger.warn("Did not recognize type '%s' of column '%s'" % (row.Type, row.Column))
                coltype = sql_types.NullType
            result.append({
                'name': row.Column,
                'type': coltype,
                # newer Presto no longer includes this column
                'nullable': getattr(row, 'Null', True),
                'default': None,
            })
        return result

    PrestoDialect.get_columns = get_columns
except ImportError:
    logger.debug("Not monkey patching pyhive's PrestoDialect.get_columns due to missing dependencies.")


class SchemasMixin(object):

    @property
    def schemas(self):
        """
        This object has as attributes the schemas on the current catalog. These
        schema objects in turn have the tables as SQLAlchemy `Table` objects.
        This allows tab completion and exploration of Databases.
        """
        from werkzeug import LocalProxy

        def get_schemas():
            if not getattr(self, '_schemas', None):