def _port_forward_start(self, local_port, remote_host, remote_port): logger.debug('Now forwarding port {} to {}:{} ...'.format(local_port, remote_host, remote_port)) try: server = forward_tunnel(local_port, remote_host, remote_port, self.__client.get_transport()) except KeyboardInterrupt: print('C-c: Port forwarding stopped.') return server
def get(self, id_duct, id_str, deserializer=pickle.load): cache_path = self.get_path(id_duct, id_str) if not os.path.exists(cache_path): return None with open(cache_path, 'rb') as f: logger.debug( "Loading local cache entry from '{}'.".format(cache_path)) return deserializer(f)
def run_in_subprocess(cmd, check_output=False, **kwargs): """ Execute command using default subprocess configuration. Parameters ---------- cmd : string Command to be executed in subprocess. kwargs : keywords Options to pass to subprocess.Popen. Returns ------- proc : Popen subprocess Subprocess used to run command. """ logger.debug('Executing command: {0}'.format(cmd)) config = DEFAULT_SUBPROCESS_CONFIG.copy() config.update(kwargs) if not check_output: if omniduct_config.logging_level < 20: config['stdout'] = None config['stderr'] = None else: config['stdout'] = open(os.devnull, 'w') config['stderr'] = open(os.devnull, 'w') timeout = config.pop('timeout', None) process = subprocess.Popen(cmd, **config) try: stdout, stderr = process.communicate(None, timeout=timeout) except subprocess.TimeoutExpired: os.killpg( os.getpgid(process.pid), signal.SIGINT ) # send signal to the process group, recurively killing all children output, unused_err = process.communicate() raise subprocess.TimeoutExpired(process.args, timeout, output=output) return SubprocessResults(returncode=process.returncode, stdout=stdout or '', stderr=stderr or '')
def run_in_subprocess(cmd, check_output=False, **kwargs): """ Execute command using default subprocess configuration. Parameters ---------- cmd : string Command to be executed in subprocess. kwargs : keywords Options to pass to subprocess.Popen. Returns ------- proc : Popen subprocess Subprocess used to run command. """ logger.debug('Executing command: {0}'.format(cmd)) config = DEFAULT_SUBPROCESS_CONFIG.copy() config.update(kwargs) if not check_output: if omniduct_config.logging_level < 20: config['stdout'] = None config['stderr'] = None else: config['stdout'] = open(os.devnull, 'w') config['stderr'] = open(os.devnull, 'w') timeout = config.pop('timeout', None) process = subprocess.Popen(cmd, **config) try: stdout, stderr = process.communicate(None, timeout=timeout) except subprocess.TimeoutExpired: os.killpg(os.getpgid(process.pid), signal.SIGINT) # send signal to the process group, recursively killing all children output, unused_err = process.communicate() raise subprocess.TimeoutExpired(process.args, timeout, output=output) return SubprocessResults(returncode=process.returncode, stdout=stdout or b'', stderr=stderr or b'')
def _dataframe_to_table( self, df, table, if_exists='fail', use_hive_cli=None, partition=None, sep=chr(1), table_props=None, dtype_overrides=None, **kwargs ): """ If `use_hive_cli` (or if not specified `.push_using_hive_cli`) is `True`, a `CREATE TABLE` statement will be automatically generated based on the datatypes of the DataFrame (unless overwritten by `dtype_overrides`). The `DataFrame` will then be exported to a CSV compatible with Hive and uploaded (if necessary) to the remote, before being loaded into Hive using a `LOAD DATA LOCAL INFILE ...` query using the `hive` cli executable. Note that if a table is not partitioned, you cannot convert it to a parititioned table without deleting it first. If `use_hive_cli` (or if not specified `.push_using_hive_cli`) is `False`, an attempt will be made to push the `DataFrame` to Hive using `pandas.DataFrame.to_sql` and the SQLAlchemy binding provided by `pyhive` and `impyla`. This may be slower, does not support older versions of Hive, and does not support table properties or partitioning. If if the schema namespace is not specified, `table.schema` will be defaulted to your username. Additional Args: use_hive_cli (bool, None): A local override for the global `.push_using_hive_cli` attribute. If not specified, the global default is used. If True, then pushes are performed using the `hive` CLI executable on the local/remote PATH. **kwargs (dict): Additional arguments to send to `pandas.DataFrame.to_sql`. Further Parameters for CLI method (specifying these for the pandas method will cause a `RuntimeError` exception): partition (dict): A mapping of column names to values that specify the partition into which the provided data should be uploaded, as well as providing the fields by which new tables should be partitioned. sep (str): Field delimiter for data (defaults to CTRL-A, or `chr(1)`). table_props (dict): Properties to set on any newly created tables (extends `.default_table_props`). dtype_overrides (dict): Mapping of column names to Hive datatypes to use instead of default mapping. """ table = self._parse_namespaces(table, defaults={'schema': self.username}) use_hive_cli = use_hive_cli or self.push_using_hive_cli partition = partition or {} table_props = table_props or {} dtype_overrides = dtype_overrides or {} # Try using SQLALchemy method if not use_hive_cli: if partition or table_props or dtype_overrides: raise RuntimeError( "At least one of `partition` or `table_props` or " "`dtype_overrides` has been specified. Setting table " "properties or partition information is not supported " "via the SQLAlchemy backend. If this is important, please " "pass `use_hive_cli=True`, otherwise remove these values " "and try again." ) try: return _pandas.to_sql( df=df, name=table.table, schema=table.schema, con=self._sqlalchemy_engine, index=False, if_exists=if_exists, **kwargs ) except Exception as e: raise RuntimeError( "Push unsuccessful. Your version of Hive may be too old to " "support the `INSERT` keyword. You might want to try setting " "`.push_using_hive_cli = True` if your local or remote " "machine has access to the `hive` CLI executable. The " "original exception was: {}".format(e.args[0]) ) # Try using Hive CLI # If `partition` is specified, the associated columns must not be # present in the dataframe. assert len(set(partition).intersection(df.columns)) == 0, "The dataframe to be uploaded must not have any partitioned fields. Please remove the field(s): {}.".format(','.join(set(partition).intersection(df.columns))) # Save dataframe to file and send it to the remote server if necessary temp_dir = tempfile.mkdtemp(prefix='omniduct_hiveserver2') tmp_fname = os.path.join(temp_dir, 'data_{}.csv'.format(time.time())) logger.info('Saving dataframe to file... {}'.format(tmp_fname)) df.fillna(r'\N').to_csv(tmp_fname, index=False, header=False, sep=sep, encoding='utf-8') if self.remote: logger.info("Uploading data to remote host...") self.remote.upload(tmp_fname) # Generate create table statement. auto_table_props = set(self.default_table_props).difference(table_props) if len(auto_table_props) > 0: logger.warning( "In addition to any specified table properties, this " "HiveServer2Client has added the following default table " "properties:\n{default_props}\nTo override them, please " "specify overrides using: `.push(..., table_props={{...}}).`" .format(default_props=json.dumps({ prop: value for prop, value in self.default_table_props.items() if prop in auto_table_props }, indent=True)) ) tblprops = self.default_table_props.copy() tblprops.update(table_props or {}) cts = self._create_table_statement_from_df( df=df, table=table, drop=(if_exists == 'replace') and not partition, text=True, sep=sep, table_props=tblprops, partition_cols=list(partition), dtype_overrides=dtype_overrides ) # Generate load data statement. partition_clause = ( '' if not partition else 'PARTITION ({})'.format( ','.join("{key} = '{value}'".format(key=key, value=value) for key, value in partition.items()) ) ) lds = '\nLOAD DATA LOCAL INPATH "{path}" {overwrite} INTO TABLE {table} {partition_clause};'.format( path=os.path.basename(tmp_fname) if self.remote else tmp_fname, overwrite="OVERWRITE" if if_exists == "replace" else "", table=table, partition_clause=partition_clause ) # Run create table statement and load data statments logger.info( "Creating hive table `{table}` if it does not " "already exist, and inserting the provided data{partition}." .format( table=table, partition=" into {}".format(partition_clause) if partition_clause else "" ) ) try: stmts = '\n'.join([cts, lds]) logger.debug(stmts) proc = self._run_in_hivecli(stmts) if proc.returncode != 0: raise RuntimeError(proc.stderr.decode('utf-8')) finally: # Clean up files if self.remote: self.remote.execute('rm -rf {}'.format(tmp_fname)) shutil.rmtree(temp_dir, ignore_errors=True) logger.info("Successfully uploaded dataframe {partition}`{table}`.".format( table=table, partition="into {} of ".format(partition_clause) if partition_clause else "" ))
def _create_table_statement_from_df(df, table, schema='default', drop=False, text=True, sep=None, loc=None): """ Return create table statement for new hive table based on pandas dataframe. Parameters ---------- df : pandas.DataFrame or pandas.Series Used to determine column names and types for create table statement. table : str Table name for create table statement. schema : str Schema for create table statement drop : bool Whether to include a drop table statement along with create table statement. text : bool Whether data will be stored as a text file. sep : str Field delimiter for text file (only used if text==True). loc : str, optional Desired hdfs location. Returns ------- cmd : str A create table statement. """ # dtype kind to hive type mapping dict. DTYPE_KIND_HIVE_TYPE = { 'b': 'BOOLEAN', # boolean 'i': 'BIGINT', # signed integer 'u': 'BIGINT', # unsigned integer 'f': 'DOUBLE', # floating-point 'c': 'STRING', # complex floating-point 'O': 'STRING', # object 'S': 'STRING', # (byte-)string 'U': 'STRING', # Unicode 'V': 'STRING' # void } sep = sep or "\t" # Sanitive column names and map data types to hive types. columns = [] for col, dtype in df.dtypes.iteritems(): col_sanitized = re.sub('\W', '', col.lower().replace(' ', '_')) hive_type = DTYPE_KIND_HIVE_TYPE[dtype.kind] columns.append(' {column} {type}'.format(column=col_sanitized, type=hive_type)) cmd = Template(""" {% if drop %} DROP TABLE IF EXISTS {{ schema }}.{{ table }}; {% endif -%} CREATE TABLE IF NOT EXISTS {{ schema }}.{{ table }} ( {%- for col in columns %} {{ col }} {% if not loop.last %}, {% endif %} {%- endfor %} ) {%- if text %} ROW FORMAT DELIMITED FIELDS TERMINATED BY "{{ sep }}" STORED AS TEXTFILE {% endif %} {%- if loc %} LOCATION "{{ loc }}" {%- endif %} ; """).render(drop=drop, table=table, schema=schema, columns=columns, text=text, sep=sep) logger.debug('Create Table Statement: {}'.format(cmd)) return cmd
logger.warn("Did not recognize type '%s' of column '%s'" % (row.Type, row.Column)) coltype = sql_types.NullType result.append({ 'name': row.Column, 'type': coltype, # newer Presto no longer includes this column 'nullable': getattr(row, 'Null', True), 'default': None, }) return result PrestoDialect.get_columns = get_columns except ImportError: logger.debug( "Not monkey patching pyhive's PrestoDialect.get_columns due to missing dependencies." ) class SchemasMixin(object): """ Attaches a tab-completable `.schemas` attribute to a `DatabaseClient` instance. It is currently implemented as a mixin rather than directly provided on the base class because it requires that the host `DatabaseClient` instance have a `sqlalchemy` metadata object handle, and not all backends support this. If we are willing to forgo the ability to actually make queries using the SQLAlchemy ORM, we could instead use an SQL agnostic version. """ @property
coltype = type_map[row.Type] except KeyError: logger.warn("Did not recognize type '%s' of column '%s'" % (row.Type, row.Column)) coltype = sql_types.NullType result.append({ 'name': row.Column, 'type': coltype, # newer Presto no longer includes this column 'nullable': getattr(row, 'Null', True), 'default': None, }) return result PrestoDialect.get_columns = get_columns except ImportError: logger.debug("Not monkey patching pyhive's PrestoDialect.get_columns due to missing dependencies.") class SchemasMixin(object): @property def schemas(self): """ This object has as attributes the schemas on the current catalog. These schema objects in turn have the tables as SQLAlchemy `Table` objects. This allows tab completion and exploration of Databases. """ from werkzeug import LocalProxy def get_schemas(): if not getattr(self, '_schemas', None):