def render_statement(method, self, statement, *args, **kwargs): # Check if statement is an SQLAlchemy executable expression, and if so, render it try: from sqlalchemy.sql.base import Executable if isinstance(statement, Executable): statement = str( statement.compile(compile_kwargs={"literal_binds": True})) except ImportError: pass # If templating enabled, render template if kwargs.pop('template', False): template_context = {} template_context.update(self._template_context) kwarg_context = kwargs.pop('template_context', {}) template_context.update(kwarg_context) intersection = set(self._template_context.keys()) & set( kwarg_context.keys()) if intersection: logger.warning( "The following default template context keys have been overridden " "by the local context: {}.".format(intersection)) statement = self.render_template(statement, template_context) return method(self, statement, *args, **kwargs)
def _connect(self): from pydruid.db import connect logger.info('Connecting to Druid database ...') self.__druid = connect(self.host, self.port, path='/druid/v2/sql/', scheme='http') if self.username or self.password: logger.warning( 'Duct username and password not passed to pydruid connection. ' 'pydruid connection currently does not allow these fields to be passed.' )
def _connect(self): from pydruid.db import connect logger.info('Connecting to Druid database ...') self.__druid = connect(self.host, self.port, path='/druid/v2/sql/', scheme='http') if self.username or self.password: logger.warning( 'Duct username and passowrd not passed to pydruid connection. ' 'pydruid connection currently does not allow these fields to be passed.' )
def _connect(self): import sqlalchemy if self.protocol not in ['mysql']: logger.warning("While querying and executing should work as " "expected, some operations on this database client " "(such as listing tables, querying to tables, etc) " "may not function as expected due to the backend " "not supporting ANSI SQL.") self.engine = sqlalchemy.create_engine(self.db_uri, **self.engine_opts) self._sqlalchemy_metadata = sqlalchemy.MetaData(self.engine)
def get_schemas(): if not getattr(self, '_schemas', None): self.connect() try: from .schemas import Schemas self._schemas = Schemas(self._sqlalchemy_metadata) except ImportError: logger.warning( 'cannot import Schemas, perhaps sqlalchemy is not up to date' ) return self._schemas
def format_dump(self, data): import pandas as pd df = pd.DataFrame(data=data, columns=self.column_names) if self.date_fields is not None: try: df = pd.io.sql._parse_date_columns(df, self.date_fields) except Exception as e: logger.warning('Unable to parse date columns. Perhaps your version of pandas is outdated.' 'Original error message was: {}: {}'.format(e.__class__.__name__, str(e))) if self.index_fields is not None: df.set_index(self.index_fields, inplace=True) return df
def wrapped(method, self, *args, **kwargs): kwargs = function_args_as_kwargs(method, self, *args, **kwargs) kwargs.pop('self') _key = key(self, kwargs) _namespace = namespace(self, kwargs) _cache = cache(self, kwargs) _use_cache = use_cache(self, kwargs) _renew = renew(self, kwargs) _serializer = serializer(self, kwargs) _metadata = metadata(self, kwargs) if _cache is None or not _use_cache: return method(self, **kwargs) if _renew or not _cache.has_key(_key, namespace=_namespace): # noqa: has_key is not of a dictionary here value = method(self, **kwargs) if value is None: logger.warning("Method value returned None. Not saving to cache.") return try: _cache.set( _key, value=value, namespace=_namespace, serializer=_serializer, metadata=_metadata ) except: logger.warning("Failed to save results to cache. If needed, please save them manually.") if config.cache_fail_hard: six.reraise(*sys.exc_info()) else: logger.caveat('Loaded from cache') # Return from cache every time, just in case serialization operation was # destructive (e.g. reading from cursors) return _cache.get( _key, namespace=_namespace, serializer=_serializer )
def naive_load_balancer(hosts, port): # Shuffle hosts randomly hosts = hosts.copy() random.shuffle(hosts) # Check if host is available and if so return it pattern = re.compile(r'(?P<host>[^\:]+)(?::(?P<port>[0-9]{1,5}))?') for host in hosts: m = pattern.match(host) if is_port_bound(m.group('host'), int(m.group('port') or port), timeout=1): return host else: logger.warning("Avoiding down or inaccessible host: '{}'.".format(host)) raise RuntimeError( "Unable to connect to any of the hosts associated with this service. " "This may be due to networking issues, such as not being connected to " "the internet or your company's VPN.".format(host) )
def naive_load_balancer(hosts, port): # Shuffle hosts randomly hosts = hosts.copy() random.shuffle(hosts) # Check if host is available and if so return it pattern = re.compile(r'(?P<host>[^\:]+)(?::(?P<port>[0-9]{1,5}))?') for host in hosts: m = pattern.match(host) if is_port_bound(m.group('host'), int(m.group('port') or port), timeout=1): return host else: logger.warning("Avoiding down or inaccessible host: '{}'.".format(host)) raise RuntimeError( "Unable to connect to any of the hosts associated with this service. " "This may be due to networking issues, such as not being connected to " "the internet or your company's VPN." )
def check_dependencies(protocols, message=None): if protocols is None: return dependencies = [] for protocol in protocols: dependencies.extend(__optional_dependencies__.get(protocol, [])) missing_deps = [] warning_deps = {} for dep in dependencies: m = re.match('^[a-z_][a-z0-9]*', dep) if not m: logger.warning('Invalid dependency requested: {}'.format(dep)) package_name = m.group(0) accept_any_version = package_name == dep try: pkg_resources.get_distribution(dep) except VersionConflict: warning_deps[dep] = "{}=={}".format(package_name, pkg_resources.get_distribution(m.group(0)).version) except: # Some packages may be available, but not installed. If so, we # should accept them with warnings (if version specified in dep). try: importlib.import_module(package_name) if not accept_any_version: warning_deps.append('{}==<not installed>'.format(package_name)) except: # ImportError in python 2, ModuleNotFoundError in Python 3 missing_deps.append(dep) if warning_deps: message = "You may have some outdated packages:\n" for key in sorted(warning_deps): message += '\t- Want {}, found {}'.format(key, warning_deps[key]) logger.warning(message) if missing_deps: message = message or "Whoops! You do not seem to have all the dependencies required." fix = ("You can fix this by running:\n\n" "\t{install_command}\n\n" "Note: Depending on your system's installation of Python, you may " "need to use `pip2` or `pip3` instead of `pip`.").format(install_command='pip install --upgrade ' + ' '.join(missing_deps)) raise RuntimeError('\n\n'.join([message, fix]))
def wrapped(method, self, *args, **kwargs): if six.PY3 and not hasattr(sys, 'pypy_version_info'): arguments = inspect.signature(method).parameters.keys() else: arguments = inspect.getargspec(method).args kwargs.update(dict(zip(list(arguments)[1:], args))) _cache = cache(self) _use_cache = use_cache(self, kwargs) _renew = renew(self, kwargs) _format = format(self, kwargs) if _cache is None or not _use_cache: return method(self, **kwargs) _id_duct = id_duct(self, kwargs) _id_str = id_str(self, kwargs) if _renew or not _cache.has_key( _id_duct, _id_str): # noqa: has_key is not of a dictionary here value = method(self, **kwargs) try: _cache.set(id_duct=_id_duct, id_str=_id_str, value=value, serializer=serializer(_format)) except Exception: # Remove any lingering (perhaps partial) cache files _cache.clear(id_duct=_id_duct, id_str=_id_str) logger.warning( "Failed to save results to cache. If needed, please save them manually." ) if config.cache_fail_hard: raise return value logger.caveat('Loaded from cache') return _cache.get(id_duct=_id_duct, id_str=_id_str, deserializer=deserializer(_format))
def check_dependencies(protocols, message=None): if protocols is None: return dependencies = [] for protocol in protocols: dependencies.extend(__optional_dependencies__.get(protocol, [])) missing_deps = [] warning_deps = {} for dep in dependencies: try: pkg_resources.get_distribution(dep) except VersionConflict: m = re.match('^[a-z_][a-z0-9]*', dep) if m: warning_deps[dep] = "{}=={}".format( m.group(0), pkg_resources.get_distribution(m.group(0)).version) else: logger.warning("Could not find distribution for '{}'.".format( m.group(0))) except: missing_deps.append(dep) if warning_deps: message = "You may have some outdated packages:\n" for key in sorted(warning_deps): message += '\t- Want {}, found {}'.format(key, warning_deps[key]) logger.warning(message) if missing_deps: message = message or "Whoops! You do not seem to have all the dependencies required." fix = ( "You can fix this by running:\n\n" "\t{install_command}\n\n" "Note: Depending on your system's installation of Python, you may " "need to use `pip2` or `pip3` instead of `pip`.").format( install_command='pip install --upgrade ' + ' '.join(missing_deps)) raise RuntimeError('\n\n'.join([message, fix]))
def template_render(self, name_or_statement, context=None, by_name=False): """ This method renders a template either by retrieving a template associated with a provided template name, or by directly rendering the template body as passed. In addition to the `jinja2` templating syntax, described in more detail in the official `jinja2` documentation, a meta-templating extension is also provided. This meta-templating allows you to reference other reference other templates. For example, if you had two SQL templates named 'template_a' and 'template_b', then you could render them into one SQL query using (for example): ``` .template_render(''' WITH a AS ( {{{template_a}}} ), b AS ( {{{template_b}}} ) SELECT * FROM a JOIN b ON a.x = b.x ''') ``` Note that template substitution in this way is iterative, so you can chain template embedding, provided that such embedding is not recursive. Parameters: name_or_statement (str): The name of a template (if `by_name` is True) or else a string representation of a `jinja2` template. context (dict, None): A dictionary to use as the template context. If not specified, an empty dictionary is used. by_name (bool): `True` if `name_or_statement` should be interpreted as a template name, or `False` (default) if `name_or_statement` should be interpreted as a template body. Returns: str: The rendered template. """ if by_name: if name_or_statement not in self._templates: raise ValueError("No such template of name: '{}'.".format( name_or_statement)) statement = self._templates[name_or_statement] else: statement = name_or_statement try: from sqlalchemy.sql.base import Executable if isinstance(statement, Executable): statement = str( statement.compile(compile_kwargs={"literal_binds": True})) except ImportError: pass if context is None or context is False: context = {} template_context = {} template_context.update(self._template_context) # default context template_context.update(context) # context passed in intersection = set(self._template_context.keys()) & set(context.keys()) if intersection: logger.warning( "The following default template context keys have been overridden " "by the local context: {}.".format(intersection)) # Substitute in any other named statements recursively while '{{{' in statement or '{{%' in statement: statement = Template(statement, block_start_string='{{%', block_end_string='%}}', variable_start_string='{{{', variable_end_string='}}}', comment_start_string='{{#', comment_end_string='#}}', undefined=StrictUndefined).render( getattr(self, '_templates', {})) return Template(statement, undefined=StrictUndefined).render(template_context)
def _create_table_statement_from_df(cls, df, table, drop=False, text=True, sep=chr(1), loc=None, table_props=None, partition_cols=None, dtype_overrides=None): """ Return create table statement for new hive table based on pandas dataframe. Args: df (pandas.DataFrame, pandas.Series): Used to determine column names and types for create table statement. table (ParsedNamespaces): The parsed name of the target table. drop (bool): Whether to include a drop table statement before the create table statement. text (bool): Whether data will be stored as a textfile. sep (str): The separator used by the text data store (defaults to CTRL-A, i.e. `chr(1)`, which is the default Hive separator). loc (str): Desired HDFS location (if not the default). table_props (dict): The table properties (if any) to set on the table. partition_cols (list): The columns by which the created table should be partitioned. Returns: str: The Hive SQL required to create the table with the above configuration. """ table_props = table_props or {} partition_cols = partition_cols or [] dtype_overrides = dtype_overrides or {} # dtype kind to hive type mapping dict. DTYPE_KIND_HIVE_TYPE = { 'b': 'BOOLEAN', # boolean 'i': 'BIGINT', # signed integer 'u': 'BIGINT', # unsigned integer 'f': 'DOUBLE', # floating-point 'c': 'STRING', # complex floating-point 'O': 'STRING', # object 'S': 'STRING', # (byte-)string 'U': 'STRING', # Unicode 'V': 'STRING' # void } # Sanitise column names and map numpy/pandas data-types to hive types. columns = [] for col, dtype in df.dtypes.iteritems(): col_sanitized = re.sub(r'\W', '', col.lower().replace(' ', '_')) hive_type = dtype_overrides.get(col) or DTYPE_KIND_HIVE_TYPE.get(dtype.kind) if hive_type is None: hive_type = DTYPE_KIND_HIVE_TYPE['O'] logger.warning( 'Unable to determine hive type for dataframe column {col} of pandas dtype {dtype}. ' 'Defaulting to hive type {hive_type}. If other column type is desired, ' 'please specify via `dtype_overrides`' .format(**locals()) ) columns.append( ' {column} {type}'.format(column=col_sanitized, type=hive_type) ) partition_columns = ['{} STRING'.format(col) for col in partition_cols] tblprops = ["'{key}' = '{value}'".format(key=key, value=value) for key, value in table_props.items()] tblprops = "TBLPROPERTIES({})".format(",".join(tblprops)) if len(tblprops) > 0 else "" cmd = Template(""" {% if drop %} DROP TABLE IF EXISTS {{ table }}; {% endif -%} CREATE TABLE IF NOT EXISTS {{ table }} ( {%- for col in columns %} {{ col }} {% if not loop.last %}, {% endif %} {%- endfor %} ) {%- if partition_columns %} PARTITIONED BY ( {%- for col in partition_columns %} {{ col }} {% if not loop.last %}, {% endif %} {%- endfor %} ) {%- endif %} {%- if text %} ROW FORMAT DELIMITED FIELDS TERMINATED BY "{{ sep }}" STORED AS TEXTFILE {% endif %} {%- if loc %} LOCATION "{{ loc }}" {%- endif %} {{ tblprops }} ; """).render(**locals()) return cmd
def _dataframe_to_table( self, df, table, if_exists='fail', use_hive_cli=None, partition=None, sep=chr(1), table_props=None, dtype_overrides=None, **kwargs ): """ If `use_hive_cli` (or if not specified `.push_using_hive_cli`) is `True`, a `CREATE TABLE` statement will be automatically generated based on the datatypes of the DataFrame (unless overwritten by `dtype_overrides`). The `DataFrame` will then be exported to a CSV compatible with Hive and uploaded (if necessary) to the remote, before being loaded into Hive using a `LOAD DATA LOCAL INFILE ...` query using the `hive` cli executable. Note that if a table is not partitioned, you cannot convert it to a parititioned table without deleting it first. If `use_hive_cli` (or if not specified `.push_using_hive_cli`) is `False`, an attempt will be made to push the `DataFrame` to Hive using `pandas.DataFrame.to_sql` and the SQLAlchemy binding provided by `pyhive` and `impyla`. This may be slower, does not support older versions of Hive, and does not support table properties or partitioning. If if the schema namespace is not specified, `table.schema` will be defaulted to your username. Additional Args: use_hive_cli (bool, None): A local override for the global `.push_using_hive_cli` attribute. If not specified, the global default is used. If True, then pushes are performed using the `hive` CLI executable on the local/remote PATH. **kwargs (dict): Additional arguments to send to `pandas.DataFrame.to_sql`. Further Parameters for CLI method (specifying these for the pandas method will cause a `RuntimeError` exception): partition (dict): A mapping of column names to values that specify the partition into which the provided data should be uploaded, as well as providing the fields by which new tables should be partitioned. sep (str): Field delimiter for data (defaults to CTRL-A, or `chr(1)`). table_props (dict): Properties to set on any newly created tables (extends `.default_table_props`). dtype_overrides (dict): Mapping of column names to Hive datatypes to use instead of default mapping. """ table = self._parse_namespaces(table, defaults={'schema': self.username}) use_hive_cli = use_hive_cli or self.push_using_hive_cli partition = partition or {} table_props = table_props or {} dtype_overrides = dtype_overrides or {} # Try using SQLALchemy method if not use_hive_cli: if partition or table_props or dtype_overrides: raise RuntimeError( "At least one of `partition` or `table_props` or " "`dtype_overrides` has been specified. Setting table " "properties or partition information is not supported " "via the SQLAlchemy backend. If this is important, please " "pass `use_hive_cli=True`, otherwise remove these values " "and try again." ) try: return _pandas.to_sql( df=df, name=table.table, schema=table.schema, con=self._sqlalchemy_engine, index=False, if_exists=if_exists, **kwargs ) except Exception as e: raise RuntimeError( "Push unsuccessful. Your version of Hive may be too old to " "support the `INSERT` keyword. You might want to try setting " "`.push_using_hive_cli = True` if your local or remote " "machine has access to the `hive` CLI executable. The " "original exception was: {}".format(e.args[0]) ) # Try using Hive CLI # If `partition` is specified, the associated columns must not be # present in the dataframe. assert len(set(partition).intersection(df.columns)) == 0, "The dataframe to be uploaded must not have any partitioned fields. Please remove the field(s): {}.".format(','.join(set(partition).intersection(df.columns))) # Save dataframe to file and send it to the remote server if necessary temp_dir = tempfile.mkdtemp(prefix='omniduct_hiveserver2') tmp_fname = os.path.join(temp_dir, 'data_{}.csv'.format(time.time())) logger.info('Saving dataframe to file... {}'.format(tmp_fname)) df.fillna(r'\N').to_csv(tmp_fname, index=False, header=False, sep=sep, encoding='utf-8') if self.remote: logger.info("Uploading data to remote host...") self.remote.upload(tmp_fname) # Generate create table statement. auto_table_props = set(self.default_table_props).difference(table_props) if len(auto_table_props) > 0: logger.warning( "In addition to any specified table properties, this " "HiveServer2Client has added the following default table " "properties:\n{default_props}\nTo override them, please " "specify overrides using: `.push(..., table_props={{...}}).`" .format(default_props=json.dumps({ prop: value for prop, value in self.default_table_props.items() if prop in auto_table_props }, indent=True)) ) tblprops = self.default_table_props.copy() tblprops.update(table_props or {}) cts = self._create_table_statement_from_df( df=df, table=table, drop=(if_exists == 'replace') and not partition, text=True, sep=sep, table_props=tblprops, partition_cols=list(partition), dtype_overrides=dtype_overrides ) # Generate load data statement. partition_clause = ( '' if not partition else 'PARTITION ({})'.format( ','.join("{key} = '{value}'".format(key=key, value=value) for key, value in partition.items()) ) ) lds = '\nLOAD DATA LOCAL INPATH "{path}" {overwrite} INTO TABLE {table} {partition_clause};'.format( path=os.path.basename(tmp_fname) if self.remote else tmp_fname, overwrite="OVERWRITE" if if_exists == "replace" else "", table=table, partition_clause=partition_clause ) # Run create table statement and load data statments logger.info( "Creating hive table `{table}` if it does not " "already exist, and inserting the provided data{partition}." .format( table=table, partition=" into {}".format(partition_clause) if partition_clause else "" ) ) try: stmts = '\n'.join([cts, lds]) logger.debug(stmts) proc = self._run_in_hivecli(stmts) if proc.returncode != 0: raise RuntimeError(proc.stderr.decode('utf-8')) finally: # Clean up files if self.remote: self.remote.execute('rm -rf {}'.format(tmp_fname)) shutil.rmtree(temp_dir, ignore_errors=True) logger.info("Successfully uploaded dataframe {partition}`{table}`.".format( table=table, partition="into {} of ".format(partition_clause) if partition_clause else "" ))
def template_render(self, name_or_statement, context=None, by_name=False): """ This method renders a template either by retrieving a template associated with a provided template name, or by directly rendering the template body as passed. In addition to the `jinja2` templating syntax, described in more detail in the official `jinja2` documentation, a meta-templating extension is also provided. This meta-templating allows you to reference other reference other templates. For example, if you had two SQL templates named 'template_a' and 'template_b', then you could render them into one SQL query using (for example): ``` .template_render(''' WITH a AS ( {{{template_a}}} ), b AS ( {{{template_b}}} ) SELECT * FROM a JOIN b ON a.x = b.x ''') ``` Note that template substitution in this way is iterative, so you can chain template embedding, provided that such embedding is not recursive. Parameters: name_or_statement (str): The name of a template (if `by_name` is True) or else a string representation of a `jinja2` template. context (dict, None): A dictionary to use as the template context. If not specified, an empty dictionary is used. by_name (bool): `True` if `name_or_statement` should be interpreted as a template name, or `False` (default) if `name_or_statement` should be interpreted as a template body. Returns: str: The rendered template. """ if by_name: if name_or_statement not in self._templates: raise ValueError("No such template of name: '{}'.".format(name_or_statement)) statement = self._templates[name_or_statement] else: statement = name_or_statement try: from sqlalchemy.sql.base import Executable if isinstance(statement, Executable): statement = str(statement.compile(compile_kwargs={"literal_binds": True})) except ImportError: pass if context is None or context is False: context = {} template_context = {} template_context.update(self._template_context) # default context template_context.update(context) # context passed in intersection = set(self._template_context.keys()) & set(context.keys()) if intersection: logger.warning( "The following default template context keys have been overridden " "by the local context: {}." .format(intersection) ) # Substitute in any other named statements recursively while '{{{' in statement or '{{%' in statement: statement = Template(statement, block_start_string='{{%', block_end_string='%}}', variable_start_string='{{{', variable_end_string='}}}', comment_start_string='{{#', comment_end_string='#}}', undefined=StrictUndefined).render(getattr(self, '_templates', {})) return Template(statement, undefined=StrictUndefined).render(template_context)
def _init(self): logger.warning("The Paramiko SSH client is still under development, \ and is not ready for use as a daily driver.")