def _execute(self, statement, cursor, wait, session_properties): """ If something goes wrong, `PrestoClient` will attempt to parse the error log and present the user with useful debugging information. If that fails, the full traceback will be raised instead. """ from pyhive import presto # Imported here due to slow import performance in Python 3 from pyhive.exc import DatabaseError # Imported here due to slow import performance in Python 3 try: cursor = cursor or presto.Cursor( host=self.host, port=self.port, username=self.username, password=self.password, catalog=self.catalog, schema=self.schema, session_props=session_properties, poll_interval=1, source=self.source, protocol=self.server_protocol ) cursor.execute(statement) status = cursor.poll() if wait: logger.progress(0) # status None means command executed successfully # See https://github.com/dropbox/PyHive/blob/master/pyhive/presto.py#L234 while status is not None and status['stats']['state'] != "FINISHED": if status['stats'].get('totalSplits', 0) > 0: pct_complete = round(status['stats']['completedSplits'] / float(status['stats']['totalSplits']), 4) logger.progress(pct_complete * 100) status = cursor.poll() logger.progress(100, complete=True) return cursor except (DatabaseError, pandas.io.sql.DatabaseError) as e: # Attempt to parse database error, before ultimately reraising the same # exception, maintaining the full stacktrace. exception, exception_args, traceback = sys.exc_info() try: message = e.args[0] if isinstance(message, six.string_types): message = ast.literal_eval(re.match("[^{]*({.*})[^}]*$", message).group(1)) linenumber = message['errorLocation']['lineNumber'] - 1 splt = statement.splitlines() splt[linenumber] += ' <-- {errorType} ({errorName}) occurred. {message} '.format(**message) context = '\n\n[Error Context]\n{}\n'.format('\n'.join([splt[l] for l in range(max(linenumber - 1, 0), min(linenumber + 2, len(splt)))])) class ErrContext(object): def __repr__(self): return context # logged twice so that both notebook and console users see the error context exception_args.args = [exception_args, ErrContext()] logger.error(context) except: logger.warn(("Omniduct was unable to parse the database error messages. Refer to the " "traceback below for full error details.")) if isinstance(exception, type): exception = exception(exception_args) raise_with_traceback(exception, traceback)
def register_from_config(self, config, override=False): """ Register a collection of Duct service configurations. The configuration format must be one of the following: - An iterable sequence of dictionaries containing a mapping between the keyword arguments required to instantiate the `Duct` subclass. - A dictionary mapping names of `Duct` instances to dictionaries of keyword arguments. - A dictionary mapping Duct types ('databases', 'filesystems', etc) to mappings like those immediately above. - A string YAML representation of one of the above (with at least one newline character). - A string filename containing such a YAML representation. There are three special keyword arguments that are required by the `DuctRegistry` instance: - name: Should be present only in the configuration dictionary when config is provided as an iterable sequence of dictionaries. - protocol: Which specifies which `Duct` subclass to fetch. Failure to correctly set this will result in a warning and an ignoring of this configuration. - register_magics (optional): A boolean flag indicating whether to register any magics defined by this Duct class (default: True). Args: config (iterable, dict, str, None): A configuration specified in one of the above described formats. override (bool): Whether to override any existing `Duct` instance of the same name(s). If `False`, any overrides will result in an exception. """ # Extract configuration from a file if necessary, and then process it. if isinstance(config, six.string_types): if '\n' in config: config = yaml.safe_load(config) else: with open(config) as f: config = yaml.safe_load(f.read()) config = self._process_config(config) for duct_config in config: names = duct_config.pop('name') protocol = duct_config.pop('protocol') register_magics = duct_config.pop('register_magics', True) try: self.new(names, protocol, register_magics=register_magics, override=override, **duct_config) except DuctProtocolUnknown as e: logger.error( "Failed to configure `Duct` instance(s) '{}'. {}".format( "', '".join(names.split(',')), str(e))) return self
def _execute(self, statement, query=True, cursor=None, wait=False): from pyhive.exc import DatabaseError # Imported here due to slow import performance in Python 3 try: cursor = cursor or self.__presto.cursor() cursor.execute(statement) status = cursor.poll() if wait or query: logger.progress(0) while status['stats']['state'] != "FINISHED": if status['stats'].get('totalSplits', 0) > 0: pct_complete = round( status['stats']['completedSplits'] / float(status['stats']['totalSplits']), 4) logger.progress(pct_complete * 100) status = cursor.poll() logger.progress(100, complete=True) return cursor except (DatabaseError, pandas.io.sql.DatabaseError) as e: # Attempt to parse database error, before ultimately reraising the same # exception, maintaining the full stacktrace. exception, exception_args, traceback = sys.exc_info() try: message = e.args[0] if isinstance(message, str): message = ast.literal_eval( re.match("[^{]*({.*})[^}]*$", e.message).group(1)) linenumber = message['errorLocation']['lineNumber'] - 1 splt = statement.splitlines() splt[ linenumber] += ' <-- {errorType} ({errorName}) occurred. {message} '.format( **message) context = '\n\n[Error Context]\n{}\n'.format('\n'.join([ splt[l] for l in range(max(linenumber - 1, 0), min(linenumber + 2, len(splt))) ])) class ErrContext(object): def __repr__(self): return context # logged twice so that both notebook and console users see the error context exception_args.args = [exception_args, ErrContext()] logger.error(context) except: logger.warn(( "Omniduct was unable to parse the database error messages. Refer to the " "traceback below for full error details.")) if isinstance(exception, type): exception = exception(exception_args) raise_with_traceback(exception, traceback)
def import_from_config(self, config): config = self._process_config(config) for t in [t.value for t in Duct.Type]: for names, options in config.get(t, {}).items(): protocol = options.pop('protocol') register_magics = options.pop('register_magics', True) try: self.new(names, protocol, register_magics=register_magics, **options) except DuctProtocolUnknown as e: logger.error("Failed to configure `Duct` instance(s) '{}'. {}".format("', '".join(names.split(',')), str(e))) return self
def _execute(self, statement, cursor, wait, session_properties): """ If something goes wrong, `PrestoClient` will attempt to parse the error log and present the user with useful debugging information. If that fails, the full traceback will be raised instead. """ from pyhive import presto # Imported here due to slow import performance in Python 3 from pyhive.exc import DatabaseError # Imported here due to slow import performance in Python 3 try: cursor = cursor or presto.Cursor(host=self.host, port=self.port, username=self.username, password=self.password, catalog=self.catalog, schema=self.schema, session_props=session_properties, poll_interval=1, source=self.source, protocol=self.server_protocol) cursor.execute(statement) status = cursor.poll() if wait: logger.progress(0) # status None means command executed successfully # See https://github.com/dropbox/PyHive/blob/master/pyhive/presto.py#L234 while status is not None and status['stats'][ 'state'] != "FINISHED": if status['stats'].get('totalSplits', 0) > 0: pct_complete = round( status['stats']['completedSplits'] / float(status['stats']['totalSplits']), 4) logger.progress(pct_complete * 100) status = cursor.poll() logger.progress(100, complete=True) return cursor except (DatabaseError, pandas.io.sql.DatabaseError) as e: # Attempt to parse database error, before ultimately reraising the same # exception, maintaining the full stacktrace. exception, exception_args, traceback = sys.exc_info() try: message = e.args[0] if isinstance(message, six.string_types): message = ast.literal_eval( re.match("[^{]*({.*})[^}]*$", message).group(1)) linenumber = message['errorLocation']['lineNumber'] - 1 splt = statement.splitlines() splt[ linenumber] += ' <-- {errorType} ({errorName}) occurred. {message} '.format( **message) context = '\n\n[Error Context]\n{}\n'.format('\n'.join([ splt[l] for l in range(max(linenumber - 1, 0), min(linenumber + 2, len(splt))) ])) class ErrContext(object): def __repr__(self): return context # logged twice so that both notebook and console users see the error context exception_args.args = [exception_args, ErrContext()] logger.error(context) except: logger.warn(( "Omniduct was unable to parse the database error messages. Refer to the " "traceback below for full error details.")) if isinstance(exception, type): exception = exception(exception_args) raise_with_traceback(exception, traceback)
def execute(self, statement, query=False, parse=True, index_field=None, date_fields=None, cleanup_statement=True, render_only=False, **kwargs): ''' Execute a statement against the data source. Parameters ---------- statement : The statement to be executed by the query client. query : Whether this statement should return data, in which case `query` should be `True`; and `False` otherwise. parse : Whether the results of this query should be converted to a pandas DataFrame. index_field : The field to use as an index in the dataframe, or None. date_fields: List of fields to be converted to datetime objects, or None. kwargs : Extra keyword arguments to be passed on to _execute, as implemented by subclasses. Returns ------- A pandas.DataFrame object if `query` and `parse` are both `True`. A DBAPI2 cursor object if `query` is `True`, and `parse` is `False`. `None` otherwise. ''' self.connect() statements = self.statements_split(statement) statements = [ self.statement_cleanup(stmt) if cleanup_statement else stmt for stmt in statements ] assert len(statements) > 0, "No non-empty statements were provided." if render_only: return ';\n'.join(statements) cursor = None for statement in statements[:-1]: cursor = self.connect()._execute(statement, query=False, cursor=cursor, **kwargs) cursor = self.connect()._execute(statements[-1], query, cursor=cursor, **kwargs) if not query or self._cursor_empty(cursor): return None if parse: df = self._cursor_to_dataframe(cursor) cursor.close() if date_fields is None: # if user supplied, use as is date_fields = config.date_fields or [] date_fields = [field for field in date_fields if field in df] if date_fields: try: df = pandas.io.sql._parse_date_columns(df, date_fields) except: logger.error( 'Unable to parse date columns. Perhaps your version of pandas is outdated.' ) if index_field is not None: df.set_index(index_field, inplace=True) return df else: return cursor
def _connect(self): """ The workflow to handle passwords and host keys used by this method is inspired by the `pxssh` module of `pexpect` (https://github.com/pexpect/pexpect). We have adjusted this workflow to our purposes. """ import pexpect # Create socket directory if it doesn't exist. socket_dir = os.path.dirname(self._socket_path) if not os.path.exists(socket_dir): os.makedirs(socket_dir) # Create persistent master connection and exit. cmd = ''.join([ "ssh {login} -MT ", "-S {socket} ", "-o ControlPersist=yes ", "-o StrictHostKeyChecking=no ", "-o UserKnownHostsFile=/dev/null " if not self.check_known_hosts else "", "-o NoHostAuthenticationForLocalhost=yes ", "-o ServerAliveInterval=60 ", "-o ServerAliveCountMax=2 ", "'exit'", ]).format(login=self._login_info, socket=self._socket_path) expected = [ "WARNING: REMOTE HOST IDENTIFICATION HAS CHANGED!", # 0 "(?i)are you sure you want to continue connecting", # 1 "(?i)(?:(?:password)|(?:passphrase for key)):", # 2 "(?i)permission denied", # 3 "(?i)terminal type", # 4 pexpect.TIMEOUT, # 5 "(?i)connection closed by remote host", # 6 "(?i)could not resolve hostname", # 7 pexpect.EOF # 8 ] try: expect = pexpect.spawn(cmd) i = expect.expect(expected, timeout=10) # First phase if i == 0: # If host identification changed, arrest any further attempts to connect error_message = ( 'Host identification for {} has changed! This is most likely ' 'due to the the server being redeployed or reconfigured but ' 'may also be due to a man-in-the-middle attack. If you trust ' 'your network connection, you should be safe to update the ' 'host keys for this host. To do this manually, please remove ' 'the line corresponding to this host in ~/.ssh/known_hosts; ' 'or call the `update_host_keys` method of this client.'.format(self._host) ) if self.interactive: logger.error(error_message) auto_fix = input('Would you like this client to do this for you? (y/n)') if auto_fix == 'y': self.update_host_keys() return self.connect() else: raise RuntimeError("Host keys not updated. Please update keys manually.") else: raise RuntimeError(error_message) if i == 1: # Request to authorize host certificate (i.e. host not in the 'known_hosts' file) expect.sendline("yes") i = self.expect(expected) if i == 2: # Request for password/passphrase expect.sendline(self.password or getpass.getpass('Password: '******'ascii') i = self.expect(expected) # Second phase if i == 1: # Another request to authorize host certificate (i.e. host not in the 'known_hosts' file) raise RuntimeError('Received a second request to authorize host key. This should not have happened!') elif i in (2, 3): # Second request for password/passphrase or rejection of credentials. For now, give up. raise DuctAuthenticationError('Invalid username and/or password, or private key is not unlocked.') elif i == 4: # Another request for terminal type. raise RuntimeError('Received a second request for terminal type. This should not have happened!') elif i == 5: # Timeout # In our instance, this means that we have not handled some or another aspect of the login procedure. # Since we are expecting an EOF when we have successfully logged in, hanging means that the SSH login # procedure is waiting for more information. Since we have no more to give, this means our login # was unsuccessful. raise RuntimeError('SSH client seems to be awaiting more information, but we have no more to give. The ' 'messages received so far are:\n{}'.format(expect.before)) elif i == 6: # Connection closed by remote host raise RuntimeError("Remote closed SSH connection") elif i == 7: raise RuntimeError("Cannot connect to {} on your current network connection".format(self.host)) finally: expect.close() # We should be logged in at this point, but let us make doubly sure assert self.is_connected(), 'Unexpected failure to establish a connection with the remote host with command: \n ' \ '{}\n\n Please report this!'.format(cmd)
def _push(self, df, table, partition_clause='', overwrite=False, schema='omniduct', sep='\t'): """ Create a new table in hive from pandas DataFrame. Parameters ---------- df : pandas.DataFrame or Series Data to be push into a hive table. table : str Table name for new hive table. schema : str Schema (or database) for new hive table. partition_clause : str The hive partition clause specifying which partitions to load data into. overwrite : bool, optional Whether to overwrite the table data if it exists. Default: False. sep : str Field delimiter for data. See Also -------- https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DML """ # Save dataframe to file. _, tmp_path = tempfile.mkstemp(dir='.') tmp_fname = os.path.basename(tmp_path) logger.info('Saving dataframe to file... {}'.format(tmp_fname)) df.to_csv(tmp_fname, index=False, header=False, sep=sep, encoding='utf-8') # Create table statement. cts = _create_table_statement_from_df(df=df, table=table, schema=schema, drop=overwrite and not partition_clause, text=True, sep=sep) # Load data statement. lds = '\nLOAD DATA LOCAL INPATH "{path}" {overwrite} INTO TABLE {schema}.{table} {partition_clause};'.format( path=tmp_fname, overwrite="OVERWRITE" if overwrite else "", schema=schema, table=table, partition_clause=partition_clause) # SCP data if SSHClient is set. if self.remote: logger.info('Uploading data to remote host...') self.remote.copy_from_local(tmp_fname, tmp_fname) # Run create table statement and load data statment. logger.info('Creating hive table and loading data...') proc = self._run_in_hivecli('\n'.join([cts, lds])) if proc.returncode != 0: logger.error(proc.stderr) # Clean up files. logger.info('Cleaning up files...') rm_cmd = 'rm -rf {0}'.format(tmp_fname) run_in_subprocess(rm_cmd) if self.remote: self.remote.execute(rm_cmd) return proc