def convert_to_parquet(self, table, order_by=[]): "Convert table to parquet format" sql = '''use {db}; drop table if exists {table}z; create table {table}z stored as parquet as select * from {table} {ord}; drop table {table}; '''.format( db=table.split('.')[0], table=table, ord='order by ' + ', '.join(order_by) if order_by else '', ) # get table path data = self.select('show create table ' + table, echo=False) for i, r in enumerate(data): if data[i][0].strip() == 'LOCATION': table_hdfs_path = data[i + 1][0].strip().replace("'", '') log('HDFS Path: ' + table_hdfs_path) break # delete z folder, create z temp table os.system('hdfs dfs -rm -r -f -skipTrash {}z'.format(table_hdfs_path)) self.execute(sql, echo=False) # delete orignal folder, create final table # When renaming, error keeps occuring os.system('hdfs dfs -rm -r -f -skipTrash {}'.format(table_hdfs_path)) sql = '''use {db}; alter table {table}z rename to {table}'''.format( db=table.split('.')[0], table=table, ) self.execute(sql, echo=False)
def get_primary_keys(self, table_name, echo=False): "Get PK metadata for table" Rec = namedtuple('PKs', 'schema table pk_name column_name column_order') self._fields = Rec._fields schema, table = self._split_schema_table(table_name) # def get_rec(col, pk_name, column_order): # r_dict = {} # r_dict['schema'] = schema # r_dict['table'] = table # r_dict['pk_name'] = pk_name # r_dict['column_name'] = col # r_dict['column_order'] = column_order # return Rec(**r_dict) # sql_tmpl = self._template('metadata.primary_keys') # if sql_tmpl: # rows = self.select(sql_tmpl.format(table=table, schema=schema)) # else: # self.get_engine(echo=echo) # r_dict = self.engine_inspect.get_pk_constraint(table, schema=schema) # rows = [ # get_rec(col, r_dict['name'], i + 1) # for i, col in enumerate(r_dict['constrained_columns']) # ] # return rows # getPrimaryKeys(String catalog, String schema, String table) rs_rows = rs_to_rows(self.meta.getPrimaryKeys(catalog, schema, table)) log(Exception('getPrimaryKeys not implemented!')) print(rs_rows)
def replace(self, table, data, pk_filter_dict, field_types=None, commit=True, echo=True): "Replace list of dics; pk_filter_dict needs to be condition dict" # http://api.mongodb.com/python/current/api/pymongo/collection.html#pymongo.collection.Collection.bulk_write from pymongo import InsertOne, DeleteOne, ReplaceOne s_t = datetime.datetime.now() collection = self.get_coll(table) requests = [ ReplaceOne(pk_filter_dict, row_dict, upsert=True) for row_dict in data ] result = collection.bulk_write(requests) secs = (datetime.datetime.now() - s_t).total_seconds() mins = round(secs / 60, 1) rate = round(len(data) / secs, 1) if echo: log("Replaced {} records into table '{}' in {} mins [{} r/s].". format(len(data), table, mins, rate)) return len(data)
def write_csv(file_path, headers, data, footer_text=None, append=False, log=log, echo=True, file_obj=None, encoding="utf8"): "Write to CSV, python3 compatible. 'data' must be list of iterables" s_t = now() mode = 'a' if append else 'w' f = file_obj if file_obj else open( file_path, mode, newline='', encoding=encoding) w = csv.writer(f, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') if not append: w.writerow(headers) for row in data: w.writerow(row) if footer_text: f.write(footer_text) if not file_obj: f.close() secs = (now() - s_t).total_seconds() rate = round(len(data) / secs, 1) if echo: log("Wrote {} rows to {} [{} r/s].".format(len(data), file_path, rate))
def select(self, sql, rec_name='Record', dtype='namedtuple', limit=None, echo=True, log=log): "Select from SQL, return list of namedtuples" s_t = datetime.datetime.now() df1 = self.sparko.sql(sql) self.df_id = self.df_id if self.df_id else 'df_' + str(int(time.time())) df1.registerTempTable(self.df_id) self.df_ids.append(self.df_id) df1 = df1.limit(limit) if limit else df1 if dtype == 'dataframe': data = df1.toPandas() else: data = df1.collect() self._fields = df1.columns secs = (datetime.datetime.now() - s_t).total_seconds() rate = round(len(data) / secs, 1) if echo: log(" >>> Got {} rows in {} secs [{} r/s].".format( len(data), secs, rate)) return data
def transfer_progress(self, transferred, total, unit='B'): "Display transfer progress" prct = int(100.0 * transferred / total) divide = lambda x, y: round(1.0 * x / (y), 1) if self.last_stat: secs = (datetime.datetime.now() - self.last_stat['time']).total_seconds() if secs > 2: rate = round((transferred - self.last_stat['transferred']) / secs, 1) self.last_stat = dict(time=now(), transferred=transferred, rate=rate) else: rate = self.last_stat['rate'] else: rate = 0 self.last_stat = dict(time=now(), transferred=transferred, rate=rate) if total > 1024**3: transferred = divide(transferred, 1024**3) total = divide(total, 1024**3) unit = 'GB' rate = '{} {} / sec'.format(divide(rate, 1024**2), 'MB') elif total > 1024**2: transferred = divide(transferred, 1024**2) total = divide(total, 1024**2) unit = 'MB' rate = '{} {} / sec'.format(divide(rate, 1024**2), unit) elif total > 1024**1: transferred = divide(transferred, 1024**1) total = divide(total, 1024**1) unit = 'KB' rate = '{} {} / sec'.format(divide(rate, 1024**1), unit) log('+{}% Complete: {} / {} {} @ {}'.format( prct, transferred, total, unit, rate))
def read_csv(file_path, delimiter=',', quotechar='"', mode='r', encoding="utf8"): """Read CSV from File""" s_t = now() with open(file_path, mode, encoding=encoding) as f: reader = csv.reader(f, delimiter=delimiter, quotechar=quotechar) Data = namedtuple("Data", [f.replace(' ', '_') for f in next(reader)]) # data_rows = [row for row in reader] try: i = 0 data_rows = [row for i, row in enumerate(imap(Data._make, reader))] except Exception as e: print('ERROR at line ' + str(i + 1)) raise e secs = (now() - s_t).total_seconds() rate = round(len(data_rows) / secs, 1) log("Imported {} rows from {} [{} r/s].".format(len(data_rows), file_path, rate)) return data_rows
def start_worker_mon(): """Starts the Monitoring worker""" worker_name = '{}-mon'.format(WORKER_PREFIX) worker = Worker(worker_name, 'monitor', fn=mon_worker.run, kwargs={}, log=log, kill_if_running=True, pid_folder=DBNET_FOLDER) worker.start() log('Monitor Loop PID is {}'.format(worker.pid)) workers['mon'] = worker workers['mon'].put_child_q(dict(name=worker_name, pid=worker.pid)) # add to monitor store.sqlx('workers').replace_rec( hostname=worker.hostname, worker_name=worker.name, worker_type=worker.type, worker_pid=worker.pid, status='RUNNING', task_id=-1, task_function=worker.fn.__name__, task_start_date=now(), task_args=jdumps(worker.args), task_kwargs=jdumps(worker.kwargs), progress=None, queue_length=0, last_updated=epoch(), ) return worker
def run(self, port, host='0.0.0.0', debug=False, url_suffix='', **kwargs): import eventlet, socketio import eventlet.wsgi if 'worker' in kwargs: self.worker: Worker = kwargs['worker'] self.pipe: Pipe = self.worker.pipe self.log = self.worker.log if 'pipe' in kwargs: self.pipe: Pipe = kwargs['pipe'] self.log = get_kw('log', self.log, kwargs) self.port = int(port) hostname = socket.gethostname() if host == '0.0.0.0' else host self.base_url = 'http://{}:{}'.format(host, self.port) # remember to use DEBUG mode for templates auto reload # https://github.com/lepture/python-livereload/issues/144 self.flask_app.debug = debug app = socketio.Middleware(self.sio, self.flask_app) log('*Web Server PID is {}'.format(os.getpid())) log("*URL -> " + self.base_url + url_suffix) eventlet.wsgi.server(eventlet.listen((host, self.port)), app)
def write_jsonls(file_path, data, log=log): "Sream Write to JSON Lines. 'data' must be namedtuple. schema is a dict of field to data-type" import jsonlines s_t = now() l_t = now() msg_dlt = 10000 counter = 0 counter2 = 0 with open(file_path, 'wb') as f: w = jsonlines.Writer(f) for row in data: w.write(row) counter += 1 counter2 += 1 # progress message if counter2 % msg_dlt == 0: secs_l = (now() - l_t).total_seconds() if secs_l >= 20: secs = (now() - s_t).total_seconds() rate = round(counter2 / secs_l, 1) mins = round(secs / 60, 1) log("{} min ## Writing to JSON: {} rows @ {} r/s.".format( mins, counter, rate)) l_t = now() counter2 = 0 secs = (now() - s_t).total_seconds() rate = round(counter / secs, 1) log("Wrote {} rows to {} [{} r/s].".format(counter, file_path, rate))
def write_pq( file_path, dataf, partition_cols=None, flavor='spark', filesystem=None, append=False, log=log, ): "Write to Parquet, python3 compatible. 'data' must be list of interables" s_t = now() if not append and os.path.exists(file_path): shutil.rmtree(file_path, ignore_errors=True) table = pa.Table.from_pandas(dataf, nthreads=psutil.cpu_count()) counter = table.num_rows pq.write_to_dataset( table, root_path=file_path, partition_cols=partition_cols, flavor=flavor, preserve_index=False, filesystem=filesystem, use_deprecated_int96_timestamps=True, compression='snappy') # will append. delete folder for overwrite secs = (now() - s_t).total_seconds() rate = round(counter / secs, 1) log("Wrote: {} rows to {} [{} r/s].".format(counter, file_path, rate)) return counter
def write_jsonl(df, file_path, hdfs_folder, mode='overwrite', timestampFormat='yyyy-MM-dd HH:mm:ss', dateFormat='yyyy-MM-dd', partitions=1, log=log): "Write to csv file correctly from HDFS to local" file_name = file_path.split('/')[-1] # Write schema schema_file_path = file_path + '.schema' schema_data = df.dtypes write_jsonl(schema_file_path, schema_data) # Write data df.repartition(partitions).write.json( path='{}/{}'.format(hdfs_folder, file_name), mode=mode, timestampFormat=timestampFormat, dateFormat=dateFormat, ) os.system('hdfs dfs -getmerge {}/{} {}'.format(hdfs_folder, file_name, file_path)) crc_file_path = '{}/.{}.crc'.format('/'.join(file_path.split('/')[:-1]), file_name) os.system('rm -f {}'.format(crc_file_path)) log("Wrote data to file {}.".format(file_path))
def reconnect(self, min_tresh=0): """Re-Connect to Database if minute threshold reached""" if (now() - self.last_connect).total_seconds() > min_tresh * 60: log('Reconnecting to {}...'.format(self.name)) if self.cursor is not None: self.cursor.close() self.connect() self.last_connect = now()
def _do_execute(self, sql, cursor=None): cursor = cursor if cursor else self.get_cursor() try: cursor.execute(sql) except Exception as E: log(Exception('Error for SQL:\n' + sql)) raise E self._fields = self.get_cursor_fields(cursor=cursor)
def select(self, sql, rec_name='Record', dtype='namedtuple', limit=None, echo=True, retrying=False, log=log): "Select from SQL, return list of namedtuples" # if echo: log("Running SQL for '{}'.".format(rec_name)) self.reconnect(min_tresh=10) s_t = datetime.datetime.now() cursor = self.get_cursor() def get_rows(cursor): counter = 0 row = cursor.fetchone() while row: counter += 1 yield row if limit and counter == limit: break row = cursor.fetchone() _data = list(self.stream(sql, dtype=dtype, echo=False, limit=limit)) fields = self._fields if not fields: return [] if dtype == 'namedtuple': Record = namedtuple( rec_name.replace(' ', '_').replace('.', '_'), fields) if limit: data = [Record(*row) for row in _data] else: data = [Record(*row) for row in _data] elif dtype == 'tuple': if limit: data = [tuple(row) for row in _data] else: data = [tuple(row) for row in _data] elif dtype == 'dataframe': if limit: data = pandas.DataFrame([row for row in _data], columns=fields) else: data = pandas.DataFrame([row for row in _data], columns=fields) else: raise (Exception('{} is not recongnized.'.format(dtype))) secs = (datetime.datetime.now() - s_t).total_seconds() rate = round(len(data) / secs, 1) if echo: log(" >>> Got {} rows in {} secs [{} r/s].".format( len(data), secs, rate)) return data
def get_spark_version(self, spark_home): try: line = read_file(spark_home + '/RELEASE', read_lines=True)[0] version = line.split()[1] except Exception as E: log(E) log('-Unable to determine Spark Version.') version = 'x.x' return version
def send_email_exchange(to_address, subject, body_text, sender=None, attachments=[], image_paths=[], html=False): import smtplib from os.path import basename from email.mime.application import MIMEApplication from email.mime.multipart import MIMEMultipart from email.mime.text import MIMEText from email.utils import COMMASPACE, formatdate msg = MIMEMultipart('related') if html else MIMEMultipart() sender = sender if sender else os.getenv("SMTP_USER") to_address = to_address if isinstance(to_address, list) else [to_address] msg['From'] = sender msg['To'] = ','.join(to_address) msg['Subject'] = subject if html: msg.preamble = 'This is a multi-part message in MIME format.' msgAlternative = MIMEMultipart('alternative') msg.attach(msgAlternative) msgText = MIMEText(extract_text_from_html(body_text)) msgAlternative.attach(msgText) msgText = MIMEText(body_text, 'html') msgAlternative.attach(msgText) else: msg.attach(MIMEText(body_text, 'plain')) for i, img_path in enumerate(image_paths): with open(img_path, 'rb') as fp: msgImage = MIMEImage(fp.read()) msgImage.add_header('Content-ID', '<image' + str(i + 1) + '>') msg.attach(msgImage) for f in attachments: with open(f, "rb") as file: part = MIMEApplication(file.read(), Name=basename(f)) part['Content-Disposition'] = 'attachment; filename="%s"' % basename(f) msg.attach(part) # Send the message via our SMTP server SMTP_SERVER = os.getenv("SMTP_SERVER") if not SMTP_SERVER: raise Exception('Env SMTP_SERVER is not defined!') s = smtplib.SMTP(SMTP_SERVER) s.sendmail(sender, to_address, msg.as_string()) s.quit() log('Sent Email "{}" succesfully!'.format(subject))
def handle_worker_req(worker: Worker, data_dict): """A function for a unhandled worker request. Args: worker: the respective worker data_dict: the request payload dictionary """ log('Received unhandled worker ({}) data: {}'.format( worker.name, data_dict))
def stream(self, sql, rec_name='Record', dtype='namedtuple', yield_chuncks=False, chunk_size=None, limit=None, echo=True): "Stream Select from SQL, yield records as they come in" self.reconnect(min_tresh=10) if echo: log("Streaming SQL for '{}'.".format(rec_name)) self.get_cursor() fetch_size = limit if limit else self.fetch_size fetch_size = chunk_size if chunk_size else fetch_size self.cursor.arraysize = fetch_size # self.cursor.itersize = fetch_size try: self._do_execute(sql) except Exception as e: raise e if dtype == 'tuple': make_rec = lambda row: row make_batch = lambda rows: rows elif dtype == 'dataframe': yield_chuncks = True make_batch = lambda rows: pandas.DataFrame(rows, columns=self._fields) else: Record = namedtuple( rec_name.replace(' ', '_').replace('.', '_'), self._fields) make_rec = lambda row: Record(*row) make_batch = lambda rows: [make_rec(r) for r in rows] self._stream_counter = 0 while True: if not self._fields: break rows = self.cursor.fetchmany(fetch_size) if rows: if yield_chuncks: batch = make_batch(rows) self._stream_counter += len(batch) if len(batch): yield batch else: for row in rows: self._stream_counter += 1 yield make_rec(row) else: break if limit: break
def read_csvD(file_path, delimiter=',', quotechar='"', date_cols=[], date_format=None, echo=True, recarray=False, detect_date=True): "Use Pandas DataFrame" # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html import pandas s_t = now() # https://stackoverflow.com/questions/17465045/can-pandas-automatically-recognize-dates def date_parser(x): dd = x try: dd = pandas.datetime.strptime(str(x), date_format) except ValueError: pass return dd date_parser = date_parser if date_format else None df = pandas.read_csv( file_path, delimiter=delimiter, parse_dates=date_cols, date_parser=date_parser, # quoting=csv.QUOTE_MINIMAL , infer_datetime_format=detect_date, ) if recarray: df = df.to_records() for col in df.columns: if not detect_date: continue if col in date_cols: continue if df[col].dtype == 'object': try: df[col] = pandas.to_datetime(df[col]) except ValueError: pass replace_func = lambda col: re.sub(r'_+', '_', re.sub( r'[\]\[. ]', '_', col)) df = df.rename(columns={col: replace_func(col) for col in df.columns}) secs = (now() - s_t).total_seconds() rate = round(len(df) / secs, 1) if echo: log("Imported {} rows from {} [{} r/s].".format( len(df), file_path, rate)) return df
def on_response(data): if data == 'OK': return res = '+OK' if data['completed'] else '~NOT OK' res = '+Queued' if 'queued' in data and data['queued'] else res log('{} for {}: {}'.format( res, data['orig_req']['req_type'], # data['req_type'], data, ))
def make_df(rows, _fields): global buf_df, buf_rows, making_df buf_rows += rows if len(buf_rows) > 100000: with th_lock: log('making buf_df') making_df = True buf_df = pandas.DataFrame(buf_rows, columns=_fields) making_df = False buf_rows = []
def _get_jar_paths(self, profile): from xutil.database.jdbc import get_jar_path if 'drivers' not in profile: log(Exception('"drivers" key not in profile!')) return jar_paths = [] for db_type in profile['drivers']: jar_path = get_jar_path(db_type, profile) jar_paths.append(jar_path) return ':'.join(jar_paths)
def __init__(self, conn_dict, profile=None, echo=False): "Inititate connection" self._cred = struct(conn_dict) self._cred.kwargs = conn_dict.get('kwargs', {}) self.name = self._cred.get('name', None) self.username = self._cred.get('username', None) self.type = self._cred.type self.engine = None self._cursor_description = None self.profile = profile self.batch_size = 10000 self.fetch_size = 20000 self.echo = echo self.connect() self.last_connect = now() # Base Template template_base_path = '{}/database/templates/base.yaml'.format( get_dir_path()) self.template_dict = read_yaml(template_base_path) # Specific Type Template template_path = '{}/database/templates/{}.yaml'.format( get_dir_path(), self.type) temp_dict = read_yaml(template_path) for key1 in temp_dict: # Level 1 if isinstance(temp_dict[key1], dict): if key1 not in self.template_dict: self.template_dict[key1] = temp_dict[key1] # Level 2 for key2 in temp_dict[key1]: # Always Overwrite self.template_dict[key1][key2] = temp_dict[key1][key2] else: # Level 1 Non-Dict Overwrite self.template_dict[key1] = temp_dict[key1] self.variables = self._template('variables') if os.getenv('PROFILE_YAML'): other_vars = get_variables() for key in other_vars: self.variables[key] = other_vars[key] self.tmp_folder = self.variables['tmp_folder'] self.set_variables() if echo: log("Connected to {} as {}".format(self._cred.name, self._cred.user))
def read_csv2(self, file_path=None, hdfs_file_path=None, hdfs_folder=None, timestampFormat='yyyy-MM-dd HH:mm:ss', dateFormat='yyyy-MM-dd', delimeter=',', date_cols=[], timestamp_cols=[], log=log, escape='"', hdfs_put=True, ignoreTrailingWhiteSpace=False, schema=None, hdfs_delete_local=False): "Read from csv file initially on local" # https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html # The dateFormat field does not parse/assign dates correctly... not sure why. # see https://stackoverflow.com/questions/40878243/reading-csv-into-a-spark-dataframe-with-timestamp-and-date-types # seems the only way is to assign the schema manually or to cast manually (which is being done here). if self.hive_enabled and file_path: if not hdfs_folder: raise('No value provided for hdfs_folder!') hdfs_file_path = hdfs_file_path if hdfs_file_path else self.hdfs_put( file_path, hdfs_folder, put=self.hive_enabled, log=log) if hdfs_delete_local: os.system('rm -rf ' + file_path) f_path = hdfs_file_path else: f_path = file_path params = dict( path=f_path, header=True, timestampFormat=timestampFormat, # dateFormat=dateFormat, inferSchema=False if schema else True, schema=schema, sep=delimeter, escape=escape, ignoreTrailingWhiteSpace=ignoreTrailingWhiteSpace, ) if self.version >= '2.2': params['multiLine'] = True df = self.spark.read.csv(**params) df = self.process_df_fields(df, date_cols, timestamp_cols, dateFormat, timestampFormat) log('Finished reading file: ' + f_path) return df
def start_listener(self, channel, queue, echo=True): log('Running Listener on channel: ' + channel, color='green') def run_redis_listener(redb, channel, queue): for payload in redb.listen(channel): queue.put(payload) if payload == 'exit': return self.listener_proc = Process(target=run_redis_listener, args=(self, channel, queue)) self.listener_proc.start() return self.listener_proc
def start_worker_db(db_name, start=False): """Create and start a dabatase worker Args: db_name: the name of the database start: Whether to automatically start the worker or not Returns: The worker object. """ db_prof = get_db_profile(db_name) db_workers_map[db_name] = db_workers_map.get(db_name, []) # multiple workers for same database index = 0 worker_name = '{}-{}-{}'.format(WORKER_PREFIX, db_name, index) while worker_name in workers: # in case worker name is already in index += 1 worker_name = '{}-{}-{}'.format(WORKER_PREFIX, db_name, index) worker = Worker(worker_name, 'database-client', fn=db_worker.run, log=log, kill_if_running=True, args=(db_prof, conf_queue), kwargs={}, pid_folder=DBNET_FOLDER) worker.status = 'IDLE' if start: worker.start() log('*Started worker {} with PID {}'.format(worker.name, worker.pid)) workers['mon'].put_child_q(dict(name=worker_name, pid=worker.pid)) # add to monitor store.sqlx('workers').replace_rec( hostname=worker.hostname, worker_name=worker.name, worker_type=worker.type, worker_pid=worker.pid, queue_length=0, status='IDLE', last_updated=epoch(), ) workers[worker_name] = worker db_workers_map[db_name].append(worker) return worker
def drop_table(self, table, log=log): "Drop table" try: sql = self._template('core.drop_table').format(table) self._do_execute(sql) except Exception as E: message = get_exception_message().lower() if self._template('error_filter.table_not_exist') in message: if self.echo: log('Table "{}" already dropped.'.format(table)) else: raise E
def apprise_notify(sid, data): """ Send Notification on Apprise """ url = os.getenv("DBNET_APPRISE_URL") apobj = apprise.Apprise() apobj.add(url) apobj.notify( title=data['title'], body=data['body'], ) log(f'''Sent notification: "{data['title']}"''') return 'OK'
def _do_execute(self, sql): try: self._cursor_description = None self.fields = None self.result = self.connection.execute(sql) self._cursor_description = self.result._cursor_description() self._fields = self._get_cursor_fields() except Exception as E: if 'not open' in get_error_str(E): pass # error when Oracle doesn't have a cursor open else: log(Exception('Error for SQL:\n' + sql)) raise E