示例#1
0
def write_csv(file_path,
              headers,
              data,
              footer_text=None,
              append=False,
              log=log,
              echo=True,
              file_obj=None,
              encoding="utf8"):
    "Write to CSV, python3 compatible. 'data' must be list of iterables"
    s_t = now()
    mode = 'a' if append else 'w'
    f = file_obj if file_obj else open(
        file_path, mode, newline='', encoding=encoding)
    w = csv.writer(f,
                   delimiter=',',
                   quoting=csv.QUOTE_MINIMAL,
                   lineterminator='\n')
    if not append: w.writerow(headers)
    for row in data:
        w.writerow(row)

    if footer_text:
        f.write(footer_text)

    if not file_obj: f.close()

    secs = (now() - s_t).total_seconds()
    rate = round(len(data) / secs, 1)
    if echo:
        log("Wrote {} rows to {} [{} r/s].".format(len(data), file_path, rate))
示例#2
0
def read_csv(file_path,
             delimiter=',',
             quotechar='"',
             mode='r',
             encoding="utf8"):
    """Read CSV from File"""
    s_t = now()

    with open(file_path, mode, encoding=encoding) as f:
        reader = csv.reader(f, delimiter=delimiter, quotechar=quotechar)
        Data = namedtuple("Data", [f.replace(' ', '_') for f in next(reader)])
        # data_rows = [row for row in reader]
        try:
            i = 0
            data_rows = [row for i, row in enumerate(imap(Data._make, reader))]
        except Exception as e:
            print('ERROR at line ' + str(i + 1))
            raise e

    secs = (now() - s_t).total_seconds()
    rate = round(len(data_rows) / secs, 1)
    log("Imported {} rows from {} [{} r/s].".format(len(data_rows), file_path,
                                                    rate))

    return data_rows
示例#3
0
  def transfer_progress(self, transferred, total, unit='B'):
    "Display transfer progress"
    prct = int(100.0 * transferred / total)
    divide = lambda x, y: round(1.0 * x / (y), 1)

    if self.last_stat:
      secs = (datetime.datetime.now() - self.last_stat['time']).total_seconds()
      if secs > 2:
        rate = round((transferred - self.last_stat['transferred']) / secs, 1)
        self.last_stat = dict(time=now(), transferred=transferred, rate=rate)
      else:
        rate = self.last_stat['rate']
    else:
      rate = 0
      self.last_stat = dict(time=now(), transferred=transferred, rate=rate)

    if total > 1024**3:
      transferred = divide(transferred, 1024**3)
      total = divide(total, 1024**3)
      unit = 'GB'
      rate = '{} {} / sec'.format(divide(rate, 1024**2), 'MB')
    elif total > 1024**2:
      transferred = divide(transferred, 1024**2)
      total = divide(total, 1024**2)
      unit = 'MB'
      rate = '{} {} / sec'.format(divide(rate, 1024**2), unit)
    elif total > 1024**1:
      transferred = divide(transferred, 1024**1)
      total = divide(total, 1024**1)
      unit = 'KB'
      rate = '{} {} / sec'.format(divide(rate, 1024**1), unit)
    log('+{}% Complete: {} / {} {} @ {}'.format(
      prct, transferred, total, unit, rate))
示例#4
0
def write_pq(
    file_path,
    dataf,
    partition_cols=None,
    flavor='spark',
    filesystem=None,
    append=False,
    log=log,
):
    "Write to Parquet, python3 compatible. 'data' must be list of interables"
    s_t = now()

    if not append and os.path.exists(file_path):
        shutil.rmtree(file_path, ignore_errors=True)

    table = pa.Table.from_pandas(dataf, nthreads=psutil.cpu_count())
    counter = table.num_rows
    pq.write_to_dataset(
        table,
        root_path=file_path,
        partition_cols=partition_cols,
        flavor=flavor,
        preserve_index=False,
        filesystem=filesystem,
        use_deprecated_int96_timestamps=True,
        compression='snappy')  # will append. delete folder for overwrite

    secs = (now() - s_t).total_seconds()
    rate = round(counter / secs, 1)
    log("Wrote: {} rows to {} [{} r/s].".format(counter, file_path, rate))
    return counter
示例#5
0
 def reconnect(self, min_tresh=0):
     """Re-Connect to Database if minute threshold reached"""
     if (now() - self.last_connect).total_seconds() > min_tresh * 60:
         log('Reconnecting to {}...'.format(self.name))
         if self.cursor is not None:
             self.cursor.close()
         self.connect()
         self.last_connect = now()
示例#6
0
def read_csvD(file_path,
              delimiter=',',
              quotechar='"',
              date_cols=[],
              date_format=None,
              echo=True,
              recarray=False,
              detect_date=True):
    "Use Pandas DataFrame"
    # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
    import pandas
    s_t = now()

    # https://stackoverflow.com/questions/17465045/can-pandas-automatically-recognize-dates
    def date_parser(x):
        dd = x
        try:
            dd = pandas.datetime.strptime(str(x), date_format)
        except ValueError:
            pass
        return dd

    date_parser = date_parser if date_format else None

    df = pandas.read_csv(
        file_path,
        delimiter=delimiter,
        parse_dates=date_cols,
        date_parser=date_parser,
        # quoting=csv.QUOTE_MINIMAL ,
        infer_datetime_format=detect_date,
    )

    if recarray:
        df = df.to_records()

    for col in df.columns:
        if not detect_date: continue
        if col in date_cols: continue
        if df[col].dtype == 'object':
            try:
                df[col] = pandas.to_datetime(df[col])
            except ValueError:
                pass

    replace_func = lambda col: re.sub(r'_+', '_', re.sub(
        r'[\]\[. ]', '_', col))
    df = df.rename(columns={col: replace_func(col) for col in df.columns})

    secs = (now() - s_t).total_seconds()
    rate = round(len(df) / secs, 1)
    if echo:
        log("Imported {} rows from {} [{} r/s].".format(
            len(df), file_path, rate))

    return df
示例#7
0
def write_jsonl(file_path, data, log=log, encoding="utf8"):
    "Write JSONL to File"
    import jsonlines
    s_t = now()

    with open(file_path, 'w', encoding=encoding) as f:
        writer = jsonlines.Writer(f)
        writer.write_all(data)

    counter = len(data)
    secs = (now() - s_t).total_seconds()
    rate = round(counter / secs, 1)
    log("Wrote {} rows to {} [{} r/s].".format(counter, file_path, rate))
示例#8
0
def read_jsonl(file_path, log=log, encoding="utf8"):
    "Read from JSONL File"
    import jsonlines
    s_t = now()

    with open(file_path, 'r', encoding=encoding) as f:
        reader = jsonlines.Reader(f)
        data = list(reader.iter())

    counter = len(data)
    secs = (now() - s_t).total_seconds()
    rate = round(counter / secs, 1)
    log("Read {} rows from {} [{} r/s].".format(counter, file_path, rate))

    return data
示例#9
0
def start_worker_mon():
    """Starts the Monitoring worker"""
    worker_name = '{}-mon'.format(WORKER_PREFIX)
    worker = Worker(worker_name,
                    'monitor',
                    fn=mon_worker.run,
                    kwargs={},
                    log=log,
                    kill_if_running=True,
                    pid_folder=DBNET_FOLDER)

    worker.start()
    log('Monitor Loop PID is {}'.format(worker.pid))

    workers['mon'] = worker
    workers['mon'].put_child_q(dict(name=worker_name,
                                    pid=worker.pid))  # add to monitor
    store.sqlx('workers').replace_rec(
        hostname=worker.hostname,
        worker_name=worker.name,
        worker_type=worker.type,
        worker_pid=worker.pid,
        status='RUNNING',
        task_id=-1,
        task_function=worker.fn.__name__,
        task_start_date=now(),
        task_args=jdumps(worker.args),
        task_kwargs=jdumps(worker.kwargs),
        progress=None,
        queue_length=0,
        last_updated=epoch(),
    )
    return worker
示例#10
0
def start_worker_webapp():
    """Starts the WebApp worker"""
    worker_name = '{}-webapp'.format(WORKER_PREFIX)

    worker = Worker(worker_name,
                    'web-app',
                    fn=webapp_worker.run,
                    log=log,
                    kill_if_running=True,
                    args=(WEBAPP_HOST, WEBAPP_PORT),
                    kwargs={'mon_worker': workers['mon']},
                    pid_folder=DBNET_FOLDER)
    worker.start()
    workers['mon'].put_child_q(dict(name=worker_name,
                                    pid=worker.pid))  # add to monitor
    workers['webapp'] = worker
    store.sqlx('workers').replace_rec(
        hostname=worker.hostname,
        worker_name=worker.name,
        worker_type=worker.type,
        worker_pid=worker.pid,
        status='RUNNING',
        task_id=-1,
        task_function=worker.fn.__name__,
        task_start_date=now(),
        task_args=jdumps(worker.args),
        task_kwargs=jdumps(worker.kwargs),
        progress=None,
        queue_length=0,
        last_updated=epoch(),
    )
    return worker
示例#11
0
 def hdfs_put(file_path,
              hdfs_folder,
              put=True,
              rm=True,
              log=log):
   "Move file from local to HDFS"
   file_size = get_path_size(file_path)
   file_name = file_path.split('/')[-1]
   hdfs_file_path = '{}/{}'.format(hdfs_folder, file_name)
   s_t = now()
   if rm:
     os.system('hdfs dfs -rm -r -f -skipTrash {}'.format(hdfs_file_path))
   if put:
     os.system('hdfs dfs -put -f {} {}'.format(file_path, hdfs_folder))
     secs = (now() - s_t).total_seconds()
     mins = round(secs / 60, 1)
     rate = round(file_size / 1024**2 / secs, 1)
     log("Moved data into HDFS in {} mins [{}MB -> {} MB/s]".format(mins, file_size, rate))
   return hdfs_file_path
示例#12
0
    def __init__(self, conn_dict, profile=None, echo=False):
        "Inititate connection"
        self._cred = struct(conn_dict)
        self._cred.kwargs = conn_dict.get('kwargs', {})
        self.name = self._cred.get('name', None)
        self.username = self._cred.get('username', None)
        self.type = self._cred.type
        self.engine = None
        self._cursor_description = None
        self.profile = profile
        self.batch_size = 10000
        self.fetch_size = 20000
        self.echo = echo
        self.connect()
        self.last_connect = now()

        # Base Template
        template_base_path = '{}/database/templates/base.yaml'.format(
            get_dir_path())
        self.template_dict = read_yaml(template_base_path)

        # Specific Type Template
        template_path = '{}/database/templates/{}.yaml'.format(
            get_dir_path(), self.type)
        temp_dict = read_yaml(template_path)

        for key1 in temp_dict:
            # Level 1
            if isinstance(temp_dict[key1], dict):
                if key1 not in self.template_dict:
                    self.template_dict[key1] = temp_dict[key1]

                # Level 2
                for key2 in temp_dict[key1]:
                    # Always Overwrite
                    self.template_dict[key1][key2] = temp_dict[key1][key2]
            else:
                # Level 1 Non-Dict Overwrite
                self.template_dict[key1] = temp_dict[key1]

        self.variables = self._template('variables')

        if os.getenv('PROFILE_YAML'):
            other_vars = get_variables()
            for key in other_vars:
                self.variables[key] = other_vars[key]

        self.tmp_folder = self.variables['tmp_folder']
        self.set_variables()

        if echo:
            log("Connected to {} as {}".format(self._cred.name,
                                               self._cred.user))
示例#13
0
def write_jsonls(file_path, data, log=log):
    "Sream Write to JSON Lines. 'data' must be namedtuple. schema is a dict of field to data-type"
    import jsonlines
    s_t = now()
    l_t = now()
    msg_dlt = 10000
    counter = 0
    counter2 = 0

    with open(file_path, 'wb') as f:
        w = jsonlines.Writer(f)
        for row in data:
            w.write(row)
            counter += 1
            counter2 += 1

            # progress message
            if counter2 % msg_dlt == 0:
                secs_l = (now() - l_t).total_seconds()
                if secs_l >= 20:
                    secs = (now() - s_t).total_seconds()
                    rate = round(counter2 / secs_l, 1)
                    mins = round(secs / 60, 1)
                    log("{} min ## Writing to JSON: {} rows @ {} r/s.".format(
                        mins, counter, rate))
                    l_t = now()
                    counter2 = 0

    secs = (now() - s_t).total_seconds()
    rate = round(counter / secs, 1)
    log("Wrote {} rows to {} [{} r/s].".format(counter, file_path, rate))
示例#14
0
def write_csvs(file_path,
               data,
               fields=None,
               lineterminator='\n',
               deli=',',
               log=log,
               secs_d=10,
               gzip=False,
               to_str=False,
               encoding="utf8"):
    "Write to CSV, python3 compatible. 'data' must be list of interables"
    s_t = now()
    l_t = now()
    log_dlt = 100
    counter2 = 0
    f1 = open(file_path, 'w', newline='', encoding=encoding)
    w = csv.writer(f1,
                   delimiter=deli,
                   quoting=csv.QUOTE_MINIMAL,
                   lineterminator=lineterminator)

    file_name = file_path.split('/')[-1]
    if fields:
        w.writerow(fields)
    else:
        row = next(data)
        w.writerow(row._fields)  # write headers
        w.writerow(row)  # write 1st row

    counter = 0
    for row in data:
        row = [str(v) for v in row] if to_str else row
        w.writerow(row)
        counter += 1
        counter2 += 1

        # progress message
        if counter2 % log_dlt == 0:
            secs_l = (now() - l_t).total_seconds()
            if secs_l >= secs_d:
                secs = (now() - s_t).total_seconds()
                rate = round(counter2 / secs_l, 1)
                mins = round(secs / 60, 1)
                log("{} min ## Writing to {}: {} rows @ {} r/s.".format(
                    mins, file_name, counter, rate))
                l_t = now()
                counter2 = 0

    f1.close()

    if gzip:
        log('Gzipping ' + file_path)
        os.system('gzip -f ' + file_path)
        file_path = file_path + '.gz'

    secs = (now() - s_t).total_seconds()
    rate = round(counter / secs, 1)
    log("Wrote: {} rows to {} [{} r/s].".format(counter, file_path, rate))
    return counter
示例#15
0
def write_pqs(
    file_path,
    dataf_chunks,
    partition_cols=None,
    flavor='spark',
    append=False,
    filesystem=None,
    log=log,
    secs_d=10,
):
    "Stream-Write to Parquet, python3 compatible. 'dataf_chunks' must be list of dataframes"
    s_t = now()
    l_t = now()
    log_dlt = 100
    counter = 0
    counter2 = 0

    if not append and os.path.exists(file_path):
        shutil.rmtree(file_path, ignore_errors=True)

    file_name = file_path.split('/')[-1]
    for dataf in dataf_chunks:
        table = pa.Table.from_pandas(dataf, nthreads=psutil.cpu_count())
        counter += table.num_rows
        counter2 += table.num_rows
        pq.write_to_dataset(
            table,
            root_path=file_path,
            partition_cols=partition_cols,
            flavor=flavor,
            preserve_index=False,
            filesystem=filesystem,
            use_deprecated_int96_timestamps=True,
            compression='snappy')  # will append. delete folder for overwrite

        secs_l = (now() - l_t).total_seconds()
        if secs_l >= secs_d:
            secs = (now() - s_t).total_seconds()
            rate = round(counter2 / secs_l, 1)
            mins = round(secs / 60, 1)
            log("{} min ## Writing to {}: {} rows @ {} r/s.".format(
                mins, file_name, counter, rate))
            l_t = now()
            counter2 = 0

    secs = (now() - s_t).total_seconds()
    rate = round(counter / secs, 1)
    log("Wrote: {} rows to {} [{} r/s].".format(counter, file_path, rate))
    return counter
示例#16
0
            def exec_sql(sql, limit_def=5000):
                log('\n------------SQL-START------------\n{}\n------------SQL-END------------ \n'
                    .format(sql),
                    color='blue')
                log('LIMIT: ' + str(limit), color='blue')
                cache_used = False
                if sql in worker_sql_cache:
                    for fields, rows in list(worker_sql_cache[sql]['results']):
                        # if limit above limit_def, then refresh
                        if limit > limit_def: break

                        # if limit is same and not a csv call, then refresh
                        if limit == worker_sql_cache[sql][
                                'limit'] and 'csv' not in options:
                            break

                        # if ran more than 10 minutes ago, then refresh
                        if now_minus(minutes=10
                                     ) > worker_sql_cache[sql]['timestamp']:
                            del worker_sql_cache[sql]
                            break

                        if len(fields) > 0:
                            cache_used = True  # must return data/fields
                            worker_sql_cache[sql]['limit'] = limit
                            log('+Cache Used')

                        yield fields, rows, cache_used

                if not cache_used:
                    worker_sql_cache[sql] = dict(timestamp=now(),
                                                 results=[],
                                                 limit=limit)
                    rows = conn.query(
                        sql.replace('%', '%%'),
                        dtype='tuple',
                        limit=limit if limit > limit_def else limit_def)
                    fields = conn._fields
                    worker_sql_cache[sql]['results'].append((fields, rows))
                    yield fields, rows, cache_used
示例#17
0
def run(db_prof, conf_queue: Queue, worker: Worker):
    """Launch the database worker and await requests.
  
  Args:
    db_prof: the db profile
    conf_queue: a multiprocessing Queue
    worker: the respective worker.
  """

    global worker_name, worker_status
    log = worker.log
    worker_name = worker.name
    worker_status = 'IDLE'
    set_worker_idle()
    worker_db_prof = db_prof

    while True:
        try:
            time.sleep(0.05)  # brings down CPU loop usage
        except (KeyboardInterrupt, SystemExit):
            return
        # data_dict = worker.pipe.recv_from_parent(timeout=0)
        data_dict = worker.get_child_q()
        if data_dict:
            conf_data = {'payload_type': 'confirmation'}
            if data_dict['req_type'] in func_map:
                worker_queue.append(data_dict)
                sync_queue()
                conf_data['queued'] = True

                # Add task
                store.sqlx('tasks').add(
                    task_id=data_dict['id'],
                    function=func_map[data_dict['req_type']].__name__,
                    queue_date=now(),
                    start_date=None,
                    end_date=None,
                    args=jdumps([]),
                    kwargs=jdumps(data_dict),
                    error=None,
                    worker_name=worker_name,
                    worker_pid=worker_pid,
                    last_updated=epoch(),
                )

                log('+({}) Queued task: {}'.format(len(worker_queue),
                                                   data_dict))

            # Send receipt confirmation?
            # with worker.lock:
            #   worker.pipe.send_to_parent(conf_data)

        if len(worker_queue) and worker_status == 'IDLE':
            data_dict = worker_queue.popleft()
            sync_queue()
            worker_status = 'BUSY'
            func = func_map[data_dict['req_type']]

            # Sync worker
            store.sqlx('workers').update_rec(
                hostname=worker.hostname,
                worker_name=worker.name,
                status=worker_status,
                task_id=data_dict['id'],
                task_function=func.__name__,
                task_start_date=now(),
                task_args=jdumps([]),
                task_kwargs=jdumps(data_dict),
                last_updated=epoch(),
            )

            # Sync task
            store.sqlx('tasks').update_rec(
                task_id=data_dict['id'],
                start_date=now(),
                last_updated=epoch(),
            )

            try:
                error_data = None
                func(worker, data_dict)
            except Exception as E:
                log(E)
                error_data = dict(
                    id=data_dict['id'],
                    sid=data_dict['sid'],
                    payload_type='task-error',
                    error=get_error_str(E),
                )
                # worker.pipe.send_to_parent(error_data)
                worker.put_parent_q(error_data)
            finally:

                # Sync worker
                worker_status = 'IDLE'
                set_worker_idle()

                # Sync task
                store.sqlx('tasks').update_rec(
                    task_id=data_dict['id'],
                    end_date=now(),
                    error=jdumps(error_data) if error_data else None,
                    last_updated=epoch(),
                )
示例#18
0
def get_conn(db,
             dbs=None,
             echo=True,
             reconnect=False,
             use_jdbc=False,
             conn_expire_min=10,
             spark_hive=False) -> DBConn:
    global conns

    dbs = dbs if dbs else get_databases()
    profile = get_profile()
    db_dict = struct(dbs[db])

    if db_dict.type.lower() == 'hive' and spark_hive:
        db_dict.type = 'spark'

    use_jdbc = True if (
        use_jdbc or
        ('use_jdbc' in db_dict and db_dict['use_jdbc'])) else use_jdbc

    if db in conns and not reconnect:
        if (now() -
                conns[db].last_connect).total_seconds() / 60 < conn_expire_min:
            return conns[db]

    if use_jdbc:
        log('*USING JDBC for ' + db)
        from .jdbc import JdbcConn
        conn = JdbcConn(db_dict, profile=profile)

    elif db_dict.type.lower() == 'oracle':
        from .oracle import OracleConn
        conn = OracleConn(db_dict, echo=echo)

    elif db_dict.type.lower() == 'spark':
        from .spark import SparkConn
        conn = SparkConn(db_dict, echo=echo)

    elif db_dict.type.lower() == 'hive':
        from .hive import HiveConn, Beeline
        if 'use_beeline' in db_dict and db_dict.use_beeline:
            conn = Beeline(db_dict, echo=echo)
        else:
            conn = HiveConn(db_dict, echo=echo)

    elif db_dict.type.lower() in ('postgresql', 'redshift'):
        from .postgresql import PostgreSQLConn
        conn = PostgreSQLConn(db_dict, echo=echo)

    elif db_dict.type.lower() == 'sqlserver':
        from .sqlserver import SQLServerConn
        conn = SQLServerConn(db_dict, echo=echo)

    elif db_dict.type.lower() == 'sqlite':
        from .sqlite import SQLiteConn
        conn = SQLiteConn(db_dict, echo=echo)
    else:
        raise Exception(f'Type {db_dict.type} not handled!')

    conns[db] = conn
    return conn