示例#1
0
def upload_file(fname, cmd, rmcmd,snow_conn):
	print('Starting %s' % fname)
	compress_file(fname)
	subprocess.call([cmd], shell=True)	
	to_file='%s.gz' % fname
	s3fn	= InOut()
	s3fn.file_names=[]
	s3fn.file_names.append(to_file)
	Snowflake.bulk_copy( trans	= snow_conn, file_names = s3fn, target=cli.tcfg, qname = 'copyStmt')
	os.remove(to_file)
	subprocess.call([rmcmd], shell=True)	
示例#2
0
def run():
	lite_tbl={}
	for _source, val in cli.cfg['dump'].items():
		cli.set_source(_source)
		_src_class = list(val.keys())[0]

		DirReader 	= create_reader(aname = _src_class,	app_init=app_init )
		if 1: 
			cli.set_source(_source)
			dir_scfg = cli.get_dcfg(_src_class)
			path = cli.get_parsed(ckey='dumpDir', cfg=dir_scfg)
			out_files	= InOut(file_names=[])
			print path
			DirReader.glob_dir(path=path,  out = out_files, ext='*.out')
			pp(out_files.file_names)

			for _trg_class, val in cli.cfg['target'][_source].items():
				cli.tcfg	= tcfg =  cli.get_tcfg(_trg_class)
				_dbname 	= tcfg["targetDb"]
				toDB 		= create_writer (aname = _trg_class,	app_init=app_init )
				if 1:
					toDB.begin_transaction (env =tcfg['targetDb'] , out 	= lite_conn )
					
					toDB.bulk_insert		 ( trans	= lite_conn, file_names = out_files,  qname = 'insertStmt', cfg = (dir_scfg, tcfg), create_table=True, strip_line_term=True)
					toDB.commit_transaction( trans	= lite_conn)
					
					lite_tbl[_source] = cli.get_parsed(ckey='targetTable', cfg=tcfg)
				
		pp(lite_tbl)
示例#3
0
文件: dir_iq.py 项目: pie-crust/etl
def run():

    stats = {}
    for _source, val in cli.cfg['dump'].items():
        cli.set_source(_source)
        _src_class = val.keys()[0]

        DirReader = create_reader(aname=_src_class, app_init=app_init)

        if 1:  #Get the file names
            cli.set_source(_source)
            dir_scfg = cli.get_dcfg(_src_class)
            path = cli.get_parsed(ckey='dumpDir', cfg=dir_scfg)
            #DirReader.get_files(path=path,  out = data_files )
            DirReader.glob_dir(path=path, out=data_files, ext='*.csv')

        if 1:  #Load to DB
            to_conn = InOut()
            for _trg_class, val in cli.cfg['target'][_source].items() or []:

                cli.tcfg = tcfg = cli.get_tcfg(_trg_class)

                _dbname = tcfg["targetDb"]
                #pp(data_files.file_names)
                for data_file in data_files.file_names:
                    dataFile = create_reader(aname='File',
                                             app_init=app_init,
                                             file_name=data_file,
                                             scfg=dir_scfg)
                    dataFile.describe()
                if 1:
                    toDB = create_writer(aname=_trg_class, app_init=app_init)

                    toDB.begin_transaction(env=tcfg['targetDb'], out=to_conn)
                    toDB.desc_table(schema=tcfg['targetSchema'],
                                    tbl=cli.get_parsed(ckey='targetTable',
                                                       cfg=tcfg),
                                    col_ord=False)
                    #e()
                    toDB.bulk_load_file(trans=to_conn,
                                        file_names=data_files,
                                        qname='insertStmt',
                                        cfg=(dir_scfg, tcfg),
                                        out=insert_stats)
                    toDB.commit_transaction(trans=to_conn)

        if 0:
            stats['Dir->%s' % (_dbname)] = st = OrderedDict()
            st['source_cnt'] = cli.get_src_row_count(
                DB) if not cli.lame_duck else cli.lame_duck
            st['total_extracted'] = insert_stats.inserted_cnt
            st['total_inserted'] = insert_stats.inserted_cnt
    if 0:
        for k, v in stats.items():
            assert v['source_cnt'] == v['total_extracted']
            assert v['source_cnt'] == v['total_inserted']

    if 0:
        email_args.update(dict(cli_stats=None))
        Email.send_email(**email_args)
示例#4
0
文件: db_file.py 项目: pie-crust/etl
def run():
    lite_tbl = {}
    for _source, val in cli.cfg['source'].items():
        _dbname = val["sourceDb"]
        DB = create_reader(_dbname, app_init=app_init)
        FileWriter = create_writer('File', app_init=app_init)
        #data_files.file_names=[]
        if 1:
            cli.set_source(_source)
            DB.set_loader(FileWriter)

            total_ins = 0
            FileWriter.open_file(out=dump_file)

            for iq_data in DB.fetch_many(chunk_size=file_size_rows,
                                         source=cli.get_scfg(),
                                         qname='sourceStmt',
                                         out=InOut(),
                                         skip_header=0):
                if not total_ins:
                    FileWriter.create_header(file=dump_file,
                                             header=DB.get_header(),
                                             cfg=cli.get_dcfg())
                FileWriter.append_data(file=dump_file,
                                       data=iq_data,
                                       cfg=cli.get_dcfg())
                total_ins += len(iq_data.data)
            if not total_ins:  #in case there's no data
                FileWriter.create_header(file=dump_file,
                                         header=DB.get_header(),
                                         cfg=cli.get_dcfg())
            FileWriter.close_file(file=dump_file)

    if 1:
        Email.send_email(**email_args)
示例#5
0
文件: IQ.py 项目: pie-crust/etl
 def fetch_stream(self, chunk_size, source, qname, out, skip_header):
     assert chunk_size
     chunk_size = self.cli.lame_duck if self.cli.lame_duck and chunk_size > self.cli.lame_duck else chunk_size
     assert chunk_size
     tf = "%Y-%m-%d.%H_%M_%S"
     current_ts = time.strftime(tf)
     id = 0
     cur = InOut()
     self.open_stream(source, qname, out=cur)
     #e()
     return None
示例#6
0
def run():
    total_ins = 0
    for _source, val in cli.cfg['source'].items():
        cli.set_source(_source)
        _src_class = val.keys()[0]
        cli.scfg = cli.get_scfg(_src_class)

        _dbname = cli.scfg["sourceDb"]
        DB = create_reader(_dbname, app_init=app_init)
        FileWriter = create_writer('Dir', app_init=app_init)
        DB.set_loader(FileWriter)

        if 1:  #Extract to Dir

            for _dmp_class, val in cli.cfg['dump'][_source].items() or []:

                cli.dcfg = cli.get_dcfg(_dmp_class)
                pp(cli.dcfg)

                file_ins_cnt = 0
                FileWriter.open_file(out=dump_file)
                if 0:
                    IQ.open_stream(dbcfg=cli.scfg,
                                   qname='sourceStmt',
                                   out=IQ_cursor)
                    S3StreamLoader.load_stream(source=IQ_cursor,
                                               skip_header=0,
                                               out=s3_file_names)

                for iq_data in DB.fetch_many(chunk_size=file_size_rows,
                                             source=cli.scfg,
                                             qname='sourceStmt',
                                             out=InOut(),
                                             skip_header=0):
                    if not file_ins_cnt:
                        FileWriter.create_header(file=dump_file,
                                                 header=DB.get_header(),
                                                 cfg=cli.dcfg)
                    FileWriter.append_data(file=dump_file,
                                           data=iq_data,
                                           cfg=cli.dcfg)
                    file_ins_cnt += len(iq_data.data)
                if not file_ins_cnt:  #in case there's no data
                    FileWriter.create_header(file=dump_file,
                                             header=DB.get_header(),
                                             cfg=cli.dcfg)
                FileWriter.close_file(file=dump_file)
                total_ins += file_ins_cnt

    log.info('Total records saved: %d' % total_ins)
    if 1:
        Email.send_email(**email_args)
示例#7
0
def run():
    stats = {}

    for _source, val in cli.cfg['source'].items():
        val = cli.cfg['source'][_source]
        _dbname = val["sourceDb"]
        DB = create_reader(_dbname, app_init=app_init)

        if 1:  #Load to DB
            cli.set_source(_source)
            file_scfg = cli.cfg['dump'][_source]

        if 1:
            to_conn = InOut()
            #file_stats.ins_stats[_dbname]=ins={}

            for _target, val in cli.cfg['target'][_source].items() or []:
                tcfg = cli.cfg['target'][_source][_target]
                _todbname = val["targetDb"]

                toDB = create_writer(_target, app_init=app_init)
                rec_delim = '\n'
                skip_header = 0
                #ins[_todbname]=manager.dict()
                toDB.insert_files(producer=(producer, (cli, _source)),
                                  out=file_stats,
                                  skip_header=skip_header,
                                  rec_delim=rec_delim,
                                  cfg=(file_scfg, tcfg),
                                  return_dict=return_dict)

        pp(file_stats.dump_files)
        extracted_cnt = 0
        for fobj in file_stats.dump_files:
            extracted_cnt += fobj.extracted_cnt
            print toDB.counter.value()
        pp(return_dict.values())
        stats['%s->%s' % (_dbname, _todbname)] = st = OrderedDict()
        st['source_cnt'] = insert_stats.source_cnt if not cli.lame_duck else cli.lame_duck
        st['total_extracted'] = extracted_cnt
        st['total_inserted'] = toDB.total_ins
    pp(stats)
    for k, v in stats.items():
        assert v['source_cnt'] == v['total_extracted']
        assert v['source_cnt'] == v['total_inserted']

    if 1:
        email_args.update(dict(cli_stats=stats))
        Email.send_email(**email_args)
示例#8
0
def run():
    for _source, val in cli.cfg['dump'].items():
        cli.set_source(_source)
        _src_class = val.keys()[0]

        DirReader = create_reader(_src_class, app_init=app_init)

        if 1:  #Get the file names
            cli.set_source(_source)
            dir_scfg = cli.get_dcfg(_src_class)
            path = cli.get_parsed(ckey='dumpDir', cfg=dir_scfg)
            DirReader.glob_dir(path=path, out=ok_files, ext='*.ok')

            for okfn in ok_files.file_names:
                okdir, _ = os.path.splitext(okfn)
                assert os.path.isdir(okdir)
                OkReader = create_reader("Dir", app_init=app_init)
                DirReader.glob_dir(path=okdir, out=out_files, ext='*.out')
                pp(out_files.file_names)

                if 1:
                    for _trg_class, val in cli.cfg['target'][_source].items():

                        cli.tcfg = tcfg = cli.get_tcfg(_trg_class)

                        _dbname = tcfg["targetDb"]
                        toDB = create_writer(_trg_class, app_init=app_init)
                        to_conn = InOut()
                        for out_fn in out_files.file_names:
                            tbl = os.path.basename(out_fn).split('.')[1]
                            print tbl
                            toDB.begin_transaction(env=tcfg['targetDb'],
                                                   out=to_conn)
                            toDB.load_file(trans=to_conn,
                                           file_name=out_fn,
                                           table_name=tbl,
                                           qname='insertStmt',
                                           cfg=(dir_scfg, tcfg),
                                           create_table=True)
                            toDB.commit_transaction(trans=to_conn)

    if 1:
        email_args.update(dict(cli_stats=None))

        Email.send_email(**email_args)
        etl.done()
示例#9
0
def run():
    lite_tbl = {}
    stats = {}
    for _source, val in cli.cfg['dump'].items():
        cli.set_source(_source)
        _src_class = val.keys()[0]

        DirReader = create_reader(_src_class, app_init=app_init)

        if 1:  #Get the file names
            cli.set_source(_source)
            dir_scfg = cli.get_dcfg(_src_class)
            path = cli.get_parsed(ckey='dumpDir', cfg=dir_scfg)
            DirReader.get_files(path=path, out=data_files)

        if 1:  #Load to DB
            to_conn = InOut()
            for _trg_class, val in cli.cfg['target'][_source].items() or []:

                cli.tcfg = tcfg = cli.get_tcfg(_trg_class)

                _dbname = tcfg["targetDb"]
                toDB = create_writer(_trg_class, app_init=app_init)
                toDB.begin_transaction(env=tcfg['targetDb'], out=to_conn)
                toDB.bulk_load(trans=to_conn,
                               file_names=data_files,
                               qname='insertStmt',
                               cfg=(dir_scfg, tcfg),
                               out=insert_stats)
                toDB.commit_transaction(trans=to_conn)

        if 0:
            stats['Dir->%s' % (_dbname)] = st = OrderedDict()
            st['source_cnt'] = cli.get_src_row_count(
                DB) if not cli.lame_duck else cli.lame_duck
            st['total_extracted'] = insert_stats.inserted_cnt
            st['total_inserted'] = insert_stats.inserted_cnt
    if 0:
        for k, v in stats.items():
            assert v['source_cnt'] == v['total_extracted']
            assert v['source_cnt'] == v['total_inserted']

    if 1:
        email_args.update(dict(cli_stats=None))
        Email.send_email(**email_args)
示例#10
0
def run():
    total_ins = 0
    for _source, val in cli.cfg['source'].items():
        cli.set_source(_source)
        _src_class = val.keys()[0]
        cli.scfg = cli.get_scfg(_src_class)

        _dbname = cli.scfg["sourceDb"]
        fromDB = create_reader(_dbname, app_init=app_init)
        FileWriter = create_writer('Dir', app_init=app_init)
        fromDB.set_loader(FileWriter)
        fromDB.begin_transaction(env=cli.scfg['sourceDb'], out=from_conn)
        if 1:  #Extract to Dir

            for _dmp_class, val in cli.cfg['dump'][_source].items() or []:

                cli.dcfg = cli.get_dcfg(_dmp_class)

                file_ins_cnt = 0
                FileWriter.open_file(out=dump_file)

                for iq_data in fromDB.fetch_many(chunk_size=file_size_rows,
                                                 source=cli.scfg,
                                                 qname='sourceStmt',
                                                 out=InOut(),
                                                 skip_header=0):
                    if not file_ins_cnt:
                        FileWriter.create_header(file=dump_file,
                                                 header=fromDB.get_header(),
                                                 cfg=cli.dcfg)
                    FileWriter.append_data(file=dump_file,
                                           data=iq_data,
                                           cfg=cli.dcfg)
                    file_ins_cnt += len(iq_data.data)
                if not file_ins_cnt:  #in case there's no data
                    FileWriter.create_header(file=dump_file,
                                             header=fromDB.get_header(),
                                             cfg=cli.dcfg)
                FileWriter.close_file(file=dump_file)
                total_ins += file_ins_cnt
        fromDB.commit_transaction(trans=from_conn)
    log.info('Total records saved: %d' % total_ins)
    if 0:
        Email.send_email(**email_args)
示例#11
0
def run():

    SQL.set_loader(FileWriter)

    total_ins = 0
    FileWriter.open_file(out=dump_file)

    for iq_data in SQL.fetch_many(chunk_size=file_size_rows,
                                  source=cli.scfg,
                                  qname='sourceStmt',
                                  out=InOut(),
                                  skip_header=0):
        FileWriter.append_data(file=dump_file, data=iq_data)
        total_ins += len(iq_data.data)

    FileWriter.close_file(file=dump_file)

    if 0:
        Email.send_email(**email_args)
示例#12
0
def run():
    IQ.set_loader(SQLServer)

    total_ins = 0
    for iq_data in IQ.fetch_many(chunk_size=file_size_rows,
                                 source=cli.scfg,
                                 qname='sourceStmt',
                                 out=InOut(),
                                 skip_header=0):

        SQLServer.insert_array(trans=sql_conn,
                               target=cli.tcfg,
                               data=iq_data,
                               stmt='insertStmt')

        SQLServer.commit_transaction(trans=sql_conn)
        total_ins += len(iq_data.data)

    log.info('SQLServer: Inserted:%d' % total_ins)
    SQLServer.commit_transaction(trans=sql_conn, close_conn=True)
    if 1:
        Email.send_email(**email_args)
示例#13
0
def run():

    IQ.set_loader(FileWriter)

    total_ins = 0
    for _source, val in cli.cfg['source'].items():
        cli.set_source(_source)
        _src_class = val.keys()[0]
        cli.scfg = cli.get_scfg(_src_class)

        for _dmp_class, val in cli.cfg['dump'][_source].items() or []:

            cli.dcfg = cli.get_dcfg(_dmp_class)
            pp(cli.dcfg)
            cli.exec_config()
            FileWriter.open_file(out=dump_file)

            for iq_data in IQ.fetch_many(chunk_size=file_size_rows,
                                         source=cli.scfg,
                                         qname='sourceStmt',
                                         out=InOut(),
                                         skip_header=0):
                if not total_ins:
                    FileWriter.create_header(file=dump_file,
                                             header=IQ.get_header(),
                                             cfg=cli.dcfg)
                FileWriter.append_data(file=dump_file,
                                       data=iq_data,
                                       cfg=cli.dcfg)
                total_ins += len(iq_data.data)
            if not total_ins:  #in case there's no data
                FileWriter.create_header(file=dump_file,
                                         header=IQ.get_header(),
                                         cfg=cli.dcfg)
            FileWriter.close_file(file=dump_file)

    if 0:
        Email.send_email(**email_args)
示例#14
0
"""

"""
import sys
import threading
import subprocess
from collections import OrderedDict

from pprint import pprint as pp
from include.utils import create_reader, create_writer, create_actor, InOut
e = sys.exit
cli, conn_pool = app_init

Email = create_actor('Email', app_init=app_init)

data_files = InOut()
lite_conn = InOut()

data_files.file_names = []

dump_file = InOut()

file_size_rows = 250000
email_args = {'email_subject': 'Dir->IQ'}
insert_stats = InOut(source_cnt=-1, inserted_cnt=-1)


def run():
    lite_tbl = {}
    stats = {}
    for _source, val in cli.cfg['dump'].items():
示例#15
0
def producer(cli, _source):
    val = cli.cfg['source'][_source]
    _dbname = val["sourceDb"]
    DB = create_reader(_dbname, app_init=app_init)
    cnt = cli.get_src_row_count(DB)

    if not cli.lame_duck:
        assert cli.dop > 0
        cli.src_chunk_size = round(cnt / cli.dop) + 1
    else:
        cli.src_chunk_size = cli.lame_duck

    FileWriter = create_writer('File', app_init=app_init)
    data_files.file_names = []
    #uploaded_files.file_names=[]
    #ext_files=[]
    if 1:
        cli.set_source(_source)
        DB.set_loader(FileWriter)

        total_read = 0

        scfg = cli.get_scfg()
        source_chunk_size = int(
            float(cli.get_parsed(ckey='sourceChunkSize', cfg=scfg)))

        cid = 0

        skew_pct = int(float(cli.get_parsed(ckey='fileSkewPct', cfg=scfg)))
        log.debug('Skew percentile = %s' % skew_pct)
        if skew_pct and cli.dop >= 2:
            delta = source_chunk_size * (skew_pct / 100.0)
            num_of_files = cli.dop
            increment = int(delta / num_of_files)
            chunk_map = {}
            accum_skew = sum(
                [increment * (num_of_files - i) for i in range(num_of_files)])
            for i in range(num_of_files):

                skew = ((cnt - accum_skew) /
                        num_of_files) + increment * (num_of_files - i)
                chunk_map[i] = skew + 1 if not cli.lame_duck else cli.lame_duck
            pp(chunk_map)
            #e()
            if not cli.lame_duck:
                assert sum(
                    chunk_map.values()
                ) >= cnt, 'Chunk map has to cover all source records [%s <> %s]' % (
                    sum(chunk_map.values()), cnt)
            #dfiles=[]
            for iq_data in DB.fetch_many_async(chunk_map=chunk_map,
                                               counter=counter,
                                               source=scfg,
                                               qname='sourceStmt',
                                               out=InOut(),
                                               skip_header=0):
                dump_file = InOut(source_cnt=cnt)
                FileWriter.open_file(id=cid, out=dump_file)
                if 1:  #not total_ins:
                    dump_cfg = cli.get_dcfg()
                    FileWriter.create_header(file=dump_file,
                                             header=DB.get_header(),
                                             cfg=dump_cfg)
                FileWriter.append_data(file=dump_file,
                                       data=iq_data,
                                       cfg=dump_cfg)
                total_read += len(iq_data.data)
                FileWriter.close_file(file=dump_file)
                #ext_files.append(dump_file.fpath)

                #dfiles.append(dump_file)
                dump_file.extracted_cnt = total_read

                yield dump_file
                cid += 1

        else:  #lame duck
            print source_chunk_size
            #e()
            assert source_chunk_size
            for iq_data in DB.fetch_many(chunk_size=source_chunk_size,
                                         source=scfg,
                                         qname='sourceStmt',
                                         out=InOut(),
                                         skip_header=0):
                dump_file = InOut(source_cnt=cnt)
                FileWriter.open_file(id=cid, out=dump_file)
                if 1:  #not total_ins:
                    dump_cfg = cli.get_dcfg()
                    FileWriter.create_header(file=dump_file,
                                             header=DB.get_header(),
                                             cfg=dump_cfg)
                FileWriter.append_data(file=dump_file,
                                       data=iq_data,
                                       cfg=dump_cfg)
                total_read += len(iq_data.data)
                FileWriter.close_file(file=dump_file)
                #ext_files.append(dump_file.fpath)
                log.debug('File %d created:file: %d,  %d records' %
                          (cid, len(iq_data.data), source_chunk_size))
                cid += 1
                dump_file.extracted_cnt = total_read
                yield dump_file

        log.debug('Done extracting.....')
示例#16
0
"""

import sys
from collections import OrderedDict
from multiprocessing import Process, Value, Lock
import multiprocessing
from pprint import pprint as pp
from include.utils import create_reader, create_writer, create_actor, InOut
e = sys.exit
cli, conn_pool = app_init

Email = create_actor('Email', app_init=app_init)

Dir = create_reader('Dir', app_init=app_init)

file_stats = InOut(dump_files=[], ins_stats={})

data_files = InOut()
data_files.file_names = []
#uploaded_files.file_names=[]

email_args = {'email_subject': 'DB->file'}


class Counter(object):
    def __init__(self, initval=0):
        self.val = Value('i', initval)
        self.cnt = Value('i', initval)
        self.lock = Lock()

    def increment(self):
示例#17
0
def run():
    skip = 2
    serviceName = 'gfin'
    #deleted = {}

    #loaded  = {}
    #not_loaded  = {}
    #masterTbl = 'gtxMasterPKData'
    #do_not_delete = ['TxFinancingRateHist', masterTbl]
    do_not_load = ['TxFinancingRate',
                   'TxFinancingRateHist']  #'TxFinancingRate',
    for _source, val in cli.cfg['dump'].items():
        cli.set_source(_source)
        _src_class = list(val.keys())[0]

        DirReader = create_reader(aname=_src_class, app_init=app_init)

        if 1:
            cli.set_source(_source)
            dir_scfg = cli.get_dcfg(_src_class)
            path = cli.get_parsed(ckey='dumpDir', cfg=dir_scfg)
            ok_files = InOut(file_names=[])
            DirReader.glob_dir(path=path, out=ok_files, ext='*.ok')

            if 1:
                for _trg_class, val in cli.cfg['target'][_source].items():

                    cli.tcfg = tcfg = cli.get_tcfg(_trg_class)

                    _dbname = tcfg["targetDb"]
                    toDB = create_writer(aname=_trg_class, app_init=app_init)
                    masterTabTag = tcfg['masterTableTag']
                    masterTbl = tcfg['targetTables'][masterTabTag][
                        'table_name']
                    masterTblCol = tcfg['targetTables'][masterTabTag][
                        'column_name']
                    do_not_delete = tcfg['doNotDeleteTables'] + [masterTbl]
                    do_not_load = tcfg['doNotLoadTables']
                    to_conn = InOut()
                    toDB.begin_transaction(env=tcfg['targetDb'], out=to_conn)
                    to_conn.cur.execute('set search_path to CIGRpt')
                    if ok_files.file_names:  # Master first
                        try:

                            stmt = 'drop table %s' % masterTbl
                            to_conn.cur.execute(stmt)
                        except Exception as ex:
                            #raise
                            if not 'Table "%s" does not exist' % masterTbl in str(
                                    ex):
                                raise

                        stmt = 'create local temporary table %s ( %s bigint not null, MartModifiedDate timestamp)\n ON COMMIT PRESERVE ROWS' % (
                            masterTbl, masterTblCol)
                        pfmt([[stmt]], ['Create master temp PK'])
                        to_conn.cur.execute(stmt)
                        #e()
                    stats = {}
                    deleted = {}
                    processed = []
                    not_processed = []
                    for okfn in ok_files.file_names:
                        okFile = create_reader(aname='File',
                                               app_init=app_init,
                                               file_name=okfn,
                                               scfg=dir_scfg)
                        okdir, _ = os.path.splitext(okfn)
                        okbn = os.path.basename(okdir)
                        #e()
                        assert os.path.isdir(okdir)
                        snap_df = cli.get_dest_folder(okdir)
                        if os.path.isdir(snap_df):
                            log.warning('[%s]Destination folder exists: [%s]' %
                                        (okdir, snap_df))
                            not_processed.append(okfn)
                            continue
                        OkReader = create_reader(aname="Dir",
                                                 app_init=app_init)
                        out_files = InOut(file_names=[])

                        DirReader.glob_dir(path=okdir,
                                           out=out_files,
                                           ext='*.out')

                        apx = dict(MartModifiedDate=okFile.get_value(
                            coords=(0, 0), skip=skip))
                        ftlist = []

                        for out_fn in out_files.file_names:
                            print(out_fn)
                            ftlist.append(
                                os.path.basename(out_fn).split('.')[1])

                        pfmt([[x] for x in ftlist], ['Files->Tables'])
                        #e()
                        if 1:
                            ctables = cli.tcfg['targetTables'].keys()

                            extra_file_tables = list(
                                set(ftlist) - set(ctables))

                            pfmt([[x] for x in extra_file_tables],
                                 ['Tables not in config.'])

                            extra_config_tables = list(
                                set(ctables) - set(ftlist))

                            pfmt([[x] for x in extra_config_tables],
                                 ['Tables in config but not in file names.'])

                            assert not extra_file_tables, 'Tables %s are not listed in config["targetTables"].' % extra_file_tables

                        if 0:
                            g = raw_input("Continue?")
                        if 1:  #//create PK file

                            fromFile = create_reader(
                                aname='File',
                                app_init=app_init,
                                file_name=os.path.join(okdir,
                                                       'gfin.Instrument.out'),
                                scfg=dir_scfg)
                            toFile = create_reader(aname='File',
                                                   app_init=app_init,
                                                   file_name=os.path.join(
                                                       okdir, '%s.PK.out' %
                                                       serviceName),
                                                   scfg=dir_scfg,
                                                   parse=False)
                            rowcnt = cli.createPrimaryKeyFile(
                                ffObj=fromFile,
                                pkfn=os.path.join(okdir,
                                                  '%s.PK.out' % serviceName))

                        assert masterTabTag in ftlist, '"%s" file is missing' % masterTabTag

                        if 1:
                            stmt = 'TRUNCATE TABLE %s' % (masterTbl)
                            toDB.exec_dml(stmt, trans=to_conn, commit=False)
                            deleted[masterTbl] = -1
                            #e()
                        #e()
                        loaded = {}
                        not_loaded = {}
                        if 1:
                            pkfn = [
                                x for x in out_files.file_names
                                if os.path.basename(x).split('.')[1] in
                                [masterTabTag]
                            ][0]

                            schema = tcfg['targetSchema']
                            outFile = create_reader(aname="File",
                                                    app_init=app_init,
                                                    file_name=pkfn,
                                                    scfg=dir_scfg)
                            fmt_cols = tcfg['targetTables'][masterTabTag].get(
                                'formatColumns', [])
                            outFile.set_alt_cols()
                            toDB.load_gfin_file(trans=to_conn,
                                                file_obj=outFile,
                                                schema=schema,
                                                table_name=masterTbl,
                                                qname='insertStmt',
                                                fmt_cols=fmt_cols,
                                                cfg=(dir_scfg, tcfg),
                                                skip=skip,
                                                apx=apx,
                                                stats=stats)
                            loaded[out_fn] = masterTbl
                            #e()
                        if 1:
                            stmt = 'SELECT count(*) FROM %s t' % masterTbl
                            pkcnt = toDB.exec_query(stmt).fetchall()[0][0]

                            assert pkcnt == (rowcnt - skip)

                        for out_fn in [
                                x for x in out_files.file_names
                                if not os.path.basename(x).split('.')[1] in
                            [masterTabTag]
                        ]:
                            outFile = create_reader(aname="File",
                                                    app_init=app_init,
                                                    file_name=out_fn,
                                                    scfg=dir_scfg)
                            outCols = [
                                col[0] for col in outFile.get_header_cols()
                            ]
                            tbl = os.path.basename(out_fn).split('.')[1]
                            assert tbl

                            if tbl not in [masterTabTag] + do_not_load:

                                if tbl not in do_not_delete:
                                    stmt = 'DELETE FROM %s WHERE %s in (SELECT t.%s FROM %s t)' % (
                                        tbl, masterTblCol, masterTblCol,
                                        masterTbl)
                                    deleted[tbl] = toDB.exec_dml(stmt,
                                                                 trans=to_conn,
                                                                 commit=False)
                                    pfmt([[deleted[tbl]]],
                                         ['Deleted from %s' % tbl])
                                else:
                                    deleted[tbl] = -1

                                tblCols = toDB.get_columns(tbl).values()
                                pfmt([[x] for x in list(
                                    set(tblCols) - set(outCols) -
                                    set(['MartModifiedDate']))],
                                     ['Columns in Source, but not Target'])

                                missing_cols = list(
                                    set(outCols) - set(tblCols))
                                pfmt([(tbl, x) for x in missing_cols],
                                     ['Table', 'Missing columns'])

                                if missing_cols:
                                    to_conn.conn.rollback()

                                    schema = tcfg["targetSchema"]
                                    toDB.desc_table(schema, tbl)

                                    raise Exception(
                                        'File column %s missing in table "%s".'
                                        % (missing_cols, tbl))

                                if 1:
                                    schema = tcfg['targetSchema']
                                    fmt_cols = tcfg['targetTables'][tbl].get(
                                        'formatColumns', [])
                                    outFile.set_alt_cols()
                                    toDB.load_gfin_file(trans=to_conn,
                                                        file_obj=outFile,
                                                        schema=schema,
                                                        table_name=tbl,
                                                        qname='insertStmt',
                                                        fmt_cols=fmt_cols,
                                                        cfg=(dir_scfg, tcfg),
                                                        skip=skip,
                                                        apx=apx,
                                                        stats=stats)
                                    loaded[out_fn] = tbl
                            else:
                                not_loaded[out_fn] = tbl

                        else:
                            toDB.commit_transaction(trans=to_conn)
                            #pfmt([[k]+[deleted [k]]+list(v)[1:]  for k,v in stats.items() if deleted [k]>=0], ['Table','Deleted', 'Accepted', 'Rejected','Line count','Skip', 'Diff'],'Load completed (deleted)'.upper())
                            #pfmt([(k,v) for k, v in loaded.items()], ['Loaded Files','Loaded Tables'])
                            #pfmt([(k,v) for k, v in not_loaded.items()], ['Not loaded Files','Not loaded Tables'])
                            pfmt(
                                [[k] + [deleted[k]] + list(v.values())[1:]
                                 for k, v in stats.items() if deleted[k] >= 0],
                                [
                                    'Table', 'Deleted', 'Accepted', 'Rejected',
                                    'Line count', 'Skip', 'Diff'
                                ], 'Load completed/deleted'.upper())
                            pfmt([(k, v) for k, v in loaded.items()],
                                 ['Loaded Files', 'Loaded Tables'])
                            pfmt([(k, v) for k, v in not_loaded.items()],
                                 ['Not loaded Files', 'Not loaded Tables'])

                            assert os.path.isdir(okdir)
                            if 0:
                                cli.MoveSnapFolder(okdir)
                            processed.append(okfn)
                        #break;

                if not ok_files.file_names:
                    counter = itertools.count(1)
                    pfmt([['No OK files at working dir: [ %s ]' % cli.pa[0]]],
                         ['No files'])
                if processed:

                    counter = itertools.count(1)
                    pfmt([[next(counter), x] for x in processed],
                         ['##', 'Processed'])
                if not_processed:

                    counter = itertools.count(1)
                    pfmt([[next(counter), x] for x in not_processed],
                         ['##', 'Not processed (backup exists)'])

    if 0:
        email_args.update(dict(cli_stats=None))
        Email.send_email(**email_args)
        cli.done()
示例#18
0
import threading
import subprocess
from collections import OrderedDict

from pprint import pprint as pp
from include.utils import create_reader, create_writer, create_actor, InOut
from include.fmt import pfmt

e = sys.exit
cli, conn_pool = app_init

Email = create_actor(aname='Email', app_init=app_init)

#ok_files	= InOut()

lite_conn = InOut()

#ok_files.file_names=[]

dump_file = InOut()

file_size_rows = 250000
email_args = {'email_subject': 'GTX->Vertica'}
insert_stats = InOut(source_cnt=-1, inserted_cnt=-1)


def run():
    skip = 2
    serviceName = 'gfin'
    #deleted = {}
示例#19
0
from include.utils import create_reader, create_writer, create_actor, InOut

cli, conn_pool = app_init

#IQ 				= create_reader('IQ',  	app_init=app_init )
#FileWriter 	= create_writer('File',	app_init=app_init )
Email = create_actor('Email', app_init=app_init)

IQ_cursor = InOut()
s3_file_names = InOut()

dump_file = InOut()

file_size_rows = 250000

email_args = {'email_subject': 'IQ.procedure->IQ'}

from_conn = InOut()


def run():
    total_ins = 0
    for _source, val in cli.cfg['source'].items():
        cli.set_source(_source)
        _src_class = val.keys()[0]
        cli.scfg = cli.get_scfg(_src_class)

        _dbname = cli.scfg["sourceDb"]
        fromDB = create_reader(_dbname, app_init=app_init)
        FileWriter = create_writer('Dir', app_init=app_init)
        fromDB.set_loader(FileWriter)
示例#20
0
def run():
    ext_files = []

    for _source, val in cli.cfg['source'].items():
        _dbname = val["sourceDb"]
        DB = create_reader(_dbname, app_init=app_init)
        FileWriter = create_writer('File', app_init=app_init)
        data_files.file_names = []
        uploaded_files.file_names = []
        if 1:
            cli.set_source(_source)
            DB.set_loader(FileWriter)

            total_ins = 0

            scfg = cli.get_scfg()
            source_chunk_size = scfg['sourceChunkSize']
            #maxRowsPerFile
            for cid, iq_data in enumerate(
                    DB.fetch_many(chunk_size=source_chunk_size,
                                  source=scfg,
                                  qname='sourceStmt',
                                  out=InOut(),
                                  skip_header=0)):
                dump_file = InOut()
                FileWriter.open_file(id=cid, out=dump_file)
                if 1:  #not total_ins:
                    dump_cfg = cli.get_dcfg()
                    FileWriter.create_header(file=dump_file,
                                             header=DB.get_header(),
                                             cfg=dump_cfg)
                FileWriter.append_data(file=dump_file,
                                       data=iq_data,
                                       cfg=dump_cfg)
                total_ins += len(iq_data.data)
                FileWriter.close_file(file=dump_file)
                ext_files.append(dump_file.fpath)
            #if not total_ins: #in case there's no data
            #	FileWriter.create_header(file = dump_file, header = DB.get_header(), cfg = dump_cfg)

        pp(ext_files)
        if 1:  #Load to DB
            cli.set_source(_source)
            file_scfg = cli.cfg['dump'][_source]

            path = cli.get_parsed(ckey='dumpDir', cfg=file_scfg)

            Dir.get_files(path=path, out=data_files)
            pp(data_files.file_names)

            if 1:
                to_conn = InOut()
                for _target, val in cli.cfg['target'][_source].items() or []:
                    tcfg = cli.cfg['target'][_source][_target]
                    _todbname = val["targetDb"]

                    toDB = create_writer(_target, app_init=app_init)
                    #print toDB
                    #e()
                    #toDB.begin_transaction  ( out 	= to_conn )
                    rec_delim = '\n'
                    skip_header = 0
                    #S3.upload_files			( file_names = data_files, out = uploaded_files, skip_header=skip_header, rec_delim=rec_delim)
                    toDB.insert_files(file_names=data_files,
                                      out=uploaded_files,
                                      skip_header=skip_header,
                                      rec_delim=rec_delim,
                                      cfg=(file_scfg, tcfg))
                    #trans	= to_conn, file_names = data_files,  qname = 'insertStmt', cfg = (file_scfg, tcfg) )
                    #toDB.commit_transaction ( trans	= to_conn)

    if 0:
        Email.send_email(**email_args)
示例#21
0
cli, conn_pool=app_init
import sys
from collections import OrderedDict
from pprint import pprint as pp
from include.utils import create_reader, create_writer, create_actor, InOut
from include.fmt import ppe
e=sys.exit


	


Email 		= create_actor (aname ='Email',app_init=app_init )


insert_stats= InOut(inserted_cnt=-1)
read_stats= InOut(total_read=-1)

email_args={'email_subject':'IQ->REST->IQ'}




from_conn  = InOut()
trans_ids = InOut()
rest_pipe = InOut()


to_conn	= InOut()

示例#22
0
import sys
from collections import OrderedDict
from pprint import pprint as pp
from include.utils import create_reader, create_writer, create_actor, InOut
e = sys.exit
cli, conn_pool = app_init

Email = create_actor(aname='Email', app_init=app_init)

Dir = create_reader(aname='Dir', app_init=app_init)

dump_file = InOut()

data_files = InOut()
data_files.file_names = []

insert_stats = InOut(inserted_cnt=-1)

file_size_rows = 250000
email_args = {'email_subject': 'IQ->file->SQL'}
from_conn = InOut()
term_line = False


def run():
    lite_tbl = {}
    stats = {}

    for _source, val in cli.cfg['source'].items():
        cli.set_source(_source)
        _src_class = list(val.keys())[0]
示例#23
0
文件: IQ.py 项目: pie-crust/etl
    def open_stream(self, dbcfg, qname, out):
        global actors
        cli = self.cli
        alt_cols = {}
        from_cols = {}
        for id, col in enumerate(cli.scfg["columnMappings"]):
            from_cols[int(id)] = col['columnName'].upper().encode()
            if col.get('altColName'):
                alt_cols[int(id)] = col['columnName'].upper().encode()

        assert hasattr(self, 'loader'), 'You must call "set_loader" first'

        if self.loader.cln not in ['Dir']:
            to_cols = self.loader.get_columns()
            assert to_cols
            #pp(to_cols)
            #e()
            assert len(from_cols) == len(
                to_cols
            ), 'Config vs Target column count mismatch (%d != %d)' % (
                len(from_cols), len(to_cols))
            miss = 0
            for id, col in from_cols.items():
                #print (col, to_cols.keys())
                assert col in to_cols, 'Config column "%s" does not exists in Target table "%s"' % (
                    col, cli.tcfg['targetTable'])
                if not int(id) == int(to_cols[col]):
                    log.error(
                        'Config column "%s" order is wrong (Config# %d != Target# %d)'
                        % (col, id, to_cols[col]))
                    miss += 1
            assert miss == 0
        else:
            to_cols = {}

        col_map = None
        #Out = collections.namedtuple('Out','pipe actor col_map')
        cli = self.cli
        apx = self.apx
        mock_file = cli.mf
        if not self.conn:
            self.begin_transaction(env=cli.scfg['sourceDb'], out=InOut())
        assert self.conn
        stmt = self.get_query(dbcfg, qname)
        #pp(stmt)

        assert stmt
        from collections import OrderedDict
        from_cols = OrderedDict()
        if 1:
            if mock_file:
                log.info('%s: Using mock file: %s' % (self.cln, mock_file))
                assert os.path.isfile(mock_file)
                import codecs
                mfh = codecs.open(mock_file, encoding='latin-1')

                #mfh=open(mock_file,'rb')
                if 1:

                    header = mfh.readline().strip().split(
                        str(self.cli.csep.decode()))

                    for id, column in enumerate(header):
                        from_cols[id] = column.encode().upper()
                        to_cols[to_cols] = id
                    #to_cols=from_cols
                    #pp(from_cols)
                    #e()
                    col_map = self.get_col_map(from_cols, to_cols)

                pipe = FileStreamer(self.cli, fh=mfh)

            else:

                pyodbc.pooling = False

                cur = self.conn.cursor()
                start_time = time.time()
                if 1:

                    if 1:
                        log.debug(stmt)
                        cur.execute(stmt)

                    for id, column in enumerate(cur.description):
                        from_cols[id] = column[0].upper().encode()
                        if self.loader.cln in ['Dir']:
                            if id in alt_cols:
                                cname = alt_cols[id]
                            else:
                                cname = column[0].upper().encode()
                            to_cols[cname] = id

                    col_map = self.get_col_map(from_cols, to_cols)
                    pipe = DbStreamer(self.cli, cur=cur, start_time=start_time)

        with StreamSlicer(cli,
                          pipe,
                          apx,
                          max_rows_to_read=self.cli.max_rows_to_read,
                          col_map=col_map,
                          stmt=stmt) as pipe:
            out.pipe, out.actor, out.col_map = pipe, self.cln, col_map
            return out
示例#24
0
文件: db_file.py 项目: pie-crust/etl
"""
"""

import sys
from pprint import pprint as pp
from include.utils import create_reader, create_writer, create_actor, InOut
e = sys.exit
cli, conn_pool = app_init

Email = create_actor('Email', app_init=app_init)

dump_file = InOut()

file_size_rows = 250000
email_args = {'email_subject': 'DB->file'}


def run():
    lite_tbl = {}
    for _source, val in cli.cfg['source'].items():
        _dbname = val["sourceDb"]
        DB = create_reader(_dbname, app_init=app_init)
        FileWriter = create_writer('File', app_init=app_init)
        #data_files.file_names=[]
        if 1:
            cli.set_source(_source)
            DB.set_loader(FileWriter)

            total_ins = 0
            FileWriter.open_file(out=dump_file)
示例#25
0
from include.utils import create_reader, create_writer, create_actor, InOut

cli, conn_pool=app_init


IQ 				= create_reader('IQ',  	app_init=app_init )
IQ_Writer		= create_writer('IQ', 	app_init=app_init )
Email 	  		= create_actor ('Email',app_init=app_init )


IQ_cursor		= InOut()
s3_file_names	= InOut()
snow_conn		= InOut()

##
##
email_args={'email_subject':'IQ.procedure->IQ'}
##
##

def run():	
	IQ.set_loader(IQ_Writer)
	IQ.open_stream				( dbcfg = cli.scfg, qname = 'sourceStmt', out=IQ_cursor )
	
	IQ_Writer.begin_transaction ( out = snow_conn )
	IQ_Writer.purge_data		( trans	= snow_conn, stmt = 'purgeStmt' )
	IQ_Writer.bulk_copy			( trans	= snow_conn, file_names = s3_file_names, target=cli.tcfg, qname = 'copyStmt', )
	IQ_Writer.commit_transaction( trans	= snow_conn )
	IQ_Writer.delete_files ( file_names=s3_file_names)
	if 0:
		Email.send_email			( **email_args )
示例#26
0
from include.utils import create_reader, create_writer, create_actor, InOut
from include.fmt import pfmtd, psql
from collections import OrderedDict

cli, conn_pool=app_init



Email 	  		= create_actor (aname = 'Email', app_init=app_init )


IQ_cursor		= InOut()
s3_file_names	= InOut()



dump_file		= InOut()

file_size_rows=25000

email_args={'email_subject':'IQ.procedure->IQ'}

data_files	= InOut()

data_files.file_names=[]
insert_stats= {} 
file_stats= {}
from_conn	= InOut()
to_conn	= InOut()
def run():
	stats={}
示例#27
0
def run():
    lite_tbl = {}
    stats = {}

    for _source, val in cli.cfg['source'].items():
        cli.set_source(_source)
        _src_class = list(val.keys())[0]
        cli.scfg = scfg = cli.get_scfg(_src_class)

        _dbname = cli.scfg["sourceDb"]
        fromDB = create_reader(aname=_src_class, app_init=app_init)
        #FileWriter 	= create_writer(aname ='File',	app_init=app_init )
        data_files.file_names = []
        if 1:
            cli.set_source(_source)

            fromDB.begin_transaction(env=cli.scfg['sourceDb'], out=from_conn)
            for _dmp_class, val in cli.cfg['dump'][_source].items() or []:
                FileWriter = create_writer(aname=_dmp_class, app_init=app_init)
                fromDB.set_loader(FileWriter)

                cli.dcfg = dcfg = cli.get_dcfg(_dmp_class)

                for _trg_class, val in cli.cfg['target'][_source].items(
                ) or []:

                    cli.tcfg = tcfg = cli.get_tcfg(_trg_class)
                    file_ins_cnt = 0
                    total_ins = 0
                    FileWriter.open_file(out=dump_file)
                    print(dump_file.fpath)

                    if 1:
                        #for iq_data in DB.fetch_many ( chunk_size=file_size_rows,  source = cli.get_scfg(), qname = 'sourceStmt', out=InOut(), skip_header=0 ):
                        for iq_data in fromDB.fetch_many(
                                chunk_size=file_size_rows,
                                source=cli.scfg,
                                qname='sourceStmt',
                                out=InOut(),
                                skip_header=0,
                                terminate_line=term_line):
                            if not file_ins_cnt:
                                FileWriter.create_header(
                                    file=dump_file,
                                    header=fromDB.get_header(),
                                    cfg=cli.dcfg,
                                    terminate_line=term_line)
                            FileWriter.append_data(file=dump_file,
                                                   data=iq_data,
                                                   cfg=cli.dcfg)
                            file_ins_cnt += len(iq_data.data)
                        if not file_ins_cnt:  #in case there's no data
                            FileWriter.create_header(
                                file=dump_file,
                                header=fromDB.get_header(),
                                cfg=cli.dcfg,
                                terminate_line=term_line)
                        FileWriter.close_file(file=dump_file)
                        total_ins += file_ins_cnt
                    if 1:  #//check if there's data in a file
                        dataFile = create_reader(aname='File',
                                                 app_init=app_init,
                                                 file_name=dump_file.fpath,
                                                 scfg=dcfg)
                        dataFile.describe()
                        lcnt = dataFile.line_count() - cli.header_size(dcfg)
                        assert lcnt, 'Dump file is empty\n%s' % dump_file.fpath

                    #e()
                    if 1:  #Load to DB
                        cli.set_source(_source)
                        dir_scfg = cli.get_dcfg(_dmp_class)
                        path = cli.get_parsed(ckey='dumpDir', cfg=dir_scfg)
                        DirReader = create_reader(aname=_dmp_class,
                                                  app_init=app_init)
                        DirReader.glob_dir(path=path,
                                           out=data_files,
                                           ext='*.*')
                        if 1:
                            to_conn = InOut()
                            _todbname = tcfg["targetDb"]
                            toDB = create_writer(aname=_todbname,
                                                 app_init=app_init)
                            toDB.begin_transaction(env=cli.scfg['sourceDb'],
                                                   out=to_conn)
                            #toDB.begin_transaction  ( out 	= to_conn )
                            toDB.bulk_insert(trans=to_conn,
                                             file_names=data_files,
                                             qname='insertStmt',
                                             cfg=(dir_scfg, tcfg),
                                             out=insert_stats)
                            toDB.commit_transaction(trans=to_conn)
                if 1:
                    FileWriter.delete_dump(data_files)

                stats['%s->%s' % (_dbname, _todbname)] = st = OrderedDict()
                st['source_cnt'] = cli.get_src_row_count(
                    DB) if not cli.lame_duck else cli.lame_duck
                st['total_extracted'] = total_ins
                st['total_inserted'] = insert_stats.inserted_cnt
    if 1:
        for k, v in stats.items():
            assert v['source_cnt'] == v['total_extracted'], " %s <> %s" % (
                v['source_cnt'], v['total_extracted'])
            assert v['source_cnt'] == v['total_inserted']

    if 1:
        email_args.update(dict(cli_stats=stats))
        Email.send_email(**email_args)
示例#28
0
def run():
	stats={}
	total_ins = 0
	term_line = True
	#//validate cols
	for _source, val in cli.cfg['source'].items():
		cli.set_source(_source)
		_src_class = list(val.keys())[0]
		cli.scfg= scfg=cli.get_scfg(_src_class)
		for _trg_class, val in cli.cfg['target'][_source].items() or []:
			cli.tcfg= tcfg =  cli.get_tcfg(_trg_class)

			if tcfg.get('accountingDate', None): #//set acct_year, acct_mon for new target table naming
				fmt=cli.get_parsed(ckey='accountingDateFmt', cfg=tcfg) 
				cli.set_target_table(tcfg=tcfg, acct_date=cli.get_parsed(ckey='accountingDate', cfg=tcfg), fmt=fmt)
				
			_dbname = tcfg["targetDb"]
			toDB 	= create_writer (aname =_trg_class,	app_init=app_init )
			
			toDB.begin_transaction  ( env =tcfg['targetDb'] , out 	= to_conn )
			table='%s.%s' % (tcfg['targetSchema'], tcfg['targetTable'])
			toDB.desc_table(schema=tcfg['targetSchema'], tbl=tcfg['targetTable'], col_ord=False)
			#// validate cols
			cfg_cols=[x[u'columnName'] for x in cli.scfg[u'columnMappings']]
			tcols=toDB.get_cols()
			t_vs_c  = set(tcols) -set(cfg_cols)
			c_vs_t  = set(cfg_cols) -set(tcols)
			

			if t_vs_c: 
				pfmtd([dict(c_vs_t = c_vs_t)], 'Config has columns missing in target table.')
				
				raise Exception('Target table has columns missing in config: %s' % t_vs_c)
			
			if c_vs_t: 
				pfmtd([dict(t_vs_c = t_vs_c)], 'Target table has columns missing in config.')
				raise Exception('Config has columns missing in target table: %s' % c_vs_t)
			toDB.commit_transaction ( trans	= to_conn)
	#//transfer
	for _source, val in cli.cfg['source'].items():
		cli.set_source(_source)
		_src_class = list(val.keys())[0]
		cli.scfg= scfg=cli.get_scfg(_src_class)

		_dbname=cli.scfg["sourceDb"]
		#// in include/extractor
		
		fromDB 			= create_reader(aname = _src_class,	app_init=app_init )


		fromDB.begin_transaction  ( env =cli.scfg['sourceDb'] , out = from_conn )
		if 1: #//Extract to File
			
			for _dmp_class, val in cli.cfg['dump'][_source].items() or []:
				FileWriter 	= create_writer(aname =_dmp_class,	app_init=app_init ) 
				fromDB.set_loader(FileWriter)
				cli.dcfg= cli.get_dcfg(_dmp_class)
				for _trg_class, val in cli.cfg['target'][_source].items() or []:

					cli.tcfg= tcfg =  cli.get_tcfg(_trg_class)
					file_ins_cnt= 0
					FileWriter.open_file( out = dump_file )
					start_time = time.time()
					# //if fetch_many is not in IQ - it's in include/extractor/common/Extractor.py
					for iq_data in fromDB.fetch_many ( chunk_size=file_size_rows,  source = cli.scfg, qname = 'sourceStmt', out=InOut(), skip_header=0, terminate_line= term_line):
						if 1:
							if not file_ins_cnt:
								FileWriter.create_header(file = dump_file, header = fromDB.get_header(), cfg=cli.dcfg, terminate_line= term_line)
							FileWriter.append_data ( file = dump_file,  data = iq_data, cfg=cli.dcfg)
							file_ins_cnt+=len(iq_data.data)
							FileWriter.terminate(file = dump_file)
						print (len(iq_data.data))
						print ('Elapsed read/write: %s' % (time.time() - start_time))
						start_time = time.time()
						
						
					if not file_ins_cnt: #in case there's no data
						FileWriter.create_header(file = dump_file, header = fromDB.get_header(), cfg=cli.dcfg, terminate_line= term_line)
					#else:
					#	FileWriter.terminate(file = dump_file)
					
					FileWriter.close_file(file = dump_file)

					total_ins +=file_ins_cnt
		fromDB.desc_cur(cur = from_conn.cur, colord=False)
		
		fromDB.commit_transaction ( trans	= from_conn)
	log.info('Total records saved: %d' % total_ins)
	#// Load to IQ
	
	for _source, val in cli.cfg['dump'].items():
		cli.set_source(_source)
		_src_class = list(val.keys())[0]


		DirReader 	= create_reader(aname = _src_class,	app_init=app_init )

			
		if 1: #//Get the file names
			cli.set_source(_source)
			dir_scfg = cli.get_dcfg(_src_class)
			path = cli.get_parsed(ckey='dumpDir', cfg=dir_scfg)

			DirReader.glob_dir(path=path,  out = data_files, ext='*.*')
			
		if 1: #//Load to DB
			
			for _trg_class, val in cli.cfg['target'][_source].items() or []:

				cli.tcfg= tcfg =  cli.get_tcfg(_trg_class)

				_dbname = tcfg["targetDb"]
				toDB 	= create_writer (aname =_trg_class,	app_init=app_init )
				
				toDB.begin_transaction  ( env =tcfg['targetDb'] , out 	= to_conn )
				
				table='%s.%s' % (tcfg['targetSchema'], tcfg['targetTable'])
				toDB.desc_table(schema=tcfg['targetSchema'], tbl=tcfg['targetTable'], col_ord=False)

				
				#// validate cols
				cfg_cols=[x[u'columnName'] for x in cli.scfg[u'columnMappings']]
				
				acols= cli.get_alt_cols(scfg)
				tcols=toDB.get_cols()
				fcols_alt=[]
				for data_file in data_files.file_names:
					dataFile 	= create_reader(aname = 'File',	app_init=app_init, file_name=data_file, scfg=dir_scfg)
					dataFile.describe()
					file_stats[data_file] =  dataFile.line_count() - cli.header_size(dir_scfg)
					fcols_alt=[acols.get(x.decode(),x.decode()) for x in  dataFile.get_header(data_file, dir_scfg)]
					f_vs_c  = set(fcols_alt) -set(cfg_cols)
					c_vs_f  = set(cfg_cols) -set(fcols_alt)
					f_vs_t = set(fcols_alt) -set(tcols)
					t_vs_f = set(tcols) -set(fcols_alt)
					if f_vs_c: 
						pfmtd([dict(c_vs_f = c_vs_f)], 'Config has columns missing in dump file.')
						pfmtd([dict(f_vs_t = f_vs_t)], 'Dump file has columns missing in target table.')
						pfmtd([dict(t_vs_f = t_vs_f)], 'Target table has columns missing in dump file.')
						raise Exception('Target table has columns missing in config: %s' % f_vs_c)
					
					if c_vs_f: 
						pfmtd([dict(f_vs_c = f_vs_c)], 'Dump file has columns missing in config.')
						pfmtd([dict(f_vs_t = f_vs_t)], 'Dump file has columns missing in target table.')
						pfmtd([dict(t_vs_f = t_vs_f)], 'Target table has columns missing in dump file.')
						raise Exception('Config has columns missing in target table: %s' % c_vs_f)

					if f_vs_t: 
						pfmtd([dict(f_vs_c = f_vs_c)], 'Dump file has columns missing in config.')
						pfmtd([dict(c_vs_f = c_vs_f)], 'Config has columns missing in dump file.')
						pfmtd([dict(t_vs_f = t_vs_f)], 'Target table has columns missing in dump file.')
						raise Exception('Dump file has columns missing in target table: %s' % f_vs_t)
					if t_vs_f: 
						pfmtd([dict(f_vs_c = f_vs_c)], 'Dump file has columns missing in config.')
						pfmtd([dict(c_vs_f = c_vs_f)], 'Config has columns missing in dump file.')
						pfmtd([dict(f_vs_t = f_vs_t)], 'Dump file has columns missing in target table.')
						raise Exception('Target table has columns missing in dump file: %s' % t_vs_f)


				#toDB.truncate_table		( table = table )
				toDB.bulk_load			( trans	= to_conn, file_names = data_files,  qname = 'insertStmt', cfg = (dir_scfg, tcfg), out=insert_stats, header=fcols_alt)
				toDB.commit_transaction ( trans	= to_conn)
				
				for k in file_stats.keys():
					assert file_stats[k], 'Dump file is empty'
					assert insert_stats[k] not in [-1], 'Insert failed'
					assert insert_stats[k] == file_stats[k], 'Insert vs file count diff: %s<>%s for file \n%s' % (insert_stats[k] , file_stats[k], k)
				
				if 1:
					stmt = cli.get_parsed(ckey='afterCountStmt', cfg=tcfg)
					
					cur = toDB.exec_query(stmt)
					after_cnt= cur.fetchall()[0][0]
					print(after_cnt)

				stats['%s->%s' % (_source, _trg_class)] =st=  OrderedDict()
				st['source_cnt']		= total_ins 

				st['total_inserted'] 	= sum(insert_stats.values())
				st['after_count'] 		= after_cnt
				st['rollback']			= cli.get_parsed(ckey='rollbackStmt', cfg=tcfg)
				st['purge']				= cli.get_parsed(ckey='purgeStmt', cfg=tcfg)

				if 1: #//validate
		
					try:
						assert st['source_cnt'] == st['total_inserted'],  "source_cnt %s <> total_inserted %s" 	% ( st['source_cnt'], st['total_inserted'])
						assert st['source_cnt'] == st['after_count'] , 	"source_cnt %s <> after_count %s" 		% ( st['source_cnt'], st['after_count'])
					except Exception as ex:
						del_cnt = toDB.exec_dml( dml=st['rollback'], trans=to_conn, commit=True) 
						log.info('Rolled back recs: %d' % del_cnt)
						raise 
				if 1: #//purge
					purge_cnt = toDB.exec_dml( dml=st['purge'], trans=to_conn, commit=True) 
					log.info('Purged old recs: %d' % purge_cnt)
				toDB.commit_transaction( trans	= to_conn )
	
	if 0:
		Email.send_email( **email_args )
示例#29
0
"""

"""
import sys
import threading
import subprocess
from pprint import pprint as pp
from include.utils import create_reader, create_writer, create_actor, InOut
e = sys.exit
cli, conn_pool = app_init

Dir = create_reader('Dir', app_init=app_init)
SQLite = create_writer('SQLite', app_init=app_init)
Email = create_actor('Email', app_init=app_init)

data_files = InOut()
lite_conn = InOut()

##
##
email_args = {'email_subject': 'File->SQLite'}
##
##

data_files.file_names = []


def run():

    Dir.get_files(out=data_files)
示例#30
0
"""
 time python cli.py -nopp 1 -dcf config/db_config.json -pcf config/proc/file_s3_snow/DY_Position_SD.json --proc_params  \
 test.csv.gz -ld 100\
 2>&1| tee DY_Position_SD.log
"""
import threading
import subprocess
from include.utils import create_reader, create_writer, create_actor, InOut

cli, conn_pool = app_init

Snowflake = create_writer('Snowflake', app_init=app_init)
S3 = create_writer('S3', app_init=app_init)
Email = create_actor('Email', app_init=app_init)

s3_file_names = InOut()
snow_conn = InOut()

##
##
email_args = {'email_subject': 'File->Snowflake'}
##
##
threads = {}
s3_file_names.file_names = []


def run():

    Snowflake.begin_transaction(out=snow_conn)
    assert isinstance(