示例#1
0
    def testRepeat(self):
        """ Testing repeatability without CSS backend """

        workers = ['worker1', 'worker2']
        database = 'TESTDB'
        table = 'TABLE'
        mapper = ChunkMapping(workers, database, table)

        # repeat the same thing twice, must get identical output
        wmap1 = [mapper.worker(w) for w in range(10)]
        wmap2 = [mapper.worker(w) for w in range(10)]
        self.assertEqual(wmap1, wmap2)
示例#2
0
    def testRR(self):
        """ Testing round-robin without CSS backend """

        workers = ['worker1', 'worker2']
        database = 'TESTDB'
        table = 'TABLE'
        mapper = ChunkMapping(workers, database, table)

        # current implementation works in round-robin mode
        wmap = [mapper.worker(w) for w in range(10)]
        for i, worker in enumerate(wmap):
            expect = workers[i % len(workers)]
            self.assertEqual(worker, expect)
示例#3
0
    def _run(self, database, table, schema, data):
        """
        Do loading only, cleanup is done in _cleanup()
        """

        # see if database is already defined in CSS and get its partitioning info
        if self.css is not None:
            self._checkCss(database, table)

        # make chunk mapper
        self.chunkMap = ChunkMapping(list(self.workerWmgrMap.keys()), database,
                                     table, self.css)

        # make chunks directory or check that there are usable data there already
        self._makeOrCheckChunksDir(data)

        # uncompress data files that are compressed, this is only needed if
        # table is not partitioned or if we are not reusing existing chunks
        files = data
        if not (self.partitioned and self.skipPart and not self.oneTable):
            files = self._gunzip(data)

        # run partitioner if necessary
        if files and self.callPartitioner:
            self._runPartitioner(files)

        # drop existing tables
        if self.deleteTables:
            self._deleteTable(database, table)

        # create table
        self._createTable(database, table, schema)

        # load data
        self._loadData(database, table, files)

        # create special dummy chunk
        self._createDummyChunk(database, table)

        # create index on czar size
        self._makeIndex(database, table)

        # update CSS with info for this table
        if self.css is not None:
            self._updateCss(database, table)

        # optionally make emptyChunks file
        self._makeEmptyChunks()
示例#4
0
    def _run(self, database, table, schema, data):
        """
        Do loading only, cleanup is done in _cleanup()
        """

        # see if database is already defined in CSS and get its partitioning info
        if self.css is not None:
            self._checkCss(database, table)

        # make chunk mapper
        self.chunkMap = ChunkMapping(list(self.workerWmgrMap.keys()), database, table, self.css)

        # make chunks directory or check that there are usable data there already
        self._makeOrCheckChunksDir(data)

        # uncompress data files that are compressed, this is only needed if
        # table is not partitioned or if we are not reusing existing chunks
        files = data
        if not (self.partitioned and self.skipPart and not self.oneTable):
            files = self._gunzip(data)

        # run partitioner if necessary
        if files and self.callPartitioner:
            self._runPartitioner(files)

        # drop existing tables
        if self.deleteTables:
            self._deleteTable(database, table)

        # create table
        self._createTable(database, table, schema)

        # load data
        self._loadData(database, table, files)

        # create special dummy chunk
        self._log.info("*** SES *** createDummyChunk")
        self._createDummyChunk(database, table)

        # create index on czar size
        self._log.info("*** SES *** makeindex")
        self._makeIndex(database, table)

        # update CSS with info for this table
        if self.css is not None:
            self._updateCss(database, table)

        # optionally make emptyChunks file
#        self._makeEmptyChunks()

        if not self.doNotResetEmptyChunks:
            self._log.info('*** SES *** : create empty chunk file')
            self._makeEmptyChunks()
        else:
            self._log.info('*** SES *** : keep existing empty chunk file')
            self._updateEmptyChunks()
示例#5
0
    def testCss1(self):
        """ Test for reading data from CSS """

        # instantiate kvI with come initial data
        initData = """\
/\t\\N
/css_meta\t\\N
/css_meta/version\t{version}
/DBS\t\\N
/DBS/{db}\t\\N
/DBS/{db}/TABLES\t\\N
/DBS/{db}/TABLES/{table}\t\\N
/DBS/{db}/TABLES/{table}/CHUNKS\t\\N
/DBS/{db}/TABLES/{table}/CHUNKS/333\t\\N
/DBS/{db}/TABLES/{table}/CHUNKS/333/REPLICAS\t\\N
/DBS/{db}/TABLES/{table}/CHUNKS/333/REPLICAS/1\t\\N
/DBS/{db}/TABLES/{table}/CHUNKS/333/REPLICAS/1/.packed.json\t{{"nodeName": "worker333"}}
/DBS/{db}/TABLES/{table}/CHUNKS/765\t\\N
/DBS/{db}/TABLES/{table}/CHUNKS/765/REPLICAS\t\\N
/DBS/{db}/TABLES/{table}/CHUNKS/765/REPLICAS/1\t\\N
/DBS/{db}/TABLES/{table}/CHUNKS/765/REPLICAS/1/.packed.json\t{{"nodeName": "worker765"}}
"""

        workers = ['worker1', 'worker2']
        database = 'TESTDB'
        table = 'TABLE'

        initData = initData.format(version=css.VERSION, db=database, table=table)
        css_inst = _makeCss(initData)

        mapper = ChunkMapping(workers, database, table, css_inst)

        # check that pre-defined chunks work
        worker = mapper.worker(333)
        self.assertEqual(worker, 'worker333')
        worker = mapper.worker(765)
        self.assertEqual(worker, 'worker765')

        # chunks that are not in CSS should return workers from the list
        worker = mapper.worker(1)
        self.assertEqual(worker, 'worker1')
        worker = mapper.worker(2)
        self.assertEqual(worker, 'worker2')
示例#6
0
    def testCss1(self):
        """ Test for reading data from CSS """

        # instantiate kvI with come initial data
        initData = """\
/\t\\N
/css_meta\t\\N
/css_meta/version\t{version}
/DBS\t\\N
/DBS/{db}\t\\N
/DBS/{db}/TABLES\t\\N
/DBS/{db}/TABLES/{table}\t\\N
/DBS/{db}/TABLES/{table}/CHUNKS\t\\N
/DBS/{db}/TABLES/{table}/CHUNKS/333\t\\N
/DBS/{db}/TABLES/{table}/CHUNKS/333/REPLICAS\t\\N
/DBS/{db}/TABLES/{table}/CHUNKS/333/REPLICAS/1\t\\N
/DBS/{db}/TABLES/{table}/CHUNKS/333/REPLICAS/1/.packed.json\t{{"nodeName": "worker333"}}
/DBS/{db}/TABLES/{table}/CHUNKS/765\t\\N
/DBS/{db}/TABLES/{table}/CHUNKS/765/REPLICAS\t\\N
/DBS/{db}/TABLES/{table}/CHUNKS/765/REPLICAS/1\t\\N
/DBS/{db}/TABLES/{table}/CHUNKS/765/REPLICAS/1/.packed.json\t{{"nodeName": "worker765"}}
"""

        workers = ['worker1', 'worker2']
        database = 'TESTDB'
        table = 'TABLE'

        initData = initData.format(version=css.VERSION,
                                   db=database,
                                   table=table)
        css_inst = _makeCss(initData)

        mapper = ChunkMapping(workers, database, table, css_inst)

        # check that pre-defined chunks work
        worker = mapper.worker(333)
        self.assertEqual(worker, 'worker333')
        worker = mapper.worker(765)
        self.assertEqual(worker, 'worker765')

        # chunks that are not in CSS should return workers from the list
        worker = mapper.worker(1)
        self.assertEqual(worker, 'worker1')
        worker = mapper.worker(2)
        self.assertEqual(worker, 'worker2')
示例#7
0
class DataLoader(object):
    """
    DataLoader class defines all logic for loading data, including data
    partitioning, CSS updating, etc. It is driven by a set of configuration
    files which are passed to constructor.
    """

    def __init__(self, configFiles, czarWmgr, workerWmgrMap={}, chunksDir="./loader_chunks",
                 chunkPrefix='chunk', keepChunks=False, skipPart=False, oneTable=False,
                 css=None, cssClear=False, indexDb='qservMeta', tmpDir=None,
                 emptyChunks=None, deleteTables=False, loggerName=None,
                 doNotResetEmptyChunks=None, doNotRegisterXrootdDb=None, doNotResetCSSTable=None):
        """
        Constructor parses all arguments and prepares for execution.

        @param configFiles:  Sequence of the files defining all partitioning options.
        @param czarWmgr:     WmgrClient instance for czar node.
        @param workerWmgrMap: Dictionary mapping worker host name to corresponding
                             WmgrClient instance. May be empty, in which case czar
                             server will be used for all data.
        @param chunksDir:    Temporary directory to store chunks files, will be created
                             if does not exist.
        @param chunkPrefix:  File name prefix for generated chunk files.
        @param keepChunks:   Chunks will not be deleted if this argument is set to True.
        @param skipPart:     If set to True then partitioning will not be performed
                             (chunks should exist already).
        @param oneTable:     If set to True then load all data into one table, do not
                             create chunk tables.
        @param css:          Instance of CssAccess class, None if CSS operations are disabled.
        @param cssClear:     If true then CSS info for a table will be deleted first.
        @param indexDb:      Name of  database for object indices, index is generated for director
                             table when it is partitioned, use empty string to disable index.
        @param tmpDir:       Temporary directory to store uncompressed files. If None then directory
                             inside chunksDir will be used. Will be created if does not exist.
        @param emptyChunks:  Path name for "empty chunks" file, may be None.
        @param deleteTables: If True then existing tables in database will be deleted.
        @param loggerName:   Logger name used for logging all messages from loader.
        """

        if not loggerName:
            loggerName = __name__
        self._log = logging.getLogger(loggerName)

        self.configFiles = configFiles
        self.czarWmgr = czarWmgr
        self.workerWmgrMap = workerWmgrMap.copy()
        self.chunksDir = chunksDir
        self.tmpDir = tmpDir
        self.chunkPrefix = chunkPrefix
        self.keepChunks = keepChunks
        self.skipPart = skipPart
        self.oneTable = oneTable
        self.css = css
        self.cssClear = cssClear
        self.indexDb = None if oneTable else indexDb
        self.emptyChunks = emptyChunks
        self.doNotResetEmptyChunks = doNotResetEmptyChunks
        self.doNotRegisterXrootdDb = doNotRegisterXrootdDb
        self.doNotResetCSSTable = doNotResetCSSTable
        self.deleteTables = deleteTables

        self.chunkRe = re.compile('^' + self.chunkPrefix + '_(?P<id>[0-9]+)(?P<ov>_overlap)?[.]txt$')
        self.cleanupDirs = []
        self.cleanupFiles = []
        self.unzipDir = None   # directory used for uncompressed data
        self.schema = None     # "CREATE TABLE" statement
        self.chunks = set()    # set of chunks that were loaded
        self.chunkMap = None
        self.createdChunks = set()

        # parse all config files, this can raise an exception
        self.partOptions = PartConfig(configFiles)

        # Logic is slightly complicated here, so pre-calculate options that we need below:
        # 1. If self.skipPart and self.oneTable are both true then we skip partitioning
        #    even for partitioned tables and load original data. So if self.skipPart and
        #    self.oneTable are both true then we say table is not partitioned
        # 2. Partitioning is done only for partitioned table, if self.skipPart is true then
        #    pre-partitioned data must exist already and we skip calling partitioner

        # is table partitioned (or pre-partitioned)?
        self.partitioned = self.partOptions.partitioned

        # do we need to run partitioner?
        self.callPartitioner = self.partitioned and not self.skipPart

    def load(self, database, table, schema, data):
        """
        Do actual loading based on parameters defined in constructor.
        This will throw exception if anything goes wrong.

        @param database:     Database name.
        @param table:        Table name.
        @param schema:       File name which contains SQL with CREATE TABLE/VIEW.
        @param data:         List of file names with data, may be empty (e.g. when
                             defining views instead of tables).
        """

        if not _mysql_identifier_validator(table):
            raise ValueError('MySQL table name not allowed: ' + table)
        if not _mysql_identifier_validator(database):
            raise ValueError('MySQL database name not allowed: ' + database)

        try:
            return self._run(database, table, schema, data)
        finally:
            self._cleanup()

    def _run(self, database, table, schema, data):
        """
        Do loading only, cleanup is done in _cleanup()
        """

        # see if database is already defined in CSS and get its partitioning info
        if self.css is not None:
            self._checkCss(database, table)

        # make chunk mapper
        self.chunkMap = ChunkMapping(list(self.workerWmgrMap.keys()), database, table, self.css)

        # make chunks directory or check that there are usable data there already
        self._makeOrCheckChunksDir(data)

        # uncompress data files that are compressed, this is only needed if
        # table is not partitioned or if we are not reusing existing chunks
        files = data
        if not (self.partitioned and self.skipPart and not self.oneTable):
            files = self._gunzip(data)

        # run partitioner if necessary
        if files and self.callPartitioner:
            self._runPartitioner(files)

        # drop existing tables
        if self.deleteTables:
            self._deleteTable(database, table)

        # create table
        self._createTable(database, table, schema)

        # load data
        self._loadData(database, table, files)

        # create special dummy chunk
        self._log.info("*** SES *** createDummyChunk")
        self._createDummyChunk(database, table)

        # create index on czar size
        self._log.info("*** SES *** makeindex")
        self._makeIndex(database, table)

        # update CSS with info for this table
        if self.css is not None:
            self._updateCss(database, table)

        # optionally make emptyChunks file
#        self._makeEmptyChunks()

        if not self.doNotResetEmptyChunks:
            self._log.info('*** SES *** : create empty chunk file')
            self._makeEmptyChunks()
        else:
            self._log.info('*** SES *** : keep existing empty chunk file')
            self._updateEmptyChunks()


    def _cleanup(self):
        """
        Do cleanup, remove all temporary files, this should not throw.
        """

        # remove tmp files
        for fName in self.cleanupFiles:
            try:
                self._log.debug('Deleting temporary file: %r', fName)
                os.unlink(fName)
            except Exception as exc:
                self._log.error('Failed to remove temporary file: %r', exc)

        # remove directories
        for dirName in self.cleanupDirs:
            try:
                self._log.debug('Deleting directory: %r', dirName)
                shutil.rmtree(dirName)
            except Exception as exc:
                self._log.error('Failed to remove directory: %r', exc)

    def _checkCss(self, database, table):
        """
        Check CSS for existing configuration and see if it matches ours.
        Throws exception if any irregularities are observed.
        """

        self._log.info('Verifying CSS info for table %r', table)

        # get striping info
        try:
            striping = self.css.getDbStriping(database)
            self._log.debug('CSS database striping info: %r', striping)
        except css.NoSuchDb:
            # we'll create it later
            return

        # check parameters
        self._checkPartParam(self.partOptions, 'part.num-stripes', striping.stripes, int)
        self._checkPartParam(self.partOptions, 'part.num-sub-stripes', striping.subStripes, int)
        self._checkPartParam(self.partOptions, 'part.default-overlap', striping.overlap, float)

        # also check that table does not exist in CSS, or optionally remove it
        cssTableExists = self.css.containsTable(database, table)
        if cssTableExists:
            if self.cssClear:
                # try to remove it
                self.css.dropTable(database, table)
            else:
                self._log.error('Table is already defined in CSS')
                if not self.doNotResetCSSTable:
                    raise RuntimeError('table exists in CSS')

    @staticmethod
    def _checkPartParam(partOptions, partKey, cssValue, optType=str):
        """
        Check that partitioning parameters are compatible. Throws exception
        if there is a mismatch.
        """
        optValue = optType(partOptions[partKey])
        if optValue != cssValue:
            raise ValueError('Option %r does not match CSS: %r != %r' %
                             (partKey, optValue, cssValue))

    def _makeOrCheckChunksDir(self, data):
        '''Create directory for chunk data or check that it exists, throws in case of errors.'''

        # only need it for partitioned table
        if not self.partitioned:
            return

        # in case we do skip-part but load into one table then we just take
        # data from command line if it is specified
        if self.oneTable and self.skipPart and data:
            return

        chunks_dir = self.chunksDir

        # if it exists it must be directory
        exists = False
        if os.path.exists(chunks_dir):
            exists = True
            if not os.path.isdir(chunks_dir):
                self._log.error('Path for chunks exists but is not a directory: %r', chunks_dir)
                raise IOError('chunk path is not directory: ' + chunks_dir)

        if self.skipPart:
            # directory must exist and have some files (chunk_index.bin at least)
            if not exists:
                self._log.error('Chunks directory does not exist: %r', chunks_dir)
                raise RuntimeError('chunk directory is missing')
            path = os.path.join(chunks_dir, 'chunk_index.bin')
            if not os.path.exists(path):
                self._log.error('Could not find required file (chunk_index.bin) in chunks directory')
                raise RuntimeError('chunk_index.bin is missing')
        else:
            if exists:
                # must be empty, we do not want any extraneous stuff there
                if os.listdir(chunks_dir):
                    self._log.error('Chunks directory is not empty: %r', chunks_dir)
                    raise RuntimeError('chunks directory is not empty: ' + chunks_dir)
            else:
                try:
                    self._log.debug('Creating chunks directory %r', chunks_dir)
                    os.makedirs(chunks_dir)
                    # will remove it later
                    if not self.keepChunks:
                        self.cleanupDirs.append(chunks_dir)
                except Exception as exc:
                    self._log.error('Failed to create chunks directory: %r', exc)
                    raise

    def _runPartitioner(self, files):
        '''Run partitioner to fill chunks directory with data, returns 0 on success.'''

        def fileList(dirName):
            '''Generate a sequence of file names in directory, exclude directories'''
            for fName in os.listdir(dirName):
                path = os.path.join(dirName, fName)
                if os.path.isfile(path):
                    yield path

        # build arguments list
        partitioner = 'sph-partition-matches' if self.partOptions.isRefMatch else 'sph-partition'
        args = [partitioner, '--out.dir', self.chunksDir, '--part.prefix', self.chunkPrefix]
        for config in self.configFiles:
            args += ['--config-file', config]
        for data in files:
            args += ['--in', data]

        try:
            # run partitioner
            self._log.info('run partitioner on files: %r', ' '.join(files))
            self._log.debug('Run shell command: %r', ' '.join(args))
            subprocess.check_output(args=args)
        except Exception as exc:
            self._log.error('Failed to run partitioner: %r', exc)
            raise
        finally:
            # some chunk files may have been created, add them to cleanup list
            if not self.keepChunks:
                self.cleanupFiles += list(fileList(self.chunksDir))

    def _gunzip(self, data):
        """
        Uncompress compressed input files to a temporary directory.
        Returns list of input file names with compressed files replaced by
        uncompressed file location. Throws exception in case of errors.
        """

        result = []
        for infile in data:

            # we rely on file extension to decide whether it is compressed or not,
            # for more reliable way we could use something like "magic" module
            if infile.endswith('.gz'):

                if self.tmpDir is None:

                    # use chunks directory for that
                    if os.path.exists(self.chunksDir):
                        if not os.path.isdir(self.chunksDir):
                            self._log.error('Path for chunks is not a directory: %r', self.chunksDir)
                            raise IOError('chunk path is not directory: ' + self.chunksDir)
                    else:
                        # create it, but don't forget to delete it later
                        self._log.debug('Creating chunks directory %r', self.chunksDir)
                        os.makedirs(self.chunksDir)
                        if not self.keepChunks:
                            self.cleanupDirs.append(self.chunksDir)

                    try:
                        self.tmpDir = tempfile.mkdtemp(dir=self.chunksDir)
                        # need to remove it later, before chunks dir
                        self.cleanupDirs.insert(0, self.tmpDir)
                    except Exception as exc:
                        self._log.critical('Failed to create temp directory for uncompressed files: %r', exc)
                        raise
                    self._log.debug('Created temporary directory %r', self.tmpDir)
                else:
                    # check and create if needed
                    if os.path.exists(self.tmpDir):
                        if not os.path.isdir(self.tmpDir):
                            self._log.critical('Temporary location is not a directory: %r', self.tmpDir)
                            raise IOError('Temporary location is not a directory: ' + self.tmpDir)
                    else:
                        try:
                            os.mkdir(self.tmpDir)
                            self._log.debug('Created temporary directory %r', self.tmpDir)
                            # need to remove it later
                            self.cleanupDirs.append(self.tmpDir)
                        except Exception as exc:
                            self._log.critical('Failed to create temp directory: %r', exc)
                            raise

                # construct output file name
                outfile = os.path.basename(infile)
                outfile = os.path.splitext(outfile)[0]
                outfile = os.path.join(self.tmpDir, outfile)

                # will cleanup it later
                self.cleanupFiles.append(outfile)

                self._log.info('Uncompressing %r to %r', infile, outfile)
                try:
                    finput = open(infile)
                    foutput = open(outfile, 'wb')
                    cmd = ['gzip', '-d', '-c']
                    subprocess.check_call(args=cmd, stdin=finput, stdout=foutput)
                except Exception as exc:
                    self._log.critical('Failed to uncompress data file: %r', exc)
                    raise

            else:

                # file is already uncompressed
                self._log.debug('Using input file which is not compressed: %r', infile)
                outfile = infile

            result.append(outfile)

        return result

    def _connections(self, useCzar, useWorkers):
        """
        Returns a list of wmgr "connections", for each conection there is a
        tuple (name, connection) where name is something like "czar" or
        "worker lsst-dbdev2". If czar connection is included then it
        is always first in the list.

        @param useCzar:     if True then include czar in the list
        @param useWorkers:  if True then include all workers in the list
        """
        res = []
        if useCzar:
            res += [("czar", self.czarWmgr)]
        if useWorkers:
            for worker, wmgr in self.workerWmgrMap.items():
                res += [('worker ' + worker, wmgr)]
        return res

    def _deleteTable(self, database, table):
        """
        Drop existing table and all chunks.
        """

        self._log.info('Deleting existing table %r (and chunks)', table)

        for name, wmgr in self._connections(useCzar=True, useWorkers=True):
            self._log.info('Deleting table from %r', name)
            wmgr.dropTable(database, table, dropChunks=True, mustExist=False)

    def _createTable(self, database, table, schema):
        """
        Create table in the database. Just executes whatever SQL was given to
        us in a schema file. Additionally applies fixes to a schema after loading.
        """

        # read table schema
        try:
            self.schema = open(schema).read()
        except Exception as exc:
            self._log.critical('Failed to read table schema file: %r', exc)
            raise

        # create table on czar and every worker
        for name, wmgr in self._connections(useCzar=True, useWorkers=True):
            self._log.info('Creating table %r in %r', table, name)
            chunkColumns = bool(self.partitioned)
            try:
                wmgr.createTable(database, table, schema=self.schema, chunkColumns=chunkColumns)
            except ServerError as exc:
                if exc.code == 409:
                    self._log.info('Table %r exists in %r', table, name)
                else:
                    self._log.critical('Failed to create table %r in %r', table, name)
                    raise

    def _loadData(self, database, table, files):
        """
        Load data into existing table.
        """
        if not self.partitioned or (self.oneTable and self.skipPart and files):
            # load files given on command line
            self._loadNonChunkedData(database, table, files)
        else:
            # load data from chunk directory
            self._loadChunkedData(database, table)

    def _chunkFiles(self):
        """
        Generator method which returns list of all chunk files. For each chunk returns
        a triplet (path:string, chunkId:int, overlap:bool).
        """
        for dirpath, _, filenames in os.walk(self.chunksDir, followlinks=True):
            for fileName in filenames:
                match = self.chunkRe.match(fileName)
                if match is not None:
                    path = os.path.join(dirpath, fileName)
                    chunkId = int(match.group('id'))
                    overlap = match.group('ov') is not None
                    yield (path, chunkId, overlap)

    def _loadChunkedData(self, database, table):
        """
        Load chunked data into mysql table, if one-table option is specified then all chunks
        are loaded into a single table with original name, otherwise we create one table per chunk.
        """

        # As we read from partitioner output files we use "out.csv" option for that.
        csvPrefix = "out.csv"

        for path, chunkId, overlap in self._chunkFiles():

            # remember all chunks that we loaded
            if not overlap:
                self.chunks.add(chunkId)

            if self.oneTable:

                # just load everything into existing table, do not load overlaps
                if not overlap:
                    self._loadOneFile(self.czarWmgr, database, table, path, csvPrefix)
                else:
                    self._log.info('Ignore overlap file %r', path)

            else:

                # Partitioner may potentially produce empty overlap files even
                # in cases when we should not make overlap tables. Check and
                # filter out empty files or complain about non-empty.
                if overlap and not self.partOptions.isSubChunked:
                    # check contents, try to read some data and strip spaces
                    data = open(path).read(1024).strip()
                    if data:
                        raise RuntimeError('Found non-empty overlap file for non-subchunked table: ' + path)
                    else:
                        self._log.info('Ignore empty overlap file %r', path)
                        continue

                if self.workerWmgrMap:
                    # find database for this chunk
                    worker = self.chunkMap.worker(chunkId)
                    wmgr = self.workerWmgrMap.get(worker)
                    if wmgr is None:
                        raise RuntimeError('Existing chunk mapping is not in the list of workers: ' + worker)
                    self._log.info('load chunk %r to worker %r', chunkId, worker)
                else:
                    # all goes to single node
                    self._log.info('load chunk %r to czar', chunkId)
                    wmgr = self.czarWmgr

                # make tables if needed
                if chunkId not in self.createdChunks:
                    try:
                        wmgr.createChunk(database, table, chunkId, overlap=self.partOptions.isSubChunked)
                        self.createdChunks.add(chunkId)
                    except ServerError as exc:
                        if exc.code == 409:
                            self._log.info('Chunk %r exists for table %r', chunkId, table)
                        else:
                            self._log.critical('Failed to create chunk %r for table %r', chunkId, table)
                            raise

                # load data into chunk table
                self._loadOneFile(wmgr, database, table, path, csvPrefix, chunkId=chunkId, overlap=overlap)

    @staticmethod
    def _chunkTableName(table, chunkId, overlap):
        """
        Return full chunk table name or overlap table name.
        """
        ctable = table
        if overlap:
            ctable += 'FullOverlap'
        ctable += '_'
        ctable += str(chunkId)
        return ctable

    def _createDummyChunk(self, database, table):
        """
        Make special dummy chunk in case of partitioned data.
        """

        if not self.partitioned or self.oneTable:
            # only do it for true partitioned stuff
            return

        # this is only needed on worker (or czar if there are no workers)
        connections = self._connections(useCzar=False, useWorkers=True)
        if not connections:
            connections = self._connections(useCzar=True, useWorkers=False)

        for name, wmgr in connections:

            self._log.info('Make dummy chunk table for %r', table)

            # just make regular chunk with special ID, do not load any data
            try:
                wmgr.createChunk(database, table, 1234567890, overlap=self.partOptions.isSubChunked)
            except ServerError as exc:
                if exc.code == 409:
                    self._log.info('Dummy chunk 1234567890 exists for table %r', table)
                else:
                    self._log.critical('Failed to create dummy chunk 1234567890 for table %r', table)
                    raise

    def _loadNonChunkedData(self, database, table, files):
        """
        Load non-chunked data into mysql table. We use (unzipped) files that
        we got for input.
        """

        # As we read from input files (which are also input files for partitioner)
        # we use "in.csv" option for that.
        csvPrefix = "in.csv"

        # this is only needed on workers (or czar if there are no workers)
        connections = self._connections(useCzar=False, useWorkers=True)
        if not connections:
            connections = self._connections(useCzar=True, useWorkers=False)

        for name, wmgr in connections:
            self._log.info('load non-chunked data to %r', name)
            for file in files:
                self._loadOneFile(wmgr, database, table, file, csvPrefix)

    def _loadOneFile(self, wmgr, database, table, path, csvPrefix, chunkId=None, overlap=None):
        """Load data from a single file into existing table"""

        self._log.info('load table %r.%r from file %r', database, table, path)

        # need to know special characters used in csv
        # default delimiter is the same as in partitioner
        special_chars = {'delimiter': '\t',
                         'enclose': '',
                         'escape': '\\',
                         'newline': '\n'}

        data = {}
        for name, default in special_chars.items():
            data[name] = self.partOptions.get(csvPrefix + '.' + name, default)

        try:
            file = open(path, "rb")
        except IOError as exc:
            self._log.error('failed to open file %r', path)
            raise

        wmgr.loadData(database, table, file, fileName=path, chunkId=chunkId, overlap=overlap,
                      delimiter=data['delimiter'], enclose=data['enclose'],
                      escape=data['escape'], terminate=data['newline'])

    def _updateCss(self, database, table):
        """
        Update CSS with information about loaded table and database.
        """

        # create database in CSS if not there yet
        if not self.css.containsDb(database):
            self._log.info('Creating database CSS info')
            options = self.partOptions.cssDbOptions()
            striping = css.StripingParams(options['nStripes'], options['nSubStripes'],
                                          0, options['overlap'])
            self.css.createDb(database, striping, options['storageClass'], 'RELEASED')

        # define options for table
        options = self.partOptions.cssTableOptions()
        schema = self._schemaForCSS(database, table)

        if options.get('match', False):
            matchParams = css.MatchTableParams(options['dirTable1'], options['dirColName1'],
                                               options['dirTable2'], options['dirColName2'],
                                               options['flagColName'])
            self._log.info('Creating table CSS info for match table')
            self.css.createMatchTable(database, table, schema, matchParams)
        else:
            if 'dirTable' in options:
                # partitioned table
                pParams = css.PartTableParams(options['dirDb'], options['dirTable'], options['dirColName'],
                                              options['latColName'], options['lonColName'],
                                              options['overlap'], True, options['subChunks'])
                sParams = css.ScanTableParams(options['lockInMem'], options['scanRating'])
            else:
                pParams = css.PartTableParams()
                sParams = css.ScanTableParams()

            if not self.doNotResetCSSTable:  
                self.css.createTable(database, table, schema, pParams, sParams)

        # save chunk mapping too
        self._log.info('Saving updated chunk map to CSS')
        self.chunkMap.save()

    def _schemaForCSS(self, database, table):
        """
        Returns schema string for CSS, which is a create table only without
        create table, only column definitions
        """

        schema = self.czarWmgr.tableSchema(database, table)
        # strip CREATE TABLE
        i = schema.find('(')
        return schema[i:]

    def _makeEmptyChunks(self):
        """
        Generate empty chunks file, should be called after loading is complete.
        """

        if not self.emptyChunks:
            # need a file name
            return

        # only makes sense for true partitioned tables
        if not self.partitioned:
            self._log.info('Table is not partitioned, will not make empty chunks file %r', self.emptyChunks)
            return

        # max possible number of chunks
        nStripes = int(self.partOptions['part.num-stripes'])
        maxChunks = 2 * nStripes ** 2

        self._log.info('Making empty chunk list (max.chunk=%d) %r', maxChunks, self.emptyChunks)

        emptyChunkDir = os.path.dirname(self.emptyChunks)
        try:
            os.makedirs(emptyChunkDir)
        except OSError:
            if not os.path.isdir(emptyChunkDir):
                raise

        out = open(self.emptyChunks, 'w')
        for chunk in range(maxChunks):
            if chunk not in self.chunks:
                print(chunk, file=out)

    def _updateEmptyChunks(self):
  
        if not self.emptyChunks:
            # need a file name
            return

        # only makes sense for true partitioned tables
        if not self.partitioned:
            self._log.info('Table is not partitioned, will not make empty chunks file %r', self.emptyChunks)
            return

        # max possible number of chunks
        nStripes = int(self.partOptions['part.num-stripes'])
        maxChunks = 2 * nStripes ** 2

        existingChunkList=[i for i in range(0,maxChunks)]
#        try:
#            in_file = open(self.emptyChunks, 'r')
#            tmp = in_file.readlines()
#            in_file.close()
#            existingChunkList=[int(x.strip()) for x in tmp if x.strip()!=""]
#            self._log.info("Existing chunks : ",existingChunkList)
#        except:
#            pass


        in_file = open(self.emptyChunks, 'r')
        tmp = in_file.readlines()
        in_file.close()
        existingChunkList=[int(x.strip()) for x in tmp if x.strip()!=""]
#        self._log.info("Existing chunks : ",existingChunkList)


        out = open(self.emptyChunks, 'w')
        for chunk in range(maxChunks):
            if chunk not in self.chunks and chunk in existingChunkList:
                print(chunk, file=out)


    def _makeIndex(self, database, table):
        """
        Generate object index in czar meta database.
        """

        # only makes sense for director table
        if not self.partitioned or \
           not self.partOptions.isDirector(database, table) or \
           not self.indexDb:
            self._log.info("*** SES *** : non index")
            return

        metaTable = database + '__' + table
        self._log.info('Generating index %r.%r', self.indexDb, metaTable)

        # try to delete existing table first
        # self.czarWmgr.dropTable(self.indexDb, metaTable, mustExist=False)

        # index column
        idxCol = self.partOptions['id']

        # get index column type from original table
        idxColType = 'BIGINT'
        for col in self.czarWmgr.tableColumns(database, table):
            if col['name'] == idxCol:
                idxColType = col['type']
                break

        # make a table, InnoDB engine is required for scalability
        schema = "CREATE TABLE IF NOT EXISTS {table} ({column} {type} NOT NULL PRIMARY KEY, chunkId INT, subChunkId INT)"
        schema += " ENGINE = INNODB"
        schema = schema.format(table=metaTable, column=idxCol, type=idxColType)
        self.czarWmgr.createTable(self.indexDb, metaTable, schema=schema)

        # call one of the two methods
        if self.workerWmgrMap:
            self._makeIndexMultiNode(database, table, metaTable, idxCol)
        else:
            self._makeIndexSingleNode(database, table, metaTable, idxCol)

    def _makeIndexMultiNode(self, database, table, metaTable, idxCol):
        """
        Generate object index in czar meta database in case chunks are on a separate
        server from index database. It needs to copy all index data over network,
        may need special optimization or parameters.
        """

        # load data from all chunks
        for chunk in self.chunks:

            # get worker name for this chunk
            wname = self.chunkMap.worker(chunk)
            wmgr = self.workerWmgrMap[wname]

            self._loadChunkIndex(wmgr, database, table, chunk, metaTable, idxCol)

    def _makeIndexSingleNode(self, database, table, metaTable, idxCol):
        """
        Generate object index in czar meta database in case all chunks are also on czar.
        """

        # TODO: there is for sure more efficient method than copying data locally

        # load data from all chunks
        for chunk in self.chunks:

            self._loadChunkIndex(self.czarWmgr, database, table, chunk, metaTable, idxCol)

    def _loadChunkIndex(self, wmgr, database, table, chunk, metaTable, idxCol):
        """
        Load secondary index with data from a single chunk.
        """

        # get index data from worker (or czar)
        columns = (idxCol, 'chunkId', 'subChunkId')
        indexData = wmgr.getIndex(database, table, chunkId=chunk, columns=columns)

        # dump it into a in-memory file, loadData expects binary mode
        data = BytesIO()
        for row in indexData:
            data.write(b"%d\t%d\t%d\n" % tuple(row))
        data.seek(0)

        # send that file to czar
        self.czarWmgr.loadData(self.indexDb, metaTable, data)
示例#8
0
    def testCss3(self):
        """ Test for saving data to CSS """

        # instantiate kvI with come initial data
        initData = """\
/\t\\N
/css_meta\t\\N
/css_meta/version\t{version}
/DBS\t\\N
/DBS/{db}\t\\N
/DBS/{db}/TABLES\t\\N
/DBS/{db}/TABLES/{table}\t\\N
/DBS/{db}/TABLES/{table}/CHUNKS\t\\N
"""

        workers = ['worker1', 'worker2']
        database = 'TESTDB'
        table = 'TABLE'

        initData = initData.format(version=css.VERSION, db=database, table=table)
        css_inst = _makeCss(initData)

        mapper = ChunkMapping(workers, database, table, css_inst)

        # chunks that are not in CSS should return workers from the list
        worker = mapper.worker(1)
        self.assertEqual(worker, 'worker1')
        worker = mapper.worker(2)
        self.assertEqual(worker, 'worker2')

        # save stuff to CSS
        mapper.save()

        # save all CSS to string
        data = css_inst.getKvI().dumpKV()

        # build another CSS instance from saved data
        css_inst = _makeCss(data)

        # new mapper, use different worker set to avoid confusion
        workers = ['worker1000', 'worker2000']
        mapper = ChunkMapping(workers, database, table, css_inst)

        # get workers for chunks from css
        worker = mapper.worker(1)
        self.assertEqual(worker, 'worker1')
        worker = mapper.worker(2)
        self.assertEqual(worker, 'worker2')
        worker = mapper.worker(3)
        self.assertEqual(worker, 'worker1000')
        worker = mapper.worker(4)
        self.assertEqual(worker, 'worker2000')
示例#9
0
class DataLoader(object):
    """
    DataLoader class defines all logic for loading data, including data
    partitioning, CSS updating, etc. It is driven by a set of configuration
    files which are passed to constructor.
    """
    def __init__(self,
                 configFiles,
                 czarWmgr,
                 workerWmgrMap={},
                 chunksDir="./loader_chunks",
                 chunkPrefix='chunk',
                 keepChunks=False,
                 skipPart=False,
                 oneTable=False,
                 css=None,
                 cssClear=False,
                 indexDb='qservMeta',
                 tmpDir=None,
                 emptyChunks=None,
                 deleteTables=False,
                 loggerName=None):
        """
        Constructor parses all arguments and prepares for execution.

        @param configFiles:  Sequence of the files defining all partitioning options.
        @param czarWmgr:     WmgrClient instance for czar node.
        @param workerWmgrMap: Dictionary mapping worker host name to corresponding
                             WmgrClient instance. May be empty, in which case czar
                             server will be used for all data.
        @param chunksDir:    Temporary directory to store chunks files, will be created
                             if does not exist.
        @param chunkPrefix:  File name prefix for generated chunk files.
        @param keepChunks:   Chunks will not be deleted if this argument is set to True.
        @param skipPart:     If set to True then partitioning will not be performed
                             (chunks should exist already).
        @param oneTable:     If set to True then load all data into one table, do not
                             create chunk tables.
        @param css:          Instance of CssAccess class, None if CSS operations are disabled.
        @param cssClear:     If true then CSS info for a table will be deleted first.
        @param indexDb:      Name of  database for object indices, index is generated for director
                             table when it is partitioned, use empty string to disable index.
        @param tmpDir:       Temporary directory to store uncompressed files. If None then directory
                             inside chunksDir will be used. Will be created if does not exist.
        @param emptyChunks:  Path name for "empty chunks" file, may be None.
        @param deleteTables: If True then existing tables in database will be deleted.
        @param loggerName:   Logger name used for logging all messages from loader.
        """

        if not loggerName:
            loggerName = __name__
        self._log = logging.getLogger(loggerName)

        self.configFiles = configFiles
        self.czarWmgr = czarWmgr
        self.workerWmgrMap = workerWmgrMap.copy()
        self.chunksDir = chunksDir
        self.tmpDir = tmpDir
        self.chunkPrefix = chunkPrefix
        self.keepChunks = keepChunks
        self.skipPart = skipPart
        self.oneTable = oneTable
        self.css = css
        self.cssClear = cssClear
        self.indexDb = None if oneTable else indexDb
        self.emptyChunks = emptyChunks
        self.deleteTables = deleteTables

        self.chunkRe = re.compile('^' + self.chunkPrefix +
                                  '_(?P<id>[0-9]+)(?P<ov>_overlap)?[.]txt$')
        self.cleanupDirs = []
        self.cleanupFiles = []
        self.unzipDir = None  # directory used for uncompressed data
        self.schema = None  # "CREATE TABLE" statement
        self.chunks = set()  # set of chunks that were loaded
        self.chunkMap = None
        self.createdChunks = set()

        # parse all config files, this can raise an exception
        self.partOptions = PartConfig(configFiles)

        # Logic is slightly complicated here, so pre-calculate options that we need below:
        # 1. If self.skipPart and self.oneTable are both true then we skip partitioning
        #    even for partitioned tables and load original data. So if self.skipPart and
        #    self.oneTable are both true then we say table is not partitioned
        # 2. Partitioning is done only for partitioned table, if self.skipPart is true then
        #    pre-partitioned data must exist already and we skip calling partitioner

        # is table partitioned (or pre-partitioned)?
        self.partitioned = self.partOptions.partitioned

        # do we need to run partitioner?
        self.callPartitioner = self.partitioned and not self.skipPart

    def load(self, database, table, schema, data):
        """
        Do actual loading based on parameters defined in constructor.
        This will throw exception if anything goes wrong.

        @param database:     Database name.
        @param table:        Table name.
        @param schema:       File name which contains SQL with CREATE TABLE/VIEW.
        @param data:         List of file names with data, may be empty (e.g. when
                             defining views instead of tables).
        """

        if not _mysql_identifier_validator(table):
            raise ValueError('MySQL table name not allowed: ' + table)
        if not _mysql_identifier_validator(database):
            raise ValueError('MySQL database name not allowed: ' + database)

        try:
            return self._run(database, table, schema, data)
        finally:
            self._cleanup()

    def _run(self, database, table, schema, data):
        """
        Do loading only, cleanup is done in _cleanup()
        """

        # see if database is already defined in CSS and get its partitioning info
        if self.css is not None:
            self._checkCss(database, table)

        # make chunk mapper
        self.chunkMap = ChunkMapping(list(self.workerWmgrMap.keys()), database,
                                     table, self.css)

        # make chunks directory or check that there are usable data there already
        self._makeOrCheckChunksDir(data)

        # uncompress data files that are compressed, this is only needed if
        # table is not partitioned or if we are not reusing existing chunks
        files = data
        if not (self.partitioned and self.skipPart and not self.oneTable):
            files = self._gunzip(data)

        # run partitioner if necessary
        if files and self.callPartitioner:
            self._runPartitioner(files)

        # drop existing tables
        if self.deleteTables:
            self._deleteTable(database, table)

        # create table
        self._createTable(database, table, schema)

        # load data
        self._loadData(database, table, files)

        # create special dummy chunk
        self._createDummyChunk(database, table)

        # create index on czar size
        self._makeIndex(database, table)

        # update CSS with info for this table
        if self.css is not None:
            self._updateCss(database, table)

        # optionally make emptyChunks file
        self._makeEmptyChunks()

    def _cleanup(self):
        """
        Do cleanup, remove all temporary files, this should not throw.
        """

        # remove tmp files
        for fName in self.cleanupFiles:
            try:
                self._log.debug('Deleting temporary file: %r', fName)
                os.unlink(fName)
            except Exception as exc:
                self._log.error('Failed to remove temporary file: %r', exc)

        # remove directories
        for dirName in self.cleanupDirs:
            try:
                self._log.debug('Deleting directory: %r', dirName)
                shutil.rmtree(dirName)
            except Exception as exc:
                self._log.error('Failed to remove directory: %r', exc)

    def _checkCss(self, database, table):
        """
        Check CSS for existing configuration and see if it matches ours.
        Throws exception if any irregularities are observed.
        """

        self._log.info('Verifying CSS info for table %r', table)

        # get striping info
        try:
            striping = self.css.getDbStriping(database)
            self._log.debug('CSS database striping info: %r', striping)
        except css.NoSuchDb:
            # we'll create it later
            return

        # check parameters
        self._checkPartParam(self.partOptions, 'part.num-stripes',
                             striping.stripes, int)
        self._checkPartParam(self.partOptions, 'part.num-sub-stripes',
                             striping.subStripes, int)
        self._checkPartParam(self.partOptions, 'part.default-overlap',
                             striping.overlap, float)

        # also check that table does not exist in CSS, or optionally remove it
        cssTableExists = self.css.containsTable(database, table)
        if cssTableExists:
            if self.cssClear:
                # try to remove it
                self.css.dropTable(database, table)
            else:
                self._log.error('Table is already defined in CSS')
                raise RuntimeError('table exists in CSS')

    @staticmethod
    def _checkPartParam(partOptions, partKey, cssValue, optType=str):
        """
        Check that partitioning parameters are compatible. Throws exception
        if there is a mismatch.
        """
        optValue = optType(partOptions[partKey])
        if optValue != cssValue:
            raise ValueError('Option %r does not match CSS: %r != %r' %
                             (partKey, optValue, cssValue))

    def _makeOrCheckChunksDir(self, data):
        '''Create directory for chunk data or check that it exists, throws in case of errors.'''

        # only need it for partitioned table
        if not self.partitioned:
            return

        # in case we do skip-part but load into one table then we just take
        # data from command line if it is specified
        if self.oneTable and self.skipPart and data:
            return

        chunks_dir = self.chunksDir

        # if it exists it must be directory
        exists = False
        if os.path.exists(chunks_dir):
            exists = True
            if not os.path.isdir(chunks_dir):
                self._log.error(
                    'Path for chunks exists but is not a directory: %r',
                    chunks_dir)
                raise IOError('chunk path is not directory: ' + chunks_dir)

        if self.skipPart:
            # directory must exist and have some files (chunk_index.bin at least)
            if not exists:
                self._log.error('Chunks directory does not exist: %r',
                                chunks_dir)
                raise RuntimeError('chunk directory is missing')
            path = os.path.join(chunks_dir, 'chunk_index.bin')
            if not os.path.exists(path):
                self._log.error(
                    'Could not find required file (chunk_index.bin) in chunks directory'
                )
                raise RuntimeError('chunk_index.bin is missing')
        else:
            if exists:
                # must be empty, we do not want any extraneous stuff there
                if os.listdir(chunks_dir):
                    self._log.error('Chunks directory is not empty: %r',
                                    chunks_dir)
                    raise RuntimeError('chunks directory is not empty: ' +
                                       chunks_dir)
            else:
                try:
                    self._log.debug('Creating chunks directory %r', chunks_dir)
                    os.makedirs(chunks_dir)
                    # will remove it later
                    if not self.keepChunks:
                        self.cleanupDirs.append(chunks_dir)
                except Exception as exc:
                    self._log.error('Failed to create chunks directory: %r',
                                    exc)
                    raise

    def _runPartitioner(self, files):
        '''Run partitioner to fill chunks directory with data, returns 0 on success.'''
        def fileList(dirName):
            '''Generate a sequence of file names in directory, exclude directories'''
            for fName in os.listdir(dirName):
                path = os.path.join(dirName, fName)
                if os.path.isfile(path):
                    yield path

        # build arguments list
        partitioner = 'sph-partition-matches' if self.partOptions.isRefMatch else 'sph-partition'
        args = [
            partitioner, '--out.dir', self.chunksDir, '--part.prefix',
            self.chunkPrefix
        ]
        for config in self.configFiles:
            args += ['--config-file', config]
        for data in files:
            args += ['--in', data]

        try:
            # run partitioner
            self._log.info('run partitioner on files: %r', ' '.join(files))
            self._log.debug('Run shell command: %r', ' '.join(args))
            subprocess.check_output(args=args)
        except Exception as exc:
            self._log.error('Failed to run partitioner: %r', exc)
            raise
        finally:
            # some chunk files may have been created, add them to cleanup list
            if not self.keepChunks:
                self.cleanupFiles += list(fileList(self.chunksDir))

    def _gunzip(self, data):
        """
        Uncompress compressed input files to a temporary directory.
        Returns list of input file names with compressed files replaced by
        uncompressed file location. Throws exception in case of errors.
        """

        result = []
        for infile in data:

            # we rely on file extension to decide whether it is compressed or not,
            # for more reliable way we could use something like "magic" module
            if infile.endswith('.gz'):

                if self.tmpDir is None:

                    # use chunks directory for that
                    if os.path.exists(self.chunksDir):
                        if not os.path.isdir(self.chunksDir):
                            self._log.error(
                                'Path for chunks is not a directory: %r',
                                self.chunksDir)
                            raise IOError('chunk path is not directory: ' +
                                          self.chunksDir)
                    else:
                        # create it, but don't forget to delete it later
                        self._log.debug('Creating chunks directory %r',
                                        self.chunksDir)
                        os.makedirs(self.chunksDir)
                        if not self.keepChunks:
                            self.cleanupDirs.append(self.chunksDir)

                    try:
                        self.tmpDir = tempfile.mkdtemp(dir=self.chunksDir)
                        # need to remove it later, before chunks dir
                        self.cleanupDirs.insert(0, self.tmpDir)
                    except Exception as exc:
                        self._log.critical(
                            'Failed to create temp directory for uncompressed files: %r',
                            exc)
                        raise
                    self._log.debug('Created temporary directory %r',
                                    self.tmpDir)
                else:
                    # check and create if needed
                    if os.path.exists(self.tmpDir):
                        if not os.path.isdir(self.tmpDir):
                            self._log.critical(
                                'Temporary location is not a directory: %r',
                                self.tmpDir)
                            raise IOError(
                                'Temporary location is not a directory: ' +
                                self.tmpDir)
                    else:
                        try:
                            os.mkdir(self.tmpDir)
                            self._log.debug('Created temporary directory %r',
                                            self.tmpDir)
                            # need to remove it later
                            self.cleanupDirs.append(self.tmpDir)
                        except Exception as exc:
                            self._log.critical(
                                'Failed to create temp directory: %r', exc)
                            raise

                # construct output file name
                outfile = os.path.basename(infile)
                outfile = os.path.splitext(outfile)[0]
                outfile = os.path.join(self.tmpDir, outfile)

                # will cleanup it later
                self.cleanupFiles.append(outfile)

                self._log.info('Uncompressing %r to %r', infile, outfile)
                try:
                    finput = open(infile)
                    foutput = open(outfile, 'wb')
                    cmd = ['gzip', '-d', '-c']
                    subprocess.check_call(args=cmd,
                                          stdin=finput,
                                          stdout=foutput)
                except Exception as exc:
                    self._log.critical('Failed to uncompress data file: %r',
                                       exc)
                    raise

            else:

                # file is already uncompressed
                self._log.debug('Using input file which is not compressed: %r',
                                infile)
                outfile = infile

            result.append(outfile)

        return result

    def _connections(self, useCzar, useWorkers):
        """
        Returns a list of wmgr "connections", for each conection there is a
        tuple (name, connection) where name is something like "czar" or
        "worker lsst-dbdev2". If czar connection is included then it
        is always first in the list.

        @param useCzar:     if True then include czar in the list
        @param useWorkers:  if True then include all workers in the list
        """
        res = []
        if useCzar:
            res += [("czar", self.czarWmgr)]
        if useWorkers:
            for worker, wmgr in self.workerWmgrMap.items():
                res += [('worker ' + worker, wmgr)]
        return res

    def _deleteTable(self, database, table):
        """
        Drop existing table and all chunks.
        """

        self._log.info('Deleting existing table %r (and chunks)', table)

        for name, wmgr in self._connections(useCzar=True, useWorkers=True):
            self._log.info('Deleting table from %r', name)
            wmgr.dropTable(database, table, dropChunks=True, mustExist=False)

    def _createTable(self, database, table, schema):
        """
        Create table in the database. Just executes whatever SQL was given to
        us in a schema file. Additionally applies fixes to a schema after loading.
        """

        # read table schema
        try:
            self.schema = open(schema).read()
        except Exception as exc:
            self._log.critical('Failed to read table schema file: %r', exc)
            raise

        # create table on czar and every worker
        for name, wmgr in self._connections(useCzar=True, useWorkers=True):
            self._log.info('Creating table %r in %r', table, name)
            chunkColumns = bool(self.partitioned)
            try:
                wmgr.createTable(database,
                                 table,
                                 schema=self.schema,
                                 chunkColumns=chunkColumns)
            except ServerError as exc:
                if exc.code == 409:
                    self._log.info('Table %r exists in %r', table, name)
                else:
                    self._log.critical('Failed to create table %r in %r',
                                       table, name)
                    raise

    def _loadData(self, database, table, files):
        """
        Load data into existing table.
        """
        if not self.partitioned or (self.oneTable and self.skipPart and files):
            # load files given on command line
            self._loadNonChunkedData(database, table, files)
        else:
            # load data from chunk directory
            self._loadChunkedData(database, table)

    def _chunkFiles(self):
        """
        Generator method which returns list of all chunk files. For each chunk returns
        a triplet (path:string, chunkId:int, overlap:bool).
        """
        for dirpath, _, filenames in os.walk(self.chunksDir, followlinks=True):
            for fileName in filenames:
                match = self.chunkRe.match(fileName)
                if match is not None:
                    path = os.path.join(dirpath, fileName)
                    chunkId = int(match.group('id'))
                    overlap = match.group('ov') is not None
                    yield (path, chunkId, overlap)

    def _loadChunkedData(self, database, table):
        """
        Load chunked data into mysql table, if one-table option is specified then all chunks
        are loaded into a single table with original name, otherwise we create one table per chunk.
        """

        # As we read from partitioner output files we use "out.csv" option for that.
        csvPrefix = "out.csv"

        for path, chunkId, overlap in self._chunkFiles():

            # remember all chunks that we loaded
            if not overlap:
                self.chunks.add(chunkId)

            if self.oneTable:

                # just load everything into existing table, do not load overlaps
                if not overlap:
                    self._loadOneFile(self.czarWmgr, database, table, path,
                                      csvPrefix)
                else:
                    self._log.info('Ignore overlap file %r', path)

            else:

                # Partitioner may potentially produce empty overlap files even
                # in cases when we should not make overlap tables. Check and
                # filter out empty files or complain about non-empty.
                if overlap and not self.partOptions.isSubChunked:
                    # check contents, try to read some data and strip spaces
                    data = open(path).read(1024).strip()
                    if data:
                        raise RuntimeError(
                            'Found non-empty overlap file for non-subchunked table: '
                            + path)
                    else:
                        self._log.info('Ignore empty overlap file %r', path)
                        continue

                if self.workerWmgrMap:
                    # find database for this chunk
                    worker = self.chunkMap.worker(chunkId)
                    wmgr = self.workerWmgrMap.get(worker)
                    if wmgr is None:
                        raise RuntimeError(
                            'Existing chunk mapping is not in the list of workers: '
                            + worker)
                    self._log.info('load chunk %r to worker %r', chunkId,
                                   worker)
                else:
                    # all goes to single node
                    self._log.info('load chunk %r to czar', chunkId)
                    wmgr = self.czarWmgr

                # make tables if needed
                if chunkId not in self.createdChunks:
                    try:
                        wmgr.createChunk(database,
                                         table,
                                         chunkId,
                                         overlap=self.partOptions.isSubChunked)
                        self.createdChunks.add(chunkId)
                    except ServerError as exc:
                        if exc.code == 409:
                            self._log.info('Chunk %r exists for table %r',
                                           chunkId, table)
                        else:
                            self._log.critical(
                                'Failed to create chunk %r for table %r',
                                chunkId, table)
                            raise

                # load data into chunk table
                self._loadOneFile(wmgr,
                                  database,
                                  table,
                                  path,
                                  csvPrefix,
                                  chunkId=chunkId,
                                  overlap=overlap)

    @staticmethod
    def _chunkTableName(table, chunkId, overlap):
        """
        Return full chunk table name or overlap table name.
        """
        ctable = table
        if overlap:
            ctable += 'FullOverlap'
        ctable += '_'
        ctable += str(chunkId)
        return ctable

    def _createDummyChunk(self, database, table):
        """
        Make special dummy chunk in case of partitioned data.
        """

        if not self.partitioned or self.oneTable:
            # only do it for true partitioned stuff
            return

        # this is only needed on worker (or czar if there are no workers)
        connections = self._connections(useCzar=False, useWorkers=True)
        if not connections:
            connections = self._connections(useCzar=True, useWorkers=False)

        for name, wmgr in connections:

            self._log.info('Make dummy chunk table for %r', table)

            # just make regular chunk with special ID, do not load any data
            try:
                wmgr.createChunk(database,
                                 table,
                                 1234567890,
                                 overlap=self.partOptions.isSubChunked)
            except ServerError as exc:
                if exc.code == 409:
                    self._log.info(
                        'Dummy chunk 1234567890 exists for table %r', table)
                else:
                    self._log.critical(
                        'Failed to create dummy chunk 1234567890 for table %r',
                        table)
                    raise

    def _loadNonChunkedData(self, database, table, files):
        """
        Load non-chunked data into mysql table. We use (unzipped) files that
        we got for input.
        """

        # As we read from input files (which are also input files for partitioner)
        # we use "in.csv" option for that.
        csvPrefix = "in.csv"

        # this is only needed on workers (or czar if there are no workers)
        connections = self._connections(useCzar=False, useWorkers=True)
        if not connections:
            connections = self._connections(useCzar=True, useWorkers=False)

        for name, wmgr in connections:
            self._log.info('load non-chunked data to %r', name)
            for file in files:
                self._loadOneFile(wmgr, database, table, file, csvPrefix)

    def _loadOneFile(self,
                     wmgr,
                     database,
                     table,
                     path,
                     csvPrefix,
                     chunkId=None,
                     overlap=None):
        """Load data from a single file into existing table"""

        self._log.info('load table %r.%r from file %r', database, table, path)

        # need to know special characters used in csv
        # default delimiter is the same as in partitioner
        special_chars = {
            'delimiter': '\t',
            'enclose': '',
            'escape': '\\',
            'newline': '\n'
        }

        data = {}
        for name, default in special_chars.items():
            data[name] = self.partOptions.get(csvPrefix + '.' + name, default)

        try:
            file = open(path, "rb")
        except IOError as exc:
            self._log.error('failed to open file %r', path)
            raise

        wmgr.loadData(database,
                      table,
                      file,
                      fileName=path,
                      chunkId=chunkId,
                      overlap=overlap,
                      delimiter=data['delimiter'],
                      enclose=data['enclose'],
                      escape=data['escape'],
                      terminate=data['newline'])

    def _updateCss(self, database, table):
        """
        Update CSS with information about loaded table and database.
        """

        # create database in CSS if not there yet
        if not self.css.containsDb(database):
            self._log.info('Creating database CSS info')
            options = self.partOptions.cssDbOptions()
            striping = css.StripingParams(options['nStripes'],
                                          options['nSubStripes'], 0,
                                          options['overlap'])
            self.css.createDb(database, striping, options['storageClass'],
                              'RELEASED')

        # define options for table
        options = self.partOptions.cssTableOptions()
        schema = self._schemaForCSS(database, table)

        if options.get('match', False):
            matchParams = css.MatchTableParams(options['dirTable1'],
                                               options['dirColName1'],
                                               options['dirTable2'],
                                               options['dirColName2'],
                                               options['flagColName'])
            self._log.info('Creating table CSS info for match table')
            self.css.createMatchTable(database, table, schema, matchParams)
        else:
            if 'dirTable' in options:
                # partitioned table
                pParams = css.PartTableParams(
                    options['dirDb'], options['dirTable'],
                    options['dirColName'], options['latColName'],
                    options['lonColName'], options['overlap'], True,
                    options['subChunks'])
                sParams = css.ScanTableParams(options['lockInMem'],
                                              options['scanRating'])
            else:
                pParams = css.PartTableParams()
                sParams = css.ScanTableParams()
            self.css.createTable(database, table, schema, pParams, sParams)

        # save chunk mapping too
        self._log.info('Saving updated chunk map to CSS')
        self.chunkMap.save()

    def _schemaForCSS(self, database, table):
        """
        Returns schema string for CSS, which is a create table only without
        create table, only column definitions
        """

        schema = self.czarWmgr.tableSchema(database, table)
        # strip CREATE TABLE
        i = schema.find('(')
        return schema[i:]

    def _makeEmptyChunks(self):
        """
        Generate empty chunks file, should be called after loading is complete.
        """

        if not self.emptyChunks:
            # need a file name
            return

        # only makes sense for true partitioned tables
        if not self.partitioned:
            self._log.info(
                'Table is not partitioned, will not make empty chunks file %r',
                self.emptyChunks)
            return

        # max possible number of chunks
        nStripes = int(self.partOptions['part.num-stripes'])
        maxChunks = 2 * nStripes**2

        self._log.info('Making empty chunk list (max.chunk=%d) %r', maxChunks,
                       self.emptyChunks)

        emptyChunkDir = os.path.dirname(self.emptyChunks)
        try:
            os.makedirs(emptyChunkDir)
        except OSError:
            if not os.path.isdir(emptyChunkDir):
                raise

        out = open(self.emptyChunks, 'w')
        for chunk in range(maxChunks):
            if chunk not in self.chunks:
                print(chunk, file=out)

    def _makeIndex(self, database, table):
        """
        Generate object index in czar meta database.
        """

        # only makes sense for director table
        if not self.partitioned or \
           not self.partOptions.isDirector(database, table) or \
           not self.indexDb:
            return

        metaTable = database + '__' + table
        self._log.info('Generating index %r.%r', self.indexDb, metaTable)

        # try to delete existing table first
        # self.czarWmgr.dropTable(self.indexDb, metaTable, mustExist=False)

        # index column
        idxCol = self.partOptions['id']

        # get index column type from original table
        idxColType = 'BIGINT'
        for col in self.czarWmgr.tableColumns(database, table):
            if col['name'] == idxCol:
                idxColType = col['type']
                break

        # make a table, InnoDB engine is required for scalability
        schema = "CREATE TABLE IF NOT EXISTS {table} ({column} {type} NOT NULL PRIMARY KEY, chunkId INT, subChunkId INT)"
        schema += " ENGINE = INNODB"
        schema = schema.format(table=metaTable, column=idxCol, type=idxColType)
        self.czarWmgr.createTable(self.indexDb, metaTable, schema=schema)

        # call one of the two methods
        if self.workerWmgrMap:
            self._makeIndexMultiNode(database, table, metaTable, idxCol)
        else:
            self._makeIndexSingleNode(database, table, metaTable, idxCol)

    def _makeIndexMultiNode(self, database, table, metaTable, idxCol):
        """
        Generate object index in czar meta database in case chunks are on a separate
        server from index database. It needs to copy all index data over network,
        may need special optimization or parameters.
        """

        # load data from all chunks
        for chunk in self.chunks:

            # get worker name for this chunk
            wname = self.chunkMap.worker(chunk)
            wmgr = self.workerWmgrMap[wname]

            self._loadChunkIndex(wmgr, database, table, chunk, metaTable,
                                 idxCol)

    def _makeIndexSingleNode(self, database, table, metaTable, idxCol):
        """
        Generate object index in czar meta database in case all chunks are also on czar.
        """

        # TODO: there is for sure more efficient method than copying data locally

        # load data from all chunks
        for chunk in self.chunks:

            self._loadChunkIndex(self.czarWmgr, database, table, chunk,
                                 metaTable, idxCol)

    def _loadChunkIndex(self, wmgr, database, table, chunk, metaTable, idxCol):
        """
        Load secondary index with data from a single chunk.
        """

        # get index data from worker (or czar)
        columns = (idxCol, 'chunkId', 'subChunkId')
        indexData = wmgr.getIndex(database,
                                  table,
                                  chunkId=chunk,
                                  columns=columns)

        # dump it into a in-memory file, loadData expects binary mode
        data = BytesIO()
        for row in indexData:
            data.write(b"%d\t%d\t%d\n" % tuple(row))
        data.seek(0)

        # send that file to czar
        self.czarWmgr.loadData(self.indexDb, metaTable, data)