Пример #1
0
 def __init__(self, connection, create_tbl, values_tbl):
     self._connection = connection
     self._create_tbl = create_tbl
     self._values_tbl = values_tbl
     self._hive = HBaseHive()
Пример #2
0
class HBaseParse:

    _create_tbl = None
    _values_tbl = None
    _connection = None  # bare connection, no reference to the namespace
    _hbase_time = 0

    def __init__(self, connection, create_tbl, values_tbl):
        self._connection = connection
        self._create_tbl = create_tbl
        self._values_tbl = values_tbl
        self._hive = HBaseHive()

    def __del__(self):
        self._connection.close()

    def create_stmt(self, sqldump_key, tbl_name):
        row = self._create_tbl.row(sqldump_key, ["tables:"+tbl_name])
        return row["tables:"+tbl_name]

    def view_stmt(self, sqldump_key, view_name):
        row = self._create_tbl.row(sqldump_key, ["views:"+view_name])
        return row["views:"+view_name]

    def inserts(self, sqldump_key, tbl_name):
        row = self._create_tbl.row(sqldump_key, ["hashes:"+tbl_name])
        if "hashes:"+tbl_name not in row:
            print >> sys.stderr, " - MISSING DATA",
            return []
        return eval(row["hashes:"+tbl_name])

    def all_inserts(self, tbl_name):
        s = self._create_tbl.scan(columns=["hashes:"+tbl_name])
        return sorted([ (k, eval(v["hashes:"+tbl_name])) for k,v in s])

    def get_tables(self, sqldump_key):
        row = self._create_tbl.row(sqldump_key, ["tables"])
        # getting the list of CREATE statements, I'd expect to
        # step on an exception when then accessing the HASHES
        # this shall work out as a sort of consistency check
        return sorted([k.split(":")[1] for k in row.keys()])

    def get_views(self, sqldump_key):
        row = self._create_tbl.row(sqldump_key, ["views"])
        return sorted([k.split(":")[1] for k in row.keys()])

    def values(self, insert_key, tbl_name):
        row = self._values_tbl.row(insert_key, ["values:"+tbl_name])
        return eval("["+row["values:"+tbl_name]+"]") # a list of tuples

    def desired_tables(self, sqldump_key, include):
        for tbl_name in self.get_tables(sqldump_key):
            if tbl_name in include:
                self.ingest(sqldump_key, tbl_name)

    def all_except_some(self, sqldump_key, exclude):
        for tbl_name in self.get_tables(sqldump_key):
            if tbl_name not in exclude:
                self.ingest(sqldump_key, tbl_name)

    def ingest(self, sqldump_key, tbl_name):
        tbl_def = schema(self.create_stmt(sqldump_key, tbl_name))

        self._hive.create_hive(self._connection.table_prefix, tbl_name, tbl_def)
        if os.environ.get('ONLYHIVE') is not None: return

        if tbl_name not in self._connection.tables():
            print "OVER-WRITING>", tbl_name, sqldump_key; sys.stdout.flush()
            self.drop_create(tbl_name)  # FIXME! we to handle it differently

        if not self.has_data(tbl_name):
            # there are no data* => RE-INGEST, due to HBASE-5241
            # * we are looking in the past, there might be
            # data if we look ahead (more recent timestamp)
            # but we want to PRESERVE those data...
            # an alternative would be to WIPE the table
            self.bulk_intake(sqldump_key, tbl_name, tbl_def)

        else: self.incremental_intake(sqldump_key, tbl_name, tbl_def)

    def bulk_intake(self, sqldump_key, tbl_name, tbl_def):
        print " --- each . (dot) might be a 16Mb chunk of data",
        print "<", "%s[%s]" % (tbl_name, sqldump_key)

        for mysql_insert_md5 in self.inserts(sqldump_key, tbl_name):
            print ".",
            sys.stdout.flush()
            b = self._connection.table(tbl_name).batch(timestamp=int(sqldump_key))
            mysql_insert_data = self.values(mysql_insert_md5, tbl_name)
            data = [map_hbase(tbl_def, row) for row in mysql_insert_data]
            self.batch(b, data)
        print ""

    def batch(self, hbase_batch, data, deletes={}):
        for row_key, columns in data:
            hbase_batch.put(row_key.zfill(KEYDIGITS), columns)

        # here are separate batches for the deletes
        #for timestamp_, list_ in deletes.iteritems():
        #    stone_batch = self._connection.table(
        #        hbase_batch.table.name, use_prefix=False
        #            ).batch(timestamp=int(timestamp_))
        #    for row in list_:
        #        row_key, columns_ = row
        #        stone_batch.delete(row_key, columns=columns_)
        #    if os.environ.get('NOSEND') is None: stone_batch.send()
        # FIXME! disabled until HBASE-5241 gets fixed

        # when moving data towards the past, we better first delete, then write
        if os.environ.get('NOSEND') is not None: print "- NOT sent! -" ; return
        else: hbase_batch.send()

    def incremental_intake(self, sqldump_key, tbl_name, tbl_def):
        stones = milestones(self.all_inserts(tbl_name))
        timestamps = [ stone[0] for stone in stones ]

        i = timestamps.index(sqldump_key)
        prev_, next_ = in_between(timestamps, i)
        print tbl_name, (i, len(timestamps)-1), (prev_, next_), stones[i]
        sys.stdout.flush()  # weird, on Jenkins stdout gets buffered

        # k, let's operate...
        table = self._connection.table(tbl_name)

        for mysql_insert_md5 in stones[i][1]:
            mysql_insert_data = self.values(mysql_insert_md5, tbl_name)
            data = [map_hbase(tbl_def, row) for row in mysql_insert_data]
            id_first, id_last =self.id_tuples(data)
            hbase_time = time.time()

            scan_prev = table.scan(row_start=str(id_first).zfill(KEYDIGITS),
                                   row_stop=str(id_last+1).zfill(KEYDIGITS),
                                   timestamp=int(prev_)+1,
                                   include_timestamp=True)
            data_prev = [(k,v) for k,v in scan_prev]

            scan_next = table.scan(row_start=str(id_first).zfill(KEYDIGITS),
                                   row_stop=str(id_last+1).zfill(KEYDIGITS),
                                   timestamp=int(next_)+1,
                                   include_timestamp=True)
            data_next = [(k,v) for k,v in scan_next]

            delta_time = int(time.time() - hbase_time)
            self._hbase_time += delta_time
            if os.environ.get('DEBUG') is not None:
                print >> sys.stderr, "\t - spent at HBASE (delta, overall) seconds:",
                print >> sys.stderr, (delta_time, self._hbase_time)
            data_diff, deletes = diff_datasets((sqldump_key,data),
                                               (prev_, data_prev),
                                               (next_, data_next))
            b = self._connection.table(tbl_name).batch(timestamp=int(sqldump_key))
            self.batch(b, data_diff, deletes)

            # weird, on Jenkins stdout gets buffered
            sys.stdout.flush()
            sys.stderr.flush()

    def id_tuples(self, data):
        if len(data) == 0: raise RuntimeError("NO DATA???")
        sorted_ids = tuple(sorted(int(d[0]) for d in data))
        return (sorted_ids[0], sorted_ids[-1])

    def has_data(self, tbl_name):
        data_scanner = self._connection.table(tbl_name).scan(
            filter="KeyOnlyFilter()",
            include_timestamp=True,
            limit=1)
        data = [(k,v) for k,v in data_scanner]
        if len(data) != 0: return True
        return False

    def drop_create(self, tbl_name):
        try: self.create(tbl_name)
        except:
            if os.environ.get('DEBUG') is not None:
                print >> sys.stderr, " - DROP/CREATE",
            self.drop(tbl_name)
            self.create(tbl_name)

    def create(self, tbl_name):
        self._connection.create_table(tbl_name,
            {'mysql': __compressed_historize__}
        )

    def drop(self, tbl_name):
        try: self._connection.disable_table(tbl_name)
        except: print tbl_name, "... already disabled, deleting"
        finally: self._connection.delete_table(tbl_name)