class MySQLDump: # this is a state machine reading through the file _md5 = None # md5 object associated to the mysql input stream _row_counter = 0 # I want to know where I'm _forced_timestamp = "" # just in case def __init__(self, f_descriptor, namespace, skiptables=[], forced_timestamp=""): self._skip_tables = skiptables self._forced_timestamp = forced_timestamp self._hbase = HBaseIntake(namespace) self._hbase.connect() self._md5 = hashlib.md5() self._namespace = namespace start_time = time.time() self.run(f_descriptor) # using past_row, instead of curr_row, due the variable's scope self._hbase.set_row_count(self._row_counter) self._hbase.set_md5(self._md5.hexdigest()) self._hbase.set_parse_time(time.time() - start_time) self._hbase.commit(self) def __str__(self): return "MD5> " + self._md5.hexdigest() + "\nROWs> " + str(self._row_counter) def timestamp(self): if self._forced_timestamp != "": self._timestamp = datetime.strptime(self._forced_timestamp, "%Y-%m-%d").strftime("%s") if self._timestamp is None or self._timestamp == 0: raise RuntimeError("we've not extracted the timestamp of the mysqldump") return self._timestamp def run(self, f_descriptor): past_row = MySQLRow("") # useless line to start with for line in f_descriptor: self._row_counter += 1 curr_row = MySQLRow(line) if past_row.is_create(): if curr_row.is_useless(): # most likely the CREATE is complete if os.environ.get("DEBUG") is not None: print >> sys.stderr, past_row, past_row.stmt() if not past_row.tbl_name() in self._skip_tables: self._hbase.set_create_tbl(past_row.tbl_name(), past_row.payload()) elif os.environ.get("DEBUG") is not None: print >> sys.stderr, "SKIP> CREATE", past_row.tbl_name() if not curr_row.is_useless() and not curr_row.is_create() and not curr_row.is_insert(): past_row.append(line) curr_row = past_row if curr_row.is_view(): if os.environ.get("DEBUG") is not None: print >> sys.stderr, curr_row.raw() self._hbase.set_view(curr_row.tbl_name(), curr_row.payload()) if curr_row.is_insert(): if not curr_row.tbl_name() in self._skip_tables: self._hbase.send(curr_row) elif os.environ.get("DEBUG") is not None: print >> sys.stderr, "SKIP> INSERT", past_row.tbl_name() if os.environ.get("DEBUG") is not None: print >> sys.stderr, self._row_counter, curr_row, curr_row.tbl_name() sys.stderr.flush() self._md5.update(line) past_row = curr_row # for next line's parsing # last parsed line, we have (hopefully) the timestamp in it self._timestamp = past_row.timestamp()