예제 #1
0
파일: order_by.py 프로젝트: PayasR/paralite
    def start(self):
        try:
            # start socket server to listen all connections
            ch = self.iom.create_server_socket(AF_INET, SOCK_STREAM, 100, ("", self.my_port))
            n, self.my_port = ch.ss.getsockname()
            ParaLiteLog.debug("listen on port : %s ..." % str(self.my_port))

            # start socket server for local connections
            self.local_addr = "/tmp/paralite-local-addr-orderby-%s-%s-%s" % (gethostname(), self.cqid, self.opid)
            if os.path.exists(self.local_addr):
                os.remove(self.local_addr)
            self.iom.create_server_socket(AF_UNIX, SOCK_STREAM, 10, self.local_addr)

            # register local port to the master
            self.register_to_master(self.cqid, self.opid, gethostname(), self.my_port)
            ParaLiteLog.debug("reg to master: FINISH")

            while self.is_running:
                ev = self.next_event(None)
                if isinstance(ev, ioman_base.event_accept):
                    self.handle_accept(ev)
                if isinstance(ev, ioman_base.event_read):
                    if ev.data != "":
                        self.handle_read(ev)

            ParaLiteLog.info("--orderby node %s on %s is finished--" % (self.opid, gethostname()))

        except KeyboardInterrupt, e:
            self.report_error("ParaLite receives a interrupt signal and then will close the process\n")
            ParaLiteLog.info("--orderby node %s on %s is finished--" % (self.opid, gethostname()))
            sys.exit(1)
예제 #2
0
파일: sql.py 프로젝트: PayasR/paralite
    def sql_proc(self):
        try:
            ParaLiteLog.debug("sql proc : START")
            # start local socket server to listen all connections
            ch = self.iom.create_server_socket(AF_INET,
                                               SOCK_STREAM, 100, ("", self.my_port)) 
            n, self.my_port = ch.ss.getsockname()
            ParaLiteLog.debug("listen on port : %s ..." % str(self.my_port))
            
            # register local port to the master
            self.register_to_master(self.cqid, self.opid, gethostname(), self.my_port)
            ParaLiteLog.debug("reg to master: FINISH")
            
            while self.is_running:
                s_time = time.time()
                ev = self.next_event(None)
                if isinstance(ev, ioman_base.event_accept):
                    self.handle_accept(ev)
                if isinstance(ev, ioman_base.event_read):
                    if ev.data != "":
                        e_time = time.time()
                        self.handle_read(ev)

            for thd in self.threads:
                thd.join()
            for proc in self.processes:
                proc.join()
            ParaLiteLog.info("--sql node %s on %s is finished--" % (self.opid,
                                                                    gethostname()))
            #self.notifier.join()
        except KeyboardInterrupt, e:
            self.report_error("ParaLite receives a interrupt signal and then will close the process\n")
            ParaLiteLog.info("--sql node %s on %s is finished--" % (self.opid,
                                                                    gethostname()))
            sys.exit(1)
예제 #3
0
 def send_to_node(self, db, table, data, addr, row_sep, col_sep, is_replace):
     sep = conf.SEP_IN_MSG
     req_info = "%s%s%s%s%s%s%s%s%s%s%s%s%s" % (conf.INFO, sep, db, sep, table, sep, self.db_col_sep, sep, col_sep, sep, row_sep, sep,is_replace)
     ParaLiteLog.info("sending %s  --> %s" % (req_info, addr[0]))
     self.really_send(addr, req_info)
     # use the first 10 charactors to indicate the database 
     self.really_send(addr, "%10s%s%s" % (len(db), db, data))
     ParaLiteLog.info("sending data : %s --> %s" % (len(data), repr(addr)))
예제 #4
0
파일: distinct.py 프로젝트: PayasR/paralite
    def distribute_data(self):
        whole_data = cStringIO.StringIO()
        for i in self.result:
            for csio in self.result[i]:
                d = string.strip(csio.getvalue())
                if len(d) == 0:
                    continue
                whole_data.write(d)
                whole_data.write("\n")
                del csio
                
        if self.limit != -1:
            data_list = whole_data.getvalue().split(self.db_row_sep)[:self.limit]
            del whole_data
            data = cStringIO.StringIO()
            data.write(self.db_row_sep.join(str(s) for s in data_list))
            del data_list
        else:
            data = whole_data

        if self.dest == conf.DATA_TO_ONE_CLIENT:
            # send data to a random client
            random_num = random.randint(0, len(self.client_sock) - 1)
            addr = self.client_sock[random_num]
            sock = socket(AF_INET, SOCK_STREAM)
            sock.connect(addr)
            data_s = data.getvalue()
            sock.send("%10s%s" % (len(data_s), data_s))
            re = sock.recv(10)
            assert re == "OK"
            sock.close()

        elif self.dest == conf.DATA_TO_DB:
            self.data = data            
            col_sep = self.db_col_sep
            row_sep = self.db_row_sep
            master = (self.master_name, self.master_port)

            ParaLiteLog.info("proc_select: load data start")
            # send request to the master
            t_size = len(data.getvalue())
            sep = conf.SEP_IN_MSG
            tag = conf.LOAD_FROM_API
            if row_sep is None or row_sep == "\n":
                temp_sep = "NULL"
            else:
                temp_sep = row_sep
            msg = sep.join(
                str(s) for s in [conf.REQ, self.cqid, gethostname(), 
                                 self.my_port, self.dest_db, self.dest_table,
                                 t_size, tag, self.fashion, temp_sep, "0"])
            so_master = socket(AF_INET, SOCK_STREAM)
            so_master.connect(master)
            so_master.send("%10s%s" % (len(msg),msg))
            so_master.close()
예제 #5
0
파일: sql.py 프로젝트: PayasR/paralite
 def proc_drop(self, exp, target_db):
     try:
         for db in target_db:
             conn = sqlite3.connect(db)
             c = conn.cursor()
             c.execute(exp)
             conn.commit()
             conn.close()
     except sqlite3.OperationalError, e:
         es("%s: %s" % (gethostname(), " ".join(e.args)))
         ParaLiteLog.info(traceback.format_exc())
예제 #6
0
파일: sql.py 프로젝트: PayasR/paralite
 def step(self, value):
     try:
         newvalue = value
         if isinstance(value, unicode):
             newvalue = value.encode("ascii")
         if isinstance(newvalue, str):
             newvalue = string.atoi(newvalue)
         self.product *= newvalue
     except:
         ParaLiteLog.info(traceback.format_exc())
         raise(Exception(traceback.format_exc()))
예제 #7
0
파일: sql.py 프로젝트: PayasR/paralite
 def proc_create(self, exp, target_db):
     try:
         # first of all, check if the directory holds database exists or not
         for db in target_db:
             parent = db[0:db.rfind(os.sep)]
             if not os.path.exists(parent):
                 os.makedirs(parent)
             conn = sqlite3.connect(db)
             c = conn.cursor()
             c.execute(exp)
             conn.commit()
             conn.close()
     except sqlite3.OperationalError, e:
         ParaLiteLog.info(traceback.format_exc())
         raise(Exception("ERROR: in proc_create: %s: %s" % (gethostname(),
                                                            " ".join(e.args))))
예제 #8
0
파일: order_by.py 프로젝트: PayasR/paralite
def main():
    if len(sys.argv) != 7:
        sys.exit(1)
    proc = OrderbyOp()
    proc.master_name = sys.argv[1]
    proc.master_port = string.atoi(sys.argv[2])
    proc.cqid = sys.argv[3]
    proc.opid = sys.argv[4]
    proc.my_port = string.atoi(sys.argv[5])
    proc.log_dir = sys.argv[6]
    if not os.path.exists(proc.log_dir):
        os.makedirs(proc.log_dir)
    cur_time = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime(time.time()))
    ParaLiteLog.init("%s/orderby-%s-%s.log" % (proc.log_dir, gethostname(), cur_time), logging.DEBUG)
    ParaLiteLog.info("--orderby node %s on %s is started" % (proc.opid, gethostname()))
    proc.start()
예제 #9
0
파일: sql.py 프로젝트: PayasR/paralite
def main():
    if len(sys.argv) != 7:
        sys.exit(1)
    proc = SqlOp()
    proc.master_name = sys.argv[1]
    proc.master_port = string.atoi(sys.argv[2])
    proc.cqid = sys.argv[3]
    proc.opid = sys.argv[4]
    proc.my_port = string.atoi(sys.argv[5])
    proc.log_dir = sys.argv[6]
    
    if not os.path.exists(proc.log_dir): os.makedirs(proc.log_dir)
    ParaLiteLog.init("%s/sql-%s-%s-%s.log" % (
        proc.log_dir, gethostname(), proc.cqid, proc.opid),
                     logging.DEBUG)
    ParaLiteLog.info("--sql node %s on %s is started" % (proc.opid, gethostname()))
    proc.sql_proc()
예제 #10
0
파일: sql.py 프로젝트: PayasR/paralite
    def get_data_by_blocksize(self, jobid, bksize):
        if self.reader is None:
            return None
        data = self.reader.read(bksize)
        if not data:
            read_size = 0
        else:
            read_size = len(data)

        if read_size < bksize or read_size == bksize and read_size == self.job_data[jobid]:
            # while True:
            #     if self.reader is not None:
            #         self.reader.close()
            #     self.reader = None
            #     self.reader = self.get_next_reader()
            #     if self.reader is None:
            #         break
            #     new_data = self.reader.read(bksize - read_size)
            #     data += new_data
            #     read_size = len(data)
            #     if read_size >= bksize:
            #         break
            if self.reader is not None:
                self.reader.close()
            self.reader = None
            self.reader = self.get_next_reader()
            
            return data
            
        if self.db_row_sep == "\n":
            if not data.endswith("\n"):
                extra_data = self.reader.readline()
                if extra_data:
                    data += extra_data
            return data
        else:
            if data:
                pos = data.rfind(self.db_row_sep)
                ParaLiteLog.info(pos)
                send_ds =  self.left_ds + data[0:pos]
                self.left_ds = data[pos+len(self.db_row_sep):]
                return send_ds
            else:
                return None
예제 #11
0
 def scan_data_queue(self):
     while True:
         data = self.queue.get()
         if data == conf.END_TAG:
             ParaLiteLog.info("SCAN DATA QUEUE : END")
             break
         try:
             pos = 10+string.atoi(data[0:10].strip())
             target_db = data[10:pos]
             data = data[pos:]
             """
             thd = threading.Thread(target=self.write_to_db, args=(data, len(data)))
             thd.setDaemon(True)
             thd.start()
             self.threads.append(thd)
             """
             self.write_to_db(target_db, data, len(data))
             del(data)
         except Exception, e:
             ParaLiteLog.info(traceback.format_exc())
             es("in write_to_db: %s" % (traceback.format_exc()))
             sys.exit(1)
예제 #12
0
 def handle_read(self, ev):
     data = ev.data
     if data == conf.END_TAG:
         ParaLiteLog.info("receive: END_TAG")
         self.is_running = False
         self.queue.put(conf.END_TAG)
     elif data.startswith(conf.INFO):
         m = data.split(conf.SEP_IN_MSG)
         assert len(m) == 7
         if self.table == None: self.table = m[2]
         if self.db_col_sep == None: self.db_col_sep = m[3]
         if self.cmd_col_sep == None: self.cmd_col_sep = m[4]
         if self.cmd_row_sep == None: self.cmd_row_sep = m[5]
         if self.is_replace == None:
             self.is_replace = m[6]
             ParaLiteLog.info("DB_COL_SEP = %s CMD_COL_SEP = %s  CMD_ROW_SEP = %s is_replace = %s" % (self.db_col_sep, self.cmd_col_sep, self.cmd_row_sep, self.is_replace))
     else:
         """
         TODO: we can control the buffer size here.
         """
         self.queue.put(data)
         """
예제 #13
0
    def start(self, argument):
        self.parse(argument)
        
        cur_time = time.strftime('%Y-%m-%d-%H-%M-%S',time.localtime(time.time()))
        ParaLiteLog.init("%s/dload-server-%s-%s.log" % (self.log_dir,
                                                        gethostname(), cur_time),
                         logging.DEBUG)

        ParaLiteLog.info("START")
        ParaLiteLog.info("parse the argumens sucessfully")
        ss = time.time()
        scan_thd = threading.Thread(target=self.scan_data_queue)
        scan_thd.setDaemon(True)
        scan_thd.start()
        
        t = time.strftime('%Y-%m-%d-%H-%M-%S',time.localtime(time.time()))
        self.local_socket = "%s%s%s-%s-%s" % (self.log_dir, os.sep, gethostname(), t, "UNIX.d")
        self.iom.create_server_socket(AF_UNIX, SOCK_STREAM, 5, self.local_socket)
        ch = self.iom.create_server_socket(AF_INET, SOCK_STREAM, 5, ("", self.port))
        n, self.port = ch.ss.getsockname()
        ParaLiteLog.info("global socket addr = %s" % (repr(ch.ss.getsockname())))
        self.register_to_master()
        
        try:
            while self.is_running:
                ev = self.next_event(None)
                if isinstance(ev, ioman_base.event_accept):
                    self.handle_accept(ev)
                elif isinstance(ev, ioman_base.event_read):
                    if ev.data != "":
                        self.handle_read(ev)
                        
        except Exception, e:
            es("in dload_server.py : %s" % traceback.format_exc())
            ParaLiteLog.info(traceback.format_exc())
            sys.exit(1)
예제 #14
0
 def range_data(self):
     ParaLiteLog.info("Now RANGE FASHION is not supported...")
예제 #15
0
 def write_to_db(self, db, data, size):
     ss = time.time()
     ParaLiteLog.info("%s: START: size = %s" % (self.write_to_db.__name__, size))
     record_num = 0
     if self.is_replace == "True":
         ParaLiteLog.info("LOAD: when is_replace = True")
         con = sqlite3.connect(db)
         con.text_factory = str
         cr = con.cursor()
         if self.cmd_row_sep == "None" or self.cmd_row_sep is None:
             lines = data.split("\n")
         else: lines = data.split(self.cmd_row_sep)
         ParaLiteLog.info(len(lines))
         template = None
         for line in lines:
             if line == "": continue
             #x = tuple([ unicode_or_buffer(s.replace("\\n", "\n").replace("\\t", "\t")) for s in line.split("\t")])
             x = tuple([ s.replace("\\n", "\n").replace("\\t", "\t") for s in line.split("\t")])
             if template is None: 
                 questions = ",".join([ "?" ] * len(x))
                 template = "insert into %s values(%s);" % (self.table, questions)
             try:
                 cr.execute(template, x)
                 record_num += 1
             except sqlite3.OperationalError,e:
                 es("sqlite3.OperationalError: %s" % traceback.format_exc())
                 ParaLiteLog.info(traceback.format_exc())
                 sys.exit(1)
         ParaLiteLog.info("record_num is %s" % (record_num))
         con.commit()
         cr.close()
         con.close()
         ParaLiteLog.info("%s: FINISH" % (self.write_to_db.__name__))
         self.cur_db.table_added_record += record_num
         self.cur_db.table_added_size += size
         self.cur_db.size += size
         return
예제 #16
0
파일: order_by.py 프로젝트: PayasR/paralite
    def handle_read(self, event):
        message = event.data[10:]

        m = message.split(conf.SEP_IN_MSG)
        try:
            if m[0] == conf.JOB_ARGUMENT:
                self.parse_args(m[1])
                ParaLiteLog.info("parse arguments: FINISH")

            elif m[0] == conf.JOB:
                ParaLiteLog.debug("MESSAGE: %s" % message)
                self.cur_jobid = m[1]

            elif m[0] == conf.DATA:
                data_id = string.strip(m[1][0:2])
                data = m[1][2:]
                self.source_data.append(data)

                # sort data
                if not self.is_data_ready(self.source_data, self.num_of_children):
                    return

                ParaLiteLog.debug("****SORT DATA****: start")
                s = 0
                for data in self.source_data:
                    s += len(data)
                ParaLiteLog.debug("source data size: %s" % s)
                s_time = time.time()
                rs_type, rs, t_size = self.sort(self.source_data)
                del self.source_data
                ParaLiteLog.debug("****SORT DATA****: finish")

                if rs_type is None:
                    self.send_status_to_master(self.cur_jobid, conf.PENDING)
                    return

                self.total_size += t_size
                self.source_data = {}

                # store the result of one job to the final result
                for i in range(len(rs)):
                    if i not in self.result:
                        self.result[i] = [rs[i]]
                    else:
                        self.result[i].append(rs[i])

                if rs_type != conf.MULTI_FILE:
                    # check if the whole data exceeds the LIMITATION
                    if self.total_size > self.MAX_SIZE:
                        self.write_data_to_disk()
                        self.result_type = conf.MULTI_FILE

                e_time = time.time()
                self.total_time += e_time - s_time

                self.send_status_to_master(self.cur_jobid, conf.PENDING)

            elif m[0] == conf.JOB_END:
                ParaLiteLog.debug("MESSAGE: %s" % message)
                # all jobs are finished
                self.send_rs_info_to_master(self.total_size, self.total_time)

                # distribute data
                if self.dest == conf.DATA_TO_ONE_CLIENT:
                    ParaLiteLog.debug("dest = %s" % self.dest)
                    self.distribute_data()
                    self.send_status_to_master(self.cur_jobid, conf.ACK)
                    self.is_running = False
                elif self.dest == conf.DATA_TO_DB:
                    self.distribute_data()

            elif m[0] == conf.DATA_PERSIST:
                # if the data is requried to be persisted or not
                if m[1] == conf.CHECKPOINT:
                    self.write_data_to_disk()

            elif m[0] == conf.DLOAD_REPLY:
                sep = conf.SEP_IN_MSG
                reply = sep.join(m[1:])
                ParaLiteLog.info("receive the information from the master")
                ParaLiteLog.debug(reply)

                if len(self.data.getvalue()) != 0:
                    dload_client.dload_client().load_internal_buffer(
                        reply,
                        self.dest_table,
                        self.data,
                        self.fashion,
                        self.hash_key,
                        self.hash_key_pos,
                        self.db_col_sep,
                        self.db_row_sep,
                        self.db_col_sep,
                        False,
                        "0",
                        self.log_dir,
                    )

                # send END_TAG to the master
                client_id = "0"
                msg = sep.join([conf.REQ, conf.END_TAG, gethostname(), client_id])
                so_master = socket(AF_INET, SOCK_STREAM)
                so_master.connect((self.master_name, self.master_port))
                so_master.send("%10s%s" % (len(msg), msg))
                so_master.close()
                ParaLiteLog.debug("sending to master: %s" % (conf.END_TAG))
                ParaLiteLog.debug("----- dload client finish -------")

            elif message == conf.DLOAD_END_TAG:
                ParaLiteLog.debug("---------import finish---------")
                self.send_status_to_master(" ".join(self.cur_jobid), conf.ACK)
                self.is_running = False

            elif m[0] == conf.EXIT:
                self.is_running = False

            elif m[0] == conf.NODE_FAIL:
                ParaLiteLog.debug("MESSAGE: %s" % message)
                # message --> NODE_FAIL:FAILED_NODE:REPLICA_NODE
                failed_node, replica_node = m[1:3]
                self.failed_node.append(failed_node)
                if replica_node != "" and replica_node == gethostname():
                    # load replica data for the failed node
                    self.recovery_data(self.replica_result, replica_node)
                ParaLiteLog.debug("Finish to handle node failure message")

        except Exception, e:
            es(traceback.format_exc())
            ParaLiteLog.info(traceback.format_exc())
            self.is_running = False
            self.no_error = False
예제 #17
0
파일: group_by.py 프로젝트: PayasR/paralite
                msg = sep.join([conf.DATA, "%2s%s" % (self.opid, data)])
                if destnode == gethostname():
                    # use local socket
                    addr = self.p_node[destnode][1]
                    t = AF_UNIX
                else:
                    addr = (destnode, self.p_node[destnode][0])
                    t = AF_INET
                self.send_data_to_node(msg, t, addr)
                ParaLiteLog.debug(
                    "send data susscufully   %s %s --> %s" % (
                        self.opid, gethostname(), destnode))

            elif m[0] == conf.DLOAD_REPLY:
                reply = sep.join(m[1:])
                ParaLiteLog.info("receive the information from the master")
                ParaLiteLog.debug(reply)
                
                if len(self.data.getvalue()) != 0:
                    dload_client.dload_client().load_internal_buffer(
                        reply, self.dest_table, self.data, self.fashion, 
                        self.hash_key, self.hash_key_pos, self.db_col_sep, 
                        self.db_row_sep, self.db_col_sep, False, "0", self.log_dir)

                # send END_TAG to the master
                client_id = "0"
                msg = sep.join([conf.REQ, conf.END_TAG, gethostname(), client_id])
                so_master = socket(AF_INET, SOCK_STREAM)
                so_master.connect((self.master_name, self.master_port))
                so_master.send("%10s%s" % (len(msg), msg))
                so_master.close()
예제 #18
0
파일: sql.py 프로젝트: PayasR/paralite
    def distribute_data(self):
        # handle the limit condition: get the first N records
        # E.g. select ... limit 10, the master firstly decides the limit
        # number for each process and set the limit value for each process
        # to be the post-limit

        whole_data = cStringIO.StringIO()
        for i in self.result:
            for csio in self.result[i]:
                d = string.strip(csio.getvalue())
                if len(d) == 0:
                    continue
                whole_data.write(d)
                whole_data.write("\n")
                del csio

        if self.distinct or self.limit != -1:
            data_list = whole_data.getvalue().split(self.db_row_sep)
            del whole_data
        
            if self.distinct:
                data_list = set(data_list)
            if self.limit != -1:
                data_list = data_list[:self.limit]

            data = cStringIO.StringIO()
            data.write(self.db_row_sep.join(str(s) for s in data_list))
            del data_list
        else:
            data = whole_data
        
        if self.dest == conf.DATA_TO_DB:
            self.data = data
            col_sep = self.db_col_sep
            row_sep = self.db_row_sep
            master = (self.master_name, self.master_port)
            
            ParaLiteLog.info("proc_select: load data start")
            # send request to the master
            t_size = len(data.getvalue())
            sep = conf.SEP_IN_MSG
            tag = conf.LOAD_FROM_API
            if row_sep is None or row_sep == "\n":
                temp_sep = "NULL"
            else:
                temp_sep = row_sep
            msg = sep.join(
                str(s) for s in [conf.REQ, self.cqid, gethostname(), 
                                 self.my_port, self.dest_db, self.dest_table,
                                 t_size, tag, self.fashion, temp_sep, "0"])
            so_master = socket(AF_INET, SOCK_STREAM)
            so_master.connect(master)
            so_master.send("%10s%s" % (len(msg),msg))
            so_master.close()

            # dload_client.dload_client().load_internal_buffer(
            #     master, self.cqid, gethostname(), self.my_port, self.dest_db,
            #     self.dest_table, data, conf.LOAD_FROM_API, self.fashion, 
            #     self.hash_key, self.hash_key_pos, self.db_col_sep, row_sep,
            #     col_sep, False, "0", self.log_dir)

        elif self.dest == conf.DATA_TO_ONE_CLIENT:
            random_num = random.randint(0, len(self.client_sock) - 1)
            addr = self.client_sock[random_num]
            sock = socket(AF_INET, SOCK_STREAM)
            sock.connect(addr)

            data_s = data.getvalue()
            ParaLiteLog.info("DATA SIZE = %s" % len(data_s))
            sock.send("%10s%s" % (len(data_s), data_s))
            re = sock.recv(10)
            assert re == "OK"
            sock.close()
예제 #19
0
파일: sql.py 프로젝트: PayasR/paralite
    def proc_select(self, jobid, exp, target_db):
        assert len(target_db) == 1
        cur_db = target_db[0]
        try:
            conn = sqlite3.connect(cur_db)
            conn.text_factory = str
            
            # register the user-defined aggregate
            conn.create_aggregate("mul", 1, mul)

            c = conn.cursor()
            """
            if self.temp_store != 0:
                c.execute('pragma temp_store=%s' % (self.temp_store))
            if self.cache_size != -1:
                c.execute('pragma cache_size=%s' % (self.cache_size))
            """

            # for test
            c.execute('pragma temp_store=memory')
            c.execute('pragma cache_size=2073741824')

            ParaLiteLog.info("start to execute sql: %s" % exp)
            
            col_sep = self.db_col_sep
            row_sep = self.db_row_sep
            num_of_dest = self.partition_num

            if self.dest == conf.DATA_TO_ANO_OP and num_of_dest > 1:
                columns = self.output
                split_key = self.split_key
                assert split_key is not None
                
                # partition data in hash fashion
                pos = []
                for key in split_key:
                    pos.append(columns.index(key))
                data_part_list = []
                for i in range(self.partition_num):
                    data_part_list.append(cStringIO.StringIO())
                size = 0
                t_size = 0
                for row in c.execute(exp):
                    part_id = abs(hash(self.db_col_sep.join(str(row[p]) for p in pos))) % num_of_dest
                    #part_id = abs(hash(row[pos[0]])) % num_of_dest
                    data = col_sep.join(str(s) for s in row)
                    """
                    size += len(data)
                    if size > self.MAX_SIZE:
                        for partid in data_part_list:
                            fs = self.write_data_to_disk(
                                partid, data_part_list[partid])
                            # delete all data in csio
                            data_part_list[partid].truncate(0)
                        t_size += size
                        size = 0
                        self.result_type = self.MULTI_FILE
                    """
                    data_part_list[part_id].write(data)
                    data_part_list[part_id].write(row_sep)

                for i in range(len(data_part_list)):
                    t_size += len(data_part_list[i].getvalue())
                    
                ParaLiteLog.debug("finish to retrieve the result: %s" % t_size)
                
                if self.result_type == self.MULTI_FILE:
                    for partid in data_part_list:
                        self.write_data_to_disk(
                            partid, data_part_list[partid].getvalue())
                        del data_part_list
                    return self.MULTI_FILE, None, t_size
                else:
                    ########################
                    # new_list = []
                    # for d in data_part_list:
                    #     new_list.append(d.getvalue())
                    # return self.MULTI_BUFFER, new_list, t_size
                    ###################
                    return self.MULTI_BUFFER, data_part_list, t_size
                
            else:
                csio = cStringIO.StringIO()
                t_size = 0
                size = 0 # record the size of current data
                data_pos = [] # the file name of data if persisted
                for row in c.execute(exp):
                    # NOTE:  For aggregation SQL, e.g. "select max(col) from T ..."
                    # if there is no record in T, (None,) will be returned
                    if row[0] is None:
                        continue
                    data = col_sep.join(str(s) for s in row)
                    size += len(data)
                    if size >= self.MAX_SIZE:
                        result_type = self.MULTI_FILE
                        self.write_data_to_disk(jobid, csio.getvalue())
                        # delete all data in csio
                        csio.truncate(0)
                        t_size += size
                        size = 0
                    csio.write(data)
                    csio.write(row_sep)

                t_size += len(csio.getvalue())
                ParaLiteLog.debug("finish to retrieve the result: %s" % t_size)

                if self.result_type == conf.MULTI_FILE:
                    self.write_data_to_disk(jobid, csio.getvalue())
                    del csio
                    return conf.MULTI_FILE, None, t_size
                else:
                    return self.SINGLE_BUFFER, [csio], t_size

        except sqlite3.OperationalError, e:
            ParaLiteLog.info(traceback.format_exc())
            raise(Exception("%s: QueryExecutionError: %s" % (gethostname(),
                                                             traceback.format_exc())))
예제 #20
0
파일: group_by.py 프로젝트: PayasR/paralite
    def handle_read(self, event):
        message = event.data[10:]

        sep = conf.SEP_IN_MSG
        m = message.split(sep)
        try:        
            if m[0] == conf.JOB_ARGUMENT:
                self.parse_args(m[1])

                ParaLiteLog.info("parsed structure : \n%s" % str(self.expr))
                ParaLiteLog.info("parse arguments: FINISH")

                if self.is_checkpoint is not None and self.is_checkpoint == conf.CHECKPOINT:
                    # this is a recovery operator
                    # init the persisted result data
                    ParaLiteLog.debug("recovery data: START")
                    self.recovery_data(self.result, gethostname())
                    ParaLiteLog.debug("recovery data: FINISH")
                    self.send_rs_info_to_master(0, 0)

                else:
                    self.parse_func()
                    # delete all temporary files for this operator
                    os.system("rm -f %s/%s_%s" % (self.temp_dir, "groupby", self.opid))


            elif m[0] == conf.JOB:
                ParaLiteLog.debug("MESSAGE: %s" % message)                
                self.cur_jobid = m[1]
                self.job_list.append(m[1])
                
            elif m[0] == conf.DATA:
                data_id = string.strip(m[1][0:2])
                data = m[1][2:]
                self.source_data.append(data)

                # aggregate data
                if not self.is_data_ready(self.source_data, self.num_of_children):
                    return

                ParaLiteLog.debug("****GROUP DATA****: start")
                s = 0
                for data in self.source_data:
                    s += len(data)
                ParaLiteLog.debug("source data size : %s" % s)
                s_time = time.time()
                rs_type, rs, t_size = self.hash_based_aggregate(self.source_data)
                ParaLiteLog.debug("****GROUP DATA****: finish")
                
                del self.source_data
                self.total_size += t_size
                self.source_data = []
                
                # store the result of one job to the final result
                if len(rs) == 1:
                    if self.dest == conf.DATA_TO_ANO_OP or self.dest == conf.DATA_TO_DB:
                        # dest is AGGR op or ORDER op, use 0 as the key
                        if 0 not in self.result:
                            self.result[0] = rs
                        else:
                            self.result[0].append(rs[0])
                    else:
                        # dest is UDX op, use jobid as the key
                        self.result[string.atoi(self.cur_jobid)] = rs
                        if self.is_checkpoint == 1:
                            self.write_data_to_disk(self.cur_jobid, rs[0].getvalue())

                else:
                    # use partid as the key
                    for i in range(len(rs)):
                        if i not in self.result:
                            self.result[i] = [rs[i]]
                        else:
                            self.result[i].append(rs[i])

                # check if the whole data exceeds the LIMITATION
                if rs_type != conf.MULTI_FILE:
                    if self.total_size > self.MAX_SIZE:
                        for dataid in self.result:
                            data = ""
                            for d in self.result[dataid]:
                                data += d
                            self.write_data_to_disk(dataid, data)
                        self.result_type = conf.MULTI_FILE
                
                e_time = time.time()
                self.total_time += (e_time - s_time)
                self.send_status_to_master(self.cur_jobid, conf.PENDING)
                    
            elif m[0] == conf.JOB_END:
                ParaLiteLog.debug("MESSAGE: %s" % message)                
                # all jobs are finished
                self.send_rs_info_to_master(self.total_size, self.total_time)
                
                # distribute data
                if self.dest == conf.DATA_TO_ONE_CLIENT:
                    self.distribute_data()
                    self.send_status_to_master(" ".join(self.job_list), conf.ACK)
                    self.is_running = False
                else self.dest == conf.DATA_TO_DB:
                    self.distribute_data()

            elif m[0] == conf.DATA_PERSIST:
                # if the data is requried to be persisted or not
                ParaLiteLog.debug("MESSAGE: %s" % message)
                self.process_ck_info(m)
예제 #21
0
파일: sql.py 프로젝트: PayasR/paralite
        sock.close()

    def proc_drop(self, exp, target_db):
        try:
            for db in target_db:
                conn = sqlite3.connect(db)
                c = conn.cursor()
                c.execute(exp)
                conn.commit()
                conn.close()
        except sqlite3.OperationalError, e:
            es("%s: %s" % (gethostname(), " ".join(e.args)))
            ParaLiteLog.info(traceback.format_exc())
        except Exception, e:
            es(traceback.format_exc())
            ParaLiteLog.info(traceback.format_exc())

    def proc_create(self, exp, target_db):
        try:
            # first of all, check if the directory holds database exists or not
            for db in target_db:
                parent = db[0:db.rfind(os.sep)]
                if not os.path.exists(parent):
                    os.makedirs(parent)
                conn = sqlite3.connect(db)
                c = conn.cursor()
                c.execute(exp)
                conn.commit()
                conn.close()
        except sqlite3.OperationalError, e:
            ParaLiteLog.info(traceback.format_exc())
예제 #22
0
    def load_internal_buffer(self, reply, table, buf, fashion, key, key_pos, 
                             db_col_sep, row_sep, col_sep, is_replace, client_id, 
                             LOG_DIR):
        ParaLiteLog.info("load_internal: START")
        ParaLiteLog.info("row separator = %s col separator = %s" % (row_sep, col_sep) )
        self.db_col_sep = db_col_sep
        total_size = len(buf.getvalue())
        try:
            """
            received message = nodes # sub_dbs # chunk_num # replica_info 
            
            nodes should be:
            n1:p1:l1 , n2:p2:l2 , ...               IF fashion = HASH_FASHION 
            n1:p1:l1:s1:num , n2:p2:l2:s2:num , ... IF fashion = ROUND_ROBIN
            TBD                                     IF fashion = RANGE_FASHION

            node_db_info: db_1_1 , db_1_2 , db_2_1, ...
            replica_info: db_1_1 db_1_1_r_1 node1 , db_1_2 db_1_2_r_1 node2 , ...
            """
            mm = reply.split("#")
            ParaLiteLog.info("receive the information from the master %s" % mm)
            nodes = mm[0].split(",")
            sub_dbs = mm[1].split(",")
            chunk_num = string.atoi(mm[2])
            replica = mm[3]
            
            replica_info = {} # {db_name : {replica_db_name:node}}
            if replica != "":
                for ll in replica.split(","):
                    lll = ll.split(" ")
                    if lll[0] not in replica_info:
                        replica_info[lll[0]] = {}
                    replica_info[lll[0]][lll[1]] = lll[2]
            ParaLiteLog.info(nodes)
            node_addr = {} # {node:addr}
            for node in nodes:
                m = node.split(conf.SEP_IN_MSG)
                if m[0] == gethostname(): addr = m[2]
                else: addr = (m[0], string.atoi(m[1]))
                node_addr[m[0]] = addr

            ss1 = time.time()
            if nodes == []:
                ParaLiteLog.info("there is no data to load")
            elif fashion == conf.HASH_FASHION:
                ParaLiteLog.info(fashion)
                # get the data for each sub db
                # db_buf = {db_name, buffer_of_data}
                db_buf = self.hash_data_buffer(buf, key_pos, nodes, row_sep, col_sep, chunk_num, sub_dbs)
                for db in db_buf:
                    data = db_buf[db].getvalue()
                    node = db.split("_")[-3]
                    self.send_to_node(db, table, data, node_addr[node], row_sep, col_sep, is_replace)
                    if db in replica_info:
                        for rdb in replica_info[db]:
                            node = replica_info[db][rdb]                            
                            self.send_to_node(rdb, table, data, node_addr[node], row_sep, col_sep, is_replace)
                """
                buf_scanner = threading.Thread(target=self.scan_buf,
                args=(table, node_buf, node_addr, row_sep, col_sep, is_replace))
                buf_scanner.setDaemon(True)
                buf_scanner.start()
                buf_scanner.join()
                """
            elif fashion == conf.REPLICATE_FASHION:
                self.replicate_data(table, files, total_size, nodes)
            elif fashion == conf.RANGE_FASHION:
                self.range_data()
            else:
                thds = []
                num_of_db = len(nodes) * chunk_num
                if row_sep is not None and row_sep != "\n":
                    whole_data = buf.getvalue()
                    lines = whole_data.split(row_sep)
                    if lines[len(lines)-1] == "":
                        lines.pop(len(lines)-1)
                    l = len(lines)
                    if l % num_of_db == 0:
                        num_each = l / num_of_db
                    else:
                        num_each = l / num_of_db + 1
                    i = 0
                    while i < num_of_db:
                        db = sub_dbs[i]
                        node = db.split("_")[-3]
                        cur_num = i*num_each + num_each
                        if cur_num > l:
                            cur_num = l
                        ds = row_sep.join(lines[i*num_each:cur_num])
                        thd = threading.Thread(target=self.send_to_node,
                                               args=(db, table, ds, node_addr[node], row_sep,
                                                     col_sep, is_replace))
                        thd.setDaemon(True)
                        thd.start()
                        thds.append(thd)
                        if db in replica_info:
                            for rdb in replica_info[db]:
                                node = replica_info[db][rdb]
                                thd = threading.Thread(target=self.send_to_node,
                                                       args=(rdb, table, ds, node_addr[node],
                                                             row_sep, col_sep, is_replace))
                                thd.setDaemon(True)
                                thd.start()
                                thds.append(thd)
                        i += 1
                else:
                    buf.seek(0)
                    i = 0
                    while i < num_of_db:
                        db = sub_dbs[i]
                        node = db.split("_")[-3]
                        node_id = i / chunk_num
                        size = string.atoi(nodes[node_id].split(conf.SEP_IN_MSG)[3]) / chunk_num
                        ParaLiteLog.info("start to get data as bk: %s" % (size))
                        ds = buf.read(size)
                        if ds is None:
                            ParaLiteLog.info("really get data as bk: 0")
                            continue
                        if not ds.endswith("\n"):
                            ds += buf.readline()

                        ParaLiteLog.info("really get data as bk: %s" % (len(ds)))
                        thd = threading.Thread(target=self.send_to_node,
                                               args=(db, table, ds, node_addr[node],
                                                     row_sep, col_sep, is_replace))
                        
                        thd.setDaemon(True)
                        thd.start()
                        thds.append(thd)
                        if db in replica_info:
                            for rdb in replica_info[db]:
                                node = replica_info[db][rdb]
                                thd = threading.Thread(target=self.send_to_node,
                                                       args=(rdb, table, ds, node_addr[node],
                                                             row_sep, col_sep, is_replace))
                                thd.setDaemon(True)
                                thd.start()
                                thds.append(thd)
                        i += 1
                for thd in thds:
                    thd.join()
        except Exception, e:
            raise(e)
예제 #23
0
파일: sql.py 프로젝트: PayasR/paralite
    def handle_read(self, event):
        message = event.data[10:]

        sep = conf.SEP_IN_MSG
        m = message.split(sep)
        try:
            if m[0] == conf.DATA_END:
                ParaLiteLog.debug("MESSAGE: %s" % message)
                # all data is dipatched to the parent nodes
                self.send_status_to_master(" ".join(self.job_data), conf.ACK)
                ParaLiteLog.debug("notify ACK to master")                    
                self.is_running = False
                
            elif message == conf.END_TAG:
                ParaLiteLog.debug("MESSAGE: %s" % message)
                self.send_status_to_master(" ".join(self.job_data), conf.ACK)
                self.is_running = False

            elif message == conf.DLOAD_END_TAG:
                ParaLiteLog.debug("---------import finish---------")
                self.send_status_to_master(" ".join(self.job_data), conf.ACK)
                self.is_running = False

            elif message == conf.EXIT:
                ParaLiteLog.debug("MESSAGE: %s" % message)
                self.is_running = False
            
            elif m[0] == conf.JOB_ARGUMENT:
                self.parse_args(m[1])
                ParaLiteLog.info("parse arguments: FINISH")
                # init the persisted result data
                if self.is_checkpoint is not None and self.is_checkpoint == conf.CHECKPOINT:
                    ParaLiteLog.debug("recovery data: START")
                    # this is a recovery operator
                    self.recovery_data(self.result, gethostname())
                    ParaLiteLog.debug("recovery data: FINISH")
                    self.send_rs_info_to_master(0, 0)
                else:
                    # delete all temporary files for this operator
                    os.system("rm -f %s/%s_%s" % (self.temp_dir, "sql", self.opid))

                ###############################
                # scanner = threading.Thread(target=self.scan_process_queue, args=(self.process_queue, ))
                # scanner.setDaemon(True)
                # scanner.start()
                # self.threads.append(scanner)
                ##########################
            elif m[0] == conf.JOB:
                self.ex_s_time = time.time()
                self.ex_w_time = 0
                ParaLiteLog.debug("MESSAGE: %s" % message)
                s_time = time.time()
                jobid = m[1]
                target_db = m[2].split()
                exp = self.expression
                ParaLiteLog.debug("*****JOB %s******:start" % jobid)
                
                # FAULT TOLERANCE:
                if jobid in self.job_data:
                    # this is a failed job, we should first delete the old result value
                    if self.dest == conf.DATA_TO_ANO_OP and self.partition_num > 1:
                        for partid in self.result:
                            pos = self.job_list.index(jobid)
                            self.result[partid][pos] = ""
                    else:
                        self.result[jobid] = ""
                
                if exp.lower().startswith("select"):
                    """
                    selection task: (1), execute sql (2), notify the result to
                    the master (3), wait for the DATA_PERSIST message from the
                    master (4), persist data if so (5), notify ACK to the master
                    """
                    ParaLiteLog.info("proc_select: START")
                    st_time = time.time()
                    
                    rs_type, rs, t_size = self.proc_select(jobid, exp, target_db)

                    et_time = time.time()
                    ParaLiteLog.debug("Job %s cost time %s second" % (jobid, (et_time - st_time)))
                    # FAULT TOLERANCE:
                    if jobid in self.job_data:
                        # this is a failed job
                        if self.dest == conf.DATA_TO_ANO_OP and self.partition_num > 1:
                            for partid in self.result:
                                pos = self.job_list.index(jobid)
                                self.result[partid][pos] = rs[partid]
                        else:
                            self.result[jobid] = rs
                        self.send_status_to_master(jobid, conf.PENDING)
                        return
                        
                    self.job_data[jobid] = t_size
                    self.job_list.append(jobid)
                    self.total_size += t_size
                    
                    # store the result of one job to the final result
                    if len(rs) == 1:
                        if self.dest == conf.DATA_TO_ANO_OP:
                            # dest is AGGR op or ORDER op, use 0 as the key
                             if 0 not in self.result:
                                 self.result[0] = rs
                             else:
                                 self.result[0].append(rs[0])

                             if self.is_checkpoint == 1:
                                 self.write_data_to_disk(0, rs[0].getvalue())
                        else:
                            # dest is UDX op, use jobid as the key
                            self.result[string.atoi(jobid)] = rs
                            if self.is_checkpoint == 1:
                                self.write_data_to_disk(0, rs[0].getvalue())
                        
                    else:
                        # use partid as the key
                        for i in range(len(rs)):
                            if i not in self.result:
                                self.result[i] = [rs[i]]
                            else:
                                self.result[i].append(rs[i])
                        if self.is_checkpoint == 1:
                            for i in range(len(rs)):
                                self.write_data_to_disk(i, rs[i].getvalue())
                        
                    # check if the whole data exceeds the LIMITATION
                    if rs_type != self.MULTI_FILE:
                        if self.is_checkpoint is not None and self.is_checkpoint == conf.CHECKPOINT or self.total_size > self.MAX_SIZE:
                            for dataid in self.result:
                                data = ""
                                for d in self.result[dataid]:
                                    data += d.getvalue()
                                self.write_data_to_disk(dataid, data)
                            self.result_type = self.MULTI_FILE
                            
                    e_time = time.time()
                    if self.total_time == 0:
                        self.total_time = (e_time - s_time)
                    self.send_status_to_master(jobid, conf.PENDING)

                elif exp.lower().startswith("create"):
                    ParaLiteLog.info("proc_create: START")
                    ParaLiteLog.info("SQL: %s" % exp)                    
                    self.proc_create(exp, target_db)
                    ParaLiteLog.info("proc_create: START")
                    self.send_status_to_master(jobid, conf.ACK)
                    self.is_running = False
                elif exp.lower().startswith("drop"):
                    ParaLiteLog.info("proc_drop: START")            
                    self.proc_drop(exp, target_db)
                    self.send_status_to_master(jobid, conf.ACK)
                    self.is_running = False
                ParaLiteLog.debug("*****JOB %s******:finish" % jobid)
                self.ex_w_time += (time.time() - self.ex_s_time)
                self.ex_s_time = 0

            elif m[0] == conf.JOB_END:
                ParaLiteLog.debug("MESSAGE: %s" % message)
                # all jobs are finished
                # create a dictionary to store the status of each part of data
                data_status = {}  # {data_id:[(pos_in_result, status)]}
                for dataid in self.result:
                    if dataid not in data_status:
                        data_status[dataid] = []
                    for i in range(len(self.result[dataid])):
                        data_status[dataid].append((i, 1))
                self.data_status = data_status
                self.reader = self.get_next_reader()
                    
                self.send_rs_info_to_master(self.total_size, self.total_time)

                # distribute data
                if self.dest == conf.DATA_TO_ONE_CLIENT:
                    self.distribute_data()
                    self.send_status_to_master(" ".join(self.job_data), conf.ACK)
                    self.is_running = False
                elif self.dest == conf.DATA_TO_DB:
                    self.distribute_data()
 
            elif m[0] == conf.DLOAD_REPLY:
                reply = sep.join(m[1:])
                ParaLiteLog.info("receive the information from the master")
                ParaLiteLog.debug(reply)
                
                if len(self.data.getvalue()) != 0:
                    dload_client.dload_client().load_internal_buffer(
                        reply, self.dest_table, self.data, self.fashion, 
                        self.hash_key, self.hash_key_pos, self.db_col_sep, 
                        self.db_row_sep, self.db_col_sep, False, "0", self.log_dir)

                # send END_TAG to the master
                client_id = "0"
                msg = sep.join([conf.REQ, conf.END_TAG, gethostname(), client_id])
                so_master = socket(AF_INET, SOCK_STREAM)
                so_master.connect((self.master_name, self.master_port))
                so_master.send("%10s%s" % (len(msg), msg))
                so_master.close()
                ParaLiteLog.debug("sending to master: %s" % (conf.END_TAG))
                ParaLiteLog.debug("----- dload client finish -------")

            elif m[0] == conf.DATA_PERSIST:
                ParaLiteLog.debug("MESSAGE: %s" % message)
                # if the data is requried to be persisted or not
                self.process_ck_info(m)
                
            elif m[0] == conf.DATA_DISTRIBUTE:
                ParaLiteLog.debug("MESSAGE: %s" % message)
                # send a part of data to the next operator
                # DATA_DISTRIBUTE:partition_num:destnode
                part_id, destnode = m[1:]
                data = self.get_data_by_part_id(self.result, string.atoi(part_id))
                
                # DATA message includes: type:id+data
                # the first 2 chars represents the opid
                msg = sep.join([conf.DATA, "%2s%s" % (self.opid, data)])
                if destnode == gethostname():
                    # use local socket
                    addr = self.p_node[destnode][1]
                    t = AF_UNIX
                else:
                    addr = (destnode, self.p_node[destnode][0])
                    t = AF_INET
                self.send_data_to_node(msg, t, addr)
                ParaLiteLog.debug("send data susscufully   %s %s --> %s" % (self.opid, gethostname(), destnode))

            elif m[0] == conf.DATA_DISTRIBUTE_UDX:
                ParaLiteLog.debug("MESSAGE: %s" % message)
                # send data to udx client
                # m[1:] = worker.id:jobid:(node:port | addr):size
                
                if len(m) == 6:
                    w_id, jobid = m[1:3]
                    addr = (m[3], string.atoi(m[4]))
                    t = AF_INET
                    bk = string.atoi(m[5])
                elif len(m) == 5:
                    w_id, jobid = m[1:3]
                    addr = m[3]
                    t = AF_UNIX
                    bk = string.atoi(m[4])
                data = self.get_data_by_blocksize(jobid, bk)
                if not data:
                    # if we don't send something here, udx will not send KAL
                    # again, and then they will not receive data again, the whole
                    # process will be blocked for ever
                    msg = sep.join([conf.DATA, "EMPTY"])
                else:
                    msg = sep.join([conf.DATA, data])
                self.send_data_to_node(msg, t, addr)
                
            elif m[0] == conf.DATA_REPLICA:
                ParaLiteLog.debug("MESSAGE: %s" % message)
                # message --> DATA_REPLICA:DATANODE:DATAID:DATA
                datanode, dataid = m[1:3]
                f_name = self.get_file_name_by_data_id(gethostname(), dataid)
                fr = open(f_name, "wa")
                fr.write(m[4])
                fr.close()

            elif m[0] == conf.NODE_FAIL:
                ParaLiteLog.debug("MESSAGE: %s" % message)
                # message --> NODE_FAIL:FAILED_NODE:REPLICA_NODE
                failed_node, replica_node = m[1:3]
                self.failed_node.append(failed_node)
                if replica_node == gethostname():
                    # load replica data for the failed node
                    self.recovery_data(self.replica_result, replica_node)
        except Exception, e:
            es("in sql_proc : %s" % traceback.format_exc())
            ParaLiteLog.info(traceback.format_exc())
            self.is_running = False
            self.no_error = False
예제 #24
0
    def load_internal_file(self, reply, opt, db_col_sep, LOG_DIR):
        ParaLiteLog.info("load_internal_file: START")
        table = opt.table
        files = opt.files
        col_sep = opt.col_sep
        row_sep = opt.row_sep
        fashion = opt.fashion
        key = opt.key
        key_pos = opt.key_pos
        is_replace = opt.replace

        self.db_col_sep = db_col_sep        
        for f in files:
            self.files[f] = 1
        self.file_reader = open(self.get_next_file(), "rb")

        try:
            """
            received message = nodes # sub_dbs # chunk_num # replica_info 
            
            nodes should be (| is SEP_IN_MSG):
            n1 : p1|l1 , n2 : p2|l2 , ...               IF fashion = HASH_FASHION 
            n1 : p1|l1|s1|num , n2 : p2|l2|s2|num , ... IF fashion = ROUND_ROBIN
            TBD                                     IF fashion = RANGE_FASHION

            node_db_info: node1:[db_1_1] , node2:[db_1_2] , node3:[db_2_1], ...
            replica_info: db_1_1 db_1_1_r_1 node1 , db_1_2 db_1_2_r_1 node2 , ...
            """
            mm = reply.split("#")

            nodes = mm[0].split(",")
            sub_dbs = mm[1].split(",")
            chunk_num = string.atoi(mm[2])
            replica = mm[3]
            
            replica_info = {} # {db_name : {replica_db_name:node}}
            if replica != "":
                for whole_re in replica.split(","):
                    lll = whole_re.split(" ")
                    if lll[0] not in replica_info:
                        replica_info[lll[0]] = {}
                    replica_info[lll[0]][lll[1]] = lll[2]

            node_addr = {} # {node:addr}
            for node in nodes:
                m = node.split(conf.SEP_IN_MSG)
                if m[0] == gethostname(): addr = m[2]
                else: addr = (m[0], string.atoi(m[1]))
                node_addr[m[0]] = addr
            
            thds = []
            if nodes == []:
                ParaLiteLog.info("there is no data to load")
            elif fashion == conf.HASH_FASHION:
                ParaLiteLog.info(fashion)
                if row_sep is not None and row_sep != "\n":
                    while True:
                        dst = self.get_data_as_bk(DATA_MAX_SIZE)
                        if dst is None:
                            ParaLiteLog.info("really get data as bk: 0")
                            break
                        ParaLiteLog.info("really get data as bk: %s" % (len(dst)))
                        pos = dst.rfind(row_sep)

                        ds =  left_ds + dst[0:pos]
                        left_ds = dst[pos+len(row_sep):]
                        del dst
                        db_buf = self.hash_data_file(ds, key_pos, nodes,
                                                       row_sep, col_sep,
                                                       chunk_num, sub_dbs)
                        ParaLiteLog.debug("hash data finish %s" % len(ds))
                        del ds                        
                        for db in db_buf:
                            data = db_buf[db].getvalue()
                            node = db.split("_")[-3]
                            thd = threading.Thread(target=self.send_to_node,
                                                   args=(db, table, data, node_addr[node],
                                                         row_sep,col_sep,
                                                         is_replace))
                            thd.setDaemon(True)
                            thd.start()
                            thds.append(thd)
                            if db in replica_info:
                                for rdb in replica_info[db]:
                                    node = replica_info[db][rdb]
                                    self.send_to_node(rdb, table, data,
                                                      node_addr[node],
                                                      row_sep, col_sep, is_replace)

                else:
                    while True:
                        ds = self.get_data_as_bk(DATA_MAX_SIZE)
                        if ds is None:
                            ParaLiteLog.info("really get data as bk: 0")
                            break
                        ParaLiteLog.info("really get data as bk: %s" % (len(ds)))
                        db_buf = self.hash_data_file(ds, key_pos, nodes,
                                                       "\n", col_sep, chunk_num, sub_dbs)
                        for db in db_buf:
                            ParaLiteLog.debug(
                                "%s -- > %s" % (db, len(db_buf[db].getvalue())))
                            break
                        for db in db_buf:
                            data = db_buf[db].getvalue()
                            node = db.split("_")[-3]

                            thd = threading.Thread(target=self.send_to_node,
                                                   args=(db, table, data,
                                                         node_addr[node],
                                                         row_sep,col_sep,
                                                         is_replace))
                            thd.setDaemon(True)
                            thd.start()
                            thds.append(thd)
                            if db in replica_info:
                                for rdb in replica_info[db]:
                                    node = replica_info[db][rdb]
                                    self.send_to_node(rdb, table, data,
                                                      node_addr[node],
                                                      row_sep, col_sep, is_replace)
                        del db_buf        
                        del ds
                        
            elif fashion == conf.REPLICATE_FASHION:
                self.replicate_data(table, files, total_size, nodes)
            elif fashion == conf.RANGE_FASHION:
                self.range_data()
            else:
                num_of_db = len(nodes) * chunk_num                
                if row_sep is not None and row_sep != "\n":
                    i = 0
                    left_ds = ""
                    while True:
                        db = sub_dbs[i % num_of_db]
                        #m = nodes[(i % num_of_db) / chunk_num].split(conf.SEP_IN_MSG) 
                        #node = m[0]
                        node = db.split("_")[-3]

                        size = string.atoi(m[3]) / chunk_num + 1
                        if size > DATA_MAX_SIZE:
                            ParaLiteLog.info("start to get data as bk: %s" % (DATA_MAX_SIZE))  
                            ds = self.get_data_as_bk(DATA_MAX_SIZE)
                        else:
                            ParaLiteLog.info("start to get data as bk: %s" % (size))
                            ds = self.get_data_as_bk(size)
                        if ds is None:
                            ParaLiteLog.info("really get data as bk: 0")
                            break
                        ParaLiteLog.info("really get data as bk: %s" % (len(ds)))
                        pos = ds.rfind(row_sep)
                        send_ds =  left_ds + ds[0:pos]
                        left_ds = ds[pos+len(row_sep):]
                        thd = threading.Thread(
                            target=self.send_to_node,
                            args=(db, table, send_ds, node_addr[node],
                                  row_sep, col_sep, is_replace))
                        thd.setDaemon(True)
                        thd.start()
                        thds.append(thd)
                        if db in replica_info:
                            for rdb in replica_info[db]:
                                node = replica_info[db][rdb]
                                thd = threading.Thread(
                                    target=self.send_to_node,
                                    args=(rdb, table, ds, node_addr[node],
                                          row_sep, col_sep, is_replace))
                                thd.setDaemon(True)
                                thd.start()
                                thds.append(thd)
                        i += 1
                else:
                    i = 0
                    while True:
                        db = sub_dbs[i % num_of_db]
                        #m = nodes[(i % num_of_db)/chunk_num].split(conf.SEP_IN_MSG) 
                        #node = m[0]
                        node = db.split("_")[-3]
                        size = string.atoi(m[3]) / chunk_num + 1
                        if size > DATA_MAX_SIZE:
                            ParaLiteLog.info(
                                "start to get data as bk: %s" % (DATA_MAX_SIZE))  
                            ds = self.get_data_as_bk(DATA_MAX_SIZE)
                        else:
                            ParaLiteLog.info("start to get data as bk: %s" % (size))
                            ds = self.get_data_as_bk(size)
                        if ds is None:
                            ParaLiteLog.info("really get data as bk: 0")
                            break
                        ParaLiteLog.info("really get data as bk: %s" % (len(ds)))
                        thd = threading.Thread(
                            target=self.send_to_node,
                            args=(db, table, ds, node_addr[node],
                                  row_sep,col_sep, is_replace))
                        thd.setDaemon(True)
                        thd.start()
                        thds.append(thd)
                        if db in replica_info:
                            for rdb in replica_info[db]:
                                ParaLiteLog.info(rdb)
                                node = replica_info[db][rdb]
                                thd = threading.Thread(
                                    target=self.send_to_node,
                                    args=(rdb, table, ds, node_addr[node],
                                          row_sep, col_sep, is_replace))
                                thd.setDaemon(True)
                                thd.start()
                                thds.append(thd)
                        i += 1
                        del ds
            for thd in thds:
                thd.join()
        except Exception, e:
            ParaLiteLog.debug(traceback.format_exc())
            raise(Exception(traceback.format_exc()))
예제 #25
0
             record_num += 1
         except sqlite3.OperationalError,e:
             es("sqlite3.OperationalError: %s" % traceback.format_exc())
             ParaLiteLog.info(traceback.format_exc())
             sys.exit(1)
     ParaLiteLog.info("record_num is %s" % (record_num))
     con.commit()
     cr.close()
     con.close()
     ParaLiteLog.info("%s: FINISH" % (self.write_to_db.__name__))
     self.cur_db.table_added_record += record_num
     self.cur_db.table_added_size += size
     self.cur_db.size += size
     return
 if self.cmd_row_sep is not None and self.cmd_row_sep != "None" and self.cmd_row_sep != conf.NEW_LINE:
     ParaLiteLog.info("cmd_row_sep %s" % (self.cmd_row_sep))
     ParaLiteLog.info("db = %s" % (db))
     try:
         ParaLiteLog.info("LOAD: insert one by one")
         con = sqlite3.connect(db)
         con.text_factory = str                
         cr = con.cursor()
         lines = data.split(self.cmd_row_sep)
         template = None
         for line in lines:
             if line == "": continue
             x = tuple([ s for s in string.strip(line).split(self.cmd_col_sep)])
             if template is None: 
                 questions = ",".join([ "?" ] * len(x))
                 template = "insert into %s values(%s);" % (self.table, questions)
             cr.execute(template, x)