def start(self): global LOG_DIR, log start = time.time()*1000 self.parse_args(arguments) log = open("%s/groupby-%s-%s.log" % (LOG_DIR, gethostname(), getCurrentTime()), "wb") host = self.node[0] port = self.node[1] queue = cqueue.cqueue(host, port) queue.listen() """ data_list = [data1, data2, data3...] data1: data1.id, data1.data(string) """ data_list = queue.get(self.num_of_children) queue.close() logging("GET SOURCE DATA") proc_start = time.time()*1000 result = self.sort_base_proc(data_list) logging("GET GROUPED DATA") proc_end = time.time()*1000 self.distribute(result) dis_end = time.time()*1000 logging("--------gourp node %s is finished--------" % (self.id))
def distribute(self, data): sep = conf.SEPERATOR if self.dest == conf.DATA_TO_CLIENTS: # This is final result, store them to a file dest_file = conf.OUTPUT_DIR + self.dest file_handler = io.FileIO(dest_file, 'wb', closefd=True) bw = io.BufferedWriter(file_handler, buffer_size=65536) for row in data: bw.write(row) bw.write('\n') bw.close() addr = (conf.MASTER_NAME, string.atoi(conf.MASTER_PORT)) sock = socket(AF_INET, SOCK_STREAM) sock.connect(addr) msg = '%s:%s:%s:%s' % (conf.ACK, conf.DATA_NODE, self.cqid, gethostname()) sock.send('%10s%s' % (len(msg), msg)) elif self.dest == conf.DATA_TO_ANO_OP: # This is intermediate data, pipeline it num_of_dest = len(self.p_node) if num_of_dest == 1: port = self.p_node.keys()[0] node = self.p_node[port] queue = cqueue.cqueue(node, port) queue.connect() # The first message is the output attrs of this sql msg = '' for out in self.output: msg += '%s%s' % (out, sep) queue.put(msg) queue.put(data.getvalue()) queue.close() elif num_of_dest > 1: queue_list = [] """ for p in self.p_port: queue = cqueue(p.split(':')[0],string.atoi(p.split(':')[1])) queue.connect() # The first message is the output attrs of this sql msg = '' for out in self.output: msg += '%s%s' % (out, sep) queue.put(msg) queue_list.append(queue) """ for port in self.p_node: queue = cqueue(self.p_node[port],port) queue.connect() # The first message is the output attrs of this sql msg = '' for out in self.output: msg += '%s%s' % (out, sep) queue.put(msg) queue_list.append(queue) if self.split_key != None: # partition data in hash fashion split_key = self.split_key if columns[0].find('.') == -1 and len(split_key.split('.')) == 2: pos = columns.index(split_key.split('.')[1]) else: pos = columns.index(split_key) for row in rs: partition_num = abs(hash(row[pos])) % num_of_dest queue = queue_list[partition_num] msg = '' for r in data: msg += '%s%s' % (str(r), sep) queue.put(msg) else: # partitioning data in range fashion for i in range(len(data)): partition_num = i % num_of_dest queue = queue_list[partition_num] msg = '' for r in data: msg += '%s%s' % (str(r), sep) queue.put(msg) for queue in queue_list: queue.close() elif self.dest == conf.DATA_TO_ONE_CLIENT: sock = socket(AF_INET, SOCK_STREAM) sock.connect(self.client) sock.send('%10sDATA' % (4)) sock.send("%10s%s" % (len(data.getvalue()), data.getvalue())) sock.close() self.notify_to_master() elif self.dest == conf.DATA_TO_DB: """ out_name = "%s%sgroup_%s" % (LOG_DIR, os.sep, self.cqid) f = open(out_name, "wb") f.write(data.getvalue()) f.close() """ col_sep = self.db_col_sep row_sep = self.db_row_sep master = (self.master_name, self.master_port) dload_client.dload_client().load_internal(master, self.cqid, gethostname(),self.dest_db, self.dest_table, data, 1, self.fashion, self.hash_key, self.hash_key_pos, col_sep, row_sep, col_sep, False, "0", LOG_DIR) self.notify_to_master() #os.remove(out_name) else: random_num = random.randint(0, len(self.client_sock) - 1) addr = self.client_sock[random_num] sock = socket(AF_INET, SOCK_STREAM) sock.connect(addr) d = string.strip(data.getvalue()) sock.send("%10s%s" % (len(d), d)) sock.close() self.notify_to_master()