def _configure(self, input_file, output_file, port, env): self._connection = Connection.TwinBufferingTCPMappedFileConnection(input_file, output_file, port) self._iterator = Iterator.Iterator(self._connection, env, 0) self._iterator2 = Iterator.Iterator(self._connection, env, 1) self._cgiter = Iterator.CoGroupIterator(self._iterator, self._iterator2, self._keys1, self._keys2) self.context = RuntimeContext.RuntimeContext(self._iterator, self._collector) self._configure_chain(Collector.Collector(self._connection, env))
def _configure(self, input_file, output_file, port, env, info, subtask_index): self._connection = Connection.TwinBufferingTCPMappedFileConnection(input_file, output_file, port) self._iterator = Iterator.Iterator(self._connection, env, 0) self._iterator2 = Iterator.Iterator(self._connection, env, 1) self._cgiter = Iterator.CoGroupIterator(self._iterator, self._iterator2, self._keys1, self._keys2) self._collector = Collector.Collector(self._connection, env, info) self.context = RuntimeContext.RuntimeContext(self._iterator, self._collector, subtask_index) if info.chained_info is not None: info.chained_info.operator._configure_chain(self.context, self._collector, info.chained_info) self._collector = info.chained_info.operator
def _configure(self, input_file, output_file, port): self._connection = Connection.BufferingTCPMappedFileConnection( input_file, output_file, port) self._iterator = Iterator.Iterator(self._connection) self.context = RuntimeContext.RuntimeContext(self._iterator, self._collector) self._configure_chain(Collector.Collector(self._connection))
def _configure(self, input_file, output_file, port, env, info): super(GroupReduceFunction, self)._configure(input_file, output_file, port, env, info) if info.key1 is None: self._run = self._run_all_group_reduce else: self._run = self._run_grouped_group_reduce self._group_iterator = Iterator.GroupIterator( self._iterator, info.key1)
def execute(self, local=False, debug=False): """ Triggers the program execution. The environment will execute all parts of the program that have resulted in a "sink" operation. """ if debug: local = True self._local_mode = local self._debug_mode = debug self._optimize_plan() plan_mode = sys.stdin.readline().rstrip('\n') == "plan" if plan_mode: port = int(sys.stdin.readline().rstrip('\n')) self._connection = Connection.PureTCPConnection(port) self._iterator = Iterator.PlanIterator(self._connection, self) self._collector = Collector.PlanCollector(self._connection, self) self._send_plan() result = self._receive_result() self._connection.close() return result else: import struct operator = None try: port = int(sys.stdin.readline().rstrip('\n')) id = int(sys.stdin.readline().rstrip('\n')) input_path = sys.stdin.readline().rstrip('\n') output_path = sys.stdin.readline().rstrip('\n') used_set = None operator = None for set in self._sets: if set.id == id: used_set = set operator = set.operator operator._configure(input_path, output_path, port, self, used_set) operator._go() operator._close() sys.stdout.flush() sys.stderr.flush() except: sys.stdout.flush() sys.stderr.flush() if operator is not None: operator._connection._socket.send(struct.pack(">i", -2)) else: socket = SOCKET.socket(family=SOCKET.AF_INET, type=SOCKET.SOCK_STREAM) socket.connect((SOCKET.gethostbyname("localhost"), port)) socket.send(struct.pack(">i", -2)) socket.close() raise
def _configure(self, input_file, output_file, port, env, info, task_id): self._connection = Connection.BufferingTCPMappedFileConnection(input_file, output_file, port) self._iterator = Iterator.Iterator(self._connection, env) self._collector = Collector.Collector(self._connection, env, info) self.context = RuntimeContext.RuntimeContext(self._iterator, self._collector, task_id) self._env = env if info.chained_info is not None: info.chained_info.operator._configure_chain(self.context, self._collector, info.chained_info) self._collector = info.chained_info.operator
def execute(self, local=False): """ Triggers the program execution. The environment will execute all parts of the program that have resulted in a "sink" operation. """ self._optimize_plan() if self._container.is_planning(): port = int(sys.stdin.readline().rstrip('\n')) self._connection = Connection.PureTCPConnection(port) self._iterator = Iterator.PlanIterator(self._connection, self) self._collector = Collector.PlanCollector(self._connection, self) self._send_plan() result = self._receive_result() self._connection.close() return result else: import struct operator = None port = None try: if self._container.should_execute(self): id = int(sys.stdin.readline().rstrip('\n')) port = int(sys.stdin.readline().rstrip('\n')) subtask_index = int(sys.stdin.readline().rstrip('\n')) mmap_size = int(sys.stdin.readline().rstrip('\n')) input_path = sys.stdin.readline().rstrip('\n') output_path = sys.stdin.readline().rstrip('\n') used_set = None operator = None for set in self._sets: if set.id == id: used_set = set operator = set.operator operator._configure(input_path, output_path, mmap_size, port, self, used_set, subtask_index) operator._go() operator._close() sys.stdout.flush() sys.stderr.flush() except: sys.stdout.flush() sys.stderr.flush() if operator is not None and operator._connection is not None: operator._connection._socket.send(struct.pack(">i", -2)) elif port is not None: socket = SOCKET.socket(family=SOCKET.AF_INET, type=SOCKET.SOCK_STREAM) socket.connect((SOCKET.gethostbyname("localhost"), port)) socket.send(struct.pack(">i", -2)) socket.close() raise
def computeSplits(self, env, con): iterator = Iterator.PlanIterator(con, env) collector = Collector.SplitCollector(con, env) min_num_splits = iterator.next() path = iterator.next() self.createInputSplits(min_num_splits, path, collector) collector._close()
def _configure(self, input_file, output_file, port): if self._combine: self._connection = Connection.BufferingTCPMappedFileConnection( input_file, output_file, port) self._iterator = Iterator.Iterator(self._connection) self._collector = Collector.Collector(self._connection) self.context = RuntimeContext.RuntimeContext( self._iterator, self._collector) self._run = self._run_combine else: self._connection = Connection.BufferingTCPMappedFileConnection( input_file, output_file, port) self._iterator = Iterator.Iterator(self._connection) self._group_iterator = Iterator.GroupIterator( self._iterator, self._keys) self.context = RuntimeContext.RuntimeContext( self._iterator, self._collector) self._configure_chain(Collector.Collector(self._connection)) self._open()
def _configure(self, input_file, output_file, port, env, info, subtask_index): super(ReduceFunction, self)._configure(input_file, output_file, port, env, info, subtask_index) if len(info.key1) == 0: self._run = self._run_all_reduce else: self._run = self._run_grouped_reduce self._group_iterator = Iterator.GroupIterator( self._iterator, info.key1)
def _sort_and_combine(self): values = self._values function = self.combine collector = self._collector extractor = self._extract_keys grouping = defaultdict(list) for value in values: grouping[extractor(value)].append(value) keys = list(grouping.keys()) keys.sort() for key in keys: iterator = Iterator.ListIterator(grouping[key]) base = iterator.next() while iterator.has_next(): base = function(base, iterator.next()) collector.collect(base) self._values = []
def _sort_and_combine(self): values = self._values function = self.combine collector = self._collector extractor = self._extract_keys grouping = defaultdict(list) for value in values: grouping[extractor(value)].append(value) keys = list(grouping.keys()) keys.sort() for key in keys: values = grouping[key] for op in reversed(self._sort_ops): values.sort(key=lambda x: x[op[0]], reverse=op[1] == Order.DESCENDING) result = function(Iterator.ListIterator(values), collector) if result is not None: for res in result: collector.collect(res) self._values = []