def test_dictionaries(self): payload = dict(first="a", second="b") res = cwo(post_with_dict_param(payload)) self.assertDictEqual(res, payload, "TEST FAILED: dict payload/return") inout = dict(first="a", second="b") update_inout_dict(inout) inout = cwo(inout) self.assertIn("third", inout, "TEST FAILED: inout dict")
def test_with_regular_tasks(self): inout = dict(first="a", second="b") update_inout_dict(inout) regular_task(inout) inout = cwo(inout) self.assertIn("third", inout, "TEST FAILED: http --> task") self.assertIn("greetings_from", inout, "TEST FAILED: http --> task")
def test_get_methods(self): dummy() print("GET: dummy works.") message = "holala" length = int(cwo(get_length(message))) self.assertEqual(length, len(message), "TEST FAILED: GET get_length") mes = cwo(return_message(message)) self.assertEqual(mes, message, "TEST FAILED: GET return_message") mes = cwo(get_nested_produces(message)) self.assertEqual(str(mes), message, "TEST FAILED: GET nested_produces") mes, length = cwo(multi_return(message)) self.assertEqual(str(mes), message, "TEST FAILED: GET multi return 0") self.assertEqual(int(length), len(message), "TEST FAILED: GET multi return 1")
def reduce(self, f, initial=marker, arity=-1): """ Reduce the whole data set. :param f: A reduce function which should take two parameters as inputs and return a single result which will be sent to itself again. :param initial: Initial value for reducer which will be used to reduce the first element with. :param arity: tree depth :return: reduced result (inside a DDS if necessary). >>> DDS().load(range(10), 5).reduce((lambda b, a: b + a) , 100) 145 """ def local_reducer(partition): """ A function to reduce a partition and retrieve it as a partition containing one element. :param partition: :return: """ iterator = iter(partition) try: init = next(iterator) except StopIteration: return [] import functools return [functools.reduce(f, iterator, init)] local_results = self.map_partitions(local_reducer)\ .collect(future_objects=True) local_results = deque(local_results) # If initial value is set, add it to the list as well if initial != marker: local_results.append([initial]) arity = arity if arity > 0 else len(self.partitions) branch = list() while local_results: while local_results and len(branch) < arity: temp = local_results.popleft() branch.append(temp) if len(branch) == 1: branch = cwo(branch[0]) break temp = reduce_multiple(f, branch) local_results.append(temp) branch = [] return branch[0]
def test_post_methods(self): mes = cwo(dummy_post()) self.assertEqual(mes, "post_works", "TEST FAILED: POST dummy") payload = "something" mes = cwo(post_with_param(payload)) self.assertEqual(str(mes), payload, "TEST FAILED: POST param in payload") inner_param = "hello" res = cwo(post_with_inner_param(inner_param)) self.assertEqual(res.get("first", None), inner_param, "TEST FAILED: POST inner param") fayl, content = "payload_file", "payload_content" with (open(fayl, 'w')) as nm: nm.write(content) ret = cwo(post_with_file_param(content)) self.assertEqual(str(ret), content, "TEST FAILED: POST file as payload")
def test_serialization(self): payload = "something" ret = post_with_param(payload) res = cwo(post_with_inner_param(ret)) self.assertEqual(res.get("first", ""), payload, "TEST FAILED: POST inner param") inout = post_with_inner_param(ret) update_inout_dict(inout) regular_task(inout) inout = cwo(inout) self.assertIn("third", inout, "TEST FAILED: json serialization") self.assertIn("greetings_from", inout, "TEST FAILED: json serialization") length = get_length("holalaa") inout = post_with_inner_param(length) regular_task(inout) inout = cwo(inout) self.assertIn("greetings_from", inout, "TEST FAILED: json serialization")
def count_by_value(self, arity=2, as_dict=True, as_fo=False): """ Amount of each element on this data set. :return: list of tuples (element, number) >>> first = DDS().load([0, 1, 2], 2) >>> second = DDS().load([2, 3, 4], 3) >>> first.union(second).count_by_value(as_dict=True) {0: 1, 1: 1, 2: 2, 3: 1, 4: 1} """ def count_partition(iterator): counts = defaultdict(int) for obj in iterator: counts[obj] += 1 return counts # Count locally and create dictionary partitions local_results = self.map_partitions(count_partition) \ .collect(future_objects=True) # Create a deque from partitions and start reduce future_objects = deque(local_results) branch = list() while future_objects: branch = [] while future_objects and len(branch) < arity: temp = future_objects.popleft() branch.append(temp) if len(branch) == 1: break first, branch = branch[0], branch[1:] reduce_dicts(first, branch) future_objects.append(first) if as_dict: if as_fo: return branch[0] branch[0] = cwo(branch[0]) return dict(branch[0]) length = self.num_of_partitions() new_partitions = list() for i in range(length): new_partitions.append(task_dict_to_list(branch[0], length, i)) return DDS().load(new_partitions, -1)
def collect(self, keep_partitions=False, future_objects=False): """ Returns all elements from all partitions. Elements can be grouped by partitions by setting keep_partitions value as True. :param keep_partitions: Keep Partitions? :param future_objects: :return: >>> dds = DDS().load(range(10), 2) >>> dds.collect(True) [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]] >>> DDS().load(range(10), 2).collect() [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] """ processed = list() if self.func: if self.paac: for col in self.partitions: processed.append(map_partition(self.func, None, col)) else: for _p in self.partitions: processed.append(map_partition(self.func, _p)) # Reset the function! self.func = None else: for _p in self.partitions: if isinstance(_p, IPartitionGenerator): processed.append(_p.retrieve_data()) else: processed.append(_p) # Future objects cannot be extended for now... if future_objects: return processed processed = cwo(processed) ret = list() if not keep_partitions: for _pp in processed: ret.extend(_pp) else: for _pp in processed: ret.append(list(_pp)) return ret
def test_dir_in(self): """ Test DIRECTORY_IN """ cur_path = "{}{}".format(os.getcwd(), os.sep) dir_in = "{}{}".format(cur_path, "some_dir_in") os.mkdir(dir_in) content = "this is some text to test directory_in".split(" ") for i, word in enumerate(content): temp = "{}{}{}".format(dir_in, os.sep, str(i)) with open(temp, 'w') as f: f.write(word) res = self.dir_in_task(dir_in) cwod(dir_in) res = cwo(res) for word in content: self.assertTrue(word in res, "missing word: {}".format(word)) shutil.rmtree(dir_in)
def take(self, num): """ The first num elements of DDS. :param num: number of elements to be retrieved. :return: """ items = [] partitions = self.collect(future_objects=True) taken = 0 for part in partitions: _p = iter(cwo(part)) while taken < num: try: items.append(next(_p)) taken += 1 except StopIteration: break if taken >= num: break return items[:num]
def sort_by_key(self, ascending=True, num_of_parts=None, key_func=lambda x: x): """ :type key_func: :param num_of_parts: :param ascending: :return: """ if num_of_parts is None: num_of_parts = len(self.partitions) # Collect everything to take samples col_parts = self.collect(future_objects=True) samples = list() for _part in col_parts: samples.append(task_collect_samples(_part, 20, key_func)) samples = sorted(list(itertools.chain.from_iterable(cwo(samples)))) bounds = [ samples[int(len(samples) * (i + 1) / num_of_parts)] for i in range(0, num_of_parts - 1) ] def range_partitioner(key): p = bisect.bisect_left(bounds, key_func(key)) if ascending: return p else: return num_of_parts - 1 - p def sort_partition(iterator): """ Sort a partition locally. :param iterator: :return: """ chunk_size = 500 iterator = iter(iterator) chunks = list() while True: chunk = list(itertools.islice(iterator, chunk_size)) chunk.sort(key=lambda kv: key_func(kv[0]), reverse=not ascending) chunks.append(chunk) if len(chunk) < chunk_size: break else: chunks.append( chunk.sort(key=lambda kv: key_func(kv[0]), reverse=not ascending)) return heapq3.merge(chunks, key=lambda kv: key_func(kv[0]), reverse=not ascending) partitioned = DDS().load(col_parts, -1).partition_by(range_partitioner) return partitioned.map_partitions(sort_partition)
def test_workflow(self): """ Test multiple tasks with directory in, out, and inout params. """ cur_path = "{}{}".format(os.getcwd(), os.sep) dir_t = "{}{}".format(cur_path, "some_dir_t") os.mkdir(dir_t) # len(phase[i] = i) res_phase_0 = [] for i in range(0, 5, 1): res_phase_0.append(self.dir_inout_task_i(dir_t, i)) # len(phase[i] = 5) res_phase_1 = [] for i in range(0, 5, 1): res_phase_1.append(self.dir_in_task_i(dir_t)) # len(phase[i] = i + 5) res_phase_2 = [] for i in range(5, 10, 1): res_phase_2.append(self.dir_inout_task_i(dir_t, i)) # len(phase[i] = 10) res_phase_3 = [] for i in range(0, 5, 1): res_phase_3.append(self.dir_in_task_i(dir_t)) # dir out should contain only the last file for i in range(0, 15, 1): self.dir_out_task_i(dir_t, i) res_phase_0 = cwo(res_phase_0) res_phase_1 = cwo(res_phase_1) res_phase_2 = cwo(res_phase_2) res_phase_3 = cwo(res_phase_3) cwod(dir_t) for i, res in enumerate(res_phase_0): self.assertEqual( len(res), i, "error in task #{} of phase 0: {} != {}".format( i, len(res), i)) for i, res in enumerate(res_phase_1): self.assertEqual( len(res), 5, "error in task #{} of phase 1: {} != 5".format(i, len(res))) for i, res in enumerate(res_phase_2): self.assertEqual( len(res), i + 5, "error in task #{} of phase 2: {} != {}".format( i, len(res), i + 5)) for i, res in enumerate(res_phase_3): self.assertEqual( len(res), 10, "error in task #{} of phase 3: {} != 10".format(i, len(res))) self.assertEqual( 1, len(os.listdir(dir_t)), "directory has fewer or more files than 1: {}".format( len(os.listdir(dir_t)))) shutil.rmtree(dir_t)
def wordcount_k_means(dim=742): """ TODO: Missing documentation """ import numpy as np f_path = sys.argv[1] start_time = time.time() vocab = DDS().load_files_from_dir(f_path, num_of_parts=4)\ .flat_map(lambda x: x[1].split()) \ .map(lambda x: ''.join(e for e in x if e.isalnum())) \ .count_by_value(arity=2, as_dict=True, as_fo=True) total = len(os.listdir(f_path)) max_iter = 2 frags = 4 epsilon = 1e-10 size = total / frags k = 4 # The number of dimensions corresponds to: dim = len(vocabulary) # dim = 742 # added as parameter to allow unittests with different dataset # to access file names by index returned from the clusters.. # load_files_from_list will also sort them alphabetically indexes = [os.path.join(f_path, f) for f in sorted(os.listdir(f_path))] # step 2 # wc_per_file = DDS().load_files_from_dir(files_path, num_of_parts=frags)\ # .map(__count_locally__, vocabulary)\ # .map(__gen_array__)\ wc_per_file = list() for fn in sorted(os.listdir(f_path)): wc_per_file.append(task_count_locally(os.path.join(f_path, fn), vocab)) mu = [np.random.randint(1, 3, dim) for _ in range(frags)] old_mu = [] clusters = [] n = 0 while n < max_iter and not has_converged(mu, old_mu, epsilon): old_mu = mu clusters = [ cluster_points_partial([wc_per_file[f]], mu, int(f * size)) for f in range(frags) ] partial_result = [ partial_sum([wc_per_file[f]], clusters[f], int(f * size)) for f in range(frags) ] mu = merge_reduce(reduce_centers, partial_result) mu = cwo(mu) mu = [mu[c][1] / mu[c][0] for c in mu] while len(mu) < k: # Add a new random center if one of the centers has no points. mu.append(np.random.randint(1, 3, dim)) n += 1 clusters_with_frag = cwo(clusters) from collections import defaultdict cluster_sets = defaultdict(list) for _d in clusters_with_frag: for _k in _d: cluster_sets[_k] += [indexes[i] for i in _d[_k]] # step 4 and 5 combined sims_per_file = {} for k in cluster_sets: clus = cluster_sets[k] for fayl in clus: sims_per_file[fayl] = get_similar_files(fayl, clus) sims_per_file = cwo(sims_per_file) for k in list(sims_per_file.keys())[:10]: print(k, "-----------sims --------->", sims_per_file[k][:5]) print("-----------------------------") print("Kmeans Timed {} (s)".format(time.time() - start_time)) print("Iterations: ", n)