def test_loader_functions(): # Iterator loader dds = DDS().load(range(10)).collect() assert len(dds) == 10 # Single file content = "Holala World!" desc, filename = tempfile.mkstemp() with open(filename, "wb") as tmp_f: tmp_f.write(bytearray(content, "utf8")) dds = DDS().load_file(filename).collect() os.remove(filename) assert dds[0] == content # Multiple files _dir = os.path.join(tempfile.gettempdir(), "multi") if not os.path.exists(_dir): os.mkdir(_dir) for _i in range(10): desc, filename = tempfile.mkstemp(dir=_dir) file_path = os.path.join(_dir, filename) with open(file_path, "wb") as tmp_f: tmp_f.write(bytearray(str(_i), "utf8")) dds = DDS().load_files_from_dir(_dir).collect() shutil.rmtree(_dir) dds = sorted(dds, key=lambda x: x[1]) for _i in range(10): assert int(dds[_i][1]) == _i
def test_word_count(): # Word Count vocabulary = ["Holala", "World", "COMPSs", "Lorem", "Ipsum", "_filter_"] _dir = os.path.join(tempfile.gettempdir(), "wc") if not os.path.exists(_dir): os.mkdir(_dir) for _i in range(5): desc, filename = tempfile.mkstemp(dir=_dir, suffix=".txt") file_path = os.path.join(_dir, filename) tmp_f = open(file_path, "w") for word in vocabulary: tmp_f.write(word + " ") tmp_f.close() dds = DDS().load_files_from_dir(_dir) \ .flat_map(lambda x: x[1].split())\ .filter(lambda x: "_" not in x)\ .count_by_value(as_dict=True) shutil.rmtree(_dir) for key, value in dds.items(): assert "_" not in key assert value == 5
def inverted_indexing(): path = sys.argv[1] start_time = time.time() result = DDS().load_files_from_dir(path).map_and_flatten(_invert_files)\ .reduce_by_key(lambda a, b: a + b).collect() print(result[-1:]) print("Elapsed Time {} (s)".format(time.time() - start_time))
def test_terasort(): dataset = [gen_fragment() for _ in range(10)] dds = DDS().load(dataset, -1).sort_by_key().collect() prev = 0 for i, k in dds: assert i > prev prev = i
def inverted_indexing(): """ TODO: Missing documentation """ path = sys.argv[1] start_time = time.time() result = DDS().load_files_from_dir(path).flat_map(_invert_files)\ .reduce_by_key(lambda a, b: a + b).collect() print(result[-1:]) print("Elapsed Time {} (s)".format(time.time() - start_time))
def word_count(): path_file = sys.argv[1] start = time.time() results = DDS().load_files_from_dir(path_file).\ map_and_flatten(lambda x: x[1].split())\ .count_by_value(arity=4, as_dict=True) print("Elapsed Time: ", time.time() - start)
def pi_estimation(): """ Example is taken from: https://spark.apache.org/examples.html """ print("Estimating Pi by 'throwing darts' algorithm.") tries = 100000 print("Number of tries: {}".format(tries)) count = DDS().load(range(0, tries), 10) \ .filter(inside).count() print("Pi is roughly %f" % (4.0 * count / tries))
def transitive_closure(partitions=None): """ TODO: Missing documentation """ if not partitions: partitions = int(sys.argv[2]) if len(sys.argv) > 2 else 2 # Commented out code for unknown reason: # path = sys.argv[1] # od = DDS().load_text_file(path, partitions) \ # .map(lambda line: (int(line.split(",")[0]), int(line.split(",")[1])))\ # .collect(future_objects=True) edges = _generate_graph() od = DDS().load(edges, partitions).collect(future_objects=True) # Because join() joins on keys, the edges are stored in reversed order. edges = DDS().load(od, -1).map(lambda x_y: (x_y[1], x_y[0])) next_count = DDS().load(od, -1).count() while True: old_count = next_count # Perform the join, obtaining an RDD of (y, (z, x)) pairs, # then project the result to obtain the new (x, z) paths. new_edges = DDS().load(od, -1).join(edges)\ .map(lambda __a_b: (__a_b[1][1], __a_b[1][0])) od = DDS().load(od, -1).union(new_edges).distinct()\ .collect(future_objects=True) next_count = DDS().load(od, -1).count() if next_count == old_count: break print("TC has %i edges" % next_count)
def word_count(): """ TODO: Missing documentation """ path_file = sys.argv[1] start = time.time() results = DDS().load_files_from_dir(path_file) \ .flat_map(lambda x: x[1].split()) \ .map(lambda x: ''.join(e for e in x if e.isalnum())) \ .count_by_value(as_dict=True) print("Results: " + str(results)) print("Elapsed Time: ", time.time() - start)
def terasort(): """ """ dir_path = sys.argv[1] dest_path = sys.argv[2] # partitions = sys.argv[2] if len(sys.argv) > 2 else -1 start_time = time.time() dds = DDS().load_files_from_dir(dir_path)\ .map_and_flatten(files_to_pairs)\ .sort_by_key().save_as_text_file(dest_path) # compss_barrier() # test = DDS().load_pickle_files(dest_path).map(lambda x: x).collect() # print(test[-1:]) print("Elapsed Time {} (s)".format(time.time() - start_time))
def terasort(): """ TODO: Missing documentation """ dir_path = sys.argv[1] dest_path = sys.argv[2] # Commented out code for unknown reason: # partitions = sys.argv[2] if len(sys.argv) > 2 else -1 start_time = time.time() dds = DDS().load_files_from_dir(dir_path) \ .flat_map(files_to_pairs) \ .sort_by_key().save_as_text_file(dest_path) # Commented out code for unknown reason: # compss_barrier() # test = DDS().load_pickle_files(dest_path).map(lambda x: x).collect() # print(test[-1:]) print("Result: " + str(dds)) print("Elapsed Time {} (s)".format(time.time() - start_time))
def inverted_indexing(): def _invert_files(pair): res = dict() for word in pair[1].split(): res[word] = [pair[0]] return res.items() vocabulary = ["Holala", "World", "COMPSs", "Lorem", "Ipsum"] files = list() pairs = defaultdict(list) _dir = os.path.join(tempfile.gettempdir(), "ii") if not os.path.exists(_dir): os.mkdir(_dir) # Create Files for _i in range(len(vocabulary) // 2): desc, filename = tempfile.mkstemp(dir=_dir, suffix=".txt") files.append(filename) for word in vocabulary: _files = random.sample(files, 2) for _f in _files: file_path = os.path.join(_dir, _f) tmp_f = open(file_path, "a") tmp_f.write(word + " ") tmp_f.close() pairs[word].append(file_path) result = DDS().load_files_from_dir(_dir).flat_map(_invert_files)\ .reduce_by_key(lambda a, b: a + b).collect() shutil.rmtree(_dir) for word, files in result: assert set(pairs[word]).issubset(set(files))
def wordcount_k_means(dim=742): """ TODO: Missing documentation """ import numpy as np f_path = sys.argv[1] start_time = time.time() vocab = DDS().load_files_from_dir(f_path, num_of_parts=4)\ .flat_map(lambda x: x[1].split()) \ .map(lambda x: ''.join(e for e in x if e.isalnum())) \ .count_by_value(arity=2, as_dict=True, as_fo=True) total = len(os.listdir(f_path)) max_iter = 2 frags = 4 epsilon = 1e-10 size = total / frags k = 4 # The number of dimensions corresponds to: dim = len(vocabulary) # dim = 742 # added as parameter to allow unittests with different dataset # to access file names by index returned from the clusters.. # load_files_from_list will also sort them alphabetically indexes = [os.path.join(f_path, f) for f in sorted(os.listdir(f_path))] # step 2 # wc_per_file = DDS().load_files_from_dir(files_path, num_of_parts=frags)\ # .map(__count_locally__, vocabulary)\ # .map(__gen_array__)\ wc_per_file = list() for fn in sorted(os.listdir(f_path)): wc_per_file.append(task_count_locally(os.path.join(f_path, fn), vocab)) mu = [np.random.randint(1, 3, dim) for _ in range(frags)] old_mu = [] clusters = [] n = 0 while n < max_iter and not has_converged(mu, old_mu, epsilon): old_mu = mu clusters = [ cluster_points_partial([wc_per_file[f]], mu, int(f * size)) for f in range(frags) ] partial_result = [ partial_sum([wc_per_file[f]], clusters[f], int(f * size)) for f in range(frags) ] mu = merge_reduce(reduce_centers, partial_result) mu = cwo(mu) mu = [mu[c][1] / mu[c][0] for c in mu] while len(mu) < k: # Add a new random center if one of the centers has no points. mu.append(np.random.randint(1, 3, dim)) n += 1 clusters_with_frag = cwo(clusters) from collections import defaultdict cluster_sets = defaultdict(list) for _d in clusters_with_frag: for _k in _d: cluster_sets[_k] += [indexes[i] for i in _d[_k]] # step 4 and 5 combined sims_per_file = {} for k in cluster_sets: clus = cluster_sets[k] for fayl in clus: sims_per_file[fayl] = get_similar_files(fayl, clus) sims_per_file = cwo(sims_per_file) for k in list(sims_per_file.keys())[:10]: print(k, "-----------sims --------->", sims_per_file[k][:5]) print("-----------------------------") print("Kmeans Timed {} (s)".format(time.time() - start_time)) print("Iterations: ", n)