def main(): data = {} G = gennum() with open("./southpark/All-seasons.csv","r") as file: r = csv.DictReader(file,delimiter=",") for row in r: data[G.next()] = (row["Character"], list(filter(lambda x: x != "", re.split(r"\W+", row["Line"].lower())) )) def mapfn(k, v): for w in v[1]: yield v[0], w def reducefn(k, vs): res = set() for word in vs: res.add(word) res = len(res) return res s = mincemeat.Server() s.datasource = data s.mapfn = mapfn s.reducefn = reducefn print("Server is running...") res = s.run_server(password="******") with open("./task1res.csv","wb") as file: w = csv.writer(file, delimiter=",") w.writerow(["Character","Number of words"]) for key in res: w.writerow([key, res[key]])
def run(): data = {} k = 3 centers = [(round(random.uniform(0, 1), 2), round(random.uniform(0, 1), 2)) for _ in range(k)] centers.sort() with open("kmeans_input.txt", "r") as f: lines = f.readlines() for line in lines: data[tuple([float(a) for a in line.split(" ")])] = centers while True: points = data.keys() server = mincemeat.Server() server.mapfn = map_kmeans server.reducefn = reduce_kmeans server.datasource = data res = server.run_server(password="******") new_centers = [a[0] for a in res.values()] new_centers.sort() if new_centers == centers: for cluster in res.values(): for p in cluster[1]: print("(%s %s)\t(%s %s)" % (p[0], p[1], cluster[0][0], cluster[0][1])) break else: centers = new_centers for p in points: data[p] = centers
def server(credentials): """ Run a Map-Reduce Server, and process a single Map-Reduce """ s = mincemeat.Server() s.datasource = datasource s.mapfn = mapfn s.collectfn = collectfn s.reducefn = reducefn s.finishfn = finishfn results = s.run_server(**credentials) # Map-Reduce over 'datasource' complete. Enumerate results, # ordered both lexicographically and by count bycount = {} for k, v in results.items(): if v in bycount: bycount[v].append(k) else: bycount[v] = [k] bycountlist = [] for k, l in sorted(bycount.items()): for w in sorted(l): bycountlist.append((k, w)) for k, lt in zip(sorted(results.keys()), bycountlist): print "%8d %-40.40s %8d %s" % (results[k], k, lt[0], lt[1])
def test_bind(): """ Tests that socket binding exclusion works. """ global testcount testcount += 1 port = unique_port(mincemeat.DEFAULT_PORT) cred = credentials.copy() cred.update({"port": port}) s1 = mincemeat.Server_daemon(credentials=cred, timeout=5.) state = s1.state() assert state == "idle" s1.start() time.sleep(1) state = s1.state() assert state == "authenticated" try: s2 = mincemeat.Server() s2.conn(**cred) assert False == "Should have thrown Exception in bind()!" except Exception, e: assert "Only one usage of each socket address" in str(e) \ or "Address already in use" in str(e)
def test_example(): """ Tests a scaled-up version of example.py. Starts 1-5 Client threads, and scales up the text corpus a bit, proportional to the number of threads we choose. """ global testcount testcount += 1 port = unique_port(mincemeat.DEFAULT_PORT) clients = random.randint(1, 5) scale = clients * 73 # Since we are running multiple asyncore-based Clients and a # Server in separate threads, we need to specify map={} for the # Clients, so they all don't use the (default) global asyncore # socket map as the Server... logging.info("Starting %d clients...", clients) for _ in xrange(clients): c = mincemeat.Client(map={}) t = threading.Timer(1.0, c.conn, args=("", port), kwargs={"password": "******"}) t.daemon = True t.start() s = mincemeat.Server(map={}) s.datasource = dict(enumerate(data * scale)) s.mapfn = mapfn s.reducefn = reducefn now = mincemeat.timer() results = s.run_server(password="******", port=port) expected = dict( (k, v * scale) for k, v in { 'All': 1, "Couldn't": 1, 'Dumpty': 2, 'Humpty': 3, "King's": 2, 'a': 2, 'again': 1, 'all': 1, 'and': 1, 'fall': 1, 'great': 1, 'had': 1, 'horses': 1, 'men': 1, 'on': 1, 'put': 1, 'sat': 1, 'the': 2, 'together': 1, 'wall': 1 }.iteritems()) assert results == expected
def start(self, display): # start the server s = mincemeat.Server() s.datasource = self.source s.mapfn = self.mapfn s.reducefn = self.reducefn results = s.run_server(password="******") display(results)
def run_server(csv_file): rdr = dict(enumerate(csv.DictReader(csv_file))) s = mm.Server() s.datasource = rdr s.mapfn = mapfn s.reducefn = reducefn return s.run_server()
def run_server(): print "Starting up" s = mincemeat.Server() print "Prep data" s.datasource = dict(enumerate(data)) s.mapfn = mapfn s.reducefn = reducefn print "starting server" results = s.run_server(password="******")
def main(): server = mincemeat.Server() data = get_data(mapsize=10000000, nummaps=1000) log.info('data: %s', data) log.info('waiting for workers...') server.datasource = data server.mapfn = mapfn server.reducefn = reducefn results = server.run_server(password='******') inside, total = results['totals'] print(results, inside, total) print('{0}: {1} inside, {2} total, pi ~= {3}'.format('totals', inside, total, 4. * inside / total))
def run_server(docs, docnames): rdr = { docname: readall(fname).translate(None, string.punctuation) for fname, docname in zip(docs, docnames) } s = mm.Server() s.datasource = rdr s.mapfn = mapfn s.reducefn = eval(reducefn_template % docnames) return s.run_server()
def run_server(mat_file): matrices = list(csv.DictReader(mat_file)) matsize = max(int(x['row']) for x in matrices) + 1 s = mm.Server() s.datasource = {(x['matrix'] == 'a', int(x['row']), int(x['col'])): (matsize, int(x['value'])) for x in matrices} s.mapfn = mapfn s.reducefn = reducefn return s.run_server()
def main(): data = {} G = gennum() m, k, n = sys.argv[1:4] with open("m1.csv", "r") as file: r = csv.DictReader(file, delimiter=",") for row in r: new_key = G.next() data[new_key] = row data[new_key]["m"] = int(m) data[new_key]["k"] = int(k) data[new_key]["n"] = int(n) def mapfn(k, v): if v["matrix"] == "a": for i in range(v["n"]): yield (int(v["row"]), i), (int(v["col"]), int(v["val"])) else: for i in range(v["m"]): yield (i, int(v["col"])), (int(v["row"]), int(v["val"])) def reducefn(k, vs): d = {} for v in vs: if v[0] in d: d[v[0]].append(v[1]) else: d[v[0]] = [v[1]] res = 0 for key in d: res += d[key][0] * d[key][1] return res % 97 s = mincemeat.Server() s.datasource = data s.mapfn = mapfn s.reducefn = reducefn print("Server is running...") res = s.run_server(password="******") with open("task3res.csv", "wb") as file: w = csv.writer(file, delimiter=",") w.writerow(["matrix", "row", "col", "val"]) for key in res: w.writerow(["c", key[0], key[1], res[key]])
def main(): s = mincemeat.Server() data = {f: d for f, d in read_all_files(False)} # print(data.keys()) # The data source can be any dictionary-like object s.datasource = data s.mapfn = mapfn s.reducefn = reducefn results = s.run_server(password="******") results = sorted(results.items(), key=operator.itemgetter(1), reverse=True) print(results) with open('sorted.txt', 'w') as f: f.write('\n'.join('%s\t%d' % result for result in results))
def run(): data = {} with open("check_clique_input.txt", "r") as f: for line in f.readlines(): temp = line.split(" -> ") data[temp[0].strip()] = temp[1].strip().split(" ") server = mincemeat.Server() server.mapfn = clique_map server.reducefn = clique_reduce server.datasource = data res = server.run_server(password="******") edge_lists_len = res.values() vertex_num = len(set(res.keys())) print("YES" if len(set(edge_lists_len)) == 1 and edge_lists_len[0] == vertex_num - 1 else "NO")
def main(): with open(DATA_FILE, "r") as f: data = map(str.strip, f.readlines()) s = mincemeat.Server() s.mapfn = map_func_phase_one s.reducefn = reduce_func_phase_one s.datasource = dict(enumerate(data)) results = s.run_server(password=SERVER_PASSWORD, port=SERVER_PORT) results = filter(None, results.values()) s.mapfn = map_func_phase_two s.reducefn = reduce_func_phase_two s.datasource = dict(enumerate(results)) results = s.run_server(password=SERVER_PASSWORD, port=SERVER_PORT) for key, num_of_ref in results.items(): if num_of_ref: print("%s (%s)" % (key, num_of_ref))
def mapreduce_pageranks(self, clients=8): # Since we are running multiple asyncore-based Clients and a # Server in separate threads, we need to specify map={} for the # Clients, so they all don't use the (default) global asyncore # socket map as the Server... logging.info("Starting %d clients...", clients) for _ in xrange(clients): c = mincemeat.Client() c.password = "******" p = Process(target=c.conn, args=("", mincemeat.DEFAULT_PORT)) p.start() s = mincemeat.Server() s.datasource = self.datasource() s.mapfn = self.mapfn s.reducefn = self.reducefn result = s.run_server(password="******") return result
def main(): data = {} directory = "./sherlock" allbooks = os.listdir(directory) numbooks = len(allbooks) for book in allbooks: name = os.path.join(directory, book) with open(name, "r") as file: data[book] = list( filter(lambda x: x != "", re.split(r"\W+", file.read().lower()))) def mapfn(k, v): for w in v: yield w, (k, 1) def reducefn(k, vs): res = {} for a, i in vs: if a in res: res[a] += i else: res[a] = i return res s = mincemeat.Server() s.datasource = data s.mapfn = mapfn s.reducefn = reducefn print("Server is running...") res = s.run_server(password="******") with open("./task2res.csv", "wb") as file: w = csv.DictWriter(file, fieldnames=["Word"] + allbooks, delimiter=",", restval=0) w.writeheader() for key in res: res[key]["Word"] = key w.writerow(res[key])
def server(credentials, asynchronous=False, map=None): """ Run a Map-Reduce Server, and process a single Map-Reduce task. Raises exception on failure to create and run a Server, or complete the task successfully. If asynchronous, does not initiate processing; use s.process(). After processing, call s.results(). """ s = mincemeat.Server(map=map) s.datasource = datasource s.mapfn = mapfn s.collectfn = collectfn s.reducefn = reducefn s.finishfn = finishfn s.conn(asynchronous=asynchronous, **credentials) return s
def main(): with open(CENTROIDS_FILE, "wb") as f: centroids = generate_centroids(K) with open(DATA_FILE, "rb") as f: data = [map(float, x.strip().split(',')) for x in f.readlines()] s = mincemeat.Server() s.mapfn = map_func s.reducefn = reduce_func old_results = list() results = list() while not old_results or old_results != results: plt.scatter(*np.array(data).T) plt.scatter(*np.array(centroids).T, color="g", s=500) plt.show() old_results = results s.datasource = dict(enumerate(zip((centroids, ) * len(data), data))) results = s.run_server(password=SERVER_PASSWORD, port=SERVER_PORT) centroids = results.values() print(results)
def run(): data = {} with open("pseudo_synonyms_input.txt", "r") as f: for idx, line in enumerate(f.readlines()): data[idx] = line.strip() server = mincemeat.Server() server.mapfn = ps1_map server.reducefn = ps1_reduce server.datasource = data res1 = server.run_server(password="******") server.mapfn = ps2_map server.reducefn = ps2_reduce server.datasource = res1 res2 = server.run_server(password="******") for key, val in res2.iteritems(): if val > 1: print("%s - %s (%s)" % (key[0], key[1], val))
for index, item in enumerate(v): columns = item.split(':') if columns[0] == 'Vendas': total += int(columns[1]) if columns[0] == 'Filial': NomeFilial = columns[1] L = list() L.append(NomeFilial + ' , ' + str(total)) return L # Transforma todos os arquivos em uma estrutura de "chave/valor" (file_name/file_content). source = dict( (file_name, file_contents(file_name)) for file_name in text_files) s = mincemeat.Server() s.datasource = source s.mapfn = mapfn s.reducefn = reducefn results = s.run_server(password="******") # Apresenta o resultado em um arquivo CSV. w = csv.writer(open(PATH + 'result.csv', 'w')) for k, v in results.items(): w.writerow([ k, str(v).replace('[', '').replace(']', '').replace("'", '').replace(' ', '')