def pearson(X, Y, mx, my): xs = [_norm(x, mx) for x in X] ys = [_norm(y, my) for y in Y] xxs = [_pow(x, 2) for x in xs] yys = [_pow(y, 2) for y in ys] suma = mergeReduce(reduce_add, [multFrag(a, b) for (a, b) in zip(xs, ys)]) sum_x = mergeReduce(reduce_add, map(_add, xxs)) sum_y = mergeReduce(reduce_add, map(_add, yys)) r = op_task(sum_x, sum_y, suma) return r
def kmeans_frag(numV, k, dim, epsilon, maxIterations, numFrag): from pycompss.api.api import compss_wait_on import time size = int(numV / numFrag) startTime = time.time() X = [genFragment(size, dim) for _ in range(numFrag)] print("Points generation Time {} (s)".format(time.time() - startTime)) mu = init_random(dim, k) oldmu = [] n = 0 startTime = time.time() while not has_converged(mu, oldmu, epsilon, n, maxIterations): oldmu = mu clusters = [ cluster_points_partial(X[f], mu, f * size) for f in range(numFrag) ] partialResult = [ partial_sum(X[f], clusters[f], f * size) for f in range(numFrag) ] mu = mergeReduce(reduceCentersTask, partialResult) mu = compss_wait_on(mu) mu = [mu[c][1] / mu[c][0] for c in mu] print(mu) n += 1 print("Kmeans Time {} (s)".format(time.time() - startTime)) return (n, mu)
def std(X, m, n, wait=False): xs = [_norm(x, m) for x in X] xp = [_pow(x, 2) for x in xs] suma = mergeReduce(reduce_add, [_mean(x, n) for x in xp]) if wait: from pycompss.api.api import compss_wait_on suma = compss_wait_on(suma) return suma
def waze_jams(trainfile, hypers, Ntrain, script, gridsList, grid, numFrag, output): """ prepare(): It contains both the data to be used for hyperparameter learning and inference as information regarding the GP prior distribution. trainGP(): It outputs two items per cell: forecasts and hypers. The first items contains a Tx2 matrix with predictive mean and variance, where T is the number of time intervals required for testing. Predictions are in the interval [-1,+1], where predictions closer to -1 indicate greater probability of being associated with label -1 and predictions closer to +1 indicate the opposite scenario. These predictions can be turned into probabilities by turning them into the interval [0,1]. The second item consists of a vector with learned hyperparameters. """ import time timestr = str(time.strftime("%Y%m%d_%Hh")) gridsList = np.loadtxt(gridsList, delimiter=',', dtype=(int,int), skiprows=1, usecols = (4,5)) gridsList = gridsList[:,0][gridsList[:, 1] == 1] print "[INFO] - {} valid grids".format(len(gridsList)) config = prepare(trainfile, Ntrain) if grid == -1: frag_cells = np.array_split(gridsList, numFrag) else: if grid in gridsList: frag_cells = np.array([[grid]]) else: print "[INFO] - Grid #{} is not valid".format(grid) return output_forecast = ['{}forecasts_part{}_{}.txt'.format(output,f,timestr) for f in range(len(frag_cells))] if len(hypers)>0: frag_cells = [load_hypers(hypers,frag_cells[i]) for i in range(len(frag_cells))] for f in range(len(frag_cells)): GP(script, config, frag_cells[f], output_forecast[f]) else: output_hyper = [GP_hyper(script, config, frag_cells[f], output_forecast[f]) for f in range(len(frag_cells))] results = mergeReduce(mergelists,output_hyper) from pycompss.api.api import compss_wait_on results = compss_wait_on(results) np.savetxt( '{}hypers_{}.txt'.format(output,timestr), np.asarray(results), delimiter=',', fmt="%i,%f,%f,%f,%f,%f,%f,%f")
def mean(X, wait=False): """ Arithmetic mean :param X: chunked data :param wait: if we want to wait for result. Default False :return: mean of X. """ n = _list_lenght(X) result = mergeReduce(reduce_add, [_mean(x, n) for x in X]) if wait: from pycompss.api.api import compss_wait_on result = compss_wait_on(result) return result
def kmeans(data, k, numFrag=-1, maxIterations=10, epsilon=1e-4, initMode='random'): """ kmeans: starting with a set of randomly chosen initial centers, one repeatedly assigns each imput point to its nearest center, and then recomputes the centers given the point assigment. This local search called Lloyd's iteration, continues until the solution does not change between two consecutive rounds or iteration > maxIterations. :param data: data :param k: num of centroids :param numFrag: num fragments, if -1 data is considered chunked :param maxIterations: max iterations :param epsilon: error threshold :return: list os centroids """ from pycompss.api.api import compss_wait_on # Data is already fragmented if numFrag == -1: numFrag = len(data) else: # fragment data data = [d for d in chunks(data, len(data) / numFrag)] mu = init(data, k, initMode) oldmu = [] n = 0 size = int(len(data) / numFrag) while not has_converged(mu, oldmu, epsilon, n, maxIterations): oldmu = list(mu) clusters = [ cluster_points_partial(data[f], mu, f * size) for f in range(numFrag) ] partialResult = [ partial_sum(data[f], clusters[f], f * size) for f in range(numFrag) ] mu = mergeReduce(reduceCentersTask, partialResult) mu = compss_wait_on(mu) mu = [mu[c][1] / mu[c][0] for c in mu] n += 1 return mu
def mean(X, n, wait=False): result = mergeReduce(reduce_add, [_mean(x, n) for x in X]) if wait: from pycompss.api.api import compss_wait_on result = compss_wait_on(result) return result
def test_mergeReduce(self): from pycompss.api.api import compss_wait_on res = mergeReduce(self.methodFunction, self.data) res = compss_wait_on(res) self.assertEqual(res, sum(self.data))
def test_mergeReduce_seq(self): res = mergeReduce(self.lambdaFunction, self.data) self.assertEqual(res, sum(self.data))
parser.add_argument('-g','--grids', required=True, help='The input of the grids list file.') parser.add_argument('-w','--window', required=False, help='The window time (in seconds) to take in count (default, 3600)', type=int, default=3600,) parser.add_argument('-f','--numFrag',required=False, help='Number of workers (cores)', type=int, default=4) arg = vars(parser.parse_args()) filename = arg['input'] grids = arg['grids'] window_time = arg['window'] numFrag = arg['numFrag'] print """ Running: Waze-jams's preprocessing script with the following parameters: - Input file: {} - Grids file: {} - Window time: {} seconds - Number of workers: {} """.format(filename,grids,window_time,numFrag) grids = np.genfromtxt(grids, delimiter=',', dtype=None, names=True) partial_grid = [preprocessing(grids, window_time, "{}_{}".format(filename,f)) for f in range(numFrag)] jam_grids_p = mergeReduce(mergeMatrix, partial_grid) jam_grids_p = compss_wait_on(jam_grids_p) jam_grids, events = updateJamGrid(jam_grids_p) jam_grids.to_csv("output_training.csv",sep=",",index=True,header=False) events.to_csv("output_counts.csv",sep=",")