normDists = data.map(lambda p: closestPoint((p - mean(p)) / norm(p), centers, "corr")[1]) return labels, centers, dists, normDists if __name__ == "__main__": parser = argparse.ArgumentParser(description="do kmeans clustering") parser.add_argument("master", type=str) parser.add_argument("dataFile", type=str) parser.add_argument("dataMode", choices=("raw", "dff", "sub"), help="form of data preprocessing") parser.add_argument("outputDir", type=str) parser.add_argument("k", type=int) parser.add_argument("dist", choices=("euclidean", "correlation"), help="distance metric for kmeans") args = parser.parse_args() egg = glob.glob(os.environ["THUNDER_EGG"] + "*.egg") sc = SparkContext(args.master, "kmeans", pyFiles=egg) lines = sc.textFile(args.dataFile) data = parse(lines, args.dataMode).cache() labels, centers, dists, normDists = kmeans(data, args.k, args.dist) outputDir = args.outputDir + "-kmeans" if not os.path.exists(outputDir): os.makedirs(outputDir) saveout(labels, outputDir, "labels", "matlab") saveout(dists, outputDir, "dists", "matlab") saveout(centers, outputDir, "centers", "matlab") saveout(normDists, outputDir, "normDists", "matlab")
from thunder.factorization.util import svd1, svd3, svd4 from pyspark import SparkContext def pca(data, k): comps, latent, scores = svd4(data, k, 0) return comps, latent, scores if __name__ == "__main__": parser = argparse.ArgumentParser(description="do principal components analysis") parser.add_argument("master", type=str) parser.add_argument("dataFile", type=str) parser.add_argument("dataMode", choices=("raw", "dff", "sub"), help="form of data preprocessing") parser.add_argument("outputDir", type=str) parser.add_argument("k", type=int) args = parser.parse_args() egg = glob.glob(os.environ['THUNDER_EGG'] + "*.egg") sc = SparkContext(args.master, "pca", pyFiles=egg) lines = sc.textFile(args.dataFile) data = parse(lines, args.dataMode).cache() comps, latent, scores = pca(data, args.k) outputDir = args.outputDir + "-pca" if not os.path.exists(outputDir): os.makedirs(outputDir) saveout(comps, outputDir, "comps", "matlab") saveout(latent, outputDir, "latent", "matlab") saveout(scores, outputDir, "scores", "matlab", args.k)
method = SigProcessingMethod.load("stats", statistic=statistic) vals = method.calc(data) return vals if __name__ == "__main__": parser = argparse.ArgumentParser(description="compute summary statistics on time series data") parser.add_argument("master", type=str) parser.add_argument("datafile", type=str) parser.add_argument("outputdir", type=str) parser.add_argument("mode", choices=("mean", "median", "std", "norm"), help="which summary statistic") parser.add_argument("--preprocess", choices=("raw", "dff", "sub"), default="raw", required=False) args = parser.parse_args() egg = glob.glob(os.environ['THUNDER_EGG'] + "*.egg") sc = SparkContext(args.master, "ref", pyFiles=egg) lines = sc.textFile(args.datafile) data = parse(lines, args.preprocess).cache() vals = stats(data, args.mode) outputdir = args.outputdir + "-stats", outputdir = args.outputdir + "-stats" if not os.path.exists(outputdir): os.makedirs(outputdir) saveout(vals, outputdir, "stats_" + args.mode, "matlab")
method = SigProcessingMethod.load("fourier", freq=freq) out = method.calc(data).cache() co = out.map(lambda x: x[0]) ph = out.map(lambda x: x[1]) return co, ph if __name__ == "__main__": parser = argparse.ArgumentParser(description="compute a fourier transform on each time series") parser.add_argument("master", type=str) parser.add_argument("datafile", type=str) parser.add_argument("outputdir", type=str) parser.add_argument("freq", type=int) parser.add_argument("--preprocess", choices=("raw", "dff", "sub"), default="raw", required=False) args = parser.parse_args() egg = glob.glob(os.environ['THUNDER_EGG'] + "*.egg") sc = SparkContext(args.master, "fourier", pyFiles=egg) lines = sc.textFile(args.datafile) data = parse(lines, "dff") co, ph = fourier(data, args.freq) outputdir = args.outputdir + "-fourier" if not os.path.exists(outputdir): os.makedirs(outputdir) saveout(co, outputdir, "co", "matlab") saveout(ph, outputdir, "ph", "matlab")
# do multiple independent component extraction B = orth(random.randn(k, c)) Bold = zeros((k, c)) iterNum = 0 minAbsCos = 0 termTol = 0.000001 iterMax = 1000 errVec = zeros(iterMax) while (iterNum < iterMax) & ((1 - minAbsCos) > termTol): iterNum += 1 # update rule for pow3 nonlinearity (TODO: add other nonlins) B = wht.map(lambda x: outer(x, dot(x, B) ** 3)).reduce(lambda x, y: x + y) / n - 3 * B # orthognalize B = dot(B, real(sqrtm(inv(dot(transpose(B), B))))) # evaluate error minAbsCos = min(abs(diag(dot(transpose(B), Bold)))) # store results Bold = B errVec[iterNum-1] = (1 - minAbsCos) # get unmixing matrix W = dot(transpose(B), whtMat) # get components sigs = data.map(lambda x: dot(W, x)) # save output files saveout(W, outputDir, "W", "matlab") saveout(sigs, outputDir, "sigs", "matlab", c)
iter += 1 labels = data.map(lambda p: closestpoint(p, centers)) return labels, centers if __name__ == "__main__": parser = argparse.ArgumentParser(description="do kmeans clustering") parser.add_argument("master", type=str) parser.add_argument("datafile", type=str) parser.add_argument("outputdir", type=str) parser.add_argument("k", type=int) parser.add_argument("--maxiter", type=float, default=20, required=False) parser.add_argument("--tol", type=float, default=0.001, required=False) parser.add_argument("--preprocess", choices=("raw", "dff", "sub"), default="raw", required=False) args = parser.parse_args() egg = glob.glob(os.environ['THUNDER_EGG'] + "*.egg") sc = SparkContext(args.master, "kmeans", pyFiles=egg) lines = sc.textFile(args.datafile) data = parse(lines, args.preprocess).cache() labels, centers = kmeans(data, k=args.k, maxiter=args.maxiter, tol=args.tol) outputdir = args.outputdir + "-kmeans" if not os.path.exists(outputdir): os.makedirs(outputdir) saveout(labels, outputdir, "labels", "matlab") saveout(centers, outputdir, "centers", "matlab")
# get correlations and sort by key so result is in the right order corr = result.map(lambda (k, v): (k, corrcoef(v[0], v[1])[0, 1])).sortByKey().map( lambda (k, v): v) return corr if __name__ == "__main__": parser = argparse.ArgumentParser(description="correlate time series with neighbors") parser.add_argument("master", type=str) parser.add_argument("datafile", type=str) parser.add_argument("outputdir", type=str) parser.add_argument("sz", type=int) parser.add_argument("--preprocess", choices=("raw", "dff", "sub"), default="raw", required=False) args = parser.parse_args() egg = glob.glob(os.environ['THUNDER_EGG'] + "*.egg") sc = SparkContext(args.master, "localcorr", pyFiles=egg) lines = sc.textFile(args.datafile) data = parse(lines, args.preprocess, "xyz").cache() corrs = localcorr(data, args.sz) outputdir = args.outputdir + "-localcorr" if not os.path.exists(outputdir): os.makedirs(outputdir) saveout(corrs, outputdir, "corr", "matlab")
traj = model.fit(data, comps) return stats, comps, latent, scores, traj if __name__ == "__main__": parser = argparse.ArgumentParser(description="fit a regression model") parser.add_argument("master", type=str) parser.add_argument("datafile", type=str) parser.add_argument("modelfile", type=str) parser.add_argument("outputdir", type=str) parser.add_argument("regressmode", choices=("linear", "bilinear"), help="form of regression") parser.add_argument("--preprocess", choices=("raw", "dff", "sub"), default="raw", required=False) args = parser.parse_args() egg = glob.glob(os.environ['THUNDER_EGG'] + "*.egg") sc = SparkContext(args.master, "regress", pyFiles=egg) lines = sc.textFile(args.datafile) data = parse(lines, args.preprocess).cache() stats, comps, latent, scores, traj = regress(data, args.modelfile, args.regressmode) outputdir = args.outputdir + "-regress" if not os.path.exists(outputdir): os.makedirs(outputdir) saveout(stats, outputdir, "stats", "matlab") saveout(comps, outputdir, "comps", "matlab") saveout(latent, outputdir, "latent", "matlab") saveout(scores, outputdir, "scores", "matlab", 2) saveout(traj, outputdir, "traj", "matlab")
lambda (k, x): x).mean() return ts if __name__ == "__main__": parser = argparse.ArgumentParser(description="query time series data by averaging values for given indices") parser.add_argument("master", type=str) parser.add_argument("datafile", type=str) parser.add_argument("indsfile", type=str) parser.add_argument("outputdir", type=str) parser.add_argument("mx_x", type=int) parser.add_argument("mx_y", type=int) parser.add_argument("--preprocess", choices=("raw", "dff", "sub"), default="raw", required=False) args = parser.parse_args() egg = glob.glob(os.environ['THUNDER_EGG'] + "*.egg") sc = SparkContext(args.master, "query", pyFiles=egg) # TODO: use sortByKey instead of specifying mxX and mxY lines = sc.textFile(args.datafile) data = parse(lines, "dff", "linear", None, [args.mx_x, args.mx_y]).cache() ts = query(data, args.indsfile) outputdir = args.outputdir + "-query" if not os.path.exists(outputdir): os.makedirs(outputdir) saveout(ts, outputdir, "ts", "matlab")
return w, sigs if __name__ == "__main__": parser = argparse.ArgumentParser(description="do independent components analysis") parser.add_argument("master", type=str) parser.add_argument("datafile", type=str) parser.add_argument("outputdir", type=str) parser.add_argument("k", type=int) parser.add_argument("c", type=int) parser.add_argument("--svdmethod", choices=("direct", "em"), default="direct", required=False) parser.add_argument("--maxiter", type=float, default=100, required=False) parser.add_argument("--tol", type=float, default=0.000001, required=False) parser.add_argument("--preprocess", choices=("raw", "dff", "sub"), default="raw", required=False) parser.add_argument("--seed", type=int, default=0, required=False) args = parser.parse_args() egg = glob.glob(os.environ['THUNDER_EGG'] + "*.egg") sc = SparkContext(args.master, "ica", pyFiles=egg) lines = sc.textFile(args.datafile) data = parse(lines, args.preprocess).cache() w, sigs = ica(data, args.k, args.c, svdmethod=args.svdmethod, maxiter=args.maxiter, tol=args.tol, seed=args.seed) outputdir = args.outputdir + "-ica" if not os.path.exists(outputdir): os.makedirs(outputdir) saveout(w, outputdir, "w", "matlab") saveout(sigs, outputdir, "sigs", "matlab", args.c)
sigs = data.map(lambda x: dot(W, x)) return W, sigs, whtMat, unwhtMat if __name__ == "__main__": parser = argparse.ArgumentParser(description="do independent components analysis") parser.add_argument("master", type=str) parser.add_argument("dataFile", type=str) parser.add_argument("dataMode", choices=("raw", "dff", "sub"), help="form of data preprocessing") parser.add_argument("outputDir", type=str) parser.add_argument("k", type=int) parser.add_argument("c", type=int) args = parser.parse_args() egg = glob.glob(os.environ["THUNDER_EGG"] + "*.egg") sc = SparkContext(args.master, "ica", pyFiles=egg) lines = sc.textFile(args.dataFile) data = parse(lines, args.dataMode).cache() W, sigs, whtMat, unwhtMat = ica(data, args.k, args.c) outputDir = args.outputDir + "-ica" if not os.path.exists(outputDir): os.makedirs(outputDir) saveout(W, outputDir, "W", "matlab") saveout(sigs, outputDir, "sigs", "matlab", args.c) saveout(whtMat, outputDir, "whtMat", "matlab") saveout(unwhtMat, outputDir, "unwhtMat", "matlab")
else: # use data params = tuningmodel.fit(data) return params if __name__ == "__main__": parser = argparse.ArgumentParser(description="fit a parametric tuning curve to regression results") parser.add_argument("master", type=str) parser.add_argument("datafile", type=str) parser.add_argument("tuningmodelfile", type=str) parser.add_argument("outputdir", type=str) parser.add_argument("tuningmode", choices=("circular", "gaussian"), help="form of tuning curve") parser.add_argument("--preprocess", choices=("raw", "dff", "sub"), default="raw", required=False) parser.add_argument("--regressmodelfile", type=str) parser.add_argument("--regressmode", choices=("linear", "bilinear"), help="form of regression") args = parser.parse_args() egg = glob.glob(os.environ['THUNDER_EGG'] + "*.egg") sc = SparkContext(args.master, "tuning", pyFiles=egg) lines = sc.textFile(args.datafile) data = parse(lines, args.preprocess).cache() params = tuning(data, args.tuningmodelfile, args.tuningmode, args.regressmodelfile, args.regressmode) outputdir = args.outputdir + "-tuning" if not os.path.exists(outputdir): os.makedirs(outputdir) saveout(params, outputdir, "params", "matlab")
if __name__ == "__main__": parser = argparse.ArgumentParser(description="fit a regression model") parser.add_argument("master", type=str) parser.add_argument("datafile", type=str) parser.add_argument("sigfile", type=str) parser.add_argument("outputdir", type=str) parser.add_argument("lag", type=int) parser.add_argument("--preprocess", choices=("raw", "dff", "sub"), default="raw", required=False) args = parser.parse_args() egg = glob.glob(os.environ['THUNDER_EGG'] + "*.egg") sc = SparkContext(args.master, "crosscorr", pyFiles=egg) lines = sc.textFile(args.datafile) data = parse(lines, args.preprocess).cache() outputdir = args.outputdir + "-crosscorr" if not os.path.exists(outputdir): os.makedirs(outputdir) # post-process data with pca if lag greater than 0 if args.lag is not 0: betas, scores, latent, comps = crosscorr(data, args.sigfile, args.lag) saveout(comps, outputdir, "comps", "matlab") saveout(latent, outputdir, "latent", "matlab") saveout(scores, outputdir, "scores", "matlab", 2) else: betas = crosscorr(data, args.sigfile, args.lag) saveout(betas, outputdir, "stats", "matlab")
MSY = data.context.parallelize(M - S + (1/mu)*Y).cache() L = svdThreshold(MSY, 1/mu).collect() MLY = data.context.parallelize(M - L + (1/mu)*Y) S = shrinkage(MLY, lam/mu).collect() Y += mu * (M - L - S) return L, S if __name__ == "__main__": parser = argparse.ArgumentParser(description="do independent components analysis") parser.add_argument("master", type=str) parser.add_argument("dataFile", type=str) parser.add_argument("dataMode", choices=("raw", "dff", "sub"), help="form of data preprocessing") parser.add_argument("outputDir", type=str) parser.add_argument("k", type=int) parser.add_argument("c", type=int) args = parser.parse_args() egg = glob.glob(os.environ['THUNDER_EGG'] + "*.egg") sc = SparkContext(args.master, "rpca", pyFiles=egg) lines = sc.textFile(args.dataFile) data = parse(lines, args.dataMode).cache() L, S = rpca(data) outputDir = args.outputDir + "-rpca" if not os.path.exists(outputDir): os.makedirs(outputDir) saveout(L, outputDir, "L", "matlab") saveout(S, outputDir, "S", "matlab")