예제 #1
0
파일: kmeans.py 프로젝트: ChicoQ/thunder
    normDists = data.map(lambda p: closestPoint((p - mean(p)) / norm(p), centers, "corr")[1])

    return labels, centers, dists, normDists


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="do kmeans clustering")
    parser.add_argument("master", type=str)
    parser.add_argument("dataFile", type=str)
    parser.add_argument("dataMode", choices=("raw", "dff", "sub"), help="form of data preprocessing")
    parser.add_argument("outputDir", type=str)
    parser.add_argument("k", type=int)
    parser.add_argument("dist", choices=("euclidean", "correlation"), help="distance metric for kmeans")

    args = parser.parse_args()
    egg = glob.glob(os.environ["THUNDER_EGG"] + "*.egg")
    sc = SparkContext(args.master, "kmeans", pyFiles=egg)

    lines = sc.textFile(args.dataFile)
    data = parse(lines, args.dataMode).cache()

    labels, centers, dists, normDists = kmeans(data, args.k, args.dist)

    outputDir = args.outputDir + "-kmeans"
    if not os.path.exists(outputDir):
        os.makedirs(outputDir)
    saveout(labels, outputDir, "labels", "matlab")
    saveout(dists, outputDir, "dists", "matlab")
    saveout(centers, outputDir, "centers", "matlab")
    saveout(normDists, outputDir, "normDists", "matlab")
예제 #2
0
파일: pca.py 프로젝트: ChicoQ/thunder
from thunder.factorization.util import svd1, svd3, svd4
from pyspark import SparkContext


def pca(data, k):
    comps, latent, scores = svd4(data, k, 0)
    return comps, latent, scores

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="do principal components analysis")
    parser.add_argument("master", type=str)
    parser.add_argument("dataFile", type=str)
    parser.add_argument("dataMode", choices=("raw", "dff", "sub"), help="form of data preprocessing")
    parser.add_argument("outputDir", type=str)
    parser.add_argument("k", type=int)

    args = parser.parse_args()
    egg = glob.glob(os.environ['THUNDER_EGG'] + "*.egg")
    sc = SparkContext(args.master, "pca", pyFiles=egg)
    lines = sc.textFile(args.dataFile)
    data = parse(lines, args.dataMode).cache()

    comps, latent, scores = pca(data, args.k)

    outputDir = args.outputDir + "-pca"
    if not os.path.exists(outputDir):
        os.makedirs(outputDir)

    saveout(comps, outputDir, "comps", "matlab")
    saveout(latent, outputDir, "latent", "matlab")
    saveout(scores, outputDir, "scores", "matlab", args.k)
예제 #3
0
파일: stats.py 프로젝트: errord/thunder
    method = SigProcessingMethod.load("stats", statistic=statistic)
    vals = method.calc(data)

    return vals

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="compute summary statistics on time series data")
    parser.add_argument("master", type=str)
    parser.add_argument("datafile", type=str)
    parser.add_argument("outputdir", type=str)
    parser.add_argument("mode", choices=("mean", "median", "std", "norm"),
                        help="which summary statistic")
    parser.add_argument("--preprocess", choices=("raw", "dff", "sub"), default="raw", required=False)

    args = parser.parse_args()
    egg = glob.glob(os.environ['THUNDER_EGG'] + "*.egg")
    sc = SparkContext(args.master, "ref", pyFiles=egg)

    lines = sc.textFile(args.datafile)
    data = parse(lines, args.preprocess).cache()

    vals = stats(data, args.mode)

    outputdir = args.outputdir + "-stats",

    outputdir = args.outputdir + "-stats"
    if not os.path.exists(outputdir):
        os.makedirs(outputdir)

    saveout(vals, outputdir, "stats_" + args.mode, "matlab")
예제 #4
0
파일: fourier.py 프로젝트: errord/thunder
    method = SigProcessingMethod.load("fourier", freq=freq)
    out = method.calc(data).cache()

    co = out.map(lambda x: x[0])
    ph = out.map(lambda x: x[1])

    return co, ph

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="compute a fourier transform on each time series")
    parser.add_argument("master", type=str)
    parser.add_argument("datafile", type=str)
    parser.add_argument("outputdir", type=str)
    parser.add_argument("freq", type=int)
    parser.add_argument("--preprocess", choices=("raw", "dff", "sub"), default="raw", required=False)

    args = parser.parse_args()
    egg = glob.glob(os.environ['THUNDER_EGG'] + "*.egg")
    sc = SparkContext(args.master, "fourier", pyFiles=egg)

    lines = sc.textFile(args.datafile)
    data = parse(lines, "dff")

    co, ph = fourier(data, args.freq)

    outputdir = args.outputdir + "-fourier"
    if not os.path.exists(outputdir):
        os.makedirs(outputdir)
    saveout(co, outputdir, "co", "matlab")
    saveout(ph, outputdir, "ph", "matlab")
예제 #5
0
파일: ica.py 프로젝트: liyanghua/thunder
# do multiple independent component extraction
B = orth(random.randn(k, c))
Bold = zeros((k, c))
iterNum = 0
minAbsCos = 0
termTol = 0.000001
iterMax = 1000
errVec = zeros(iterMax)

while (iterNum < iterMax) & ((1 - minAbsCos) > termTol):
    iterNum += 1
    # update rule for pow3 nonlinearity (TODO: add other nonlins)
    B = wht.map(lambda x: outer(x, dot(x, B) ** 3)).reduce(lambda x, y: x + y) / n - 3 * B
    # orthognalize
    B = dot(B, real(sqrtm(inv(dot(transpose(B), B)))))
    # evaluate error
    minAbsCos = min(abs(diag(dot(transpose(B), Bold))))
    # store results
    Bold = B
    errVec[iterNum-1] = (1 - minAbsCos)

# get unmixing matrix
W = dot(transpose(B), whtMat)

# get components
sigs = data.map(lambda x: dot(W, x))

# save output files
saveout(W, outputDir, "W", "matlab")
saveout(sigs, outputDir, "sigs", "matlab", c)
예제 #6
0
파일: kmeans.py 프로젝트: errord/thunder
        iter += 1

    labels = data.map(lambda p: closestpoint(p, centers))

    return labels, centers

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="do kmeans clustering")
    parser.add_argument("master", type=str)
    parser.add_argument("datafile", type=str)
    parser.add_argument("outputdir", type=str)
    parser.add_argument("k", type=int)
    parser.add_argument("--maxiter", type=float, default=20, required=False)
    parser.add_argument("--tol", type=float, default=0.001, required=False)
    parser.add_argument("--preprocess", choices=("raw", "dff", "sub"), default="raw", required=False)

    args = parser.parse_args()
    egg = glob.glob(os.environ['THUNDER_EGG'] + "*.egg")
    sc = SparkContext(args.master, "kmeans", pyFiles=egg)

    lines = sc.textFile(args.datafile)
    data = parse(lines, args.preprocess).cache()

    labels, centers = kmeans(data, k=args.k, maxiter=args.maxiter, tol=args.tol)

    outputdir = args.outputdir + "-kmeans"
    if not os.path.exists(outputdir):
        os.makedirs(outputdir)
    saveout(labels, outputdir, "labels", "matlab")
    saveout(centers, outputdir, "centers", "matlab")
예제 #7
0
파일: localcorr.py 프로젝트: errord/thunder
    # get correlations and sort by key so result is in the right order
    corr = result.map(lambda (k, v): (k, corrcoef(v[0], v[1])[0, 1])).sortByKey().map(
        lambda (k, v): v)

    return corr


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="correlate time series with neighbors")
    parser.add_argument("master", type=str)
    parser.add_argument("datafile", type=str)
    parser.add_argument("outputdir", type=str)
    parser.add_argument("sz", type=int)
    parser.add_argument("--preprocess", choices=("raw", "dff", "sub"), default="raw", required=False)

    args = parser.parse_args()
    egg = glob.glob(os.environ['THUNDER_EGG'] + "*.egg")
    sc = SparkContext(args.master, "localcorr", pyFiles=egg)

    lines = sc.textFile(args.datafile)
    data = parse(lines, args.preprocess, "xyz").cache()

    corrs = localcorr(data, args.sz)

    outputdir = args.outputdir + "-localcorr"
    if not os.path.exists(outputdir):
        os.makedirs(outputdir)

    saveout(corrs, outputdir, "corr", "matlab")
예제 #8
0
파일: regress.py 프로젝트: errord/thunder
    traj = model.fit(data, comps)

    return stats, comps, latent, scores, traj


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="fit a regression model")
    parser.add_argument("master", type=str)
    parser.add_argument("datafile", type=str)
    parser.add_argument("modelfile", type=str)
    parser.add_argument("outputdir", type=str)
    parser.add_argument("regressmode", choices=("linear", "bilinear"), help="form of regression")
    parser.add_argument("--preprocess", choices=("raw", "dff", "sub"), default="raw", required=False)

    args = parser.parse_args()
    egg = glob.glob(os.environ['THUNDER_EGG'] + "*.egg")
    sc = SparkContext(args.master, "regress", pyFiles=egg)
    lines = sc.textFile(args.datafile)
    data = parse(lines, args.preprocess).cache()

    stats, comps, latent, scores, traj = regress(data, args.modelfile, args.regressmode)

    outputdir = args.outputdir + "-regress"
    if not os.path.exists(outputdir):
        os.makedirs(outputdir)

    saveout(stats, outputdir, "stats", "matlab")
    saveout(comps, outputdir, "comps", "matlab")
    saveout(latent, outputdir, "latent", "matlab")
    saveout(scores, outputdir, "scores", "matlab", 2)
    saveout(traj, outputdir, "traj", "matlab")
예제 #9
0
파일: query.py 프로젝트: errord/thunder
            lambda (k, x): x).mean()

    return ts


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="query time series data by averaging values for given indices")
    parser.add_argument("master", type=str)
    parser.add_argument("datafile", type=str)
    parser.add_argument("indsfile", type=str)
    parser.add_argument("outputdir", type=str)
    parser.add_argument("mx_x", type=int)
    parser.add_argument("mx_y", type=int)
    parser.add_argument("--preprocess", choices=("raw", "dff", "sub"), default="raw", required=False)

    args = parser.parse_args()
    egg = glob.glob(os.environ['THUNDER_EGG'] + "*.egg")
    sc = SparkContext(args.master, "query", pyFiles=egg)

    # TODO: use sortByKey instead of specifying mxX and mxY
    lines = sc.textFile(args.datafile)
    data = parse(lines, "dff", "linear", None, [args.mx_x, args.mx_y]).cache()

    ts = query(data, args.indsfile)

    outputdir = args.outputdir + "-query"
    if not os.path.exists(outputdir):
        os.makedirs(outputdir)

    saveout(ts, outputdir, "ts", "matlab")
예제 #10
0
파일: ica.py 프로젝트: errord/thunder
    return w, sigs

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="do independent components analysis")
    parser.add_argument("master", type=str)
    parser.add_argument("datafile", type=str)
    parser.add_argument("outputdir", type=str)
    parser.add_argument("k", type=int)
    parser.add_argument("c", type=int)
    parser.add_argument("--svdmethod", choices=("direct", "em"), default="direct", required=False)
    parser.add_argument("--maxiter", type=float, default=100, required=False)
    parser.add_argument("--tol", type=float, default=0.000001, required=False)
    parser.add_argument("--preprocess", choices=("raw", "dff", "sub"), default="raw", required=False)
    parser.add_argument("--seed", type=int, default=0, required=False)

    args = parser.parse_args()
    egg = glob.glob(os.environ['THUNDER_EGG'] + "*.egg")
    sc = SparkContext(args.master, "ica", pyFiles=egg)
    lines = sc.textFile(args.datafile)
    data = parse(lines, args.preprocess).cache()

    w, sigs = ica(data, args.k, args.c, svdmethod=args.svdmethod, maxiter=args.maxiter, tol=args.tol, seed=args.seed)

    outputdir = args.outputdir + "-ica"
    if not os.path.exists(outputdir):
        os.makedirs(outputdir)

    saveout(w, outputdir, "w", "matlab")
    saveout(sigs, outputdir, "sigs", "matlab", args.c)
예제 #11
0
파일: ica.py 프로젝트: ChicoQ/thunder
    sigs = data.map(lambda x: dot(W, x))

    return W, sigs, whtMat, unwhtMat


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="do independent components analysis")
    parser.add_argument("master", type=str)
    parser.add_argument("dataFile", type=str)
    parser.add_argument("dataMode", choices=("raw", "dff", "sub"), help="form of data preprocessing")
    parser.add_argument("outputDir", type=str)
    parser.add_argument("k", type=int)
    parser.add_argument("c", type=int)

    args = parser.parse_args()
    egg = glob.glob(os.environ["THUNDER_EGG"] + "*.egg")
    sc = SparkContext(args.master, "ica", pyFiles=egg)
    lines = sc.textFile(args.dataFile)
    data = parse(lines, args.dataMode).cache()

    W, sigs, whtMat, unwhtMat = ica(data, args.k, args.c)

    outputDir = args.outputDir + "-ica"
    if not os.path.exists(outputDir):
        os.makedirs(outputDir)

    saveout(W, outputDir, "W", "matlab")
    saveout(sigs, outputDir, "sigs", "matlab", args.c)
    saveout(whtMat, outputDir, "whtMat", "matlab")
    saveout(unwhtMat, outputDir, "unwhtMat", "matlab")
예제 #12
0
파일: tuning.py 프로젝트: errord/thunder
    else:
        # use data
        params = tuningmodel.fit(data)

    return params

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="fit a parametric tuning curve to regression results")
    parser.add_argument("master", type=str)
    parser.add_argument("datafile", type=str)
    parser.add_argument("tuningmodelfile", type=str)
    parser.add_argument("outputdir", type=str)
    parser.add_argument("tuningmode", choices=("circular", "gaussian"), help="form of tuning curve")
    parser.add_argument("--preprocess", choices=("raw", "dff", "sub"), default="raw", required=False)
    parser.add_argument("--regressmodelfile", type=str)
    parser.add_argument("--regressmode", choices=("linear", "bilinear"), help="form of regression")

    args = parser.parse_args()
    egg = glob.glob(os.environ['THUNDER_EGG'] + "*.egg")
    sc = SparkContext(args.master, "tuning", pyFiles=egg)
    lines = sc.textFile(args.datafile)
    data = parse(lines, args.preprocess).cache()

    params = tuning(data, args.tuningmodelfile, args.tuningmode, args.regressmodelfile, args.regressmode)

    outputdir = args.outputdir + "-tuning"
    if not os.path.exists(outputdir):
        os.makedirs(outputdir)

    saveout(params, outputdir, "params", "matlab")
예제 #13
0
파일: crosscorr.py 프로젝트: errord/thunder

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="fit a regression model")
    parser.add_argument("master", type=str)
    parser.add_argument("datafile", type=str)
    parser.add_argument("sigfile", type=str)
    parser.add_argument("outputdir", type=str)
    parser.add_argument("lag", type=int)
    parser.add_argument("--preprocess", choices=("raw", "dff", "sub"), default="raw", required=False)

    args = parser.parse_args()
    egg = glob.glob(os.environ['THUNDER_EGG'] + "*.egg")
    sc = SparkContext(args.master, "crosscorr", pyFiles=egg)
    lines = sc.textFile(args.datafile)
    data = parse(lines, args.preprocess).cache()

    outputdir = args.outputdir + "-crosscorr"
    if not os.path.exists(outputdir):
        os.makedirs(outputdir)

    # post-process data with pca if lag greater than 0
    if args.lag is not 0:
        betas, scores, latent, comps = crosscorr(data, args.sigfile, args.lag)
        saveout(comps, outputdir, "comps", "matlab")
        saveout(latent, outputdir, "latent", "matlab")
        saveout(scores, outputdir, "scores", "matlab", 2)
    else:
        betas = crosscorr(data, args.sigfile, args.lag)
        saveout(betas, outputdir, "stats", "matlab")
예제 #14
0
파일: rpca.py 프로젝트: ChicoQ/thunder
        MSY = data.context.parallelize(M - S + (1/mu)*Y).cache()
        L = svdThreshold(MSY, 1/mu).collect()
        MLY = data.context.parallelize(M - L + (1/mu)*Y)
        S = shrinkage(MLY, lam/mu).collect()
        Y += mu * (M - L - S)

    return L, S

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="do independent components analysis")
    parser.add_argument("master", type=str)
    parser.add_argument("dataFile", type=str)
    parser.add_argument("dataMode", choices=("raw", "dff", "sub"), help="form of data preprocessing")
    parser.add_argument("outputDir", type=str)
    parser.add_argument("k", type=int)
    parser.add_argument("c", type=int)

    args = parser.parse_args()
    egg = glob.glob(os.environ['THUNDER_EGG'] + "*.egg")
    sc = SparkContext(args.master, "rpca", pyFiles=egg)
    lines = sc.textFile(args.dataFile)
    data = parse(lines, args.dataMode).cache()

    L, S = rpca(data)

    outputDir = args.outputDir + "-rpca"
    if not os.path.exists(outputDir):
        os.makedirs(outputDir)

    saveout(L, outputDir, "L", "matlab")
    saveout(S, outputDir, "S", "matlab")