示例#1
0
def subtract(v1, v2):
    v1d = dict(zip(v1.indices, v1.values))
    v2d = dict(zip(v2.indices, v2.values))
    zero = np.float64(0)
    values = {i: v1d.get(i, zero) - v2d.get(i, zero)\
            for i in indices\
            if v1d.get(i,zero)+v2d.get(i,zero) != zero}
    return lg.SparseVector(v1.size, values)
示例#2
0
def add(v1, v2):
    """Add two sparse vectors
    >>> v1 = Vectors.sparse(3, {0: 1.0, 2: 1.0})
    >>> v2 = Vectors.sparse(3, {1: 1.0})
    >>> add(v1, v2)
    SparseVector(3, {0: 1.0, 1: 1.0, 2: 1.0})
    """
    assert isinstance(v1, lg.SparseVector) and isinstance(v2, lg.SparseVector)
    assert v1.size == v2.size
    # Compute union of indices
    indices = set(v1.indices).union(set(v2.indices))
    # Not particularly efficient but we are limited by SPARK-10973
    # Create index: value dicts
    v1d = dict(zip(v1.indices, v1.values))
    v2d = dict(zip(v2.indices, v2.values))
    zero = np.float64(0)
    # Create dictionary index: (v1[index] + v2[index])
    values =  {i: v1d.get(i, zero) + v2d.get(i, zero)\
       for i in indices\
       if v1d.get(i, zero) + v2d.get(i, zero) != zero}

    return lg.SparseVector(v1.size, values)
示例#3
0
def as_old(v):
    if isinstance(v, ml_linalg.SparseVector):
        return mllib_linalg.SparseVector(v.size, v.indices, v.values)
    if isinstance(v, ml_linalg.DenseVector):
        return mllib_linalg.DenseVector(v.values)
    raise ValueError("Unsupported type {0}".format(type(v)))
示例#4
0
spark = SparkSession\
        .builder\
        .appName("PythonKMeans")\
        .getOrCreate()

lines = spark.read.text(sys.argv[1])\
        .rdd.map(lambda r: r[0])\
        .map(lambda x: x.split(" "))\
        .map(lambda x: [int(i) for i in x])
k = int(sys.argv[2])
header = lines.take(2)
N, V = header[0][0], header[1][0]

doc_tf = lines.filter(lambda x: len(x)>1)\
        .map(lambda x: (x[0]-1,(x[1]-1,x[2])))\
        .groupByKey()\
        .map(lambda x: (x[0],lg.SparseVector(V,x[1])))\
        .sortByKey()\

tf = doc_tf.map(lambda x: x[1])
idf = ft.IDF()
model = idf.fit(tf)
tfidf = model.transform(tf)
kPoints = tfidf.repartition(1).takeSample(False, k, 1)
kk = tf.repartition(1).takeSample(False, k, 1)
print model.transform(kk[1])
print doc_tf.collect()
print tfidf.collect()
print kPoints
示例#5
0
def divide(v, n):
    vd = dict(zip(v.indices, v.values))
    values = {i: vd.get(i) / n for i in v.indices}
    return lg.SparseVector(v.size, values)
示例#6
0
            .builder\
            .appName("PythonKMeans")\
            .getOrCreate()

    lines = spark.read.text(sys.argv[1])\
            .rdd.map(lambda r: r[0])\
            .map(lambda x: x.split(" "))\
            .map(lambda x: [int(i) for i in x])
    k = int(sys.argv[2])
    header = lines.take(2)
    N, V = header[0][0], header[1][0]

    doc_tf = lines.filter(lambda x: len(x)>1)\
            .map(lambda x: (x[0]-1,(x[1]-1,x[2])))\
            .groupByKey()\
            .map(lambda x: (x[0],lg.SparseVector(V,x[1])))\
            .sortByKey()\

    tf = doc_tf.map(lambda x: x[1])
    idf = ft.IDF()
    model = idf.fit(tf)
    tfidf = model.transform(tf).\
            map(lambda x: lg.SparseVector(x.size,x.indices,x.values/x.norm(2)))\
            .cache()
    kPoints = tfidf.repartition(1).takeSample(False, k, 1)
    convergeDist = float(sys.argv[3])
    tempDist = 1.0

    while tempDist > convergeDist:
        closest = tfidf.map(lambda p: (closestPoint(p, kPoints), (p, 1)))
        pointStats = closest.reduceByKey(lambda p1_c1, p2_c2: (add(