예제 #1
0
def kmeansInitialClusters(dataset):
    model = KMeansModel(CENTER_VECTORS)
    vectorsRdd = dataset.rdd.map(lambda data: Vectors.parse(Vectors.stringify(data['features'])))
    trainedModel = KMeans.train(vectorsRdd, 4, maxIterations=1000, initialModel=model)
    result=[]
    for d in dataset.collect():
        entry = {}
        entry["features"] = d["features"]
        entry["prediction"] = trainedModel.predict(Vectors.parse(Vectors.stringify(d['features'])))
        entry["label"] = d['label']
        result.append(entry)

    plotDiversitySizeClustering(result, CENTERS, "Size", "Diversity", "Song Analysis by Size and Diversity with Initial Clusters")
    centroidArtistSongCount(result, CENTERS)
def map_to_libsvm_format_descr(tp):
    """
    map the training rdd to libsvm format (label index:value index:value...)
    """
    spVectorList=[]
    vectStr=Vectors.stringify(tp[1])[1:]
    (dim,indexStr,valsStr)=vectStr[:-1].split("[")
    indexLst=ast.literal_eval("["+indexStr)[0]
    valsLst=ast.literal_eval("["+valsStr)
    print "sparse vector-------------------------------"
    print tp[1]
    print "indexes-------------------------------"
    print indexLst
    print "vals-------------------------------"
    print valsLst
    for i in xrange(len(indexLst)):
        print i
        spVectorList.append("{0}:{1}".format((int(indexLst[i])+1),int(valsLst[i]))) #for this format indexes start from 1
    print "spVectorList is"
    print spVectorList
    return "{0} {1}".format(int(tp[0])," ".join(spVectorList))
def map_to_libsvm_format_descr(tp):
    """
    map the training rdd to libsvm format (label index:value index:value...)
    """
    spVectorList = []
    vectStr = Vectors.stringify(tp[1])[1:]
    (dim, indexStr, valsStr) = vectStr[:-1].split("[")
    indexLst = ast.literal_eval("[" + indexStr)[0]
    valsLst = ast.literal_eval("[" + valsStr)
    print "sparse vector-------------------------------"
    print tp[1]
    print "indexes-------------------------------"
    print indexLst
    print "vals-------------------------------"
    print valsLst
    for i in xrange(len(indexLst)):
        print i
        spVectorList.append("{0}:{1}".format(
            (int(indexLst[i]) + 1),
            int(valsLst[i])))  #for this format indexes start from 1
    print "spVectorList is"
    print spVectorList
    return "{0} {1}".format(int(tp[0]), " ".join(spVectorList))
예제 #4
0
 def __str__(self):
     return "(" + ",".join((str(self.label), Vectors.stringify(self.features))) + ")"