示例#1
0
def icluster(data, terms, userFeedbackTerm, k, userU=-1):
    N, M = data.shape

    if userU == +1:  # it means reclustering signal has been sent
        # clusterNames = eval(form.getvalue('serverClusterName'))
        userU = numpy.zeros((k, M), float)
        userFeedbackTermId = []
        for i in range(len(userFeedbackTerm)):
            tempArray = []
            if (len(userFeedbackTerm[i]) == 1):
                if (numpy.where(terms == userFeedbackTerm[i][0])[1].size > 0):
                    userU[i,
                          numpy.where(
                              terms == userFeedbackTerm[i][0])[1][0]] = 1
            else:
                step = 0.05  # the lower terms will recive lower value
                for j in range(len(userFeedbackTerm[i])):
                    if (numpy.where(terms == userFeedbackTerm[i][j])[1].size >
                            0):
                        userU[i,
                              numpy.where(terms ==
                                          userFeedbackTerm[i][j])[1][0]] = max(
                                              1 - j * step, 0.5)

    docs = numpy.arange(1, N + 1).reshape((1, N))

    Vars = numpy.var(data, axis=0).transpose()
    options = (1.1, 25, 0.01, 0)
    keyterms = []
    clusterKeyterms = []
    clusterDocs = []

    realK = 0
    # in case the number of clusters are less than user specified, it will recluster until it gets the right number.
    while realK < k:
        idp = []
        selectedCentroids = numpy.empty([k, M], dtype=float)
        fcm = Fuzzy.FuzzyCMeans(data.transpose(), k, options[0], 'cosine',
                                userU, options[1], options[2])
        fcm()
        bestU = fcm.mu  # .transpose()
        for p in range(k):
            sortIDX = numpy.argsort(bestU[p, :])
            sortV = numpy.sort(bestU[p, :])
            tempIndex = numpy.argmax(sortV > (1.0 / k))
            idp.append(sortIDX[tempIndex:])

        for p in range(k):
            idx = []
            idpp = idp[p]

            Varsp = Vars[idpp]
            meanVarsp = numpy.mean(Varsp)
            tempIndex = numpy.where(Varsp >= meanVarsp)[0]
            keyTerms = idpp[tempIndex]

            newDataset = data[:, keyTerms]
            sumDataset = numpy.mean(newDataset, axis=1)

            temp, label = scipy.cluster.vq.kmeans2(sumDataset,
                                                   2,
                                                   iter=50,
                                                   thresh=1e-03,
                                                   minit='random',
                                                   missing='warn')
            idx.append(numpy.where(label == 0)[0])
            idx.append(numpy.where(label == 1)[0])
            if (idx[0].size == 0):
                relDocs = idx[1]
            elif (idx[1].size == 0):
                relDocs = idx[0]
            else:
                if (idx[0].size >= idx[1].size):
                    relDocs = idx[1]
                else:
                    relDocs = idx[0]
            selectedCentroids[p, :] = numpy.mean(data[relDocs, :], axis=0)
        Y = cdist(data, selectedCentroids, 'cosine')

        minY = numpy.min(Y, axis=1)
        maxY = numpy.max(Y, axis=1)
        maxMmin = maxY - minY
        minY = numpy.kron(numpy.ones((k, 1)), minY).transpose()
        maxMmin = numpy.kron(numpy.ones((k, 1)), maxMmin).transpose()
        tempY = numpy.multiply((Y - minY), numpy.power(maxMmin, -1.0))
        tempY = 1 - tempY

        threshold = 0.95
        tempY = (tempY > threshold)
        clusters = []
        for p in range(k):
            clusters.append(numpy.where(tempY[:, p])[0])

        realK = 0
        IDX = numpy.argmin(Y, axis=1)
        newclusters = []
        for p in range(k):
            newclusters.append(numpy.where(IDX == p)[0])
            if (len(newclusters[p]) > 0):
                realK = realK + 1
        del newclusters

    silhouette_avg = silhouette_score(data, IDX, 'cosine')
    sample_silhouette_values = silhouette_samples(data, IDX, 'cosine')
    scores = dict()
    for i, label in enumerate(IDX):
        ith_cluster_silhouette_values = sample_silhouette_values[IDX == label]
        avg = numpy.mean(ith_cluster_silhouette_values)
        scores[str(label)] = scale_score(avg)
    attrVals = numpy.empty([M, k], dtype=float)
    computeX2(attrVals, clusters, data, N)
    for p in range(k):
        temp = numpy.argsort(attrVals[:, p])
        temp = temp[::-1]
        keyterms.append(temp[range(f)])

    for p in range(k):
        tempStr = '['
        comma = ''
        for j in range(len(keyterms[p])):
            tempStr += comma + '\"' + terms[0, keyterms[p][j]] + '\"'
            comma = ','
        tempStr += ']'
        clusterKeyterms.append(tempStr)

    for p in range(k):
        tmp = []

        for j in range(len(clusters[p])):
            tmp.append(docs[0, clusters[p][j]])

        clusterDocs.append(tmp)

    clusterKeyterms = [ast.literal_eval(x) for x in clusterKeyterms]

    # clusterDocs = [ast.literal_eval(x) for x in clusterDocs]
    return clusterDocs, clusterKeyterms, keyterms, silhouette_avg, scores
示例#2
0
NMI = []
keyterms = []
#clusterKeyterms = numpy.empty([1,k], dtype=object)
clusterKeyterms = []
clusterDocs = []

realK = 0

while (realK < k):

    idp = []

    selectedCentroids = numpy.empty([k, M], dtype=float)
    attrVals = numpy.empty([M, k], dtype=float)

    fcm = Fuzzy.FuzzyCMeans(data.transpose(), k, options[0], 'cosine', userU)
    fcm()
    bestU = fcm.mu  #.transpose()

    for p in range(k):
        sortIDX = numpy.argsort(bestU[p, :])
        sortV = numpy.sort(bestU[p, :])
        tempIndex = numpy.argmax(sortV > (1.0 / k))
        idp.append(sortIDX[tempIndex:])

    for p in range(k):
        idx = []
        idpp = idp[p]

        Varsp = Vars[idpp]
        meanVarsp = numpy.mean(Varsp)