예제 #1
0
def rsnn(sampledData, remainedData, sampledIndex, remainedIndex, singleName):
    predicted_labelAll = []
    for i in range(len(sampledData)):
        # clusters = random.randint(min_clusters,max_clusters)
        clusters = random.randint(2, 11)
        # clusters = random.randint(2,11)#范围是[2,10]
        if singleName == 'kmeans':
            predicted_label = KMeans(n_clusters=clusters).fit_predict(
                sampledData[i])
        elif singleName in ('ward', 'complete', 'average'):
            predicted_label = AgglomerativeClustering(
                linkage=singleName,
                n_clusters=clusters).fit_predict(sampledData[i])

        predicted_labelAll.append(predicted_label.tolist())  ##对采样出来的数据集的预测标签集合

    assinALLNnLabels = []  #全部的通过近邻分配的标签

    #remainedData和sampleedData拥有的数据的行数是一致的,所以j的值无论从len(remainedData)还是从len(sampledData)取都可以
    for j in range(len(remainedData)):
        assinNnLabels = []  # 通过近邻分配的标签
        for m in range(len(remainedData[j])):
            minDist = inf
            minindex = -1
            for k in range(len(sampledData[j])):
                distJI = distEclud(remainedData[j][m], sampledData[j][k])
                if distJI < minDist:
                    minDist = distJI
                    minindex = k
            assinNnLabels.append(
                predicted_labelAll[j][minindex])  #对除采样外的数据集的根据近邻关系得到的预测标签集合
        assinALLNnLabels.append(assinNnLabels)

    #对两个预测标签和序列值分别进行组合
    combineIndex = []
    combinedLables = []
    for column in range(len(predicted_labelAll)):
        combineIndexOne = sampledIndex[column] + remainedIndex[column]
        combinedLablesOne = predicted_labelAll[column] + assinALLNnLabels[
            column]
        combineIndex.append(combineIndexOne)
        combinedLables.append(combinedLablesOne)
    #把打乱的序号按照从小到大排列出来,得到元素升序的序列值
    seqIndexAll = []
    for combineIndex1 in combineIndex:
        seqIndex = []
        for seq in range(len(sampledData[0]) + len(remainedData[0])):
            for elementIndex in range(len(combineIndex1)):
                if combineIndex1[elementIndex] == seq:
                    seqIndex.append(elementIndex)
        seqIndexAll.append(seqIndex)

    #得到真正的sampledData和remainedData组合后的标签值
    finalLabel = []
    for finalIndex in range(len(combinedLables)):
        finallabelone = []
        for index in seqIndexAll[finalIndex]:
            finallabelone.append(combinedLables[finalIndex][index])
        finalLabel.append(finallabelone)  #最终聚类结果
    return finalLabel
예제 #2
0
def fsrsnn(sampledData, remainedData, sampledIndex, remainedIndex,
           sampledDataFs, k):
    min_clusters, max_clusters = k_range(k)  # 根据真实类标签数得到实验所用的簇数量范围
    predicted_labelAll = []
    for i in range(len(sampledData)):
        clusters = random.randint(min_clusters, max_clusters)
        # clusters = random.randint(2,11)#范围是[2,10]
        predicted_label = KMeans(n_clusters=clusters).fit_predict(
            sampledDataFs[i])

        predicted_labelAll.append(predicted_label.tolist())  ##对采样出来的数据集的预测标签集合

    assinALLNnLabels = []  #全部的通过近邻分配的标签

    #remainedData和sampleedData拥有的数据的行数是一致的,所以j的值无论从len(remainedData)还是从len(sampledData)取都可以
    for j in range(len(remainedData)):
        assinNnLabels = []  # 通过近邻分配的标签
        for m in range(len(remainedData[j])):
            minDist = inf
            minindex = -1
            for k in range(len(sampledData[j])):
                distJI = distEclud(remainedData[j][m],
                                   sampledData[j][k])  # 计算质心和数据点之间的距离
                if distJI < minDist:
                    minDist = distJI
                    minindex = k
            assinNnLabels.append(
                predicted_labelAll[j][minindex])  #对除采样外的数据集的根据近邻关系得到的预测标签集合
        assinALLNnLabels.append(assinNnLabels)

    #对两个预测标签和序列值分别进行组合
    combineIndex = []
    combinedLables = []
    for column in range(len(predicted_labelAll)):
        combineIndexOne = sampledIndex[column] + remainedIndex[column]
        combinedLablesOne = predicted_labelAll[column] + assinALLNnLabels[
            column]
        combineIndex.append(combineIndexOne)
        combinedLables.append(combinedLablesOne)
    #把打乱的序号按照从小到大排列出来,得到元素升序的序列值
    seqIndexAll = []
    for combineIndex1 in combineIndex:
        seqIndex = []
        for seq in range(len(sampledData[0]) + len(remainedData[0])):
            for elementIndex in range(len(combineIndex1)):
                if combineIndex1[elementIndex] == seq:
                    seqIndex.append(elementIndex)
        seqIndexAll.append(seqIndex)

    #得到真正的sampledData和remainedData组合后的标签值
    finalLabel = []
    for finalIndex in range(len(combinedLables)):
        finallabelone = []
        for index in seqIndexAll[finalIndex]:
            finallabelone.append(combinedLables[finalIndex][index])
        finalLabel.append(finallabelone)  #最终聚类结果
    return finalLabel
예제 #3
0
def initialMultiRun(data, times, singleName):
    predicted_labelAll = []
    for i in range(times):
        clusters = random.randint(2, 11)
        if singleName == "kmeans":
            predicted_label = KMeans(n_clusters=clusters).fit_predict(data)
        elif singleName in ('ward', 'average', 'complete'):
            predicted_label = AgglomerativeClustering(
                linkage=singleName, n_clusters=clusters).fit_predict(data)
        predicted_labelAll.append(predicted_label.tolist())
    return predicted_labelAll
예제 #4
0
def main():
    predicted_labelAll = []
    datamat, datalabels = loadDataset("../dataset/iris.data")
    print 'data ready'
    nmi_max = -inf
    ari_max = -inf
    for i in range(10):
        clusters = random.randint(2, 11)
        predicted_label = KMeans(n_clusters=clusters).fit_predict(datamat)
        predicted_label = predicted_label.tolist()
        nmi = normalized_mutual_info_score(datalabels, predicted_label)
        ari = adjusted_rand_score(datalabels, predicted_label)
        if nmi > nmi_max:
            nmi_max = nmi
        if ari > ari_max:
            ari_max = ari
    print('nmi值为:')
    print(nmi_max)
    print('ari值为:')
    print(ari_max)