def load(): data = genfromtxt('../dataset/' + dataFile + '.csv', delimiter=',') label = genfromtxt('../dataset/' + labelFile + '.csv', delimiter=',') res = pickle.load(open(FN,'rb')) AE = res['autoencoder'] encodedX = AE.encoder(AE.X) X = encodedX.data.numpy() #X = preprocessing.scale(encodedX.data.numpy()) d_matrix = sklearn.metrics.pairwise.pairwise_distances(X, Y=None, metric='euclidean') s = np.median(d_matrix) Vgamma = 1/(2*s*s) spAlloc = SpectralClustering(2, gamma=Vgamma).fit_predict(X) nmi_sp = np.around(normalized_mutual_info_score(label, spAlloc), 3) kmAlloc = KMeans(2).fit_predict(X) nmi_km = np.around(normalized_mutual_info_score(label, kmAlloc), 3) print X print nmi_sp print nmi_km print res['loss'] #print res['autoencoder'] txt = dataFile + ' nmiSP : ' + str(nmi_sp) + ' , nmiKM : ' + str(nmi_km) + ' , num_of_layers:' + str(num_of_layers) + ' , num_of_output:' + str(num_of_output) + '\n' fin = open('auto_out.txt','a') fin.write(txt) fin.close()
def test_exactly_zero_info_score(): """Check numerical stability when information is exactly zero""" for i in np.logspace(1, 4, 4).astype(np.int): labels_a, labels_b = np.ones(i, dtype=np.int), np.arange(i, dtype=np.int) assert_equal(normalized_mutual_info_score(labels_a, labels_b), 0.0) assert_equal(v_measure_score(labels_a, labels_b), 0.0) assert_equal(adjusted_mutual_info_score(labels_a, labels_b), 0.0) assert_equal(normalized_mutual_info_score(labels_a, labels_b), 0.0)
def test_exactly_zero_info_score(): # Check numerical stability when information is exactly zero for i in np.logspace(1, 4, 4).astype(np.int): labels_a, labels_b = (np.ones(i, dtype=np.int), np.arange(i, dtype=np.int)) assert_equal(normalized_mutual_info_score(labels_a, labels_b), 0.0) assert_equal(v_measure_score(labels_a, labels_b), 0.0) assert_equal(adjusted_mutual_info_score(labels_a, labels_b), 0.0) assert_equal(normalized_mutual_info_score(labels_a, labels_b), 0.0) for method in ["min", "geometric", "arithmetic", "max"]: assert adjusted_mutual_info_score(labels_a, labels_b, method) == 0.0 assert normalized_mutual_info_score(labels_a, labels_b, method) == 0.0
def test_single_linkage_clustering(): # Check that we get the correct result in two emblematic cases moons, moon_labels = make_moons(noise=0.05, random_state=42) clustering = AgglomerativeClustering(n_clusters=2, linkage='single') clustering.fit(moons) assert_almost_equal(normalized_mutual_info_score(clustering.labels_, moon_labels), 1) circles, circle_labels = make_circles(factor=0.5, noise=0.025, random_state=42) clustering = AgglomerativeClustering(n_clusters=2, linkage='single') clustering.fit(circles) assert_almost_equal(normalized_mutual_info_score(clustering.labels_, circle_labels), 1)
def evaluation(X_selected, n_clusters, y): """ This function calculates ARI, ACC and NMI of clustering results Input ----- X_selected: {numpy array}, shape (n_samples, n_selected_features} input data on the selected features n_clusters: {int} number of clusters y: {numpy array}, shape (n_samples,) true labels Output ------ nmi: {float} Normalized Mutual Information acc: {float} Accuracy """ k_means = KMeans(n_clusters=n_clusters, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances=True, verbose=0, random_state=None, copy_x=True, n_jobs=1) k_means.fit(X_selected) y_predict = k_means.labels_ # calculate NMI nmi = normalized_mutual_info_score(y, y_predict) # calculate ACC y_permuted_predict = best_map(y, y_predict) acc = accuracy_score(y, y_permuted_predict) return nmi, acc
def __eval_lda_clustering(lda_model, mm_corpus, gold_labels): # lda_model = gensim.models.ldamodel.LdaModel.load(model_file) sys_labels = list() for i, doc in enumerate(mm_corpus): topic_dist = lda_model[doc] # print topic_dist cluster_idx = 0 max_dist = 0 for tup in topic_dist: if tup[1] > max_dist: cluster_idx = tup[0] max_dist = tup[1] sys_labels.append(cluster_idx) if len(sys_labels) % 5000 == 0: print len(sys_labels) # if i > 10: # break # print len(sys_labels) # print len(gold_labels) nmi_score = normalized_mutual_info_score(gold_labels, sys_labels) purity_score = purity(gold_labels, sys_labels) ri_score = rand_index(gold_labels, sys_labels) # print 'NMI: %f' % normalized_mutual_info_score(gold_labels, sys_labels) # print 'Purity: %f' % purity(gold_labels, sys_labels) # print 'Accuracy: %f' % cluster_accuracy(gold_labels, sys_labels) print 'NMI: %f Purity: %f Rand index: %f' % (nmi_score, purity_score, ri_score) return nmi_score, purity_score, ri_score
def NMI(groundTruth, predictionResult): oneListGT = _labelPreprocessing(groundTruth) oneListPR = _labelPreprocessing(predictionResult) return normalized_mutual_info_score(oneListGT, oneListPR) # This show how NMI can produce high result while it should not # I5SIM3DatasetTrueClassification = [[1, 22, 52, 67, 84, 88, 106, 124, 138, 156, 167, 172, 204, 228, 240, 245, 256, 283, 313, 322, 337, 355, 367, 375, 380, 382, 405, 421, 422, 449, 451, 452, 464, 468, 469, 519, 520, 539, 566, 596, 612, 627, 628, 642, 656, 683, 718, 780, 808, 817, 830, 831, 833, 835, 852, 853, 854, 870, 876, 878, 927, 948, 952, 958, 968, 972, 976, 1005, 1016, 1024, 1058, 1108, 1122, 1123, 1149,1152, 1190, 1217, 1236, 1243, 1244, 1257, 1260, 1325, 1331, 1346, 1348, 1375, 1382, 1390, 1393, 1416, 1433, 1445, 1451, 1491, 1521, 1557, 1580, 1588], # [36, 71,76, 80, 85, 155, 157, 171, 182, 211, 215, 224, 237, 238, 239, 292, 298, 311, 315, 329, 342, 361, 370, 384, 403, 415, 416, 419, 425, 437, 483, 485, 488, 497, 522, 528, 545, 561, 569, 571, 572, 574, 621, 645, 653, 664, 674, 699, 712, 732, 734, 740, 755, 773, 774, 802, 828, 841, 846, 872, 896, 903, 906, 940, 991, 1006, 1013, 1042, 1074, 1085, 1100, 1125, 1148, 1170, 1179, 1180, 1203, 1237, 1239, 1252, 1256, 1274, 1279, 1289, 1297, 1307, 1353, 1358, 1383, 1415, 1417, 1431, 1438,1449, 1457, 1506, 1542, 1544, 1547, 1579], # [2, 8, 18, 32, 45, 59, 64, 79, 86, 96, 103, 110, 140, 166, 174, 199, 201, 202, 214, 235, 242, 247, 250, 262, 273, 278, 279, 286, 290, 326, 344, 345, 350, 363, 364, 424, 426, 427, 447, 470, 493, 501, 523, 526, 548, 563, 586, 599, 601, 602, 605, 632, 643, 661, 662, 677, 694, 702, 706, 751, 768, 815, 832, 871, 892, 930, 932, 934, 973, 984, 994, 1010, 1064,1080, 1115, 1133, 1136, 1166, 1173, 1258, 1298, 1302, 1313, 1319, 1320, 1321, 1326, 1328, 1333, 1341, 1349, 1385, 1403, 1478, 1489, 1512, 1556, 1559, 1565, 1572], # [12, 25, 29, 97, 102, 107, 116, 119, 132, 141, 158, 178, 181, 252, 253, 309,332, 356, 359, 373, 374, 393, 402, 420, 433, 438, 448, 459, 472, 480, 482, 491,515, 517, 529, 530, 543, 581, 618, 623, 636, 650, 676, 687, 727, 728, 737, 742,787, 793, 807, 857, 858, 863, 882, 904, 907, 957, 969, 970, 971, 975, 1000, 1007, 1011, 1043, 1050, 1056, 1069, 1112, 1141, 1145, 1147, 1151, 1157, 1163, 1172,1246, 1316, 1335, 1352, 1360, 1376, 1387, 1399, 1412, 1468, 1505, 1509, 1510, 1513, 1514, 1517, 1529, 1567, 1576, 1578, 1581, 1583, 1591], # [6, 21, 23, 34, 42, 53, 77, 90, 154, 168, 195, 264, 267, 269, 282, 284, 335, 336, 352, 354, 358, 379,400, 434, 444, 453, 463, 509, 511, 554, 556, 559, 580, 591, 594, 595, 615, 640,707, 714, 749, 756, 763, 764, 765, 792, 821, 843, 855, 860, 875, 894, 897, 920,954, 1002, 1012, 1025, 1028, 1057, 1076, 1083, 1109, 1127, 1130, 1134, 1144, 1153, 1160, 1194, 1195, 1198, 1207, 1209, 1242, 1245, 1262, 1265, 1284, 1373, 1392, 1407, 1429, 1446, 1461, 1470, 1477, 1479, 1485, 1511, 1519, 1520, 1534, 1539,1554, 1560, 1566, 1577, 1590, 1599], # [14, 50, 55, 73, 94, 100, 109, 112, 144, 149, 183, 200, 207, 234, 249, 251, 258, 275, 297, 301, 305, 323, 346, 357, 386, 404, 409, 436, 440, 458, 496, 504, 525, 538, 570, 583, 608, 634, 635, 637, 665, 668, 673, 679, 682, 688, 689, 693, 711, 716, 752, 760, 783, 794, 819, 823, 836, 847, 881, 891, 893, 943, 946, 961, 974, 988, 995, 1001, 1003, 1040, 1075, 1155, 1171, 1228, 1259, 1264, 1269, 1291, 1304, 1305, 1308, 1332, 1343, 1356, 1357, 1398, 1401, 1427, 1428, 1435, 1482, 1483, 1484, 1496, 1533, 1535, 1555, 1574, 1586,1589], # [5, 19, 46, 63, 68, 82, 111, 120, 129, 131, 145, 151, 152, 170, 176, 184,203, 208, 216, 220, 227, 246, 263, 302, 308, 331, 334, 351, 376, 378, 387, 396,408, 412, 428, 489, 500, 502, 555, 577, 622, 631, 651, 652, 686, 705, 745, 747,782, 798, 827, 874, 884, 885, 928, 937, 939, 956, 983, 1009, 1018, 1030, 1039,1062, 1063, 1070, 1072, 1073, 1084, 1093, 1103, 1107, 1128, 1158, 1168, 1169, 1224, 1227, 1230, 1272, 1315, 1327, 1336, 1388, 1394, 1395, 1400, 1409, 1432, 1447, 1448, 1452, 1453, 1465, 1476, 1503, 1515, 1523, 1525, 1582], # [9, 13, 15, 41, 44, 93, 113, 122, 134, 136, 139, 173, 185, 189, 205, 241, 259, 260, 268, 272, 299, 318, 327, 353, 389, 394, 413, 430, 479, 492, 499, 503, 516, 531, 535, 573, 579, 590, 609, 614, 638, 660, 666, 670, 691, 717, 724, 757, 762, 796, 801, 803, 809, 811, 838, 844, 873, 888, 899, 905, 908, 913, 918, 929, 953, 977, 982, 990, 992, 1020, 1021, 1051, 1094, 1116, 1124, 1164, 1183, 1184, 1199, 1208, 1219, 1226,1303, 1306, 1323, 1334, 1345, 1354, 1364, 1374, 1377, 1402, 1422, 1501, 1531, 1540, 1553, 1585, 1592, 1597], # [3, 20, 26, 39, 40, 57, 70, 78, 87, 92, 126, 133, 143, 147, 153, 160, 190, 198, 206, 210, 254, 270, 280, 295, 314, 321, 338, 339, 347, 360, 365, 399, 401, 429, 435, 471, 475, 487, 490, 505, 506, 507, 565, 600, 624, 625, 649, 654, 659, 675, 709, 713, 719, 722, 766, 775, 784, 790, 820, 849, 887, 914, 938, 941, 960, 962, 999, 1054, 1060, 1095, 1099, 1106, 1139, 1142, 1154, 1165, 1186, 1189, 1197, 1273, 1276, 1295, 1301, 1309, 1314, 1324, 1366, 1368,1379, 1436, 1440, 1450, 1459, 1473, 1500, 1528, 1532, 1558, 1563, 1568], # [0, 4,27, 56, 66, 91, 117, 118, 121, 142, 194, 209, 221, 236, 243, 244, 248, 277, 293,317, 333, 348, 371, 481, 510, 514, 532, 551, 568, 575, 585, 604, 620, 629, 644,681, 700, 704, 720, 726, 743, 748, 770, 779, 795, 850, 851, 859, 867, 889, 911,924, 933, 949, 955, 967, 989, 998, 1019, 1052, 1098, 1105, 1117, 1126, 1131, 1132, 1138, 1156, 1162, 1167, 1175, 1176, 1200, 1238, 1268, 1277, 1278, 1283, 1292, 1310, 1330, 1339, 1350, 1372, 1391, 1404, 1406, 1419, 1463, 1466, 1480, 1481,1490, 1498, 1526, 1538, 1549, 1562, 1570, 1598], # [24, 51, 54, 74, 108, 130, 148,186, 196, 226, 230, 261, 281, 294, 304, 307, 349, 362, 372, 383, 417, 465, 477,537, 546, 553, 560, 578, 592, 593, 611, 613, 616, 619, 630, 658, 669, 685, 692,696, 791, 805, 806, 834, 837, 840, 845, 856, 868, 869, 898, 915, 945, 951, 964,987, 1004, 1008, 1015, 1036, 1041, 1045, 1071, 1081, 1082, 1088, 1111, 1119, 1120, 1135, 1143, 1159, 1174, 1177, 1193, 1202, 1205, 1221, 1248, 1253, 1280, 1281, 1290, 1293, 1311, 1312, 1340, 1378, 1397, 1405, 1474, 1475, 1486, 1487, 1516,1518, 1537, 1552, 1573, 1596], # [16, 28, 37, 104, 128, 159, 164, 175, 187, 188, 212, 217, 223, 255, 312, 341, 343, 392, 397, 398, 406, 410, 418, 454, 455, 461, 462, 478, 494, 495, 512, 540, 550, 558, 597, 626, 633, 729, 735, 741, 750, 761, 781, 797, 799, 814, 822, 824, 879, 900, 910, 966, 979, 981, 993, 996, 1037, 1044,1046, 1053, 1061, 1065, 1077, 1079, 1096, 1097, 1113, 1121, 1146, 1150, 1182, 1185, 1196, 1210, 1235, 1241, 1254, 1263, 1275, 1285, 1287, 1338, 1355, 1359, 1363, 1380, 1381, 1396, 1437, 1441, 1467, 1493, 1494, 1495, 1522, 1541, 1546, 1551,1564, 1575], # [11, 17, 31, 48, 75, 89, 95, 98, 115, 123, 125, 135, 137, 161, 179, 191, 219, 222, 257, 266, 276, 291, 300, 310, 320, 324, 411, 473, 476, 498, 518, 534, 557, 576, 582, 587, 639, 646, 648, 663, 701, 710, 723, 731, 744, 759, 772, 776, 812, 813, 816, 818, 839, 861, 864, 866, 883, 886, 916, 921, 922, 965, 980, 985, 997, 1031, 1032, 1038, 1047, 1059, 1114, 1137, 1191, 1201, 1206, 1215, 1218, 1223, 1234, 1249, 1251, 1266, 1288, 1317, 1318, 1370, 1371, 1414, 1420, 1430, 1439, 1444, 1460, 1464, 1507, 1508, 1524, 1536, 1543, 1548], # [7, 10, 35, 58, 61, 69, 99, 146, 163, 165, 192, 213, 231, 233, 274, 287, 328, 330, 366, 377, 390,395, 445, 446, 450, 456, 460, 508, 536, 541, 547, 549, 564, 567, 598, 606, 617,657, 671, 672, 695, 703, 725, 733, 736, 754, 769, 771, 778, 785, 786, 800, 810,829, 865, 877, 880, 909, 919, 931, 935, 942, 963, 986, 1023, 1026, 1055, 1068,1086, 1089, 1101, 1102, 1104, 1110, 1140, 1181, 1212, 1229, 1267, 1270, 1286, 1322, 1337, 1347, 1361, 1362, 1369, 1384, 1411, 1413, 1423, 1426, 1456, 1469, 1472, 1488, 1499, 1530, 1561, 1595], # [38, 43, 60, 62, 101, 225, 229, 232, 285, 288,289, 306, 316, 319, 340, 368, 381, 423, 439, 441, 457, 467, 474, 521, 533, 542,544, 552, 562, 588, 589, 603, 610, 641, 647, 698, 708, 721, 730, 746, 753, 767,788, 825, 862, 890, 895, 902, 912, 917, 925, 926, 947, 959, 1017, 1022, 1029, 1048, 1049, 1066, 1078, 1091, 1092, 1129, 1187, 1192, 1204, 1213, 1216, 1222, 1225, 1231, 1232, 1233, 1247, 1250, 1261, 1271, 1294, 1296, 1299, 1344, 1367, 1389,1410, 1421, 1424, 1425, 1434, 1442, 1443, 1454, 1455, 1458, 1471, 1492, 1550, 1584, 1587, 1593], # [30, 33, 47, 49, 65, 72, 81, 83, 105, 114, 127, 150, 162, 169,177, 180, 193, 197, 218, 265, 271, 296, 303, 325, 369, 385, 388, 391, 407, 414,431, 432, 442, 443, 466, 484, 486, 513, 524, 527, 584, 607, 655, 667, 678, 680,684, 690, 697, 715, 738, 739, 758, 777, 789, 804, 826, 842, 848, 901, 923, 936,944, 950, 978, 1014, 1027, 1033, 1034, 1035, 1067, 1087, 1090, 1118, 1161, 1178,1188, 1211, 1214, 1220, 1240, 1255, 1282, 1300, 1329, 1342, 1351, 1365, 1386, 1408, 1418, 1462, 1497, 1502, 1504, 1527, 1545, 1569, 1571, 1594]] # I5SIMTestDatasetTrueClassification = [[1, 22, 52, 67, 84, 88, 106, 124, 138, 156, 167, 172, 204, 228, 240, 245, 256, 283, 313, 322, 337, 355, 367, 375, 380, 382, 405, 421, 422, 449, 451, 452, 464, 468, 469, 519, 520, 539, 566, 596, 612, 627, 628, 642, 656, 683, 718, 780, 808, 817, 830, 831, 833, 835, 852, 853, 854, 870, 876, 878, 927, 948, 952, 958, 968, 972, 976, 1005, 1016, 1024, 1058, 1108, 1122, 1123, 1149,1152, 1190, 1217, 1236, 1243, 1244, 1257, 1260, 1325, 1331, 1346, 1348, 1375, 1382, 1390, 1393, 1416, 1433, 1445, 1451, 1491, 1521, 1557, 1580, 1588,36, 71,76, 80, 85, 155, 157, 171, 182, 211, 215, 224, 237, 238, 239, 292, 298, 311, 315, 329, 342, 361, 370, 384, 403, 415, 416, 419, 425, 437, 483, 485, 488, 497, 522, 528, 545, 561, 569, 571, 572, 574, 621, 645, 653, 664, 674, 699, 712, 732, 734, 740, 755, 773, 774, 802, 828, 841, 846, 872, 896, 903, 906, 940, 991, 1006, 1013, 1042, 1074, 1085, 1100, 1125, 1148, 1170, 1179, 1180, 1203, 1237, 1239, 1252, 1256, 1274, 1279, 1289, 1297, 1307, 1353, 1358, 1383, 1415, 1417, 1431, 1438,1449, 1457, 1506, 1542, 1544, 1547, 1579], # [2, 8, 18, 32, 45, 59, 64, 79, 86, 96, 103, 110, 140, 166, 174, 199, 201, 202, 214, 235, 242, 247, 250, 262, 273, 278, 279, 286, 290, 326, 344, 345, 350, 363, 364, 424, 426, 427, 447, 470, 493, 501, 523, 526, 548, 563, 586, 599, 601, 602, 605, 632, 643, 661, 662, 677, 694, 702, 706, 751, 768, 815, 832, 871, 892, 930, 932, 934, 973, 984, 994, 1010, 1064,1080, 1115, 1133, 1136, 1166, 1173, 1258, 1298, 1302, 1313, 1319, 1320, 1321, 1326, 1328, 1333, 1341, 1349, 1385, 1403, 1478, 1489, 1512, 1556, 1559, 1565, 1572, 12, 25, 29, 97, 102, 107, 116, 119, 132, 141, 158, 178, 181, 252, 253, 309,332, 356, 359, 373, 374, 393, 402, 420, 433, 438, 448, 459, 472, 480, 482, 491,515, 517, 529, 530, 543, 581, 618, 623, 636, 650, 676, 687, 727, 728, 737, 742,787, 793, 807, 857, 858, 863, 882, 904, 907, 957, 969, 970, 971, 975, 1000, 1007, 1011, 1043, 1050, 1056, 1069, 1112, 1141, 1145, 1147, 1151, 1157, 1163, 1172,1246, 1316, 1335, 1352, 1360, 1376, 1387, 1399, 1412, 1468, 1505, 1509, 1510, 1513, 1514, 1517, 1529, 1567, 1576, 1578, 1581, 1583, 1591], # [6, 21, 23, 34, 42, 53, 77, 90, 154, 168, 195, 264, 267, 269, 282, 284, 335, 336, 352, 354, 358, 379,400, 434, 444, 453, 463, 509, 511, 554, 556, 559, 580, 591, 594, 595, 615, 640,707, 714, 749, 756, 763, 764, 765, 792, 821, 843, 855, 860, 875, 894, 897, 920,954, 1002, 1012, 1025, 1028, 1057, 1076, 1083, 1109, 1127, 1130, 1134, 1144, 1153, 1160, 1194, 1195, 1198, 1207, 1209, 1242, 1245, 1262, 1265, 1284, 1373, 1392, 1407, 1429, 1446, 1461, 1470, 1477, 1479, 1485, 1511, 1519, 1520, 1534, 1539,1554, 1560, 1566, 1577, 1590, 1599, 14, 50, 55, 73, 94, 100, 109, 112, 144, 149, 183, 200, 207, 234, 249, 251, 258, 275, 297, 301, 305, 323, 346, 357, 386, 404, 409, 436, 440, 458, 496, 504, 525, 538, 570, 583, 608, 634, 635, 637, 665, 668, 673, 679, 682, 688, 689, 693, 711, 716, 752, 760, 783, 794, 819, 823, 836, 847, 881, 891, 893, 943, 946, 961, 974, 988, 995, 1001, 1003, 1040, 1075, 1155, 1171, 1228, 1259, 1264, 1269, 1291, 1304, 1305, 1308, 1332, 1343, 1356, 1357, 1398, 1401, 1427, 1428, 1435, 1482, 1483, 1484, 1496, 1533, 1535, 1555, 1574, 1586,1589], # [5, 19, 46, 63, 68, 82, 111, 120, 129, 131, 145, 151, 152, 170, 176, 184,203, 208, 216, 220, 227, 246, 263, 302, 308, 331, 334, 351, 376, 378, 387, 396,408, 412, 428, 489, 500, 502, 555, 577, 622, 631, 651, 652, 686, 705, 745, 747,782, 798, 827, 874, 884, 885, 928, 937, 939, 956, 983, 1009, 1018, 1030, 1039,1062, 1063, 1070, 1072, 1073, 1084, 1093, 1103, 1107, 1128, 1158, 1168, 1169, 1224, 1227, 1230, 1272, 1315, 1327, 1336, 1388, 1394, 1395, 1400, 1409, 1432, 1447, 1448, 1452, 1453, 1465, 1476, 1503, 1515, 1523, 1525, 1582, 9, 13, 15, 41, 44, 93, 113, 122, 134, 136, 139, 173, 185, 189, 205, 241, 259, 260, 268, 272, 299, 318, 327, 353, 389, 394, 413, 430, 479, 492, 499, 503, 516, 531, 535, 573, 579, 590, 609, 614, 638, 660, 666, 670, 691, 717, 724, 757, 762, 796, 801, 803, 809, 811, 838, 844, 873, 888, 899, 905, 908, 913, 918, 929, 953, 977, 982, 990, 992, 1020, 1021, 1051, 1094, 1116, 1124, 1164, 1183, 1184, 1199, 1208, 1219, 1226,1303, 1306, 1323, 1334, 1345, 1354, 1364, 1374, 1377, 1402, 1422, 1501, 1531, 1540, 1553, 1585, 1592, 1597], # [3, 20, 26, 39, 40, 57, 70, 78, 87, 92, 126, 133, 143, 147, 153, 160, 190, 198, 206, 210, 254, 270, 280, 295, 314, 321, 338, 339, 347, 360, 365, 399, 401, 429, 435, 471, 475, 487, 490, 505, 506, 507, 565, 600, 624, 625, 649, 654, 659, 675, 709, 713, 719, 722, 766, 775, 784, 790, 820, 849, 887, 914, 938, 941, 960, 962, 999, 1054, 1060, 1095, 1099, 1106, 1139, 1142, 1154, 1165, 1186, 1189, 1197, 1273, 1276, 1295, 1301, 1309, 1314, 1324, 1366, 1368,1379, 1436, 1440, 1450, 1459, 1473, 1500, 1528, 1532, 1558, 1563, 1568, 0, 4,27, 56, 66, 91, 117, 118, 121, 142, 194, 209, 221, 236, 243, 244, 248, 277, 293,317, 333, 348, 371, 481, 510, 514, 532, 551, 568, 575, 585, 604, 620, 629, 644,681, 700, 704, 720, 726, 743, 748, 770, 779, 795, 850, 851, 859, 867, 889, 911,924, 933, 949, 955, 967, 989, 998, 1019, 1052, 1098, 1105, 1117, 1126, 1131, 1132, 1138, 1156, 1162, 1167, 1175, 1176, 1200, 1238, 1268, 1277, 1278, 1283, 1292, 1310, 1330, 1339, 1350, 1372, 1391, 1404, 1406, 1419, 1463, 1466, 1480, 1481,1490, 1498, 1526, 1538, 1549, 1562, 1570, 1598], # [24, 51, 54, 74, 108, 130, 148,186, 196, 226, 230, 261, 281, 294, 304, 307, 349, 362, 372, 383, 417, 465, 477,537, 546, 553, 560, 578, 592, 593, 611, 613, 616, 619, 630, 658, 669, 685, 692,696, 791, 805, 806, 834, 837, 840, 845, 856, 868, 869, 898, 915, 945, 951, 964,987, 1004, 1008, 1015, 1036, 1041, 1045, 1071, 1081, 1082, 1088, 1111, 1119, 1120, 1135, 1143, 1159, 1174, 1177, 1193, 1202, 1205, 1221, 1248, 1253, 1280, 1281, 1290, 1293, 1311, 1312, 1340, 1378, 1397, 1405, 1474, 1475, 1486, 1487, 1516,1518, 1537, 1552, 1573, 1596, 16, 28, 37, 104, 128, 159, 164, 175, 187, 188, 212, 217, 223, 255, 312, 341, 343, 392, 397, 398, 406, 410, 418, 454, 455, 461, 462, 478, 494, 495, 512, 540, 550, 558, 597, 626, 633, 729, 735, 741, 750, 761, 781, 797, 799, 814, 822, 824, 879, 900, 910, 966, 979, 981, 993, 996, 1037, 1044,1046, 1053, 1061, 1065, 1077, 1079, 1096, 1097, 1113, 1121, 1146, 1150, 1182, 1185, 1196, 1210, 1235, 1241, 1254, 1263, 1275, 1285, 1287, 1338, 1355, 1359, 1363, 1380, 1381, 1396, 1437, 1441, 1467, 1493, 1494, 1495, 1522, 1541, 1546, 1551,1564, 1575], # [11, 17, 31, 48, 75, 89, 95, 98, 115, 123, 125, 135, 137, 161, 179, 191, 219, 222, 257, 266, 276, 291, 300, 310, 320, 324, 411, 473, 476, 498, 518, 534, 557, 576, 582, 587, 639, 646, 648, 663, 701, 710, 723, 731, 744, 759, 772, 776, 812, 813, 816, 818, 839, 861, 864, 866, 883, 886, 916, 921, 922, 965, 980, 985, 997, 1031, 1032, 1038, 1047, 1059, 1114, 1137, 1191, 1201, 1206, 1215, 1218, 1223, 1234, 1249, 1251, 1266, 1288, 1317, 1318, 1370, 1371, 1414, 1420, 1430, 1439, 1444, 1460, 1464, 1507, 1508, 1524, 1536, 1543, 1548, 7, 10, 35, 58, 61, 69, 99, 146, 163, 165, 192, 213, 231, 233, 274, 287, 328, 330, 366, 377, 390,395, 445, 446, 450, 456, 460, 508, 536, 541, 547, 549, 564, 567, 598, 606, 617,657, 671, 672, 695, 703, 725, 733, 736, 754, 769, 771, 778, 785, 786, 800, 810,829, 865, 877, 880, 909, 919, 931, 935, 942, 963, 986, 1023, 1026, 1055, 1068,1086, 1089, 1101, 1102, 1104, 1110, 1140, 1181, 1212, 1229, 1267, 1270, 1286, 1322, 1337, 1347, 1361, 1362, 1369, 1384, 1411, 1413, 1423, 1426, 1456, 1469, 1472, 1488, 1499, 1530, 1561, 1595], # [38, 43, 60, 62, 101, 225, 229, 232, 285, 288,289, 306, 316, 319, 340, 368, 381, 423, 439, 441, 457, 467, 474, 521, 533, 542,544, 552, 562, 588, 589, 603, 610, 641, 647, 698, 708, 721, 730, 746, 753, 767,788, 825, 862, 890, 895, 902, 912, 917, 925, 926, 947, 959, 1017, 1022, 1029, 1048, 1049, 1066, 1078, 1091, 1092, 1129, 1187, 1192, 1204, 1213, 1216, 1222, 1225, 1231, 1232, 1233, 1247, 1250, 1261, 1271, 1294, 1296, 1299, 1344, 1367, 1389,1410, 1421, 1424, 1425, 1434, 1442, 1443, 1454, 1455, 1458, 1471, 1492, 1550, 1584, 1587, 1593, 30, 33, 47, 49, 65, 72, 81, 83, 105, 114, 127, 150, 162, 169,177, 180, 193, 197, 218, 265, 271, 296, 303, 325, 369, 385, 388, 391, 407, 414,431, 432, 442, 443, 466, 484, 486, 513, 524, 527, 584, 607, 655, 667, 678, 680,684, 690, 697, 715, 738, 739, 758, 777, 789, 804, 826, 842, 848, 901, 923, 936,944, 950, 978, 1014, 1027, 1033, 1034, 1035, 1067, 1087, 1090, 1118, 1161, 1178,1188, 1211, 1214, 1220, 1240, 1255, 1282, 1300, 1329, 1342, 1351, 1365, 1386, 1408, 1418, 1462, 1497, 1502, 1504, 1527, 1545, 1569, 1571, 1594],[],[],[],[],[],[],[],[]] # print NMI(I5SIMTestDatasetTrueClassification,I5SIM3DatasetTrueClassification)
def calcNMI(): dataset = readARFF(); subSet = dataset[['class', 'cluster']] #print subSet NMI = normalized_mutual_info_score(subSet['class'], subSet['cluster']) print NMI
def pairwise_MI(data): columns = data.columns MI_df = pd.DataFrame(index = columns, columns = columns) for c1,c2 in combinations(columns, 2): cleaned = data[[c1,c2]].dropna() MI = normalized_mutual_info_score(cleaned[c1], cleaned[c2]) MI_df.loc[c1,c2] = MI MI_df.loc[c2,c1] = MI return MI_df.astype(float)
def main(): file1 = sys.argv[1] file2 = sys.argv[2] c_true = {} c_pred = {} #read data from file with open(file1) as fd1, open(file2) as fd2: c_true = eval(fd1.readline()) c_pred = eval(fd2.readline()) #order the data in dictionary data structure c_true_order = collections.OrderedDict(sorted(c_true.items())) c_pred_order = collections.OrderedDict(sorted(c_pred.items())) c_true_label = [] c_pred_label = [] print c_true_order #make list with community label for k, v in c_true_order.items(): c_true_label.append(v) for k, v in c_pred_order.items(): c_pred_label.append(v) simi = normalized_mutual_info_score(c_true_label,c_pred_label) DATA_FILE = sys.argv[3].split("/") FILE_LOG_NAME = "LOG_File_"+(DATA_FILE[-1])+ ".xlsx" Kcore_Value = int(sys.argv[4]) if(not os.path.exists(FILE_LOG_NAME)): wb = openpyxl.Workbook() sheet = wb.active sheet.title = "Sheet1" sheet['A1'] = 'K/R Value' sheet['B1'] = 'NMI Similarity' sheet['A2'] = 'v=10%' sheet['A3'] = 'v=20%' sheet['A4'] = 'v=30%' sheet['A5'] = 'v=40%' sheet['A6'] = 'v=50%' sheet['A7'] = 'v=60%' sheet['A8'] = 'v=70%' sheet['A9'] = 'v=80%' sheet['A10'] = 'v=90%' sheet['A11'] = 'v=100%' else: wb = openpyxl.load_workbook(FILE_LOG_NAME) sheet = wb.get_sheet_by_name('Sheet1') sheet['B'+str(Kcore_Value + 1)] = simi wb.save(FILE_LOG_NAME)
def get_loss(ckernel_net, data_loader): # Compute final average loss for idx, (data, target) in enumerate(data_loader): data = Variable(data.type(db['dataType'])) loss = ckernel_net.CAE_compute_loss(data) dataOut = ckernel_net(data) dataOut = dataOut.cpu().data.numpy() allocation = KMeans(10).fit_predict(dataOut) nmi = normalized_mutual_info_score(allocation, target.numpy()) return [loss.cpu().data.numpy()[0], nmi]
def __eval_lda_clustering_20ng(): text_doc_file = 'e:/dc/20ng_bydate/twe/docs-nl.txt' dict_file = 'e:/dc/20ng_bydate/lda/all-docs.dict' mm_file = 'e:/dc/20ng_bydate/lda/all-docs.mm' lda_model_file = 'e:/dc/20ng_bydate/lda/lda-model' dataset_label_file = 'e:/dc/20ng_bydate/doc_split_labels.bin' test_label_file = 'e:/dc/20ng_bydate/test_labels.bin' __text_file_to_mm_corpus(text_doc_file, dict_file, mm_file) __train_lda_model(dict_file, mm_file, lda_model_file) dataset_labels = ioutils.load_labels_file(dataset_label_file) lda_model = gensim.models.ldamodel.LdaModel.load(lda_model_file) mm_corpus = gensim.corpora.MmCorpus(mm_file) sys_labels = list() for i, doc in enumerate(mm_corpus): if dataset_labels[i] == 0: continue topic_dist = lda_model[doc] # print topic_dist cluster_idx = 0 max_dist = 0 for tup in topic_dist: if tup[1] > max_dist: cluster_idx = tup[0] max_dist = tup[1] sys_labels.append(cluster_idx) if len(sys_labels) % 1000 == 0: print len(sys_labels) # if i > 10: # break print len(sys_labels) gold_labels = ioutils.load_labels_file(test_label_file) print len(gold_labels) print normalized_mutual_info_score(gold_labels, sys_labels) print cluster_accuracy(gold_labels, sys_labels)
def main(): file1 = sys.argv[1] file2 = sys.argv[2] c_true = {} c_pred = {} #read data from file with open(file1) as fd1, open(file2) as fd2: c_true = eval(fd1.readline()) c_pred = eval(fd2.readline()) #order the data in dictionary data structure c_true_order = collections.OrderedDict(sorted(c_true.items())) c_pred_order = collections.OrderedDict(sorted(c_pred.items())) c_true_label = [] c_pred_label = [] #make list with community label for k, v in c_true_order.items(): c_true_label.append(v) for k, v in c_pred_order.items(): c_pred_label.append(v) print normalized_mutual_info_score(c_true_label,c_pred_label)
def test_v_measure_and_mutual_information(seed=36): # Check relation between v_measure, entropy and mutual information for i in np.logspace(1, 4, 4).astype(np.int): random_state = np.random.RandomState(seed) labels_a, labels_b = (random_state.randint(0, 10, i), random_state.randint(0, 10, i)) assert_almost_equal(v_measure_score(labels_a, labels_b), 2.0 * mutual_info_score(labels_a, labels_b) / (entropy(labels_a) + entropy(labels_b)), 0) avg = 'arithmetic' assert_almost_equal(v_measure_score(labels_a, labels_b), normalized_mutual_info_score(labels_a, labels_b, average_method=avg) )
def evaluate( self, partition, clustered_ids ): # no class info? if not self.has_class_info(): return {} # get two clusterings that we can compare n = len(clustered_ids) classes_subset = np.zeros( n ) for row in range(n): classes_subset[row] = self.class_map[clustered_ids[row]] scores = {} scores["external-nmi"] = normalized_mutual_info_score( classes_subset, partition ) scores["external-ami"] = adjusted_mutual_info_score( classes_subset, partition ) scores["external-ari"] = adjusted_rand_score( classes_subset, partition ) return scores
def sklearn_measures(U, V): # http://scikit-learn.org/stable/modules/classes.html#clustering-metrics import sklearn.metrics.cluster as sym U_labels = np.nonzero(U)[1] V_labels = np.nonzero(V)[1] print U_labels, V_labels # V2_labels = np.nonzero(V2)[1] print 'entro(U)=',sym.entropy(U_labels),'entro(V)=',sym.entropy(V_labels), 'entro(U,V)=',sym.mutual_info_score(U_labels, V_labels) res = [ ['ari', 'nmi', 'ami', 'vm' ], \ [ sym.adjusted_rand_score(U_labels, V_labels),\ sym.normalized_mutual_info_score(U_labels, V_labels),\ sym.adjusted_mutual_info_score(U_labels, V_labels),\ sym.v_measure_score(U_labels, V_labels)]] print res return res
def checkout_CAE(): X = pickle.load( open( 'mnist_60000_validation.pk', "rb" ) ) Y = pickle.load( open( 'mnist_60000_label_validation.pk', "rb" ) ) Y = torch.from_numpy(Y) kinfo = pickle.load( open( 'kernel_mnist.p', "rb" ) ) cnn = kinfo['kernel_net'] X_var = Variable(X.type(db['dataType'])) xout = cnn(X_var) xout = xout.cpu().data.numpy() allocation = KMeans(10).fit_predict(xout) nmi = normalized_mutual_info_score(allocation, Y.numpy()) print('nmi : %.3f', nmi)
def bow_kmeans(bow_vecs, gold_labels, num_clusters): print 'performing kmeans ...' model = KMeans(n_clusters=num_clusters, n_jobs=4, n_init=20) model.fit(bow_vecs) # print len(gold_labels), 'samples' nmi_score = normalized_mutual_info_score(gold_labels, model.labels_) purity_score = purity(gold_labels, model.labels_) ri_score = rand_index(gold_labels, model.labels_) # print 'NMI: %f' % normalized_mutual_info_score(gold_labels, model.labels_) # print 'Purity: %f' % purity(gold_labels, model.labels_) # print 'Accuracy: %f' % cluster_accuracy(gold_labels, model.labels_) print 'NMI: %f Purity: %f Rand index: %f' % (nmi_score, purity_score, ri_score) return nmi_score, purity_score, ri_score
def nimSimilarity(c_true, c_pred): '''This will return the Normalized Mutual Information between two clusterings Parameters: c_true, communities detected without kcore, a dictionary with community node as the key and community lable as the value c_pred, communities detected with kcore, a dictionary with community node as the key and community lable as the value Return nmi Example: x = {1:1,2:1,3:0,4:0} y = {1:0,2:0,3:1,4:1} print nimSimilarity(x,y)''' #put community lables (lable might be duplicate)into array c_true = list(c_true.values()) #print sorted(c_true) c_pred = list(c_pred.values()) #print sorted(c_pred) return normalized_mutual_info_score(c_true,c_pred)
def mutualinfo(df): dfin=df Label=dfin['L'] VALUES=['sentiment_polarity','sentiment_subjectivity','absPolarity','Clean tweet', 'L'] headers_names=list(dfin.columns.values) headers_names = [x for x in headers_names if x not in VALUES] mutualinfowords=[] for header in headers_names: mutualcolumn= dfin[header] mutualvalue= normalized_mutual_info_score(mutualcolumn,Label) if mutualvalue>0.02: #print'mutual info',header, mutualvalue mutualinfowords.append(header) return mutualinfowords #mutualinfo(test)
def crossvalidate(profiles, true_group_name, holdout_group_name=None, train=NNClassifier, distance='cosine'): profiles.assert_not_isnan() keys = profiles.keys() true_labels = profiles.regroup(true_group_name) profiles.data = np.array([d for k, d in zip(keys, profiles.data) if tuple(k) in true_labels]) profiles._keys = [k for k in keys if tuple(k) in true_labels] keys = profiles.keys() labels = list(set(true_labels.values())) if holdout_group_name: holdouts = profiles.regroup(holdout_group_name) else: holdouts = dict((k, k) for k in keys) true_indices = [] pred_indices = [] for ho in set(holdouts.values()): test_set_mask = np.array([tuple(holdouts[k]) == ho for k in keys], dtype=bool) training_features = profiles.data[~test_set_mask, :] training_labels = [labels.index(true_labels[tuple(k)]) for k, m in zip(keys, ~test_set_mask) if m] model = train(training_features, training_labels, distance=distance) for k, f, m in zip(keys, profiles.data, test_set_mask): if not m: continue true = true_labels[k] predicted = labels[model.classify(f)] true_indices.append(labels.index(true)) pred_indices.append(labels.index(predicted)) true_indices = np.array(true_indices) pred_indices = np.array(pred_indices) nmi_score = normalized_mutual_info_score(true_indices, pred_indices) ami_score = adjusted_mutual_info_score(true_indices, pred_indices) return nmi_score, ami_score
def process_evaluation(args, model): if args['true_row_labels']: try: with open(args['true_row_labels'], 'r') as f: labels = f.read().split() from sklearn.metrics.cluster import normalized_mutual_info_score from sklearn.metrics.cluster import adjusted_rand_score from sklearn.metrics import confusion_matrix n = normalized_mutual_info_score(labels, model.row_labels_) ari = adjusted_rand_score(labels, model.row_labels_) cm = confusion_matrix(labels, model.row_labels_) print("nmi ==>" + str(n)) print("adjusted rand index ==>" + str(ari)) print() print(cm) except Exception as e: logging.error("--true_row_labels option (evaluation) exception:\ %s" % e)
def test_identical_points(): # Ensure identical points are handled correctly when using mst with # a sparse connectivity matrix X = np.array([[0, 0, 0], [0, 0, 0], [1, 1, 1], [1, 1, 1], [2, 2, 2], [2, 2, 2]]) true_labels = np.array([0, 0, 1, 1, 2, 2]) connectivity = kneighbors_graph(X, n_neighbors=3, include_self=False) connectivity = 0.5 * (connectivity + connectivity.T) connectivity, n_components = _fix_connectivity(X, connectivity, 'euclidean') for linkage in ('single', 'average', 'average', 'ward'): clustering = AgglomerativeClustering(n_clusters=3, linkage=linkage, connectivity=connectivity) clustering.fit(X) assert_almost_equal(normalized_mutual_info_score(clustering.labels_, true_labels), 1)
def feature_selection(): num_leading_progressions = 900 num_selected_features = 50 songs_list = Song.objects.filter(progressions__isnull=False).filter(tags__isnull=False).distinct() leading_progressions = get_leading_progressions(num_leading_progressions) feature_matrix = get_features_data(leading_progressions,songs_list) all_labels_lists = get_label_data(songs_list) num_labels = len(all_labels_lists) selected_features = [np.empty((num_selected_features), dtype=object) for _i in xrange(num_labels)] for l in range(num_labels): mi_results = np.zeros(num_leading_progressions) for p in range(num_leading_progressions): mi = normalized_mutual_info_score(feature_matrix[:,p], all_labels_lists[l]) mi_results[p] = mi highest_mi_feature_indices = np.argsort(mi_results)[num_leading_progressions-num_selected_features:] selected_features[l] = [leading_progressions[i] for i in highest_mi_feature_indices] pickle.dump(selected_features,open(path_to_selected_progressions,'w'))
def cluster_and_eval(vec_list, labels, num_clusters): if len(labels) < len(vec_list): vec_list = vec_list[-len(labels):] cl_data = np.asarray(vec_list) # print cl_data # model = sklearn.cluster.AgglomerativeClustering(n_clusters=5, # linkage="average", affinity="cosine") model = sklearn.cluster.KMeans(n_clusters=num_clusters, n_jobs=4, n_init=50) model.fit(cl_data) # print estimator.labels_ # print labels[0:100] # print model.labels_ nmi_score = normalized_mutual_info_score(labels, model.labels_) purity_score = purity(labels, model.labels_) ri_score = rand_index(labels, model.labels_) # print len(labels), 'samples' print 'NMI: %f Purity: %f Rand index: %f' % (nmi_score, purity_score, ri_score) # print 'Accuracy: %f' % cluster_accuracy(labels, model.labels_) return nmi_score, purity_score, ri_score
### for example on digits import read_tree from sklearn.metrics.classification import accuracy_score, log_loss from sklearn.datasets import load_digits from sklearn.preprocessing import scale from sklearn.metrics.cluster import normalized_mutual_info_score from sklearn import datasets iris = datasets.load_iris() data = scale(iris.data) labels = iris.target tree = read_file('output.in') result_of_alg = clusters(tree, k) def convert(clusters, n): clustering_vect = [0] * n for i in range(len(clusters)): for p in clusters[i]: clustering_vect[p] = i return clustering_vect #test = [[3,4,5], [1,8,0], [2,6,7]] #print(convert(test, 9)) normalized_mutual_info_score(convert(result_of_alg, len(labels)), labels)
def mutual_info_plot(var_names_dict, df, name, nname, m_path): # setup cols = [col for col in df] col_names = [var_names_dict[col] for col in cols] ncols = len(cols) # compute mi's norm_mi = np.zeros((len(cols), len(cols))) for i, col1 in enumerate(cols): for j, col2 in enumerate(cols[:i]): raw_matrix = df.as_matrix([col1, col2]) norm_mi[i][j] = normalized_mutual_info_score( raw_matrix[:, 0], raw_matrix[:, 1]) # mask upper right duplicates mask = np.triu(np.ones(norm_mi.shape, dtype=int)) norm_mi_masked = np.ma.masked_array(norm_mi, mask=mask) # now plot figsize = 10 digit_size = 12.5 if ncols > 15: figsize = 15 digit_size = 11 fig = plt.figure(figsize=(figsize, figsize)) ax = fig.add_subplot(111) norm = mpl.colors.Normalize(vmin=0., vmax=1.) cmap = 'viridis' # create an axes on the right side of ax. The width of cax will be 5% # of ax and the padding between cax and ax will be fixed at 0.1 inch. divider = make_axes_locatable(ax) cax = divider.append_axes("right", size="5%", pad=0.1) img = ax.imshow(norm_mi_masked, cmap=cmap, norm=norm) cb = plt.colorbar(img, cmap=cmap, norm=norm, cax=cax) # annotate for (j, i), value in np.ndenumerate(norm_mi): if (i < j): # https://stackoverflow.com/questions/11010683/how-to-have-negative-zero-always-formatted-as-positive-zero-in-a-python-string/36604981#36604981 value_str = re.sub(r"^-(0\.?00*)$", r"\1", "%.2f" % value) ax.text(i, j, value_str, ha='center', va='center', color='fuchsia', size=digit_size) ax.set_xticks(np.arange(ncols)) ax.set_yticks(np.arange(ncols)) ax.set_xticklabels(col_names, rotation='vertical') ax.set_yticklabels(col_names) plt.figtext(0.5, 0.89, name, ha='center', va='center', size=18) plt.figtext(0.96, 0.8, "(Dependent)", rotation='vertical', ha='center', va='center', size=16) plt.figtext(0.96, 0.22, "(Independent)", rotation='vertical', ha='center', va='center', size=16) plt.figtext(0.96, 0.5, "NMI", rotation='vertical', ha='center', va='center', size=18) make_path(m_path) fig.savefig(m_path + '/mutual_information_' + nname + '.pdf')
def run(self): path = self.path #### Step 1: reading and sampling graphs ''' m_graph, nx_graphs, total_edges = Reader.multi_readG_with_Merg(path) print("%d total nodes"%len(m_graph.nodes())) r_list, m_graph_sampled, nx_graphs_sampled = Sampler.multi_sampling_with_Merg(path, self.s_p) print("%d edges before sampling, %d edges after sampling. sampled %d "%(len(m_graph.edges()), len(m_graph_sampled.edges()), len(r_list))) r_set = set([node for edge in r_list for node in edge]) ''' nx_graphs_sampled, _ = Reader.multi_readG(self.path) cluster_true = [] for i in range(29): if i < 12: cluster_true.append(0) else: cluster_true.append(1) for r in range(11): r_t = r / 10.0 if r_t == 0: w_dict = Reader.weight(self.path) #print(w_dict) MK_G = Node2Vec_LayerSelect.Graph(nx_graphs_sampled, self.p, self.q, 0.1) MK_G.preprocess_transition_probs(w_dict, 1) MK_walks = MK_G.simulate_walks(self.num_walks, self.walk_length) MK_words = [] for walk in MK_walks: MK_words.extend([str(step) for step in walk]) M_L = Word2Vec.Learn(MK_words) M_matrix, M_mapping = M_L.train() ''' eval_p = Evaluator.Precision_Eval(M_matrix, M_mapping, nx_graphs, r_set, self.e_p) precision, recall, F = eval_p.eval() print("*** MKII Biased: precision %f, accuracy %f, F %f"%(precision, recall, F)) eval_a = Evaluator.AUC_Eval(M_matrix, M_mapping, nx_graphs, nx_graphs_sampled) M_auc = eval_a.eval_auc(1) print("@@@ MKII Biased AUC:", M_auc) ''' else: w_dict = Reader.weight(self.path) #print(w_dict) MK_G = Node2Vec_LayerSelect.Graph(nx_graphs_sampled, self.p, self.q, r_t) MK_G.preprocess_transition_probs(w_dict, 3) MK_walks = MK_G.simulate_walks(self.num_walks, self.walk_length) MK_words = [] for walk in MK_walks: MK_words.extend([str(step) for step in walk]) M_L = Word2Vec.Learn(MK_words) M_matrix, M_mapping = M_L.train() ''' eval_p = Evaluator.Precision_Eval(M_matrix, M_mapping, nx_graphs, r_set, self.e_p) precision, recall, F = eval_p.eval() print("*** MKII Biased_ii with %f: precision %f, accuracy %f, F %f"%(r_t, precision, recall, F)) eval_a = Evaluator.AUC_Eval(M_matrix, M_mapping, nx_graphs, nx_graphs_sampled) M_auc = eval_a.eval_auc(1) print("@@@ MKII Biased_ii AUC:", M_auc) ''' cluster_trained = KMeans( n_clusters=2, random_state=0).fit_predict(M_matrix).tolist() length = min(len(cluster_true), len(cluster_trained)) r = normalized_mutual_info_score(cluster_true[0:length], cluster_trained[0:length]) mi_f = f1_score(cluster_true[0:length], cluster_trained[0:length], average='micro') ma_f = f1_score(cluster_true[0:length], cluster_trained[0:length], average='macro') print("r is %f: nmi %f, micro_f %f, macro_f %f" % (r_t, r, mi_f, ma_f)) print( "-----------------------DONE--------------------------------")
def main(): global args args = parser.parse_args() # fix random seeds torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) np.random.seed(args.seed) # CNN if args.verbose: print('Architecture: {}'.format(args.arch)) model = models.__dict__[args.arch](sobel=args.sobel) fd = int(model.top_layer.weight.size()[1]) model.top_layer = None model.features = torch.nn.DataParallel(model.features) model.cuda() cudnn.benchmark = True # create optimizer optimizer = torch.optim.SGD( filter(lambda x: x.requires_grad, model.parameters()), lr=args.lr, momentum=args.momentum, weight_decay=10**args.wd, ) # define loss function criterion = nn.CrossEntropyLoss().cuda() # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] # remove top_layer parameters from checkpoint for key in checkpoint['state_dict']: if 'top_layer' in key: del checkpoint['state_dict'][key] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # creating checkpoint repo exp_check = os.path.join(args.exp, 'checkpoints') if not os.path.isdir(exp_check): os.makedirs(exp_check) # creating cluster assignments log cluster_log = Logger(os.path.join(args.exp, 'clusters')) # preprocessing of data normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) tra = [ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize ] # load the data end = time.time() dataset = datasets.ImageFolder(args.data, transform=transforms.Compose(tra)) if args.verbose: print('Load dataset: {0:.2f} s'.format(time.time() - end)) dataloader = torch.utils.data.DataLoader(dataset, batch_size=args.batch, num_workers=args.workers, pin_memory=True) # clustering algorithm to use deepcluster = clustering.__dict__[args.clustering](args.nmb_cluster) # training convnet with DeepCluster for epoch in range(args.start_epoch, args.epochs): end = time.time() # remove head model.top_layer = None model.classifier = nn.Sequential( *list(model.classifier.children())[:-1]) # get the features for the whole dataset features = compute_features(dataloader, model, len(dataset)) # cluster the features clustering_loss = deepcluster.cluster(features, verbose=args.verbose) # assign pseudo-labels train_dataset = clustering.cluster_assign(deepcluster.images_lists, dataset.imgs) # uniformely sample per target sampler = UnifLabelSampler(int(args.reassign * len(train_dataset)), deepcluster.images_lists) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch, num_workers=args.workers, sampler=sampler, pin_memory=True, ) # set last fully connected layer mlp = list(model.classifier.children()) mlp.append(nn.ReLU(inplace=True).cuda()) model.classifier = nn.Sequential(*mlp) model.top_layer = nn.Linear(fd, len(deepcluster.images_lists)) model.top_layer.weight.data.normal_(0, 0.01) model.top_layer.bias.data.zero_() model.top_layer.cuda() # train network with clusters as pseudo-labels end = time.time() loss = train(train_dataloader, model, criterion, optimizer, epoch) # print log if args.verbose: print('###### Epoch [{0}] ###### \n' 'Time: {1:.3f} s\n' 'Clustering loss: {2:.3f} \n' 'ConvNet loss: {3:.3f}'.format(epoch, time.time() - end, clustering_loss, loss)) try: nmi = normalized_mutual_info_score( clustering.arrange_clustering(deepcluster.images_lists), clustering.arrange_clustering(cluster_log.data[-1])) print('NMI against previous assignment: {0:.3f}'.format(nmi)) except IndexError: pass print('####################### \n') # save running checkpoint torch.save( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict() }, os.path.join(args.exp, 'checkpoint.pth.tar')) # save cluster assignments cluster_log.log(deepcluster.images_lists)
def fit(self, trainloader, validloader, lr=0.001, batch_size=128, num_epochs=10, visualize=False, anneal=False, optimizer="adam"): use_cuda = torch.cuda.is_available() if use_cuda: self.cuda() if optimizer == "adam": optimizer = optim.Adam(self.parameters(), lr=lr) elif optimizer == "sgd": optimizer = optim.SGD(self.parameters(), lr=lr, momentum=0.9) # validate self.eval() valid_loss = 0.0 for batch_idx, (inputs, _) in enumerate(validloader): inputs = inputs.view(inputs.size(0), -1).float() if use_cuda: inputs = inputs.cuda() inputs = Variable(inputs) z, outputs = self.forward(inputs) loss = self.loss_function(outputs, inputs) valid_loss += loss.data * len(inputs) # total_loss += valid_recon_loss.data[0] * inputs.size()[0] # total_num += inputs.size()[0] # valid_loss = total_loss / total_num print("#Epoch -1: Valid Loss: %.5f" % (valid_loss / len(validloader.dataset))) for epoch in range(num_epochs): # train 1 epoch self.train() if anneal: adjust_learning_rate(lr, optimizer, epoch) train_loss = 0 for batch_idx, (inputs, labels) in enumerate(trainloader): inputs = inputs.view(inputs.size(0), -1).float() if use_cuda: inputs = inputs.cuda() optimizer.zero_grad() inputs = Variable(inputs) z, outputs = self.forward(inputs) loss = self.loss_function(outputs, inputs) train_loss += loss.data * len(inputs) loss.backward() optimizer.step() # print(" #Iter %3d: Reconstruct Loss: %.3f" % ( # batch_idx, recon_loss.data[0])) # validate self.eval() valid_loss = 0.0 for batch_idx, (inputs, labels) in enumerate(validloader): inputs = inputs.view(inputs.size(0), -1).float() if use_cuda: inputs = inputs.cuda() inputs = Variable(inputs) z, outputs = self.forward(inputs) loss = self.loss_function(outputs, inputs) valid_loss += loss.data * len(inputs) print("#Epoch %3d: Train Loss: %.5f, Valid Loss: %.5f" % (epoch, train_loss / len(trainloader.dataset), valid_loss / len(validloader.dataset))) if epoch % int(num_epochs / 10) == 0 or epoch == num_epochs - 1: trainX, trainY = self.encodeBatch(trainloader, True) testX, testY = self.encodeBatch(validloader, True) trainX = trainX.cpu().numpy() trainY = trainY.cpu().numpy() testX = testX.cpu().numpy() testY = testY.cpu().numpy() n_components = len(np.unique(trainY)) km = KMeans(n_clusters=n_components, n_init=20).fit(trainX) y_pred = km.predict(testX) print("acc: %.5f, nmi: %.5f" % (acc(testY, y_pred), normalized_mutual_info_score(testY, y_pred))) gmm = GaussianMixture( n_components=n_components, covariance_type='diag', means_init=km.cluster_centers_).fit(trainX) y_pred = gmm.predict(testX) print("acc: %.5f, nmi: %.5f" % (acc(testY, y_pred), normalized_mutual_info_score(testY, y_pred)))
aaa = 1 / aaa # aaa=(aaa-aaa.min())/(aaa.max()-aaa.min()) aaa = np.concatenate([aaa] * length, axis=0) aaa = aaa.reshape(length, length) aaa = np.transpose(aaa) # aaa=np.log(aaa+1) # aaa=(aaa-aaa.min())/(aaa.max()-aaa.min()) p = count_percent(D3, D2) p = p * aaa D = getD(p) L = getL(D, p) eigvec = getEigen(L, n) eigvec = np.real(eigvec) clf = KMeans(n_clusters=n) s = clf.fit(eigvec) C = s.labels_ print('processed data using sc ARI:', metrics.adjusted_rand_score(y, C)) print('NMI:', normalized_mutual_info_score(y, C)) print('ACC:', acc(y, C)) from sklearn.cluster import SpectralClustering sc1 = SpectralClustering(n_clusters=n, affinity='nearest_neighbors') print('SC KNN ARI:', metrics.adjusted_rand_score(y, sc1.fit_predict(x1))) c = 'ARI:' + str(metrics.adjusted_rand_score(y, C)) + '\n' + 'NMI:' + str( normalized_mutual_info_score(y, C)) + '\n' c = c + 'ACC:' + str(acc(y, C)) + '\n' + 'SKARI' + str( metrics.adjusted_rand_score(y, sc1.fit_predict(x1))) fh = open('performancegoolamimproved.txt', 'w', encoding='utf-8') fh.write(c) fh.close()
noOfDSPoints = 0 for key, value in discardStats.items(): noOfDSPoints += value[0] noOfCSPoints = 0 for key, value in compressStats.items(): noOfCSPoints += value[0] noOfCSCluster = len(compressStats) noOfRSPoints = len(retainedSet) if iteration == 4: interResult[iteration] = (noOfDSPoints, len(compressStats), noOfCSPoints, len(retainedSet)) accuracy = normalized_mutual_info_score(totalIndex, originalIndex) print("Accuracy", accuracy) ################################################# File Output ########################################################## f = open('output.txt', 'w') f.write("The intermediate results:") f.write("\n") for i in interResult: current = str(i + 1) s = str("Round " + current + ": " + str(interResult[i][0]) + "," + str(interResult[i][1]) + "," + str(interResult[i][2]) + "," + str(interResult[i][3])) f.write(str(s)) f.write("\n") f.write("\n")
def test_agglomerative_clustering(): """ Check that we obtain the correct number of clusters with agglomerative clustering. """ rng = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) n_samples = 100 X = rng.randn(n_samples, 50) connectivity = grid_to_graph(*mask.shape) for linkage in ("ward", "complete", "average"): clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, linkage=linkage) clustering.fit(X) # test caching try: tempdir = mkdtemp() clustering = AgglomerativeClustering( n_clusters=10, connectivity=connectivity, memory=tempdir, linkage=linkage) clustering.fit(X) labels = clustering.labels_ assert_true(np.size(np.unique(labels)) == 10) finally: shutil.rmtree(tempdir) # Turn caching off now clustering = AgglomerativeClustering( n_clusters=10, connectivity=connectivity, linkage=linkage) # Check that we obtain the same solution with early-stopping of the # tree building clustering.compute_full_tree = False clustering.fit(X) assert_almost_equal(normalized_mutual_info_score(clustering.labels_, labels), 1) clustering.connectivity = None clustering.fit(X) assert_true(np.size(np.unique(clustering.labels_)) == 10) # Check that we raise a TypeError on dense matrices clustering = AgglomerativeClustering( n_clusters=10, connectivity=sparse.lil_matrix( connectivity.toarray()[:10, :10]), linkage=linkage) assert_raises(ValueError, clustering.fit, X) # Test that using ward with another metric than euclidean raises an # exception clustering = AgglomerativeClustering( n_clusters=10, connectivity=connectivity.toarray(), affinity="manhattan", linkage="ward") assert_raises(ValueError, clustering.fit, X) # Test using another metric than euclidean works with linkage complete for affinity in PAIRED_DISTANCES.keys(): # Compare our (structured) implementation to scipy clustering = AgglomerativeClustering( n_clusters=10, connectivity=np.ones((n_samples, n_samples)), affinity=affinity, linkage="complete") clustering.fit(X) clustering2 = AgglomerativeClustering( n_clusters=10, connectivity=None, affinity=affinity, linkage="complete") clustering2.fit(X) assert_almost_equal(normalized_mutual_info_score(clustering2.labels_, clustering.labels_), 1) # Test that using a distance matrix (affinity = 'precomputed') has same # results (with connectivity constraints) clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, linkage="complete") clustering.fit(X) X_dist = pairwise_distances(X) clustering2 = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, affinity='precomputed', linkage="complete") clustering2.fit(X_dist) assert_array_equal(clustering.labels_, clustering2.labels_)
#!/usr/bin/python import numpy as np from sklearn.metrics.cluster import normalized_mutual_info_score x = np.array([1 ,2 ,4 ,1 ,1 ,1 ,4 ,4 ,4 ,4 ,4 ,4 ,3 ,3 ,3 ,3 ,3 ,3 ,4 ,4 ,4 ,4 ,4 ,2 ,3 ,2 ,3 ,3 ,2]) y = np.array([1, 1, 3, 2, 3, 4, 1, 1, 1, 1, 1, 3, 3, 3, 3, 1, 1, 1, 4, 2, 2, 2, 4, 2, 4, 4, 2, 2, 3]) numpy.savetxt('alternative_kernel.txt', X, fmt='%.18e', delimiter=',', newline='\n', header='', footer='') print normalized_mutual_info_score(x, y)
### Comparison model (n=2) ### For downgrading print("\nComparison model: ") print("-------------------") #Fit the model with optimal, and use the 4 cores to run it faster km = KMeans(n_clusters=2, n_jobs=4) km.fit(KMEANSdf) # Predict the clusters of each datum Kms2 = km.predict(KMEANSdf) #Evaluate the model print("Inertia: %0.3f" % km.inertia_) silhouette_values = metrics.silhouette_score(KMEANSdf, Kms2) print("Silhouette: %0.3f" % silhouette_values) print("NMI for downgrading from 5 to 2 cluster is: %0.3f" % normalized_mutual_info_score(Kms1, Kms2)) n_clusters_cKMEANS = len(set(Kms2)) print("\nEstimated number of clusters: %d" % n_clusters_cKMEANS) #Adding the columns to the Dataset KMEANSdf["Cluster_KMEANS"] = pd.DataFrame(Kms1) KMEANSdf["Cluster(n=2)_KMEANS"] = pd.DataFrame(Kms2) print("NMI: %0.3f" % normalized_mutual_info_score(Kms2, DBSCANdf["Cluster_DBSCAN"])) #################################### ##### KMEANS clustering plots ##### #################################### ### Calculate the cendroids for ploting
hdbscan_pca = clusterer_hdbscan.fit_predict(embedding_pca) #clusterer.minimum_spanning_tree_.plot(edge_cmap='viridis', edge_alpha=0.6, node_size=80, edge_linewidth=2) #clusterer.single_linkage_tree_.plot(cmap='viridis', colorbar=True) #clusterer.condensed_tree_.plot() cluster_result = [ kmeans_umap, kmeans_tsne, kmeans_mds, kmeans_pca, hc_umap, hc_tsne, hc_mds, hc_pca, spc_umap, spc_tsne, spc_mds, spc_pca, gmm_umap, gmm_tsne, gmm_mds, gmm_pca, hdbscan_umap, hdbscan_tsne, hdbscan_mds, hdbscan_pca ] file = open("clustering_accuracy_NMI.txt", "a") #append mode file.write(filename + "\n") for _, cluster in enumerate(cluster_result): nmi = normalized_mutual_info_score(label_group, cluster) file.write(str(nmi) + "\n") file.close() file = open("clustering_accuracy_ARI.txt", "a") #append mode file.write(filename + "\n") for _, cluster in enumerate(cluster_result): ari = adjusted_rand_score(label_group, cluster) file.write(str(ari) + "\n") file.close() ####################################################################### # Contour plot showing the visiting timestamp of each sample point ####################################################################### label_days_cummulated = data[(data.shape[0] - 1):(data.shape[0])]
kValues = list() scores = list() for k in range(3, 22, 2): labels = bisecting_kmeans(denseMatrix, k, 10) # if (k == 7): # write result to output file # outputFile = open("output.dat", "w") # for index in labels: # outputFile.write(str(index) +'\n') #outputFile.close() score = normalized_mutual_info_score(denseMatrix, labels) kValues.append(k) scores.append(score) print ("For K= %d NMI is %f" %(k, score)) # In[25]: labels = bisecting_kmeans(denseMatrix, 7, 10) # In[22]:
def kmeans(encoder_val_clean, y, nClusters, y_pred_prev=None, weight_initilization='k-means++', seed=42, n_init=40, max_iter=300): # weight_initilization = { 'kmeans-pca', 'kmean++', 'random', None } if weight_initilization == 'kmeans-pca': start_time = timeit.default_timer() pca = PCA(n_components=nClusters).fit(encoder_val_clean) kmeans_model = KMeans(init=pca.components_, n_clusters=nClusters, n_init=1, max_iter=300, random_state=seed) y_pred = kmeans_model.fit_predict(encoder_val_clean) centroids = kmeans_model.cluster_centers_.T centroids = centroids / np.sqrt( np.diag(np.matmul(centroids.T, centroids))) end_time = timeit.default_timer() elif weight_initilization == 'k-means++': start_time = timeit.default_timer() kmeans_model = KMeans(init='k-means++', n_clusters=nClusters, n_init=n_init, max_iter=max_iter, n_jobs=15, random_state=seed) y_pred = kmeans_model.fit_predict(encoder_val_clean) D = 1.0 / euclidean_distances( encoder_val_clean, kmeans_model.cluster_centers_, squared=True) D **= 2.0 / (2 - 1) D /= np.sum(D, axis=1)[:, np.newaxis] centroids = kmeans_model.cluster_centers_.T centroids = centroids / np.sqrt( np.diag(np.matmul(centroids.T, centroids))) end_time = timeit.default_timer() print('k-means: \t nmi =', normalized_mutual_info_score(y, y_pred), '\t arc =', adjusted_rand_score(y, y_pred), '\t acc = {:.4f} '.format(bestMap(y, y_pred)), 'K-means objective = {:.1f} '.format(kmeans_model.inertia_), '\t runtime =', end_time - start_time) if y_pred_prev is not None: print( 'Different Assignments: ', sum(y_pred == y_pred_prev), '\tbestMap: ', bestMap(y_pred, y_pred_prev), '\tdatapoints-bestMap*datapoints: ', encoder_val_clean.shape[0] - bestMap(y_pred, y_pred_prev) * encoder_val_clean.shape[0]) return centroids, kmeans_model.inertia_, y_pred
def test_check_clusterings(): noise = np.random.rand(500) wavelength = np.linspace(0.01, 1, 500) * 1e-6 with pytest.raises(ValueError): normalized_mutual_info_score(wavelength, noise)
def clustering(dataset, X, y, input_var, encoder, num_clusters, output_path, test_batch_size=100, seed=42, continue_training=False): encoder_clean = lasagne.layers.get_output(encoder, deterministic=True) encoder_clean_function = theano.function([input_var], encoder_clean) # Extract MdA features minibatch_flag = 1 for batch in iterate_minibatches(X, y, test_batch_size, shuffle=False): inputs, targets, idx = batch minibatch_x = encoder_clean_function(inputs) if minibatch_flag: encoder_val_clean = minibatch_x minibatch_flag = 0 else: encoder_val_clean = np.concatenate( (encoder_val_clean, minibatch_x), axis=0) # Check kmeans results kmeans(encoder_val_clean, y, num_clusters, seed=seed) initial_time = timeit.default_timer() if (dataset == 'MNIST-full') | (dataset == 'FRGC') | (dataset == 'YTF') | ( dataset == 'CMU-PIE'): # K-means on MdA Features centroids, inertia, y_pred = kmeans(encoder_val_clean, y, num_clusters, seed=seed) y_pred = (np.array(y_pred)).reshape(np.array(y_pred).shape[0], ) y_pred = y_pred - 1 else: # AC-PIC on MdA Features if os.path.isfile( os.path.join(output_path, '../params/pred' + dataset + '.pickle')) & continue_training: with open( os.path.join(output_path, '../params/pred' + dataset + '.pickle'), "rb") as input_file: y_pred = pickle.load(input_file, encoding='latin1') else: try: import matlab.engine eng = matlab.engine.start_matlab() eng.addpath(eng.genpath('matlab')) targets_init = eng.predict_ac_mpi( matlab.double( encoder_val_clean.reshape( encoder_val_clean.shape[0] * encoder_val_clean.shape[1]).tolist()), num_clusters, encoder_val_clean.shape[0], encoder_val_clean.shape[1]) y_pred = (np.array(targets_init)).reshape( np.array(targets_init).shape[0], ) eng.quit() y_pred = y_pred - 1 except: y_pred = predict_ac_mpi(encoder_val_clean, num_clusters, encoder_val_clean.shape[0], encoder_val_clean.shape[1]) with open( os.path.join(output_path, '../params/pred' + dataset + '.pickle'), "wb") as output_file: pickle.dump(y_pred, output_file) final_time = timeit.default_timer() print('AC-PIC: \t nmi = ', normalized_mutual_info_score(y, y_pred), '\t arc = ', adjusted_rand_score(y, y_pred), '\t acc = {:.4f} '.format(bestMap(y, y_pred)), '\t time taken = {:.4f}'.format(final_time - initial_time)) centroids_acpic = np.zeros(shape=(num_clusters, encoder_val_clean.shape[1])) for i in range(num_clusters): centroids_acpic[i] = encoder_val_clean[y_pred == i].mean(axis=0) centroids = centroids_acpic.T centroids = centroids_acpic / np.sqrt( np.diag(np.matmul(centroids.T, centroids))) return np.int32(y_pred), np.float32(centroids)
#Start log file, create log file and start. FILE_PATH = sys.argv[1] pre_clus = sys.argv[2] eigv_path = sys.argv[3] G = nx.read_edgelist(FILE_PATH) Lap = gen_laplacian(G, 2) w, v = LA.eig(Lap.todense()) save_eigen(w, v, FILE_PATH) feature_matrix = np.transpose(v) partition_orig = KMeans(n_clusters=int(pre_clus)).fit(feature_matrix) ext_matrix = np.genfromtxt (eigv_path, delimiter=",") feature_ext_matrix = np.transpose(ext_matrix) partition_fast = KMeans(n_clusters = int(pre_clus)).fit(feature_ext_matrix) print partition_orig.inertia_ print jaccard_similarity_score(partition_orig.labels_, partition_fast.labels_) print normalized_mutual_info_score(partition_orig.labels_, partition_fast.labels_) ''' #spec_matrix1 = SpectralClustering(n_clusters=int(pre_clus)).fit(nx.to_numpy_matrix(G)) spec_matrix = SpectralClustering(n_clusters = int(pre_clus)).fit(nx.to_numpy_matrix(G)) kmeans_matrix = KMeans(n_clusters = int (pre_clus)).fit(nx.to_numpy_matrix(G)) print kmeans_matrix.inertia_ print normalized_mutual_info_score(spec_matrix1.labels_, kmeans_matrix.labels_) '''
def train_depict(dataset, X, y, input_var, decoder, encoder, loss_recons, num_clusters, y_pred, output_path, batch_size=100, test_batch_size=100, num_epochs=1000, learning_rate=1e-4, prediction_status='soft', rec_mult=1, clus_mult=1, centroids=None, init_flag=1, continue_training=False): ###################### # ADD RLC TO MdA # ###################### initial_time = timeit.default_timer() rec_lambda = theano.shared(lasagne.utils.floatX(rec_mult)) clus_lambda = theano.shared(lasagne.utils.floatX(clus_mult)) pred_normalizition_flag = 1 num_batches = X.shape[0] // batch_size if prediction_status == 'soft': target_var = T.matrix('minibatch_out') target_init = T.ivector('kmeans_out') elif prediction_status == 'hard': target_var = T.ivector('minibatch_out') target_val = T.vector() network2 = build_eml(encoder, n_out=num_clusters, W_initial=centroids) network_prediction_noisy = lasagne.layers.get_output(network2, input_var, deterministic=False) network_prediction_clean = lasagne.layers.get_output(network2, input_var, deterministic=True) loss_clus_init = lasagne.objectives.categorical_crossentropy( network_prediction_noisy, target_init).mean() params_init = lasagne.layers.get_all_params([decoder, network2], trainable=True) if prediction_status == 'soft': loss_clus = lasagne.objectives.categorical_crossentropy( network_prediction_noisy, target_var) elif prediction_status == 'hard': loss_clus = target_val * lasagne.objectives.categorical_crossentropy( network_prediction_noisy, target_var) loss_clus = clus_lambda * loss_clus.mean() loss_recons = rec_lambda * loss_recons loss = loss_recons + loss_clus params2 = lasagne.layers.get_all_params([decoder, network2], trainable=True) updates = lasagne.updates.adam(loss, params2, learning_rate=learning_rate) train_fn = theano.function([input_var, target_var], [loss, loss_recons, loss_clus], updates=updates) loss_clus_init = clus_lambda * loss_clus_init loss_init = loss_clus_init + loss_recons updates_init = lasagne.updates.adam(loss_init, params_init, learning_rate=learning_rate) train_fn_init = theano.function([input_var, target_init], [loss_init, loss_recons, loss_clus_init], updates=updates_init) test_fn = theano.function([input_var], network_prediction_clean) final_time = timeit.default_timer() print("\n...Start DEPICT initialization") if init_flag: if os.path.isfile( os.path.join(output_path, '../params/weights' + dataset + '.pickle')) & continue_training: with open( os.path.join(output_path, '../params/weights' + dataset + '.pickle'), "rb") as input_file: weights = pickle.load(input_file, encoding='latin1') lasagne.layers.set_all_param_values([decoder, network2], weights) else: X_train, X_val, y_train, y_val, y_pred_train, y_pred_val = train_test_split( X, y, y_pred, stratify=y, test_size=0.10, random_state=42) last_update = 0 # Initilization y_targ_train = np.copy(y_pred_train) y_targ_val = np.copy(y_pred_val) y_val_prob = test_fn(X_val) y_val_pred = np.argmax(y_val_prob, axis=1) val_nmi = normalized_mutual_info_score(y_targ_val, y_val_pred) best_val = val_nmi print('initial val nmi: ', val_nmi) best_params_values = lasagne.layers.get_all_param_values( [decoder, network2]) for epoch in range(1000): train_err, val_err = 0, 0 lossre_train, lossre_val = 0, 0 losspre_train, losspre_val = 0, 0 num_batches_train = 0 for batch in iterate_minibatches(X_train, y_train, batch_size, shuffle=True): minibatch_inputs, targets, idx = batch minibatch_error, lossrec, losspred = train_fn_init( minibatch_inputs, np.int32(y_targ_train[idx])) train_err += minibatch_error lossre_train += lossrec losspre_train += losspred num_batches_train += 1 y_val_prob = test_fn(X_val) y_val_pred = np.argmax(y_val_prob, axis=1) y_pred = np.zeros(X.shape[0]) for batch in iterate_minibatches(X, y, test_batch_size, shuffle=False): minibatch_inputs, targets, idx = batch minibatch_prob = test_fn(minibatch_inputs) minibatch_pred = np.argmax(minibatch_prob, axis=1) y_pred[idx] = minibatch_pred val_nmi = normalized_mutual_info_score(y_targ_val, y_val_pred) print( 'epoch:', epoch + 1, '\t nmi = {:.4f} '.format( normalized_mutual_info_score(y, y_pred)), '\t arc = {:.4f} '.format(adjusted_rand_score(y, y_pred)), '\t acc = {:.4f} '.format(bestMap(y, y_pred)), '\t loss= {:.10f}'.format(train_err / num_batches_train), '\t loss_reconstruction= {:.10f}'.format( lossre_train / num_batches_train), '\t loss_prediction= {:.10f}'.format(losspre_train / num_batches_train), '\t val nmi = {:.4f} '.format(val_nmi)) last_update += 1 if val_nmi > best_val: last_update = 0 print("new best val nmi: ", val_nmi) best_val = val_nmi best_params_values = lasagne.layers.get_all_param_values( [decoder, network2]) # if (losspre_val / num_batches_val) < 0.2: # break if last_update > 5: break lasagne.layers.set_all_param_values([decoder, network2], best_params_values) with open( os.path.join(output_path, '../params/weights' + dataset + '.pickle'), "wb") as output_file: pickle.dump( lasagne.layers.get_all_param_values([decoder, network2]), output_file) # Epoch 0 print("\n...Start DEPICT training") y_prob = np.zeros((X.shape[0], num_clusters)) y_prob_prev = np.zeros((X.shape[0], num_clusters)) for batch in iterate_minibatches(X, y, test_batch_size, shuffle=False): minibatch_inputs, targets, idx = batch minibatch_prob = test_fn(minibatch_inputs) y_prob[idx] = minibatch_prob y_prob_max = np.max(y_prob, axis=1) if pred_normalizition_flag: cluster_frequency = np.sum(y_prob, axis=0) y_prob = y_prob**2 / cluster_frequency y_prob = np.transpose(y_prob.T / np.sum(y_prob, axis=1)) y_pred = np.argmax(y_prob, axis=1) print('epoch: 0', '\t nmi = {:.4f} '.format(normalized_mutual_info_score(y, y_pred)), '\t arc = {:.4f} '.format(adjusted_rand_score(y, y_pred)), '\t acc = {:.4f} '.format(bestMap(y, y_pred))) if os.path.isfile( os.path.join(output_path, '../params/rlc' + dataset + '.pickle')) & continue_training: with open( os.path.join(output_path, '../params/rlc' + dataset + '.pickle'), "rb") as input_file: weights = pickle.load(input_file, encoding='latin1') lasagne.layers.set_all_param_values([decoder, network2], weights) else: for epoch in range(num_epochs): # In each epoch, we do a full pass over the training data: train_err = 0 lossre = 0 losspre = 0 for batch in iterate_minibatches(X, y, batch_size, shuffle=True): minibatch_inputs, targets, idx = batch # M_step if prediction_status == 'hard': minibatch_err, lossrec, losspred = train_fn( minibatch_inputs, np.ndarray.astype(y_pred[idx], 'int32'), np.ndarray.astype(y_prob_max[idx], 'float32')) elif prediction_status == 'soft': minibatch_err, lossrec, losspred = train_fn( minibatch_inputs, np.ndarray.astype(y_prob[idx], 'float32')) minibatch_prob = test_fn(minibatch_inputs) y_prob[idx] = minibatch_prob train_err += minibatch_err lossre += lossrec losspre += losspred y_prob_max = np.max(y_prob, axis=1) if pred_normalizition_flag: cluster_frequency = np.sum( y_prob, axis=0) # avoid unbalanced assignment y_prob = y_prob**2 / cluster_frequency # y_prob = y_prob / np.sqrt(cluster_frequency) y_prob = np.transpose(y_prob.T / np.sum(y_prob, axis=1)) y_pred = np.argmax(y_prob, axis=1) # print('mse: ', mean_squared_error(y_prob, y_prob_prev)) if mean_squared_error(y_prob, y_prob_prev) < 1e-7: with open( os.path.join(output_path, '../params/rlc' + dataset + '.pickle'), "wb") as output_file: pickle.dump( lasagne.layers.get_all_param_values( [decoder, network2]), output_file) break y_prob_prev = np.copy(y_prob) print( 'epoch:', epoch + 1, '\t nmi = {:.4f} '.format( normalized_mutual_info_score(y, y_pred)), '\t arc = {:.4f} '.format(adjusted_rand_score(y, y_pred)), '\t acc = {:.4f} '.format(bestMap(y, y_pred)), '\t loss= {:.10f}'.format(train_err / num_batches), '\t loss_recons= {:.10f}'.format(lossre / num_batches), '\t loss_pred= {:.10f}'.format(losspre / num_batches)) # test y_pred = np.zeros(X.shape[0]) for batch in iterate_minibatches(X, y, test_batch_size, shuffle=False): minibatch_inputs, targets, idx = batch minibatch_prob = test_fn(minibatch_inputs) minibatch_pred = np.argmax(minibatch_prob, axis=1) y_pred[idx] = minibatch_pred print('final: ', '\t nmi = {:.4f} '.format(normalized_mutual_info_score(y, y_pred)), '\t arc = {:.4f} '.format(adjusted_rand_score(y, y_pred)), '\t acc = {:.4f} '.format(bestMap(y, y_pred)))
def report_clustering(distance_file, biom_file, metadata_file, num_clusters, verbose, L=2, output_file=None): if not isinstance(distance_file, list): distance_matrix = CSV.read(distance_file) else: distance_matrix = distance_file if output_file is not None: f = open(output_file, 'w') output_matrix = [] AgglomerativeCluster = AgglomerativeClustering( n_clusters=num_clusters, affinity='precomputed', linkage='complete').fit_predict(distance_matrix) KMedoidsCluster = KMedoids(n_clusters=num_clusters, metric='precomputed', method='pam', init='heuristic').fit_predict(distance_matrix) PCoA_Samples = BW.extract_samples(biom_file) metadata = meta.extract_metadata(metadata_file) region_names = [] for i in range(len(PCoA_Samples)): if metadata[PCoA_Samples[i]]['body_site'] not in region_names: region_names.append(metadata[PCoA_Samples[i]]['body_site']) PCoA_Samples[i] = region_names.index( metadata[PCoA_Samples[i]]['body_site']) if verbose and L == 1: print('Printing results for L1-UniFrac:') elif verbose and L == 2: print('Printing results for L2-UniFrac:') if verbose: print('Metric\t\t\t\t\t\t\tAgglomerativeClustering\t\tKMedoids') if output_file is not None: if L == 1: f.write('Printing results for L1-UniFrac:\n') elif L == 2: f.write('Printing results for L2-UniFrac:\n') f.write('Metric\t\t\t\tAgglomerativeClustering\t\t\tKMedoids\n') if L == 1: output_matrix.append(['Printing results for L1-UniFrac:']) if L == 2: output_matrix.append(['Printing results for L2-UniFrac:']) output_matrix.append(['Metric', 'AgglomerativeClustering', 'KMedoids']) RI1 = rand_score(PCoA_Samples, AgglomerativeCluster) RI2 = rand_score(PCoA_Samples, KMedoidsCluster) if verbose: print(f'Rand Index Score: {RI1}\t\t\t{RI2}') ARI1 = adjusted_rand_score(PCoA_Samples, AgglomerativeCluster) ARI2 = adjusted_rand_score(PCoA_Samples, KMedoidsCluster) if verbose: print(f'Adjusted Rand Index Score: {ARI1}\t\t\t{ARI2}') NMI1 = normalized_mutual_info_score(PCoA_Samples, AgglomerativeCluster) NMI2 = normalized_mutual_info_score(PCoA_Samples, KMedoidsCluster) if verbose: print(f'Normalized Mutual Index Score: {NMI1}\t\t\t{NMI2}') AMI1 = adjusted_mutual_info_score(PCoA_Samples, AgglomerativeCluster) AMI2 = adjusted_mutual_info_score(PCoA_Samples, KMedoidsCluster) if verbose: print(f'Adjusted Mutual Info Score: {AMI1}\t\t\t{AMI2}') FM1 = fowlkes_mallows_score(PCoA_Samples, AgglomerativeCluster) FM2 = fowlkes_mallows_score(PCoA_Samples, KMedoidsCluster) if verbose: print(f'Fowlkes Mallows Score: {FM1}\t\t\t{FM2}') if output_file is not None: f.write(f'Rand Index Score: {RI1}\t\t\t{RI2}\n') f.write(f'Adjusted Rand Index Score: {ARI1}\t\t\t{ARI2}\n') f.write(f'Normalized Mutual Index Score: {NMI1}\t\t\t{NMI2}\n') f.write(f'Adjusted Mutual Info Score: {AMI1}\t\t\t{AMI2}\n') f.write(f'Fowlkes Mallows Score: {FM1}\t\t\t{FM2}\n') output_matrix.append(['Rand Index Score:', RI1, RI2]) output_matrix.append(['Adjusted Rand Index Score:', ARI1, ARI2]) output_matrix.append(['Normalized Mutual Index Score:', NMI1, NMI2]) output_matrix.append(['Adjusted Mutual Info Score:', AMI1, AMI2]) output_matrix.append(['Fowlkes Mallows Score:', FM1, FM2]) return output_matrix
input = open('C:\\Users\\Administrator\\Desktop\\Tweets.txt', 'r') for line in input.readlines(): tweets = json.loads(line) texts.append(tweets['text']) labels.append(tweets['cluster']) vectorizer = TfidfVectorizer() vec = vectorizer.fit_transform(texts) vectorizer_2 = CountVectorizer() vec_w2v = vectorizer_2.fit_transform(texts) # KMeans clf = KMeans(n_clusters=100) a = clf.fit(vec) labels_predict = clf.labels_ nml = normalized_mutual_info_score(labels, labels_predict) print('the nml of Kmeans:', nml) # Affinity Propagation afp = AffinityPropagation().fit(vec) cluster_centers_indices = afp.cluster_centers_indices_ labels_predict = afp.labels_ nml = normalized_mutual_info_score(labels, labels_predict) print('the nml of Affinity Propagation:', nml) # MeanShift vec_w2v_a = preprocessing.scale(vec_w2v.toarray()) clustering = MeanShift(bandwidth=5).fit(vec_w2v_a) labels_predict = clustering.labels_ nml = normalized_mutual_info_score(labels, labels_predict) print('the nml of MeanShift:', nml)
X = X[index] X = X.toarray() labels = labels[index] # train_x=X[:train_len] # train_y=labels[:train_len] # # test_x=X[train_len:] # test_y=labels[train_len:] # KMeans km = KMeans(n_clusters=class_num) km.fit(X) pred_y = km.labels_ nmi = normalized_mutual_info_score(labels, pred_y) print('KMeans NMI:{:.4f}'.format(nmi)) # AffinityPropagation affinity_propagation = AffinityPropagation(damping=0.9, preference=-1) affinity_propagation.fit(X) pred_y = affinity_propagation.labels_ nmi = normalized_mutual_info_score(labels, pred_y) print('AffinityPropagation NMI:{:.4f}'.format(nmi)) # Mean-shift bandwidth = estimate_bandwidth(X, quantile=0.2) mean_shift = MeanShift(bandwidth=0.8, bin_seeding=True) mean_shift.fit(X) pred_y = mean_shift.labels_
def test_agglomerative_clustering(): # Check that we obtain the correct number of clusters with # agglomerative clustering. rng = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) n_samples = 100 X = rng.randn(n_samples, 50) connectivity = grid_to_graph(*mask.shape) for linkage in ("ward", "complete", "average"): clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, linkage=linkage) clustering.fit(X) # test caching try: tempdir = mkdtemp() clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, memory=tempdir, linkage=linkage) clustering.fit(X) labels = clustering.labels_ assert_true(np.size(np.unique(labels)) == 10) finally: shutil.rmtree(tempdir) # Turn caching off now clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, linkage=linkage) # Check that we obtain the same solution with early-stopping of the # tree building clustering.compute_full_tree = False clustering.fit(X) assert_almost_equal( normalized_mutual_info_score(clustering.labels_, labels), 1) clustering.connectivity = None clustering.fit(X) assert_true(np.size(np.unique(clustering.labels_)) == 10) # Check that we raise a TypeError on dense matrices clustering = AgglomerativeClustering( n_clusters=10, connectivity=sparse.lil_matrix(connectivity.toarray()[:10, :10]), linkage=linkage) assert_raises(ValueError, clustering.fit, X) # Test that using ward with another metric than euclidean raises an # exception clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity.toarray(), affinity="manhattan", linkage="ward") assert_raises(ValueError, clustering.fit, X) # Test using another metric than euclidean works with linkage complete for affinity in PAIRED_DISTANCES.keys(): # Compare our (structured) implementation to scipy clustering = AgglomerativeClustering(n_clusters=10, connectivity=np.ones( (n_samples, n_samples)), affinity=affinity, linkage="complete") clustering.fit(X) clustering2 = AgglomerativeClustering(n_clusters=10, connectivity=None, affinity=affinity, linkage="complete") clustering2.fit(X) assert_almost_equal( normalized_mutual_info_score(clustering2.labels_, clustering.labels_), 1) # Test that using a distance matrix (affinity = 'precomputed') has same # results (with connectivity constraints) clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, linkage="complete") clustering.fit(X) X_dist = pairwise_distances(X) clustering2 = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, affinity='precomputed', linkage="complete") clustering2.fit(X_dist) assert_array_equal(clustering.labels_, clustering2.labels_)
def multirun(datasetName): # init_population,init_ari,datamat,datalabels = ini_Cluster(kNumber=6) #多种聚类算法产生初始种群 # datamat,datalabels = loadDataset("../dataset/glass.data") path = '../dataset/'+datasetName datamat,datalabels = loadDataset(path) print 'data ready' # datalabels_to_float = list(map(lambda x: float(x), datalabels)) sampledData, remainedData, sampledIndex, remainedIndex= data_sample(datamat,1,10) print 'sampledData ready' # pop_kmeans = rsnn(sampledData, remainedData, sampledIndex, remainedIndex,'kmeans') print 'kmeans end' max_nmi1 = -inf for ind1 in pop_kmeans: nmi1 = normalized_mutual_info_score(datalabels, ind1) if nmi1 > max_nmi1: max_nmi1 = nmi1 print '初始kmeans最大nmi为%s'%max_nmi1 pop_ward = rsnn(sampledData, remainedData, sampledIndex, remainedIndex,'ward') print 'ward end' max_nmi2 = -inf for ind2 in pop_ward: nmi2 = normalized_mutual_info_score(datalabels, ind2) if nmi2 > max_nmi2: max_nmi2 = nmi2 print '初始ward最大nmi为%s'%max_nmi2 pop_complete = rsnn(sampledData, remainedData, sampledIndex, remainedIndex,'complete') print 'complete end' max_nmi3 = -inf for ind3 in pop_complete: nmi3 = normalized_mutual_info_score(datalabels, ind3) if nmi3 > max_nmi3: max_nmi3 = nmi3 print '初始complete最大nmi为%s'%max_nmi3 pop_average = rsnn(sampledData, remainedData, sampledIndex, remainedIndex,'average') print 'average end' max_nmi4 = -inf for ind4 in pop_average: nmi4 = normalized_mutual_info_score(datalabels, ind4) if nmi4 > max_nmi4: max_nmi4 = nmi4 print '初始average最大nmi为%s'%max_nmi4 pop = [] pop.extend(pop_kmeans) pop.extend(pop_ward) pop.extend(pop_complete) pop.extend(pop_average) init_population = [] for indiv1 in pop: ind1 = creator.Individual(indiv1) init_population.append(ind1) filter_pop = filter(lambda x:len(x)>0,init_population) ##去除初始化聚类失败的结果 population = filter_pop #population是总的种群,后续的交叉算法的结果也要添加进来 invalid_ind = [ind for ind in population if not ind.fitness.valid] fitnesses = toolbox.map(toolbox.evaluate, tile(datamat,(len(invalid_ind),1,1)),tile(population,(len(invalid_ind),1,1)),invalid_ind) for ind, fit in zip(invalid_ind, fitnesses): ind.fitness.values = fit population = toolbox.select(population, len(population)) popeliteLen = len(population) for i in range(generation): print '第%s代'%i popElite = toolbox.select(population, popeliteLen) #top half from population # Vary the population parentSpring = tools.selTournamentDCD(population, len(population)) parentSpring = [toolbox.clone(ind) for ind in parentSpring] newoffspring = [] # applying crossover subpopArr=[] for subtimes in range(5): #这里循环10次,是因为subpop写死成4次,乘起来又产生40个个体,用于产生dsce运算产生40个结果,视作交叉 subpopOneArr = getSubPop(parentSpring) subpopArr.extend(subpopOneArr) for subpop in subpopArr: transMatrix, popClusterArr_3, popClusterArr_2, clusterNumArr = transformation(datamat, subpop) similiarMatrix, unionClusterArr_2 = measureSimilarity(transMatrix, popClusterArr_3, popClusterArr_2, clusterNumArr, datamat, a1=0.8) dictCownP = assign(similiarMatrix, 0.7) resultList = resultTransform(dictCownP, datamat) ind_ensemble = creator.Individual(resultList) newoffspring.append(ind_ensemble) # evaluating fitness of individuals with invalid fitnesses invalid_ind = [ind for ind in newoffspring if not ind.fitness.valid] fitnesses = toolbox.map(toolbox.evaluate, tile(datamat,(len(invalid_ind),1,1)),tile(newoffspring,(len(invalid_ind),1,1)),invalid_ind)#这里只用了未经处理的数据,没有用到真实类别 for ind, fit in zip(invalid_ind, fitnesses): ind.fitness.values = fit # Chossing a population for the next generation population = toolbox.select(popElite + newoffspring, popeliteLen) result1 = toolbox.nondominated(population,len(population)) ari_arr = [] max_ari = -inf for ind in result1[0]: ari = adjusted_rand_score(datalabels, ind) ari_arr.append(ari) if ari > max_ari: max_ari = ari nmi_arr = [] max_nmi = -inf print 'nmi值' for ind in result1[0]: nmi = normalized_mutual_info_score(datalabels, ind) nmi_arr.append(nmi) if nmi > max_nmi: max_nmi = nmi print '最大nmi值为:%s' % max_nmi return max_nmi,max_ari
def clusterscores(self): target,pred = self.conf2label() NMI = normalized_mutual_info_score(target,pred) ARI = adjusted_rand_score(target,pred) AMI = adjusted_mutual_info_score(target,pred) return {'NMI':NMI,'ARI':ARI,'AMI':AMI}
X=X, K=self.n_clusters, max_iter=self.max_iter) return self def fit_predict(self, X, y=None): if self.fit(X).isConverge: return self.best_labels else: return 'Not convergence with current parameter ' \ 'or centroids,Please try again' def get_params(self): return self.isConverge, self.n_clusters, 'KMEAS' def get_cost(self): return self.cost def load_data(): data = load_iris() x, y = data.data, data.target return x, y if __name__ == '__main__': x, y = load_data() K = len(np.unique(y)) model = KMeans(n_clusters=K) y_pred = model.fit_predict(x) nmi = normalized_mutual_info_score(y, y_pred) print("NMI: ", nmi)
def _calculate(self, input): input = input[~np.isnan(input).any(axis=1)] return normalized_mutual_info_score(input[:, 0], input[:, 1])
def my_kmeans(): X, cluster = loaddata() kmeans = KMeans(n_clusters=len(set(cluster)), random_state=0).fit(X) print('kmeans result:') print('NMI score:%f\n' % normalized_mutual_info_score(cluster, kmeans.labels_))
def evaluation(file_name): #[V, miu] = [[], []] #V_target = [[]] V_target = [] #with gzip.open(file_name, 'rb') as f: #[V, miu] = pickle.load(f) #V_target = pickle.load(f) #with open(file_name, 'r') as fin: with open(file_name + '.tsv', 'r') as fin: for line in fin: l = line[0:-1].split('\t') l = [float(x) for x in l] V_target.append(l) #V_star = get_V_star(V, miu) #clusters = np.argmax(V_star[target], axis = 1) clusters_prob = V_target clusters = np.argmax(clusters_prob, axis=1) #Xinwei edited #clusters_prob = V_star[target] if args.debug: print("clusters_prob:") print(clusters_prob) print("clusters:") print(clusters) clusters_prob_parse = [] total = 0 correct = 0 aal = [] ppl = [] #new data only results = {} for i in range(len(target_list)): a = target_list[i] predict = clusters[i] + 1 if a in labels_dict_test: total += 1 actual = labels_dict_test[a] if actual == predict: correct += 1 aal.append(actual) ppl.append(predict) results[a] = (actual, predict) clusters_prob_parse.append(clusters_prob[i]) if args.debug: print('results is: {}'.format(results)) print("Total test size: {}".format(total)) try: print("Accuracy: {}".format(correct / total)) except ZeroDivisionError: print("0 total labels") precision = precision_score(np.array(aal), np.array(ppl), average=None) micro_precision = precision_score(np.array(aal), np.array(ppl), average='micro') macro_precision = precision_score(np.array(aal), np.array(ppl), average='macro') recall = recall_score(np.array(aal), np.array(ppl), average=None) micro_recall = recall_score(np.array(aal), np.array(ppl), average='micro') macro_recall = recall_score(np.array(aal), np.array(ppl), average='macro') f1 = f1_score(np.array(aal), np.array(ppl), average=None) micro_f1 = f1_score(np.array(aal), np.array(ppl), average='micro') macro_f1 = f1_score(np.array(aal), np.array(ppl), average='macro') nmi = normalized_mutual_info_score(np.array(aal), np.array(ppl)) ll = log_loss(np.array(aal), clusters_prob_parse) print('len(np.array(aal)) is: {}'.format(len(np.array(aal)))) print('len(clusters_prob) is: {}'.format(len(clusters_prob))) print('len(clusters_prob_parse) is: {}'.format(len(clusters_prob_parse))) print("precision: {}".format(precision)) print('micro precision: {}'.format(micro_precision)) print('macro precision: {}'.format(macro_precision)) print("recall: {}".format(recall)) print('micro recall: {}'.format(micro_recall)) print('macro recall: {}'.format(macro_recall)) print("f1: {}".format(f1)) print('micro f1: {}'.format(micro_f1)) print('macro f1: {}'.format(macro_f1)) print("nmi: {}".format(nmi)) print("ll: {}".format(ll))
print ccr(crossDatasetTrueClassification, clusteringResults) # print labelAssignment fileContainer.write('\n') fileContainer.write('The Correct Clustering Rate is : '+ str(ccr(crossDatasetTrueClassification, clusteringResults))) fileContainer.write('\n') fileContainer.write('The clustering Jaccard Similarity is : '+ str(jaccardSim(crossDatasetTrueClassification, clusteringResults))) wholeTrueClasses = np.zeros(1900) for i, trueClass in enumerate(crossDatasetTrueClassification): for trajectory in trueClass: wholeTrueClasses[trueClass] = i print list(wholeTrueClasses[:20]) wholePredClasses = np.zeros(1900) for i, predClass in enumerate(clusteringResults): for trajectory in predClass: wholePredClasses[predClass] = i print list(wholePredClasses[:20]) print normalized_mutual_info_score(wholeTrueClasses, wholePredClasses) fileContainer.write('\nThe NMI is : '+ str(normalized_mutual_info_score(wholeTrueClasses, wholePredClasses))) fileContainer.write('\n--------------------------------------------------------------------') print measurements.ccr(crossDatasetTrueClassification, clusteringResults) print measurements.jaccardSim(crossDatasetTrueClassification, clusteringResults) print measurements.NMI(crossDatasetTrueClassification, clusteringResults)
def NMI(GT, pred): return normalized_mutual_info_score(GT, pred)
#separate data into training and test sets and choose state so that results are repeatable X_train, X_test, y_train, y_test = train_test_split(df[cols], df["quality"], test_size=0.2, random_state=4) #declare and fit svm with rbf kernel to training set clf = svm.SVC(kernel="rbf", gamma=1, C=1, decision_function_shape="ovo") clf.fit(X_train, y_train) ypred = clf.predict(X_test) #get scores for how well the svm performs on test set print("Accuracy rbf kernel: %.2f" % clf.score(X_test, y_test)) print("nmi rbf kernel: %.2f" % normalized_mutual_info_score(y_test, ypred)) print(classification_report(y_test, ypred)) X = StandardScaler().fit_transform(df[cols]) #finding ideal eps for DBSCAN by calculating the distance to the nearest n points for each point, #sorting and plotting the results. Then we look to see where the change is most pronounce #and select that as epsilon. neigh = NearestNeighbors(n_neighbors=2) nbrs = neigh.fit(X) distances, indices = nbrs.kneighbors(X) distances = np.sort(distances, axis=0) distances = distances[:, 1] plt.plot(distances) plt.title("finding optimal eps value for DBSCAN using elbow method")
ASC = alt_spectral_clust(data) omg = objective_magnitude db = ASC.db ASC.set_values('q', 2) ASC.set_values('C_num', 2) ASC.set_values('lambda', 1) ASC.set_values('kernel_type', 'Gaussian Kernel') ASC.set_values('sigma', 15) ASC.run() a = db['allocation'] print a import pdb pdb.set_trace() #print db['Y_matrix'] start_time = time.time() ASC.run() print("--- %s seconds ---" % (time.time() - start_time)) b = db['allocation'] print "NMI : ", normalized_mutual_info_score(a, b) #sklearn.metrics.pairwise.pairwise_distances(X, Y=None, metric='euclidean', n_jobs=1, **kwds) #new_d = db['data'].dot(db['W_matrix']) #dm = sklearn.metrics.pairwise.pairwise_distances(new_d) #np.savetxt('original_similarity.txt', db['Kernel_matrix'], fmt='%5.3f', delimiter=',', newline='\n', header='', footer='', comments='# ') import pdb pdb.set_trace()
if w in model.vocab: tw[w] = prob[model.vocab[w].index][tp] tmp += prob[model.vocab[w].index][tp] lista.append(tw) tw_topics.append(tmp) dist_topics.append(tw_topics) tw_l.append(lista) assign_topics.append(tw_topics.index(max(tw_topics))) clf = svm.SVC(kernel = 'linear', C = 1) scores = cross_val_score(clf, dist_topics, labels, cv = k - 1) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) scores = cross_val_score(clf, dist_topics, labels, cv = k - 1, scoring = 'f1_macro') print("F1_macro: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) print("Topic Coherence") print(topic_coherence(word_topic, texts, 15)) print("Topic NMI") print(normalized_mutual_info_score(assign_topics, labels)) print("Topic Purity") print(purity_score(assign_topics,labels))
def main(): shutil.rmtree(save_path) os.mkdir(save_path) if data_path == '20newsgroups': newsgroups_data = fetch_20newsgroups_vectorized(subset='all') x = newsgroups_data.data.toarray() labels = newsgroups_data.target n_clusters = 20 elif data_path == 'r8': df = pd.read_csv('data/r8-all-stemmed.txt') labels_idx = [ 'acq', 'crude', 'earn', 'grain', 'interest', 'money-fx', 'ship', 'trade' ] labels = df['class'].values labels = [labels_idx.index(ele) for ele in labels] labels = np.asarray(labels, dtype=np.int64) x_df = df.drop(['class'], axis=1) corpus = np.squeeze(x_df.values) is_TfidfVectorizer = True if is_TfidfVectorizer: vectorizer = TfidfVectorizer() x = vectorizer.fit_transform(corpus).toarray() else: vectorizer = CountVectorizer() x = vectorizer.fit_transform(corpus).toarray() n_clusters = 8 elif data_path == 'olivetti_faces': data = fetch_olivetti_faces() x = data.data labels = data.target n_clusters = 40 elif data_path == 'rcv1': # data = fetch_rcv1() # x = data.data.toarray() # labels = data.target.toarray() # n_clusters = 103 x, labels = get_data_from_svmlight_file('data/rcv1_train.binary') x = x.toarray() n_clusters = 2 elif data_path == 'sector': x, labels = get_data_from_svmlight_file('data/sector.scale.all') x = x.toarray() n_clusters = 105 else: raise Exception("Invalid data path!") print("Data shape: (%d, %d)" % x.shape) data_size = labels.size # build model model = RDP_Model(in_c=x.shape[1], out_c=out_c, USE_GPU=USE_GPU, LR=LR, logfile=logfile, dropout_r=dropout_r) best_nmi = best_epoch = 0 loss = 0 for epoch in range(0, total_epoch): # random sampling with replacement for batch_i in range(epoch_batch): random_pos = random_list(0, data_size - 1, batch_size) batch_data = x[random_pos] loss = model.train_model(batch_data, epoch) if epoch % eval_interval == 0: print("epoch ", epoch, "loss:", loss) if logfile: logfile.write("epoch " + str(epoch) + " loss: " + str(loss) + '\n') model.save_model(save_path + 'model_latest.h5') # eval if is_eval: gap_dims = model.eval_model(x) kmeans_results = KMeans(n_clusters=n_clusters, random_state=0).fit(gap_dims) # Match each learned cluster label with the true labels found in them y_pred = kmeans_results.labels_ labels_pred = np.zeros_like(y_pred) for i in range(n_clusters): mask = (y_pred == i) labels_pred[mask] = mode(labels[mask])[0] # evaluations nmi_scores = normalized_mutual_info_score(labels, labels_pred) print("nmi_scores:", nmi_scores) if logfile: logfile.write("nmi_scores: %.4f\n" % nmi_scores) fscores = f1_score(labels, labels_pred, average='macro') print("fscores_macro:", fscores) if logfile: logfile.write("fscores_macro: %.4f\n" % fscores) fscores = f1_score(labels, labels_pred, average='micro') print("fscores_micro:", fscores) if logfile: logfile.write("fscores_micro: %.4f\n" % fscores) fscores = f1_score(labels, labels_pred, average='weighted') print("fscores_weighted:", fscores) if logfile: logfile.write("fscores_weighted: %.4f\n" % fscores) RI_scores = adjusted_rand_score(labels, labels_pred) print("RI_scores:", RI_scores) if logfile: logfile.write("RI_scores: %.4f\n" % RI_scores) if best_nmi < nmi_scores: best_nmi = nmi_scores best_epoch = epoch print("Best NMI: %.4f" % best_nmi) print("Best Epoch %d\n" % best_epoch) if logfile: logfile.write("Best NMI: %.4f\n" % best_nmi) logfile.write("Best Epoch %d\n\n" % best_epoch) logfile.flush()
import matplotlib.pyplot as plt; plt.rcdefaults() import numpy as np import matplotlib.pyplot as plt from sklearn.mixture import GMM k = 4 X = genfromtxt('dataset/min_words.csv', delimiter=',') univ_label = genfromtxt('dataset/webkbRaw_label_univ.csv', delimiter=',') topic_label = genfromtxt('dataset/webkbRaw_label_topic.csv', delimiter=',') clf = KMeans(n_clusters=k) allocation = clf.fit_predict(X) kmeans_nmi = normalized_mutual_info_score(allocation, univ_label) print "K means : " , kmeans_nmi d_matrix = sklearn.metrics.pairwise.pairwise_distances(X, Y=None, metric='euclidean') sigma = np.median(d_matrix) Gamma = 1/(2*np.power(sigma,2)) clf = SpectralClustering(n_clusters=k, gamma=Gamma) allocation = clf.fit_predict(X) spectral_nmi = normalized_mutual_info_score(allocation, univ_label) print 'Spectral Clustering : ' , spectral_nmi
def main(args): # fix random seeds torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) np.random.seed(args.seed) device = torch.device('cuda:0' if torch.cuda.is_available() else "cpu") print(device) criterion = nn.CrossEntropyLoss() cluster_log = Logger(os.path.join(args.exp, 'clusters.pickle')) # CNN if args.verbose: print('Architecture: {}'.format(args.arch)) ''' ########################################## ########################################## # Model definition ########################################## ##########################################''' model = models.__dict__[args.arch](bn=True, num_cluster=args.nmb_cluster, num_category=args.nmb_category) fd = int(model.cluster_layer[0].weight.size() [1]) # due to transpose, fd is input dim of W (in dim, out dim) model.cluster_layer = None model.category_layer = None model.features = torch.nn.DataParallel(model.features) model = model.double() model.to(device) cudnn.benchmark = True if args.optimizer is 'Adam': print('Adam optimizer: conv') optimizer_body = torch.optim.Adam( filter(lambda x: x.requires_grad, model.parameters()), lr=args.lr_Adam, betas=(0.9, 0.999), weight_decay=10**args.wd, ) else: print('SGD optimizer: conv') optimizer_body = torch.optim.SGD( filter(lambda x: x.requires_grad, model.parameters()), lr=args.lr_SGD, momentum=args.momentum, weight_decay=10**args.wd, ) ''' ############### ############### category_layer ############### ############### ''' model.category_layer = nn.Sequential( nn.Linear(fd, args.nmb_category), nn.Softmax(dim=1), ) model.category_layer[0].weight.data.normal_(0, 0.01) model.category_layer[0].bias.data.zero_() model.category_layer = model.category_layer.double() model.category_layer.to(device) if args.optimizer is 'Adam': print('Adam optimizer: conv') optimizer_category = torch.optim.Adam( filter(lambda x: x.requires_grad, model.category_layer.parameters()), lr=args.lr_Adam, betas=(0.9, 0.999), weight_decay=10**args.wd, ) else: print('SGD optimizer: conv') optimizer_category = torch.optim.SGD( filter(lambda x: x.requires_grad, model.category_layer.parameters()), lr=args.lr_SGD, momentum=args.momentum, weight_decay=10**args.wd, ) ''' ######################################## ######################################## Create echogram sampling index ######################################## ########################################''' print('Sample echograms.') dataset_cp, dataset_semi = sampling_echograms_full(args) dataloader_cp = torch.utils.data.DataLoader(dataset_cp, shuffle=False, batch_size=args.batch, num_workers=args.workers, drop_last=False, pin_memory=True) dataloader_semi = torch.utils.data.DataLoader(dataset_semi, shuffle=False, batch_size=args.batch, num_workers=args.workers, drop_last=False, pin_memory=True) dataset_test_bal, dataset_test_unbal = sampling_echograms_test(args) dataloader_test_bal = torch.utils.data.DataLoader(dataset_test_bal, shuffle=False, batch_size=args.batch, num_workers=args.workers, drop_last=False, pin_memory=True) dataloader_test_unbal = torch.utils.data.DataLoader( dataset_test_unbal, shuffle=False, batch_size=args.batch, num_workers=args.workers, drop_last=False, pin_memory=True) # clustering algorithm to use deepcluster = clustering.__dict__[args.clustering](args.nmb_cluster, args.pca) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] # remove top located layer parameters from checkpoint copy_checkpoint_state_dict = checkpoint['state_dict'].copy() for key in list(copy_checkpoint_state_dict): if 'cluster_layer' in key: del copy_checkpoint_state_dict[key] # if 'category_layer' in key: # del copy_checkpoint_state_dict[key] checkpoint['state_dict'] = copy_checkpoint_state_dict model.load_state_dict(checkpoint['state_dict']) optimizer_body.load_state_dict(checkpoint['optimizer_body']) optimizer_category.load_state_dict( checkpoint['optimizer_category']) category_save = os.path.join(args.exp, 'category_layer.pth.tar') if os.path.isfile(category_save): category_layer_param = torch.load(category_save) model.category_layer.load_state_dict(category_layer_param) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # creating checkpoint repo exp_check = os.path.join(args.exp, 'checkpoints') if not os.path.isdir(exp_check): os.makedirs(exp_check) exp_bal = os.path.join(args.exp, 'bal') exp_unbal = os.path.join(args.exp, 'unbal') for dir_bal in [exp_bal, exp_unbal]: for dir_2 in ['features', 'pca_features', 'pred']: dir_to_make = os.path.join(dir_bal, dir_2) if not os.path.isdir(dir_to_make): os.makedirs(dir_to_make) if os.path.isfile(os.path.join(args.exp, 'loss_collect.pickle')): with open(os.path.join(args.exp, 'loss_collect.pickle'), "rb") as f: loss_collect = pickle.load(f) else: loss_collect = [[], [], [], [], [], [], [], [], []] if os.path.isfile(os.path.join(args.exp, 'nmi_collect.pickle')): with open(os.path.join(args.exp, 'nmi_collect.pickle'), "rb") as ff: nmi_save = pickle.load(ff) else: nmi_save = [] ''' ####################### ####################### MAIN TRAINING ####################### #######################''' for epoch in range(args.start_epoch, args.epochs): end = time.time() print( '##################### Start training at Epoch %d ################' % epoch) model.classifier = nn.Sequential( *list(model.classifier.children()) [:-1]) # remove ReLU at classifier [:-1] model.cluster_layer = None model.category_layer = None ''' ####################### ####################### PSEUDO-LABEL GENERATION ####################### ####################### ''' print('Cluster the features') features_train, input_tensors_train, labels_train = compute_features( dataloader_cp, model, len(dataset_cp), device=device, args=args) clustering_loss, pca_features = deepcluster.cluster( features_train, verbose=args.verbose) nan_location = np.isnan(pca_features) inf_location = np.isinf(pca_features) if (not np.allclose(nan_location, 0)) or (not np.allclose( inf_location, 0)): print('PCA: Feature NaN or Inf found. Nan count: ', np.sum(nan_location), ' Inf count: ', np.sum(inf_location)) print('Skip epoch ', epoch) torch.save(pca_features, 'tr_pca_NaN_%d.pth.tar' % epoch) torch.save(features_train, 'tr_feature_NaN_%d.pth.tar' % epoch) continue print('Assign pseudo labels') size_cluster = np.zeros(len(deepcluster.images_lists)) for i, _list in enumerate(deepcluster.images_lists): size_cluster[i] = len(_list) print('size in clusters: ', size_cluster) img_label_pair_train = zip_img_label(input_tensors_train, labels_train) train_dataset = clustering.cluster_assign( deepcluster.images_lists, img_label_pair_train) # Reassigned pseudolabel # uniformly sample per target sampler_train = UnifLabelSampler(int(len(train_dataset)), deepcluster.images_lists) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch, shuffle=False, num_workers=args.workers, sampler=sampler_train, pin_memory=True, ) ''' #################################################################### #################################################################### TRSNSFORM MODEL FOR SELF-SUPERVISION // SEMI-SUPERVISION #################################################################### #################################################################### ''' # Recover classifier with ReLU (that is not used in clustering) mlp = list(model.classifier.children( )) # classifier that ends with linear(512 * 128). No ReLU at the end mlp.append(nn.ReLU(inplace=True).to(device)) model.classifier = nn.Sequential(*mlp) model.classifier.to(device) '''SELF-SUPERVISION (PSEUDO-LABELS)''' model.category_layer = None model.cluster_layer = nn.Sequential( nn.Linear(fd, args.nmb_cluster), # nn.Linear(4096, num_cluster), nn.Softmax( dim=1 ), # should be removed and replaced by ReLU for category_layer ) model.cluster_layer[0].weight.data.normal_(0, 0.01) model.cluster_layer[0].bias.data.zero_() model.cluster_layer = model.cluster_layer.double() model.cluster_layer.to(device) ''' train network with clusters as pseudo-labels ''' with torch.autograd.set_detect_anomaly(True): pseudo_loss, semi_loss, semi_accuracy = semi_train( train_dataloader, dataloader_semi, model, fd, criterion, optimizer_body, optimizer_category, epoch, device=device, args=args) # save checkpoint if (epoch + 1) % args.checkpoints == 0: path = os.path.join( args.exp, 'checkpoints', 'checkpoint_' + str(epoch) + '.pth.tar', ) if args.verbose: print('Save checkpoint at: {0}'.format(path)) torch.save( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'optimizer_body': optimizer_body.state_dict(), 'optimizer_category': optimizer_category.state_dict(), }, path) ''' ############## ############## # TEST phase ############## ############## ''' test_loss_bal, test_accuracy_bal, test_pred_bal, test_label_bal = test( dataloader_test_bal, model, criterion, device, args) test_loss_unbal, test_accuracy_unbal, test_pred_unbal, test_label_unbal = test( dataloader_test_unbal, model, criterion, device, args) '''Save prediction of the test set''' if (epoch % args.save_epoch == 0): with open( os.path.join(args.exp, 'bal', 'pred', 'sup_epoch_%d_te_bal.pickle' % epoch), "wb") as f: pickle.dump([test_pred_bal, test_label_bal], f) with open( os.path.join(args.exp, 'unbal', 'pred', 'sup_epoch_%d_te_unbal.pickle' % epoch), "wb") as f: pickle.dump([test_pred_unbal, test_label_unbal], f) if args.verbose: print('###### Epoch [{0}] ###### \n' 'Time: {1:.3f} s\n' 'Pseudo tr_loss: {2:.3f} \n' 'SEMI tr_loss: {3:.3f} \n' 'TEST_bal loss: {4:.3f} \n' 'TEST_unbal loss: {5:.3f} \n' 'Clustering loss: {6:.3f} \n\n' 'SEMI accu: {7:.3f} \n' 'TEST_bal accu: {8:.3f} \n' 'TEST_unbal accu: {9:.3f} \n'.format( epoch, time.time() - end, pseudo_loss, semi_loss, test_loss_bal, test_loss_unbal, clustering_loss, semi_accuracy, test_accuracy_bal, test_accuracy_unbal)) try: nmi = normalized_mutual_info_score( clustering.arrange_clustering(deepcluster.images_lists), clustering.arrange_clustering(cluster_log.data[-1])) nmi_save.append(nmi) print('NMI against previous assignment: {0:.3f}'.format(nmi)) with open(os.path.join(args.exp, 'nmi_collect.pickle'), "wb") as ff: pickle.dump(nmi_save, ff) except IndexError: pass print('####################### \n') # save cluster assignments cluster_log.log(deepcluster.images_lists) # save running checkpoint torch.save( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'optimizer_body': optimizer_body.state_dict(), 'optimizer_category': optimizer_category.state_dict(), }, os.path.join(args.exp, 'checkpoint.pth.tar')) torch.save(model.category_layer.state_dict(), os.path.join(args.exp, 'category_layer.pth.tar')) loss_collect[0].append(epoch) loss_collect[1].append(pseudo_loss) loss_collect[2].append(semi_loss) loss_collect[3].append(clustering_loss) loss_collect[4].append(test_loss_bal) loss_collect[5].append(test_loss_unbal) loss_collect[6].append(semi_accuracy) loss_collect[7].append(test_accuracy_bal) loss_collect[8].append(test_accuracy_unbal) with open(os.path.join(args.exp, 'loss_collect.pickle'), "wb") as f: pickle.dump(loss_collect, f) ''' ############################ ############################ # PSEUDO-LABEL GEN: Test set (balanced UA) ############################ ############################ ''' model.classifier = nn.Sequential( *list(model.classifier.children()) [:-1]) # remove ReLU at classifier [:-1] model.cluster_layer = None model.category_layer = None print('TEST set: Cluster the features') features_te_bal, input_tensors_te_bal, labels_te_bal = compute_features( dataloader_test_bal, model, len(dataset_test_bal), device=device, args=args) clustering_loss_te_bal, pca_features_te_bal = deepcluster.cluster( features_te_bal, verbose=args.verbose) mlp = list(model.classifier.children( )) # classifier that ends with linear(512 * 128). No ReLU at the end mlp.append(nn.ReLU(inplace=True).to(device)) model.classifier = nn.Sequential(*mlp) model.classifier.to(device) nan_location_bal = np.isnan(pca_features_te_bal) inf_location_bal = np.isinf(pca_features_te_bal) if (not np.allclose(nan_location_bal, 0)) or (not np.allclose( inf_location_bal, 0)): print('PCA: Feature NaN or Inf found. Nan count: ', np.sum(nan_location_bal), ' Inf count: ', np.sum(inf_location_bal)) print('Skip epoch ', epoch) torch.save(pca_features_te_bal, 'te_pca_NaN_%d_bal.pth.tar' % epoch) torch.save(features_te_bal, 'te_feature_NaN_%d_bal.pth.tar' % epoch) continue # save patches per epochs cp_epoch_out_bal = [ features_te_bal, deepcluster.images_lists, deepcluster.images_dist_lists, input_tensors_te_bal, labels_te_bal ] if (epoch % args.save_epoch == 0): with open( os.path.join(args.exp, 'bal', 'features', 'cp_epoch_%d_te_bal.pickle' % epoch), "wb") as f: pickle.dump(cp_epoch_out_bal, f) with open( os.path.join(args.exp, 'bal', 'pca_features', 'pca_epoch_%d_te_bal.pickle' % epoch), "wb") as f: pickle.dump(pca_features_te_bal, f) ''' ############################ ############################ # PSEUDO-LABEL GEN: Test set (Unbalanced UA) ############################ ############################ ''' model.classifier = nn.Sequential( *list(model.classifier.children()) [:-1]) # remove ReLU at classifier [:-1] model.cluster_layer = None model.category_layer = None print('TEST set: Cluster the features') features_te_unbal, input_tensors_te_unbal, labels_te_unbal = compute_features( dataloader_test_unbal, model, len(dataset_test_unbal), device=device, args=args) clustering_loss_te_unbal, pca_features_te_unbal = deepcluster.cluster( features_te_unbal, verbose=args.verbose) mlp = list(model.classifier.children( )) # classifier that ends with linear(512 * 128). No ReLU at the end mlp.append(nn.ReLU(inplace=True).to(device)) model.classifier = nn.Sequential(*mlp) model.classifier.to(device) nan_location_unbal = np.isnan(pca_features_te_unbal) inf_location_unbal = np.isinf(pca_features_te_unbal) if (not np.allclose(nan_location_unbal, 0)) or (not np.allclose( inf_location_unbal, 0)): print('PCA: Feature NaN or Inf found. Nan count: ', np.sum(nan_location_unbal), ' Inf count: ', np.sum(inf_location_unbal)) print('Skip epoch ', epoch) torch.save(pca_features_te_unbal, 'te_pca_NaN_%d_unbal.pth.tar' % epoch) torch.save(features_te_unbal, 'te_feature_NaN_%d_unbal.pth.tar' % epoch) continue # save patches per epochs cp_epoch_out_unbal = [ features_te_unbal, deepcluster.images_lists, deepcluster.images_dist_lists, input_tensors_te_unbal, labels_te_unbal ] if (epoch % args.save_epoch == 0): with open( os.path.join(args.exp, 'unbal', 'features', 'cp_epoch_%d_te_unbal.pickle' % epoch), "wb") as f: pickle.dump(cp_epoch_out_unbal, f) with open( os.path.join(args.exp, 'unbal', 'pca_features', 'pca_epoch_%d_te_unbal.pickle' % epoch), "wb") as f: pickle.dump(pca_features_te_unbal, f)