def clusterise_data(data_obj): """ Assigns a cluster label to each days present in the data received using three different algorithms: MeanShift, Affinity Propagation, or KMeans. @param data_obj: List of dictionaries """ L = len(data_obj) #Simply converts data_obj to a 2D list for computation List2D = [[None for _ in range(4)] for _ in range(L-1)] for i in range(L-1): #don't include current day #wake_up and sleep_duration are the most important factors List2D[i][0] = 5 * data_obj[i]["wake_up"] List2D[i][1] = 1 * data_obj[i]["sleep"] List2D[i][2] = 5 * data_obj[i]["sleep_duration"] List2D[i][3] = 0.5 * data_obj[i]["activity"] points = NumpyArray(List2D) #converts 2D list to numpyarray if ALGO == "Affinity Propagation": labels = AffinityPropagation().fit_predict(points) elif ALGO == "KMeans": labels= KMeans(init='k-means++', n_clusters=5, n_init=10) .fit_predict(points) elif ALGO == "MeanShift": bandwidth = estimate_bandwidth(points, quantile=0.2, n_samples=20) labels = MeanShift(bandwidth=bandwidth, bin_seeding=True).fit_predict(points) else: raise Exception("Algorithm not defined: "+str(ALGO)) for i in range(L-1): data_obj[i]["cluster"] = labels[i] for unique_label in remove_duplicates(labels): debug_print(ALGO+": Cluster "+str(unique_label)+" contains "+str(labels.tolist().count(unique_label))+" data points") debug_print(ALGO+": Silhouette coefficient"+ str(metrics.silhouette_score(points, labels, metric='euclidean')*100)+"%")
string_summary = [] for s in token_summary: string_summary.append(" ".join(token for token in s)) print("Getting TFIDF...") vectorizer = TfidfVectorizer(norm='l2') sim_matrix = cosine_similarity(vectorizer.fit_transform(string_summary)) #print("the shape of similarity matrix: " % np.shape(sim_matrix)) print("Start clustering...") start = time.time() labels = AffinityPropagation().fit_predict(sim_matrix) print("{:.2f}s".format(time.time() - start)) labels = labels.tolist() # print(labels) dict = {} for i in labels: if i not in dict: dict[i]=1 else: dict[i]+=1 def dict2list(dic:dict): ''' 将字典转化为列表 ''' keys = dic.keys() vals = dic.values() lst = [(key, val) for key, val in zip(keys, vals)] return lst