def calcBackProject(image, tags, histograms): probability = {} cl.kmeans(image) for tag in tags: print tag result = cv2.calcBackProject([image], [0, 1, 2], histograms[tag], [0, 180, 0, 256, 0, 256], 1) probabilityValue = cv2.countNonZero(result)/float(image.shape[0] * image.shape[1]) probability[tag] = probabilityValue return probability
def k (tags, data, K, tries=5): #first get the clustering info idxs=kmeans(str(data), K, tries) idxs=eval(idxs) print len(idxs)==len(tags) #for the clusters, get the numbers for eac`` return idxs
def onlytransfer(n_clusters, fb_kmeans=True): X = np.load('data/chan/8chan_pol/VGG16/fc1/featuresx.npy') X = X.astype('float32') pathfile = open('data/chan/8chan_pol/VGG16/fc1/paths.txt', "r") pathlist = pathfile.readlines() pathlist = [path[:-1] for path in pathlist] pathfile.close() if fb_kmeans: #features = torch.from_numpy(features) images_lists, loss = kmeans(X, nmb_clusters=n_clusters, preprocess=False) Y_pred = arrange_clustering(images_lists) else: km = KMeans(n_clusters=n_clusters, n_init=20) Y_pred = km.fit_predict(X) for y_pred, path in zip(Y_pred, pathlist): savedir = '/home/elahe/NortfaceProject/codes/DEC-keras/results/clusters/8chan_pol/%s/%s/%s' % ( 'transfer', 'fc1', y_pred) if not os.path.exists(savedir): os.makedirs(savedir) shutil.copy(path, savedir)
def main(): # check command-line arguments if len(sys.argv) not in [3, 4]: print "Error, incorrect number of arguments" usage() sys.exit(1) k = check_k(sys.argv[1]) check_argv(sys.argv[2]) if len(sys.argv) == 4: # optional, file containing annotations check_argv(sys.argv[3]) # initialize gene matrix using annotations geneMatrix = construct_geneMatrix(sys.argv[2], sys.argv[3]) else: # index instances by line number in original file geneMatrix = construct_geneMatrix(sys.argv[2], None) #normalize matrix geneMatrix = normalize_matrix(geneMatrix) # call kmeans clustering on input files sse, aic, silhouette = kmeans(geneMatrix, k) # print results print "K-means with k = %d\n" % k print "%-15s %.2f\n%-15s %.2f\n%-15s %.2f\n" % ('SSE:', sse, 'AIC:', aic, 'Silhouette:', silhouette)
def run_k_means(self): print("RUNNING KMEANS") print("Points") print(self.points) print("Starting Centers: {}".format([self.points[0], self.points[1]])) centers, clusters = clustering.kmeans(self.points, [self.points[0], self.points[1]] ) # whichever center is lower is the blue center blue_center = centers[0] if centers[1][1] < centers[0][1]: blue_center = centers[1] # draw the centers and points self.click_canvas.delete("all") for center_i, center in enumerate(centers): color = None if center is blue_center: color='blue' else: color='red' print("Cluster {} with center {}".format(color, center)) print("Cluster Points:") print(clusters[center_i]) self.draw_center(center, color=color) for cluster_coord in clusters[center_i]: self.draw_dot(cluster_coord, color=color)
def quantize_single_sample(points, k): """ Applies quantization to a single sample with accelerometer observations. """ X = np.asarray(points, dtype=np.float) centroids, _ = kmeans(X, n_clusters=k) feature_vector = centroids.flatten() return feature_vector
def __init__(self): super(ClustersWidget, self).__init__() layout = QtGui.QVBoxLayout() clusters = kmeans() for cluster in clusters: if len(cluster.palettes) == 0: continue cw = ClusterWidget(cluster) layout.addWidget(cw) self.setLayout(layout)
def test_ffh_10(self): finalObj = [] for rep in range(10): np.random.seed(1234 + rep) mu0 = clustering.initialize_clusters(self.X, 10, 'ffh') (mu, z, obj) = clustering.kmeans(self.X, mu0, doPlot=False) finalObj.append(obj[-1]) targetObj = 0.44031610993896342 self.assertTrue(abs(np.mean(finalObj) - targetObj) <= 1e4)
def test_km_plus_plus_10(self): finalObj = [] for rep in range(20): np.random.seed(1234 + rep) mu0 = clustering.initialize_clusters(self.X, 10, 'km++') (mu, z, obj) = clustering.kmeans(self.X, mu0, doPlot=False) finalObj.append(obj[-1]) targetObj = 0.4392510535744174 self.assertTrue(abs(np.mean(finalObj) - targetObj) <= 1e4)
def cluster(algorithm, similarity, encoding, outlier, rep, chart_id, key=None): """Returns the cluster each time series was placed in. Args: algorithm: The algorithm used for clustering. Must be "K-means" or "DBSCAN". similarity: The similarity measure used for scaling the data before clustering. Must be "Proximity" or "Correlation". encoding: The method used for encoding the labels. Must be "None" or "One-Hot". outlier: Whether outliers are identified, must be "on" or "off". rep: Whether the data is represented as "lines" or "bands". chart_id: The id of the file containing the data that k-means clustering is run on. key: The key for the time series labels that are saved. If None, then all label values may be kept, otherwise only label values with that key are kept. Returns: A json with a list containing the label of the cluster each time series was grouped in, and the min_max of each cluster and the corresponding dates for each value if rep == "bands", otheriwse and dates are empty lists. """ data = load_data(chart_id) if "timeSeries" not in data: return data (time_series_data, label_dict, ts_to_labels, dates, old_range) = clustering.time_series_array(data, key) ts_data_updated = clustering.preprocess(time_series_data, encoding, similarity, ts_to_labels, algorithm) if algorithm == "k-means": labels = clustering.kmeans(ts_data_updated, outlier).tolist() elif algorithm == "k-means-constrained" or algorithm == "k-medians": labels = clustering.kmeans_kmedians(ts_data_updated, label_dict, ts_to_labels, algorithm, outlier).tolist() elif algorithm == "zone": labels = clustering.cluster_zone(label_dict, ts_to_labels) else: labels = clustering.dbscan(ts_data_updated, similarity, encoding, outlier).tolist() min_max, ordered_dates, outlier_indexes = [], [], [] if rep == "bands": min_max, ordered_dates, outlier_indexes = clustering.clusters_min_max( time_series_data, labels, dates, old_range, outlier) return jsonify({ "cluster_labels": labels, "min_max": min_max, "dates": ordered_dates, "outlier_indexes": outlier_indexes })
def test_kmeans(self): # Run k-means clustering on the test set using 4 clusters and check if the cluster centres # (centroids) lie close to the expected centroids based on a fixed seed centroids, assignments, numIts = clustering.kmeans(self.data, self.numClusters, maxNumIts=10) expectedCentroids = np.float32([[2.80, -2.73], [-3.38, -2.94], [2.62, 3.10], [-2.46, 2.78]]) # np.testing.assert_almost_equal(centroids, expectedCentroids, 2) if self.plotResults: self.plotData_kMeans(self.data, centroids, assignments, 1)
def step2_cal(): """第二步计算,判断其他类是否需要分裂,若需要,则对其他类进行文本聚类,并做聚类评价 """ # 聚类评价时选取TOPK_FREQ_WORD的高频词 TOPK_FREQ_WORD = 50 # 聚类评价时最小簇的大小 LEAST_SIZE = 8 # 判断其他类是否需要分裂 ifsplit = event.check_ifsplit(initializing) print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' split ', ifsplit, ' %s start step2' % ts2datetime(timestamp) if ifsplit: inputs, kmeans_cluster_num, reserve_num = event.getOtherSubEventInfos(initializing) print eventid, ' after classify before split: ', len(inputs), kmeans_cluster_num, reserve_num if len(inputs) > 2: items = [] for r in inputs: r["title"] = r["title"].encode("utf-8") r["content"] = r["content168"].encode("utf-8") items.append(r) # kmeans聚类 kmeans_results = kmeans(items, k=kmeans_cluster_num) # 聚类评价 if initializing or now_hour == 0: min_tfidf = event.get_min_tfidf() final_cluster_results, tfidf_dict = cluster_evaluation(kmeans_results, top_num=reserve_num, topk_freq=TOPK_FREQ_WORD, least_size=LEAST_SIZE, min_tfidf=min_tfidf) else: # 每小时聚类时,不用和已有簇的最小tfidf作比 final_cluster_results, tfidf_dict = cluster_evaluation(kmeans_results, top_num=reserve_num, topk_freq=TOPK_FREQ_WORD, least_size=LEAST_SIZE) # 更新新闻簇标签,更新子事件表 for label, items in final_cluster_results.iteritems(): if label == "other": label = event.getOtherSubEventID() event.save_subevent(label, timestamp) if label != event.getOtherSubEventID(): # 更新每类的tfidf event.update_subevent_tfidf(label, tfidf_dict[label]) for r in items: news = News(r["_id"], event.id) news.update_news_subeventid(label) else: print 'inputs less than 2, kmeans aborted' print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s end step2' % ts2datetime(timestamp)
def kmeans(model_file=None, nclusters=None, output_file=None): word_vectors = KeyedVectors.load_word2vec_format(model_file, binary=False) clusters = clustering.kmeans(word_vectors, nclusters)[0] if output_file: out = open(output_file, 'w') else: out = sys.stdout for k in clusters: out.write("%s\n" % " ".join([fn for fn in clusters[k]])) if output_file: out.close()
def getOptimalWordCluster(self, target_words, bomb_words): ''' Creates clusters of words by cosine similarity, with increasing centriod count until a cluster is created containing only target_words params: target_words (iterable, str): words to maximize in clusters bomb_words (iterable, str): words to be avoided in clusters returns: potential_groups (tuple) A tuple containing the best cluster of words format -> (group center word embedding, list of words in group) ''' if len(target_words) < 1: raise ValueError("Empty target word list provided") if len(target_words) == 1: return target_words[0] full_word_set = np.array(target_words + bomb_words) embeddings = self.getWordEmbeddings(full_word_set) potential_groups = [] num_groups = max(int(len(target_words) / 2), 1) # keep clustering until found a group of similar words without bomb_words # increase number of clusters after each failed iteration while not potential_groups: grouper = clustering.kmeans(k=num_groups, measure='cosine') grouper.fit(embeddings) cluster_labels = np.array(grouper.train_labels) #verify if any potential clusters were created for cluster in np.unique(cluster_labels): cluster_indices = cluster_labels == cluster current_cluster_words = full_word_set[cluster_indices] if len(current_cluster_words)>0 and \ not any([word in bomb_words for word in current_cluster_words]): potential_groups.append( [grouper.centroids[cluster], current_cluster_words]) num_groups += 1 # sort potential_groups by number of group members potential_groups = sorted(potential_groups, key=lambda x: len(x[1]), reverse=True) return potential_groups[0]
def main(): np.random.seed(1) data_path = join(dirname(__file__), 'datasets', 'blobs.csv') X, _ = read_csv(data_path) figure, axes = plt.subplots(2, 2, figsize=(12, 12)) axes = axes.flatten() font_path = join(dirname(__file__), 'fonts', 'RobotoSlab-Regular.ttf') font = font_manager.FontProperties(fname=font_path) font.set_size(16) gray = '#3c3c3c' for i, n_clusters in enumerate((2, 3, 4, 5)): print('Running K-means with k=%d' % n_clusters) centroids, score = kmeans(X, n_clusters) print('Best inertia score: %.2f' % score) letter = string.ascii_letters[i] title = '(%s) k=%d, inertia=%2.2f' % (letter, n_clusters, score) labels = assign_labels(X, centroids) get_color = palette() colors = [get_color(l) for l in labels] axes[i].scatter(X[:, 0], X[:, 1], c=colors, s=50, alpha=0.6) axes[i].set_title(title, fontproperties=font, color=gray) axes[i].set_xticks([]) axes[i].set_yticks([]) for spine in ('top', 'right', 'bottom', 'left'): axes[i].spines[spine].set_color(gray) for (x, y) in centroids: axes[i].plot(x, y, color='white', markeredgewidth=1, markeredgecolor=gray, markersize=10, marker='d') figure.tight_layout() figure.savefig('clusters.png', transparent=False)
def frequency(similarity, algorithm, label_encoding, chart_id): """Runs kmeans and gets the frequencies of labels per time series and labels per cluster. Args: similarity: The similarity measure used for scaling the data before clustering. Must be "proximity" or "correlation". label_encoding: The method used for encoding the labels. Must be "none" or "one-hot". chart_id: The id of the file containing the data that k-means clustering is run on. Returns: A json with a list of cluster labels generated by running kmeans, an array of labels per time series and an array of labels per cluster. """ data = load_data(chart_id) if "timeSeries" not in data: return data (time_series_data, label_dict, ts_to_labels, _, _) = clustering.time_series_array(data, None) time_series_data = clustering.preprocess(time_series_data, label_encoding, similarity, ts_to_labels, "k-means") if algorithm == "k-means": labels = clustering.kmeans(time_series_data, "off") elif algorithm == "k-means-constrained": labels = clustering.kmeans_kmedians(time_series_data, label_dict, ts_to_labels, algorithm, "off") cluster_labels = clustering.cluster_to_labels(labels, ts_to_labels) ordered_labels, ordered_clusters, ordered_ts = clustering.sort_labels( label_dict, cluster_labels, ts_to_labels) return jsonify({ "labels": ordered_labels, "ts_labels": ordered_ts.tolist(), "cluster_labels": ordered_clusters.tolist() })
def get_evaluation_scores(X, ground_truth_labels, clusters, filepath, ignored_indices=(), result_file=None, files=None): if result_file is not None: X = read_distance_matrix_from_file(result_file, (), len(ground_truth_labels)) n_clusters_, predicted_labels = kmeans(X, clusters) ari_k = metrics.adjusted_rand_score(ground_truth_labels, predicted_labels) n_clusters_, predicted_labels = spectral_clustering(X, clusters) ari_s = metrics.adjusted_rand_score(ground_truth_labels, predicted_labels) filepath.write("Silhouttee, K-mean ARI, Spectral-ARI\n") filepath.write("%s,%s,%s\n" % (silhouette_score(X, ground_truth_labels), ari_k, ari_s))
def train_kmeans_layer(X, in_shape, K, ksize, n_patches_per_image, prep_type=None, pad_h=0, pad_w=0, repeat=1, **kwargs): train_data = get_random_patches(X, in_shape, ksize, n_patches_per_image, pad_h=pad_h, pad_w=pad_w) if prep_type is not None: prep = pp.choose_preprocessor_by_name(prep_type) prep.train(train_data) train_data = prep.process(train_data) else: prep = None C_best = None loss_best = None for i_repeat in xrange(repeat): print '*** repeat #%d ***' % (i_repeat + 1) gnp.free_reuse_cache() C, _, loss = clust.kmeans(train_data, K, **kwargs) if loss_best is None or loss < loss_best: loss_best = loss C_best = C print '>>> best loss: %.2f' % loss_best return KMeansModel(C_best, kwargs.get('dist', 'euclidean'), in_shape.c, ksize, prep)
def recolor_left_half(rgb_lh_arr, number_of_colors, useElbowMethod=False, low=2, high=10, num_of_iters=1000): rgb_lh_arr_flat_shape = (rgb_lh_arr.shape[0] * rgb_lh_arr.shape[1], 3) rgb_lh_arr_flat = rgb_lh_arr.reshape( rgb_lh_arr_flat_shape ) # matrix flattened into a vector of data points (r,g,b) if not useElbowMethod: # if elbow method is False number_of_clusters = number_of_colors # k = given number of colors else: elbow_data = cl.elbow( rgb_lh_arr_flat, low, high, num_of_iters) # compute sse for k = low to k = high df = pd.DataFrame(elbow_data, columns=['k', 'cost']) cl.plot_elbow_data(df) number_of_clusters = int( input('Enter value of k at elbow point in the plot')) # number_of_clusters = number_of_colors centroids_arr, assigned_clusters_arr_flat = cl.kmeans( rgb_lh_arr_flat, number_of_clusters, 1000) # execute k-means algorithm assigned_clusters_arr = assigned_clusters_arr_flat.reshape( (rgb_lh_arr.shape[0], rgb_lh_arr.shape[1])) # reshape into a matrix recolored_lh_arr = np.zeros( (rgb_lh_arr.shape)) # array of zeros to recolor left-half of an image for index, centroid in enumerate(centroids_arr): bool_indices = (assigned_clusters_arr == index) # boolean array recolored_lh_arr[bool_indices] = centroid # vectorized assignment recolored_lh_arr = np.asarray( recolored_lh_arr, dtype='uint8') # convert all float numbers into 8-bit integers (0-255) centroids_arr = centroids_arr.astype( dtype='uint8') # convert all float numbers into 8-bit integers (0-255) return centroids_arr, assigned_clusters_arr, recolored_lh_arr, number_of_clusters
def add_filter(filters_list, column_index, column): from clustering import kmeans filters_list.append((column_index, kmeans(column))) return filters_list
from clustering import svdpca,kmeans import os, sys, string, numpy ss=[[1,2],[4,2],[300,200]] ll=svdpca(str(ss), 3, 2); print ll; ss=[[300,200],[1,2],[4,2]] ll=kmeans(str(ss), 2, 2); print ll; #exchange #sss = [[1,2],[4,2],[3,1]] #ll=svdpca(str(ss), 3,2); #print ll; #ll=svdpca(str(sss), 3,2); #print ll;
def _select_centers(self, X): # random_args = np.random.choice(len(X), self.n_neurons) # centers = X[random_args] centers, _, sigmas = kmeans(X, self.n_neurons, 'kmeanspp', 100) return centers, sigmas
def sensors_keywords_detection(task_detail): task_name = task_detail[0] social_sensors = task_detail[1] keywords_list = task_detail[2] sensitive_words = task_detail[3] stop_time = task_detail[4] forward_warning_status = task_detail[5] ts = task_detail[7] forward_result = get_forward_numerical_info(task_name, ts, keywords_list) # 1. 聚合前12个小时内传感人物发布的所有与关键词相关的原创微博 forward_origin_weibo_list = query_mid_list(ts-time_interval, keywords_list, forward_time_range, 1, social_sensors) # 2. 聚合当前阶段内的原创微博 current_mid_list = query_mid_list(ts, keywords_list, time_interval, 1, social_sensors) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list = list(set(all_mid_list)) print len(all_mid_list) # 3. 查询当前的原创微博和之前12个小时的原创微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval, keywords_list, 1, social_sensors) current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 print "current all weibo: ", statistics_count current_origin_count = statistics_count['origin'] current_retweeted_count = statistics_count['retweeted'] current_comment_count = statistics_count['comment'] # 4. 聚合当前时间内积极、中性、悲伤、愤怒情绪分布 sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0} datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-time_interval) if datetime == datetime_1: index_name = flow_text_index_name_pre + datetime else: index_name = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index_name) if exist_es: search_results = aggregation_sentiment_related_weibo(ts, all_mid_list, time_interval, keywords_list, 1) sentiment_count = search_results print "sentiment_count: ", sentiment_count negetive_count = sentiment_count['2'] + sentiment_count['3'] # 5. 那些社会传感器参与事件讨论 query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts } }}, {"terms":{"uid": social_sensors}} ], "should":[ {"terms": {"root_mid": all_mid_list}}, {"terms": {"mid": all_mid_list}} ] } } } }, "size": 10000 } datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts - time_interval) if datetime == datetime_1: index_name = flow_text_index_name_pre + datetime else: index_name = flow_text_index_name_pre + datetime_1 search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)['hits']['hits'] attend_users = [] if search_results: for item in search_results: attend_users.append(item['_source']['uid']) important_users = list(set(attend_users)) print "important users", important_users # 6. 敏感词识别,如果传感器的微博中出现这么一个敏感词,那么就会预警------PS.敏感词是一个危险的设置 sensitive_origin_weibo_number = 0 sensitive_retweeted_weibo_number = 0 sensitive_comment_weibo_number = 0 sensitive_total_weibo_number = 0 if sensitive_words: query_sensitive_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts }} }, {"terms": {"keywords_string": sensitive_words}}, {"terms": {"uid": social_sensors}} ] } } } }, "aggs":{ "all_list":{ "terms":{"field": "message_type"} } } } sensitive_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_sensitive_body)['aggregations']['all_list']["buckets"] if sensitive_results: for item in sensitive_results: if int(item["key"]) == 1: sensitive_origin_weibo_number = item['doc_count'] elif int(item["key"]) == 2: sensitive_comment_weibo_number = item['doc_count'] elif int(item["key"]) == 3: sensitive_retweeted_weibo_number = item["doc_count"] else: pass sensitive_total_weibo_number = sensitive_origin_weibo_number + sensitive_comment_weibo_number + sensitive_retweeted_weibo_number burst_reason = signal_nothing_variation warning_status = signal_nothing finish = unfinish_signal # "0" if sensitive_total_weibo_number: # 敏感微博的数量异常 print "======================" if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason = signal_sensitive_variation if forward_result[0]: # 根据移动平均判断是否有时间发生 mean_count = forward_result[1] std_count = forward_result[2] mean_sentiment = forward_result[3] std_sentiment = forward_result[4] if current_total_count > mean_count+1.96*std_count: # 异常点发生 print "=====================================================" if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason += signal_count_varition # 数量异常 if negetive_count > mean_sentiment+1.96*std_sentiment: warning_status = signal_brust burst_reason += signal_sentiment_varition # 负面情感异常, "12"表示两者均异常 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track if int(stop_time) <= ts: # 检查任务是否已经完成 finish = finish_signal tmp_burst_reason = burst_reason topic_list = [] # 7. 感知到的事, all_mid_list if burst_reason: # 有事情发生 text_list = [] mid_set = set() if signal_sensitive_variation in burst_reason: query_sensitive_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts }} }, {"terms": {"keywords_string": sensitive_words}} ] } } } }, "size": 10000 } if social_sensors: query_sensitive_body['query']['filtered']['filter']['bool']['must'].append({"terms":{"uid": social_sensors}}) sensitive_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_sensitive_body)['hits']["hits"] if sensitive_results: for item in sensitive_results: iter_mid = item['_source']['mid'] iter_text = item['_source']['text'] temp_dict = dict() temp_dict["mid"] = iter_mid temp_dict["text"] = iter_text if iter_mid not in mid_set: text_list.append(temp_dict) # 整理后的文本,mid,text mid_set.add(iter_mid) burst_reason.replace(signal_sensitive_variation, "") if burst_reason and all_mid_list: sensing_text = es_text.mget(index=index_name, doc_type=flow_text_index_type, body={"ids": all_mid_list}, fields=["mid", "text"])["docs"] if sensing_text: for item in sensing_text: if item['found']: iter_mid = item["fields"]["mid"][0] iter_text = item["fields"]["text"][0] temp_dict = dict() temp_dict["mid"] = iter_mid temp_dict["text"] = iter_text if iter_mid not in mid_set: text_list.append(temp_dict) mid_set.add(iter_mid) if len(text_list) == 1: top_word = freq_word(text_list[0]) topic_list = top_word.keys() elif len(text_list) == 0: topic_list = [] tmp_burst_reason = "" #没有相关微博,归零 print "***********************************" else: feature_words, input_word_dict = tfidf(text_list) #生成特征词和输入数据 word_label, evaluation_results = kmeans(feature_words, text_list) #聚类 inputs = text_classify(text_list, word_label, feature_words) clustering_topic = cluster_evaluation(inputs) sorted_dict = sorted(clustering_topic.items(), key=lambda x:x[1], reverse=True)[0:5] topic_list = [] if sorted_dict: for item in sorted_dict: topic_list.append(word_label[item[0]]) print "topic_list:", topic_list if not topic_list: tmp_burst_reason = signal_nothing_variation warning_status = signal_nothing results = dict() results['sensitive_origin_weibo_number'] = sensitive_origin_weibo_number results['sensitive_retweeted_weibo_number'] = sensitive_retweeted_weibo_number results['sensitive_comment_weibo_number'] = sensitive_comment_weibo_number results['sensitive_weibo_total_number'] = sensitive_total_weibo_number results['origin_weibo_number'] = current_origin_count results['retweeted_weibo_number'] = current_retweeted_count results['comment_weibo_number'] = current_comment_count results['weibo_total_number'] = current_total_count results['sentiment_distribution'] = json.dumps(sentiment_count) results['important_users'] = json.dumps(important_users) results['burst_reason'] = tmp_burst_reason results['timestamp'] = ts if tmp_burst_reason: results["clustering_topic"] = json.dumps(topic_list) # es存储当前时段的信息 doctype = task_name es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results) # 更新manage social sensing的es信息 temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=task_name)['_source'] temporal_result['warning_status'] = warning_status temporal_result['burst_reason'] = tmp_burst_reason temporal_result['finish'] = finish history_status = json.loads(temporal_result['history_status']) history_status.append([ts, ' '.join(keywords_list), warning_status]) temporal_result['history_status'] = json.dumps(history_status) es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=task_name, body=temporal_result) return "1"
def rbf_train(x, y, gamma=1, x_validation=None, y_validation=None, num_hidden_nodes=10, output="regression", num_epochs=100, alpha=1, mini_batch_size=1, compute_loss=True): # Preprocess the input and output data x_matrix = x y = y.copy() if len(y.shape) == 1: y = y[:, None] if y_validation is not None: y_validation = y_validation.copy() if len(y_validation.shape) == 1: y_validation = y_validation[:, None] # Select the hidden and output activation functions activation_function = activation_functions.rbf out_activation_function, out_grad_activation_function, loss = select_output_type( output, y) # Initialize the weights w_hidden = clustering.kmeans(x_matrix, num_hidden_nodes, max_iter=100, num_rep=10)['centroids'] w_output = np.random.normal(size=(y.shape[1], num_hidden_nodes + 1)) loss_history = [] validation_loss_history = [] for epoch in range(num_epochs): random_permutation = np.array_split( np.random.permutation(y.shape[0]), np.ceil(y.shape[0] / mini_batch_size)) for i in random_permutation: xi = x_matrix[i] yi = y[i] # Forward pass layer_z = np.vstack((np.ones( (1, xi.shape[0])), activation_function(w_hidden, xi, gamma))) model_output = out_activation_function(w_output @ layer_z) # Compute error error = yi.T - model_output # Update the weights w_output += alpha * error @ layer_z.T / xi.shape[0] if compute_loss: layer_z = np.vstack((np.ones((1, x_matrix.shape[0])), activation_function(w_hidden, x_matrix, gamma))) model_output = out_activation_function(w_output @ layer_z) loss_history.append(loss(y, model_output.T)) if x_validation is not None: layer_z = np.vstack( (np.ones((1, x_validation.shape[0])), activation_function(w_hidden, x_validation, gamma))) model_output_validation = out_activation_function( w_output @ layer_z) validation_loss_history.append( loss(y_validation, model_output_validation.T)) return { 'w_hidden': w_hidden, 'w_output': w_output, 'gamma': gamma, 'loss_history': loss_history, 'validation_loss_history': validation_loss_history, 'output': output, 'activation_function': activation_function, 'out_activation_function': out_activation_function }
def cluster(self, X): centers, data_centers, sigmas = kmeans(X, self.n_neurons, 'kmeanspp', 100) return centers, data_centers, sigmas
ffeatures, fids = get_sents_training( model_name ) # Topic modelling decomposer = [lsa, lda, nmf, None][1] n_topics = 15 x_topic = topic_model_run(decomposer, n_topics, ffeatures, fids) # Clustering from clustering import birch from clustering import kmeans from clustering import ward_linkage x, _ = count_vector( x_topic, ngram=(1,2), max_df=0.99, min_df=0.1 ) n_clusters = 10 plot_dimension = 2 x, _ = pca(x, plot_dimension) km, (centroids, c, k) = kmeans(x, n_clusters) plot(x, centroids, c, k, "K-Means", plot_dimension) if fids: print_clusters(c, fids) print_measure("K-Means", "silhouette", silhouette(x, c))
def silhouette(k, word_vectors): kmeans = kmeans(word_vectors, k)[1] return silhouette_score(word_vectors.syn0norm, kmeans.labels_)
embedding_recort, embedding_dict = embedding(upper_corr, full_shape, mask, n_embedding) np.save(embed_file%(smooth, masktype, hemi, str(n_embedding)),embedding_recort) pkl_out = open(embed_dict_file%(smooth, masktype, hemi, str(n_embedding)), 'wb') pickle.dump(embedding_dict, pkl_out) pkl_out.close() '''clustering''' if calc_cluster: if not calc_embed: embedding_recort = np.load(embed_file%(smooth, masktype, hemi, str(n_embedding))) mask = np.load(mask_file%(hemi, masktype)) for nk in n_kmeans: print 'clustering %s'%str(nk) kmeans_recort = kmeans(embedding_recort, nk, mask) np.save(kmeans_file%(smooth, masktype, hemi, str(n_embedding), str(nk)), kmeans_recort) if calc_subcluster: print 'subclustering %s'%str(nk) v, f, d = read_vtk(mesh_file%hemi) subclust_arr=subcluster(kmeans_recort, f) np.save(subclust_file%(smooth, masktype, hemi, str(n_embedding), str(nk)), subclust_arr) '''subclustering''' if not calc_cluster: if calc_subcluster:
def specific_keywords_burst_dection(task_detail): task_name = task_detail[0] social_sensors = task_detail[1] keywords_list = task_detail[2] sensitive_words = task_detail[3] stop_time = task_detail[4] forward_warning_status = task_detail[5] ts = int(task_detail[7]) forward_result = get_forward_numerical_info(task_name, ts, keywords_list) # 之前时间阶段内的原创微博list forward_origin_weibo_list = query_mid_list(ts-time_interval, keywords_list, forward_time_range) # 当前阶段内原创微博list current_mid_list = query_mid_list(ts, keywords_list, time_interval) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(forward_origin_weibo_list) print "all mid list: ", len(all_mid_list) # 查询当前的原创微博和之前12个小时的原创微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval, keywords_list) current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 print "current all weibo: ", statistics_count current_origin_count = statistics_count['origin'] current_retweeted_count = statistics_count['retweeted'] current_comment_count = statistics_count['comment'] # 针对敏感微博的监测,给定传感器和敏感词的前提下,只要传感器的微博里提及到敏感词即会认为是预警 # 聚合当前时间内积极、中性、悲伤、愤怒情绪分布 # sentiment_dict = {"0": "neutral", "1":"positive", "2":"sad", "3": "anger"} sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0} datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-time_interval) if datetime != datetime_1: index_name = flow_text_index_name_pre + datetime_1 else: index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = aggregation_sentiment_related_weibo(ts, all_mid_list, time_interval, keywords_list) sentiment_count = search_results print "sentiment_count: ", sentiment_count negetive_count = sentiment_count['2'] + sentiment_count['3'] # 聚合当前时间内重要的人 important_uid_list = [] if exist_es: #search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=aggregation_sensor_keywords(ts-time_interval, ts, [], "root_uid", size=IMPORTANT_USER_NUMBER))['aggregations']['all_keywords']['buckets'] search_results = query_hot_weibo(ts, all_mid_list, time_interval, keywords_list, aggregation_field="root_uid", size=100) important_uid_list = search_results.keys() if datetime != datetime_1: index_name_1 = flow_text_index_name_pre + datetime_1 if es_text.indices.exists(index_name_1): #search_results_1 = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=aggregation_sensor_keywords(ts-time_interval, ts, [], "root_uid", size=IMPORTANT_USER_NUMBER))['aggregations']['all_keywords']['buckets'] search_results_1 = query_hot_weibo(ts, all_mid_list, time_interval, keywords_list, aggregation_field="root_uid", size=100) if search_results_1: for item in search_results_1: important_uid_list.append(item['key']) # 根据获得uid_list,从人物库中匹配重要人物 if important_uid_list: important_results = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, body={"ids": important_uid_list})['docs'] else: important_results = {} filter_important_list = [] # uid_list if important_results: for item in important_results: if item['found']: if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD: filter_important_list.append(item['_id']) print filter_important_list # 6. 敏感词识别,如果传感器的微博中出现这么一个敏感词,那么就会预警------PS.敏感词是一个 sensitive_origin_weibo_number = 0 sensitive_retweeted_weibo_number = 0 sensitive_comment_weibo_number = 0 sensitive_total_weibo_number = 0 if sensitive_words: query_sensitive_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts }} }, {"terms": {"keywords_string": sensitive_words}} ] } } } }, "aggs":{ "all_list":{ "terms":{"field": "message_type"} } } } if social_sensors: query_sensitive_body['query']['filtered']['filter']['bool']['must'].append({"terms":{"uid": social_sensors}}) sensitive_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_sensitive_body)['aggregations']['all_list']["buckets"] if sensitive_results: for item in sensitive_results: if int(item["key"]) == 1: sensitive_origin_weibo_number = item['doc_count'] elif int(item["key"]) == 2: sensitive_comment_weibo_number = item['doc_count'] elif int(item["key"]) == 3: sensitive_retweeted_weibo_number = item["doc_count"] else: pass sensitive_total_weibo_number = sensitive_origin_weibo_number + sensitive_comment_weibo_number + sensitive_retweeted_weibo_number burst_reason = signal_nothing_variation warning_status = signal_nothing finish = unfinish_signal # "0" process_status = "1" if sensitive_total_weibo_number > WARNING_SENSITIVE_COUNT: # 敏感微博的数量异常 print "======================" if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason = signal_sensitive_variation if forward_result[0]: # 根据移动平均判断是否有时间发生 mean_count = forward_result[1] std_count = forward_result[2] mean_sentiment = forward_result[3] std_sentiment = forward_result[4] if current_total_count > mean_count+1.96*std_count: # 异常点发生 print "=====================================================" if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason += signal_count_varition # 数量异常 if negetive_count > mean_sentiment+1.96*std_sentiment: warning_status = signal_brust burst_reason += signal_sentiment_varition # 负面情感异常, "12"表示两者均异常 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track if int(stop_time) <= ts: # 检查任务是否已经完成 finish = finish_signal process_status = "0" # 7. 感知到的事, all_mid_list tmp_burst_reason = burst_reason topic_list = [] # 判断是否有敏感微博出现:有,则聚合敏感微博,replace;没有,聚合普通微博 if burst_reason: # 有事情发生 text_list = [] mid_set = set() if signal_sensitive_variation in burst_reason: query_sensitive_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts }} }, {"terms": {"keywords_string": sensitive_words}} ] } } } }, "size": 10000 } if social_sensors: query_sensitive_body['query']['filtered']['filter']['bool']['must'].append({"terms":{"uid": social_sensors}}) sensitive_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_sensitive_body)['hits']['hits'] if sensitive_results: for item in sensitive_results: iter_mid = item['_source']['mid'] iter_text = item['_source']['text'] temp_dict = dict() temp_dict["mid"] = iter_mid temp_dict["text"] = iter_text if iter_mid not in mid_set: text_list.append(temp_dict) # 整理后的文本,mid,text mid_set.add(iter_mid) burst_reason.replace(signal_sensitive_variation, "") current_origin_mid_list = query_mid_list(ts, keywords_list, time_interval, 1) print "current_origin_mid_list:", len(current_origin_mid_list) if burst_reason and current_mid_list: origin_sensing_text = es_text.mget(index=index_name, doc_type=flow_text_index_type, body={"ids": current_origin_mid_list}, fields=["mid", "text"])["docs"] if origin_sensing_text: for item in origin_sensing_text: if item["found"]: iter_mid = item["fields"]["mid"][0] iter_text = item["fields"]["text"][0] temp_dict = dict() temp_dict["mid"] = iter_mid temp_dict["text"] = iter_text if iter_mid not in mid_set: text_list.append(temp_dict) # 整理后的文本,mid,text mid_set.add(iter_mid) if len(text_list) == 1: top_word = freq_word(text_list[0]) topic_list = [top_word.keys()] elif len(text_list) == 0: topic_list = [] tmp_burst_reason = "" #没有相关微博,归零 print "***********************************" else: feature_words, input_word_dict = tfidf(text_list) #生成特征词和输入数据 word_label, evaluation_results = kmeans(feature_words, text_list) #聚类 inputs = text_classify(text_list, word_label, feature_words) clustering_topic = cluster_evaluation(inputs) print "========================================================================================" print "=========================================================================================" sorted_dict = sorted(clustering_topic.items(), key=lambda x:x[1], reverse=True) topic_list = [] if sorted_dict: for item in sorted_dict: topic_list.append(word_label[item[0]]) print "topic_list, ", topic_list if not topic_list: warning_status = signal_nothing tmp_burst_reason = signal_nothing_variation results = dict() results['origin_weibo_number'] = current_origin_count results['retweeted_weibo_number'] = current_retweeted_count results['comment_weibo_number'] = current_comment_count results['weibo_total_number'] = current_total_count results['sensitive_origin_weibo_number'] = sensitive_origin_weibo_number results['sensitive_retweeted_weibo_number'] = sensitive_retweeted_weibo_number results['sensitive_comment_weibo_number'] = sensitive_comment_weibo_number results['sensitive_weibo_total_number'] = sensitive_total_weibo_number results['sentiment_distribution'] = json.dumps(sentiment_count) results['important_users'] = json.dumps(filter_important_list) results['burst_reason'] = tmp_burst_reason results['timestamp'] = ts if tmp_burst_reason: results['clustering_topic'] = json.dumps(topic_list) # es存储当前时段的信息 doctype = task_name es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results) # 更新manage social sensing的es信息 temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=task_name)['_source'] temporal_result['warning_status'] = warning_status temporal_result['burst_reason'] = tmp_burst_reason temporal_result['finish'] = finish temporal_result['processing_status'] = process_status history_status = json.loads(temporal_result['history_status']) history_status.append([ts, ' '.join(keywords_list), warning_status]) temporal_result['history_status'] = json.dumps(history_status) es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=task_name, body=temporal_result) return "1"
#Module with functions to manipulate files import manipulate_files import fraud_simulation import prediction import clustering # Reads .txt files data=manipulate_files.read_files() print('Files read') # Sorts the files in order to be space relevant data=manipulate_files.sort_files(data) print('Files sorted') # Finds consumers with full data for a year # Output: index MeterID consumers vs halfhours of a year timeseries=manipulate_files.find_full_values(data) print('Dataset formated to annual data') # Applies types of attack # Outputs X and Y with MeterID index X, Y=fraud_simulation.typical_attack(timeseries) print('Fraud Simulation done') # Computes Performance prediction.liblinear(X,Y) # Clustering with PCA and K-Means clustering.kmeans(X)
def step2_cal(): """第二步计算,判断其他类是否需要分裂,若需要,则对其他类进行文本聚类,并做聚类评价 """ # 聚类评价时选取TOPK_FREQ_WORD的高频词 TOPK_FREQ_WORD = 50 # 聚类评价时最小簇的大小 LEAST_SIZE = 8 # 判断其他类是否需要分裂 ifsplit = event.check_ifsplit(initializing) print '[%s] ' % ts2datetime( int(time.time()) ), 'event ', eventid, ' split ', ifsplit, ' %s start step2' % ts2datetime( timestamp) if ifsplit: inputs, kmeans_cluster_num, reserve_num = event.getOtherSubEventInfos( initializing) print eventid, ' after classify before split: ', len( inputs), kmeans_cluster_num, reserve_num if len(inputs) > 2: items = [] for r in inputs: r["title"] = r["title"].encode("utf-8") r["content"] = r["content168"].encode("utf-8") items.append(r) # kmeans聚类 kmeans_results = kmeans(items, k=kmeans_cluster_num) # 聚类评价 if initializing or now_hour == 0: min_tfidf = event.get_min_tfidf() final_cluster_results, tfidf_dict = cluster_evaluation( kmeans_results, top_num=reserve_num, topk_freq=TOPK_FREQ_WORD, least_size=LEAST_SIZE, min_tfidf=min_tfidf) else: # 每小时聚类时,不用和已有簇的最小tfidf作比 final_cluster_results, tfidf_dict = cluster_evaluation( kmeans_results, top_num=reserve_num, topk_freq=TOPK_FREQ_WORD, least_size=LEAST_SIZE) # 更新新闻簇标签,更新子事件表 for label, items in final_cluster_results.iteritems(): if label == "other": label = event.getOtherSubEventID() event.save_subevent(label, timestamp) if label != event.getOtherSubEventID(): # 更新每类的tfidf event.update_subevent_tfidf(label, tfidf_dict[label]) for r in items: news = News(r["_id"], event.id) news.update_news_subeventid(label) else: print 'inputs less than 2, kmeans aborted' print '[%s] ' % ts2datetime(int(time.time( ))), 'event ', eventid, ' %s end step2' % ts2datetime(timestamp)
if __name__ == "__main__": s_index = "20150701" e_index = "20150702" features_as_list, ids = indexed_features(s_index, e_index) n_topics = 0 all_features = [] for index, feature in features_as_list: all_features += feature round_topics = int(len(feature) / 25) + 1 n_topics += round_topics dictionary, corpus = create_corpus(index, all_features) x, lda_model, lda_corpus = single_lda("20150703", corpus, dictionary, n_topics) x = pca(x, 2) km, (centroids, c, k) = kmeans(x, n_topics) cluster_plot_2d(x, centroids, c, k) clusters = {} for idx, el in enumerate(c): if not el in clusters: clusters[el] = [] clusters[el].append(idx) for key, doc in clusters.items(): for _id in doc[:5]: print(_id) print("-" * 40)
if __name__ == "__main__": s_index = "20150701" e_index = "20150702" features_as_list, ids = indexed_features(s_index, e_index) n_topics = 0 all_features = [] for index, feature in features_as_list: all_features += feature round_topics = int( len(feature) / 25 ) + 1 n_topics += round_topics dictionary, corpus = create_corpus(index, all_features) x, lda_model, lda_corpus = single_lda("20150703", corpus, dictionary, n_topics) x = pca(x, 2) km, (centroids, c, k) = kmeans(x, n_topics) cluster_plot_2d(x, centroids, c, k) clusters = {} for idx, el in enumerate(c): if not el in clusters: clusters[el] = [] clusters[el].append( idx ) for key, doc in clusters.items(): for _id in doc[:5]: print(_id) print("-"*40)
plt.style.use(args.style) df = data.read_data(args.data_filepath) if(args.plot_raw_data): plotting.plot_all_data(df, "Raw Data", "Raw", os.path.join(args.output_folderpath, "raw"), data.column_names) if(args.plot_standardized_data): clean_df = data.clean_data(df) plotting.plot_all_data(clean_df, "Standardized Data", "Clean", os.path.join(args.output_folderpath, "clean"), data.column_names) if(args.perform_pca): num_dimensions = args.num_dimensions projected_df = pca.perform_pca(df, num_dimensions) output_path = os.path.join(args.output_folderpath, "PCA") if(not(os.path.exists(output_path))): os.makedirs(output_path) print "Saving Graphs" plotting.plot_all_data(projected_df, "PCA {0}-D".format(num_dimensions), "PCA", output_path, projected_df.columns) if(args.perform_kmeans): output_path = os.path.join(args.output_folderpath, "KMeans") if (not (os.path.exists(output_path))): os.makedirs(output_path) clustering.kmeans(df, 6, 7, 2, output_path, df.columns[6:8])
if __name__ == "__main__": logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) # Data model_name = "20150701" ffeatures, fids = get_sents_training(model_name) # Topic modelling decomposer = [lsa, lda, nmf, None][1] n_topics = 15 x_topic = topic_model_run(decomposer, n_topics, ffeatures, fids) # Clustering from clustering import birch from clustering import kmeans from clustering import ward_linkage x, _ = count_vector(x_topic, ngram=(1, 2), max_df=0.99, min_df=0.1) n_clusters = 10 plot_dimension = 2 x, _ = pca(x, plot_dimension) km, (centroids, c, k) = kmeans(x, n_clusters) plot(x, centroids, c, k, "K-Means", plot_dimension) if fids: print_clusters(c, fids) print_measure("K-Means", "silhouette", silhouette(x, c))
def social_sensing(task_detail): # 任务名 传感器 终止时间 之前状态 创建者 时间 task_name = task_detail[0] social_sensors = task_detail[1] stop_time = task_detail[2] forward_warning_status = task_detail[3] create_by = task_detail[4] ts = int(task_detail[5]) # PART 1 forward_result = get_forward_numerical_info(task_name, ts, create_by) # 之前时间阶段内的原创微博list/retweeted forward_origin_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range) forward_retweeted_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range, 3) # 当前阶段内原创微博list current_mid_list = query_mid_list(ts, social_sensors, time_interval) current_retweeted_mid_list = query_mid_list(ts, social_sensors, time_interval, 3) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(current_retweeted_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list.extend(forward_retweeted_weibo_list) all_origin_list = [] all_origin_list.extend(current_mid_list) all_origin_list.extend(forward_origin_weibo_list) all_retweeted_list = [] all_retweeted_list.extend(current_retweeted_mid_list) all_retweeted_list.extend(forward_retweeted_weibo_list)#被转发微博的mid/root-mid print "all mid list: ", len(all_mid_list) print "all_origin_list", all_origin_list print "all_retweeted_list", all_retweeted_list # 查询微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval) if all_origin_list: origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情 else: origin_weibo_detail = {} if all_retweeted_list: retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情 else: retweeted_weibo_detail = {} current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 current_retweeted_count = statistics_count['retweeted'] current_comment_count = statistics_count['comment'] # PART 2 # 聚合当前时间内积极、中性、悲伤、愤怒情绪分布 # sentiment_dict = {"0": "neutral", "1":"positive", "2":"sad", "3": "anger"} sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0} search_results = aggregation_sentiment_related_weibo(ts, all_mid_list, time_interval) sentiment_count = search_results print "sentiment_count: ", sentiment_count negetive_key = ["2", "3", "4", "5", "6"] negetive_count = 0 for key in negetive_key: negetive_count += sentiment_count[key] # 聚合当前时间内重要的人 important_uid_list = [] datetime = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = get_important_user(ts, all_mid_list, time_interval) important_uid_list = search_results.keys() # 根据获得uid_list,从人物库中匹配重要人物 if important_uid_list: important_results = es_user_portrait.mget(index=portrait_index_name,doc_type=portrait_index_type, body={"ids": important_uid_list})['docs'] else: important_results = {} filter_important_list = [] # uid_list if important_results: for item in important_results: if item['found']: #if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD: filter_important_list.append(item['_id']) print filter_important_list #判断感知 burst_reason = signal_nothing_variation warning_status = signal_nothing finish = unfinish_signal # "0" process_status = "1" if forward_result[0]: # 根据移动平均判断是否有时间发生 mean_count = forward_result[1] std_count = forward_result[2] mean_sentiment = forward_result[3] std_sentiment = forward_result[4] if mean_count >= MEAN_COUNT and current_total_count > mean_count+1.96*std_count or current_total_count >= len(social_sensors)*0.2*AVERAGE_COUNT: # 异常点发生 print "=====================================================" if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason += signal_count_varition # 数量异常 if negetive_count > mean_sentiment+1.96*std_sentiment and mean_sentiment >= MEAN_COUNT or negetive_count >= len(social_sensors)*0.2*AVERAGE_COUNT: warning_status = signal_brust burst_reason += signal_sentiment_varition # 负面情感异常, "12"表示两者均异常 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track if int(stop_time) <= ts: # 检查任务是否已经完成 finish = finish_signal process_status = "0" # 感知到的事, all_mid_list tmp_burst_reason = burst_reason topic_list = [] # 有事件发生时开始 if warning_status: index_list = [] important_words = [] datetime_1 = ts2datetime(ts) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index=index_name_1) if exist_es: index_list.append(index_name_1) datetime_2 = ts2datetime(ts-DAY) index_name_2 = flow_text_index_name_pre + datetime_2 exist_es = es_text.indices.exists(index=index_name_2) if exist_es: index_list.append(index_name_2) if index_list and all_mid_list: query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": all_mid_list} } } }, "size": 2000 } search_results = es_text.search(index=index_list, doc_type="text", body=query_body)['hits']['hits'] text_list = [] if search_results: for item in search_results: iter_mid = item['_source']['mid'] iter_text = item['_source']['text'] temp_dict = dict() temp_dict["mid"] = iter_mid temp_dict["text"] = iter_text text_list.append(temp_dict) for item in text_list: print item['text'] if len(text_list) == 1: top_word = freq_word(text_list[0]) topic_list = [top_word.keys()] elif len(text_list) == 0: topic_list = [] tmp_burst_reason = "" #没有相关微博,归零 print "***********************************" else: feature_words, input_word_dict = tfidf(text_list) #生成特征词和输入数据 word_label, evaluation_results = kmeans(feature_words, text_list) #聚类 inputs = text_classify(text_list, word_label, feature_words) clustering_topic = cluster_evaluation(inputs) print "===============================================================" print "===============================================================" sorted_dict = sorted(clustering_topic.items(), key=lambda x:x[1], reverse=True) topic_list = [] if sorted_dict: for item in sorted_dict: topic_list.append(word_label[item[0]]) print "topic_list, ", topic_list #if not topic_list: # warning_status = signal_nothing # tmp_burst_reason = signal_nothing_variation results = dict() results['origin_weibo_number'] = len(all_origin_list) results['retweeted_weibo_number'] = len(all_retweeted_list) results['origin_weibo_detail'] = json.dumps(origin_weibo_detail) results['retweeted_weibo_detail'] = json.dumps(retweeted_weibo_detail) results['retweeted_weibo_count'] = current_retweeted_count results['comment_weibo_count'] = current_comment_count results['weibo_total_number'] = current_total_count results['sentiment_distribution'] = json.dumps(sentiment_count) results['important_users'] = json.dumps(filter_important_list) results['unfilter_users'] = json.dumps(important_uid_list) results['burst_reason'] = tmp_burst_reason results['timestamp'] = ts if tmp_burst_reason: results['clustering_topic'] = json.dumps(topic_list) # es存储当前时段的信息 doctype = create_by + '-' + task_name es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results) # 更新manage social sensing的es信息 temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=doctype)['_source'] temporal_result['warning_status'] = warning_status temporal_result['burst_reason'] = tmp_burst_reason temporal_result['finish'] = finish temporal_result['processing_status'] = process_status history_status = json.loads(temporal_result['history_status']) history_status.append([ts, task_name, warning_status]) temporal_result['history_status'] = json.dumps(history_status) es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=doctype, body=temporal_result) return "1"
def cluster_wine(k): table = load_wine_data('redwine.csv') labels = clus.kmeans(k, table) plot_scatter(table, labels, k)
def sensors_keywords_detection(task_detail): task_name = task_detail[0] social_sensors = task_detail[1] keywords_list = task_detail[2] sensitive_words = task_detail[3] stop_time = task_detail[4] forward_warning_status = task_detail[5] ts = task_detail[7] forward_result = get_forward_numerical_info(task_name, ts, keywords_list) # 1. 聚合前12个小时内传感人物发布的所有与关键词相关的原创微博 forward_origin_weibo_list = query_mid_list(ts - time_interval, keywords_list, forward_time_range, 1, social_sensors) # 2. 聚合当前阶段内的原创微博 current_mid_list = query_mid_list(ts, keywords_list, time_interval, 1, social_sensors) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list = list(set(all_mid_list)) print len(all_mid_list) # 3. 查询当前的原创微博和之前12个小时的原创微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval, keywords_list, 1, social_sensors) current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 print "current all weibo: ", statistics_count current_origin_count = statistics_count['origin'] current_retweeted_count = statistics_count['retweeted'] current_comment_count = statistics_count['comment'] # 4. 聚合当前时间内积极、中性、悲伤、愤怒情绪分布 sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0} datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts - time_interval) if datetime == datetime_1: index_name = flow_text_index_name_pre + datetime else: index_name = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index_name) if exist_es: search_results = aggregation_sentiment_related_weibo( ts, all_mid_list, time_interval, keywords_list, 1) sentiment_count = search_results print "sentiment_count: ", sentiment_count negetive_count = sentiment_count['2'] + sentiment_count['3'] # 5. 那些社会传感器参与事件讨论 query_body = { "query": { "filtered": { "filter": { "bool": { "must": [{ "range": { "timestamp": { "gte": ts - time_interval, "lt": ts } } }, { "terms": { "uid": social_sensors } }], "should": [{ "terms": { "root_mid": all_mid_list } }, { "terms": { "mid": all_mid_list } }] } } } }, "size": 10000 } datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts - time_interval) if datetime == datetime_1: index_name = flow_text_index_name_pre + datetime else: index_name = flow_text_index_name_pre + datetime_1 search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)['hits']['hits'] attend_users = [] if search_results: for item in search_results: attend_users.append(item['_source']['uid']) important_users = list(set(attend_users)) print "important users", important_users # 6. 敏感词识别,如果传感器的微博中出现这么一个敏感词,那么就会预警------PS.敏感词是一个危险的设置 sensitive_origin_weibo_number = 0 sensitive_retweeted_weibo_number = 0 sensitive_comment_weibo_number = 0 sensitive_total_weibo_number = 0 if sensitive_words: query_sensitive_body = { "query": { "filtered": { "filter": { "bool": { "must": [{ "range": { "timestamp": { "gte": ts - time_interval, "lt": ts } } }, { "terms": { "keywords_string": sensitive_words } }, { "terms": { "uid": social_sensors } }] } } } }, "aggs": { "all_list": { "terms": { "field": "message_type" } } } } sensitive_results = es_text.search( index=index_name, doc_type=flow_text_index_type, body=query_sensitive_body)['aggregations']['all_list']["buckets"] if sensitive_results: for item in sensitive_results: if int(item["key"]) == 1: sensitive_origin_weibo_number = item['doc_count'] elif int(item["key"]) == 2: sensitive_comment_weibo_number = item['doc_count'] elif int(item["key"]) == 3: sensitive_retweeted_weibo_number = item["doc_count"] else: pass sensitive_total_weibo_number = sensitive_origin_weibo_number + sensitive_comment_weibo_number + sensitive_retweeted_weibo_number burst_reason = signal_nothing_variation warning_status = signal_nothing finish = unfinish_signal # "0" process_status = "1" if sensitive_total_weibo_number: # 敏感微博的数量异常 print "======================" if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason = signal_sensitive_variation if forward_result[0]: # 根据移动平均判断是否有时间发生 mean_count = forward_result[1] std_count = forward_result[2] mean_sentiment = forward_result[3] std_sentiment = forward_result[4] if current_total_count > mean_count + 1.96 * std_count: # 异常点发生 print "=====================================================" if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason += signal_count_varition # 数量异常 if negetive_count > mean_sentiment + 1.96 * std_sentiment: warning_status = signal_brust burst_reason += signal_sentiment_varition # 负面情感异常, "12"表示两者均异常 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track if int(stop_time) <= ts: # 检查任务是否已经完成 finish = finish_signal process_status = '0' tmp_burst_reason = burst_reason topic_list = [] # 7. 感知到的事, all_mid_list if burst_reason: # 有事情发生 text_list = [] mid_set = set() if signal_sensitive_variation in burst_reason: query_sensitive_body = { "query": { "filtered": { "filter": { "bool": { "must": [{ "range": { "timestamp": { "gte": ts - time_interval, "lt": ts } } }, { "terms": { "keywords_string": sensitive_words } }] } } } }, "size": 10000 } if social_sensors: query_sensitive_body['query']['filtered']['filter']['bool'][ 'must'].append({"terms": { "uid": social_sensors }}) sensitive_results = es_text.search( index=index_name, doc_type=flow_text_index_type, body=query_sensitive_body)['hits']["hits"] if sensitive_results: for item in sensitive_results: iter_mid = item['_source']['mid'] iter_text = item['_source']['text'] temp_dict = dict() temp_dict["mid"] = iter_mid temp_dict["text"] = iter_text if iter_mid not in mid_set: text_list.append(temp_dict) # 整理后的文本,mid,text mid_set.add(iter_mid) burst_reason.replace(signal_sensitive_variation, "") if burst_reason and all_mid_list: sensing_text = es_text.mget(index=index_name, doc_type=flow_text_index_type, body={"ids": all_mid_list}, fields=["mid", "text"])["docs"] if sensing_text: for item in sensing_text: if item['found']: iter_mid = item["fields"]["mid"][0] iter_text = item["fields"]["text"][0] temp_dict = dict() temp_dict["mid"] = iter_mid temp_dict["text"] = iter_text if iter_mid not in mid_set: text_list.append(temp_dict) mid_set.add(iter_mid) if len(text_list) == 1: top_word = freq_word(text_list[0]) topic_list = [top_word.keys()] elif len(text_list) == 0: topic_list = [] tmp_burst_reason = "" #没有相关微博,归零 print "***********************************" else: feature_words, input_word_dict = tfidf(text_list) #生成特征词和输入数据 word_label, evaluation_results = kmeans(feature_words, text_list) #聚类 inputs = text_classify(text_list, word_label, feature_words) clustering_topic = cluster_evaluation(inputs) sorted_dict = sorted(clustering_topic.items(), key=lambda x: x[1], reverse=True)[0:5] topic_list = [] if sorted_dict: for item in sorted_dict: topic_list.append(word_label[item[0]]) print "topic_list:", topic_list if not topic_list: tmp_burst_reason = signal_nothing_variation warning_status = signal_nothing results = dict() results['sensitive_origin_weibo_number'] = sensitive_origin_weibo_number results[ 'sensitive_retweeted_weibo_number'] = sensitive_retweeted_weibo_number results['sensitive_comment_weibo_number'] = sensitive_comment_weibo_number results['sensitive_weibo_total_number'] = sensitive_total_weibo_number results['origin_weibo_number'] = current_origin_count results['retweeted_weibo_number'] = current_retweeted_count results['comment_weibo_number'] = current_comment_count results['weibo_total_number'] = current_total_count results['sentiment_distribution'] = json.dumps(sentiment_count) results['important_users'] = json.dumps(important_users) results['burst_reason'] = tmp_burst_reason results['timestamp'] = ts if tmp_burst_reason: results["clustering_topic"] = json.dumps(topic_list[:5]) # es存储当前时段的信息 doctype = task_name es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results) # 更新manage social sensing的es信息 temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=task_name)['_source'] temporal_result['warning_status'] = warning_status temporal_result['burst_reason'] = tmp_burst_reason temporal_result['finish'] = finish temporal_result['processing_status'] = process_status history_status = json.loads(temporal_result['history_status']) history_status.append([ts, ' '.join(keywords_list), warning_status]) temporal_result['history_status'] = json.dumps(history_status) es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=task_name, body=temporal_result) return "1"
values = l.split() at = atom() at.PDBIndex = values[1] at.atomname = values[2] at.residue = values[3] at.chain = values[4] at.resid = values[5] at.coordinates = point(float(values[6]), float(values[7]), float(values[8]) ) at.bfactor = float(values[10])* weight_be at.atomtype = values[11] at.element = 'H' coords = [at.coordinates.x, at.coordinates.y, at.coordinates.z, at.bfactor] p = clustering.Point(coords , reference=at) points.append(p) clusters = clustering.kmeans(points, number_of_clusters, cutoff) for i in range(len(clusters)): if ( not os.path.exists(str(i+1)) ): os.mkdir(str(i+1)) pdb_cluster = PDB() tot_be = 0.0 for p in clusters[i].points: p.reference.bfactor = p.reference.bfactor / weight_be pdb_cluster.AddNewAtom(p.reference) tot_be += p.reference.bfactor (box_num, cluster_id) = (ord(p.reference.chain) - ord('A') +1, p.reference.resid) target = "../../%s/cluster-%04d.pdbqt" % (box_num, int(cluster_id) ) link_name = "./cluster-%04d-box%s.pdbqt" % (int(cluster_id), box_num) os.chdir(str(i+1)) os.system("ln -sf %s %s" % (target, link_name) ) os.chdir("..") pdb_cluster.UserNotes.append("STATISTICS")