def text_kmeans_clustering(): # 聚类评价时选取TOPK_FREQ_WORD的高频词 TOPK_FREQ_WORD = 50 # 聚类评价时最小簇的大小 LEAST_SIZE = 8 eventcomment = EventComments(topicid) newsIds = eventcomment.getNewsIds() for news_id in newsIds: results = eventcomment.getNewsComments(news_id) inputs = [] for r in results: r['title'] = '' r['content'] = r['content168'].encode('utf-8') r['text'] = r['content168'] item = ad_filter(r) if item['ad_label'] == 0: inputs.append(item) # 情绪计算 for r in inputs: sentiment = triple_classifier(r) comment = Comment(r['_id']) comment.update_comment_sentiment(sentiment) # kmeans 聚类及评价 kmeans_results = kmeans(inputs, k=10) reserve_num = 5 final_cluster_results, tfidf_dict = cluster_evaluation(kmeans_results, \ top_num=reserve_num, topk_freq=TOPK_FREQ_WORD, least_size=LEAST_SIZE, min_tfidf=None) inputs = [] for label, items in final_cluster_results.iteritems(): if label != 'other': inputs.extend(items) for item in items: news = News(item['news_id']) if label == 'other': label = news.otherClusterId comment = Comment(item['_id']) comment.update_comment_label(label) eventcomment.save_cluster(label, news_id, int(time.time())) #计算各簇特征词 cluster_feature = extract_feature(inputs) for label, fwords in cluster_feature.iteritems(): eventcomment.update_feature_words(label, fwords) #计算文本权重 for input in inputs: weight = text_weight_cal(input, cluster_feature[input['label']]) comment = Comment(input['_id']) comment.update_comment_weight(weight)
def main(): # Sliding window size is 200*140(h*w) train_list = [] response_list = [] print "in main..." # Process sample images for SIGN in SIGN_LIST: for sample_image in load_sample_image(SIGN): train_data = feature.extract_feature(sample_image) train_list.append(train_data) response_list.append(SIGN) # SVM in OpenCV 3.1.0 for Python SVM = cv2.ml.SVM_create() SVM.setKernel(cv2.ml.SVM_LINEAR) SVM.setP(0.2) SVM.setType(cv2.ml.SVM_EPS_SVR) SVM.setC(1.0) print "building tl..." tl = np.array(train_list, np.float32) print "building rl..." rl = np.array(response_list, np.int32) # Train SVM model # svm = cv2.SVM() SVM.train(tl, cv2.ml.ROW_SAMPLE, rl) SVM.save('svm_data2.dat') return
def step3_cal(): """计算各簇的特征词、代表文本、去重, 更新簇的大小、增幅信息 """ print '[%s] ' % ts2datetime(int(time.time( ))), 'event ', eventid, ' %s start step3' % ts2datetime(timestamp) inputs = [] subevents = event.getSubEvents() for subevent in subevents: subeventid = subevent["_id"] inputs.extend(event.getSubeventInfos(subeventid)) for r in inputs: r["title"] = r["title"].encode("utf-8") r["content"] = r["content168"].encode("utf-8") r["label"] = r["subeventid"] # 计算各簇的存量特征词 cluster_feature = extract_feature(inputs) for label, fwords in cluster_feature.iteritems(): feature = Feature(label) feature.upsert_newest(fwords) # 计算文本权重 for r in inputs: weight = text_weight_cal(r, cluster_feature[r['label']]) news = News(r["_id"], event.id) news.update_news_weight(weight) # 文本去重 items_dict = {} for r in inputs: try: items_dict[r["label"]].append(r) except KeyError: items_dict[r["label"]] = [r] for label, items in items_dict.iteritems(): results = duplicate(items) for r in results: news = News(r["_id"], event.id) news.update_news_duplicate(r["duplicate"], r["same_from"]) # 更新簇的大小、增幅信息 before_size = event.get_subevent_size(label) event.update_subevent_size(label, len(items)) event.update_subevent_addsize(label, len(items) - before_size) if initializing: # 更新事件状态由initializing变为active event.activate() print '[%s] ' % ts2datetime(int(time.time( ))), 'event ', eventid, ' %s end step3' % ts2datetime(timestamp)
def step3_cal(): """计算各簇的特征词、代表文本、去重, 更新簇的大小、增幅信息 """ print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s start step3' % ts2datetime(timestamp) inputs = [] subevents = event.getSubEvents() for subevent in subevents: subeventid = subevent["_id"] inputs.extend(event.getSubeventInfos(subeventid)) for r in inputs: r["title"] = r["title"].encode("utf-8") r["content"] = r["content168"].encode("utf-8") r["label"] = r["subeventid"] # 计算各簇的存量特征词 cluster_feature = extract_feature(inputs) for label, fwords in cluster_feature.iteritems(): feature = Feature(label) feature.upsert_newest(fwords) # 计算文本权重 for r in inputs: weight = text_weight_cal(r, cluster_feature[r['label']]) news = News(r["_id"], event.id) news.update_news_weight(weight) # 文本去重 items_dict = {} for r in inputs: try: items_dict[r["label"]].append(r) except KeyError: items_dict[r["label"]] = [r] for label, items in items_dict.iteritems(): results = duplicate(items) for r in results: news = News(r["_id"], event.id) news.update_news_duplicate(r["duplicate"], r["same_from"]) # 更新簇的大小、增幅信息 before_size = event.get_subevent_size(label) event.update_subevent_size(label, len(items)) event.update_subevent_addsize(label, len(items) - before_size) if initializing: # 更新事件状态由initializing变为active event.activate() print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s end step3' % ts2datetime(timestamp)
def get_cnn_feature(input, reuse, mode): input_shape = input.get_shape().as_list() if len(input_shape) > 4: input = tf.reshape(input, [-1] + input_shape[2:]) is_train = True if mode == ModeKeys.TRAIN else False with tf.variable_scope('feature_extraction', reuse=reuse): cnn_feature = extract_feature(is_train, input) if len(input_shape) > 4: cnn_feature_shape = cnn_feature.get_shape().as_list() cnn_feature = tf.reshape(cnn_feature, input_shape[0:2] + cnn_feature_shape[1:]) return cnn_feature
# 返回pandas.DataFrame类型的特征矩阵 f_datafram = pd.DataFrame([feature[k] for k in feature.keys()], index=list(feature.keys())).T # 返回 FX,Fy features = [ 'mean', 'rms', 'std', 'skewness', 'kurtosis', 'maxf', 'signal_entropy', 'am_median_pdf' ] FX, Fy = f_datafram[features], f_datafram['label'] return FX, Fy if __name__ == "__main__": from augment import preprocess from feature import extract_feature # -1- 载入数据 path = r"./data/0HP" data_mark = "FE" len_data = 1024 overlap_rate = 50 # 50% random_seed = 1 fs = 12000 X, y = preprocess(path, data_mark, fs, len_data / fs, overlap_rate, random_seed) # -2- 提取特征 FX, Fy = extract_feature(X, y, fs)
# -*- encoding:utf8 -*- import sys from model import RandomForestModel from feature import extract_feature from model import LinearRegModel if __name__ == '__main__': try: if sys.argv[1] == 'feature': if len(sys.argv) < 3: print 'Usage: python main.py feature [train raw feature] [test raw feature]' else: features = extract_feature(sys.argv[2], sys.argv[3]) # load train features elif sys.argv[1] == 'model': if len(sys.argv) < 4: print 'usage: python main.py model [train_feature] [model_file] [test_file]' model = RandomForestModel(sys.argv[2], sys.argv[3], n_estimators = 200) """ model = LinearRegModel(sys.argv[2], sys.argv[3], alpha = 0.5) """ """ model = GBDTModle(sys.argv[2],