예제 #1
0
def text_kmeans_clustering():
    # 聚类评价时选取TOPK_FREQ_WORD的高频词
    TOPK_FREQ_WORD = 50

    # 聚类评价时最小簇的大小
    LEAST_SIZE = 8

    eventcomment = EventComments(topicid)
    newsIds = eventcomment.getNewsIds()

    for news_id in newsIds:
        results = eventcomment.getNewsComments(news_id)

        inputs = []
        for r in results:
            r['title'] = ''
            r['content'] = r['content168'].encode('utf-8')
            r['text'] = r['content168']
            item = ad_filter(r)
            if item['ad_label'] == 0:
                inputs.append(item)

        # 情绪计算
        for r in inputs:
            sentiment = triple_classifier(r)
            comment = Comment(r['_id'])
            comment.update_comment_sentiment(sentiment)

        # kmeans 聚类及评价
        kmeans_results = kmeans(inputs, k=10)
        reserve_num = 5
        final_cluster_results, tfidf_dict = cluster_evaluation(kmeans_results, \
                top_num=reserve_num, topk_freq=TOPK_FREQ_WORD, least_size=LEAST_SIZE, min_tfidf=None)

        inputs = []
        for label, items in final_cluster_results.iteritems():
            if label != 'other':
                inputs.extend(items)

            for item in items:
                news = News(item['news_id'])

                if label == 'other':
                    label = news.otherClusterId

                comment = Comment(item['_id'])
                comment.update_comment_label(label)

            eventcomment.save_cluster(label, news_id, int(time.time()))

        #计算各簇特征词
        cluster_feature = extract_feature(inputs)
        for label, fwords in cluster_feature.iteritems():
            eventcomment.update_feature_words(label, fwords)

        #计算文本权重
        for input in inputs:
            weight = text_weight_cal(input, cluster_feature[input['label']])
            comment = Comment(input['_id'])
            comment.update_comment_weight(weight)
예제 #2
0
def main():
    # Sliding window size is 200*140(h*w)
    train_list = []
    response_list = []
    print "in main..."
    # Process sample images
    for SIGN in SIGN_LIST:
        for sample_image in load_sample_image(SIGN):
            train_data = feature.extract_feature(sample_image)
            train_list.append(train_data)
            response_list.append(SIGN)

    # SVM in OpenCV 3.1.0 for Python
    SVM = cv2.ml.SVM_create()

    SVM.setKernel(cv2.ml.SVM_LINEAR)
    SVM.setP(0.2)
    SVM.setType(cv2.ml.SVM_EPS_SVR)
    SVM.setC(1.0)


    print "building tl..."
    tl = np.array(train_list, np.float32)
    print "building rl..."
    rl = np.array(response_list, np.int32)

    # Train SVM model
    # svm = cv2.SVM()

    SVM.train(tl, cv2.ml.ROW_SAMPLE, rl)
    SVM.save('svm_data2.dat')

    return
예제 #3
0
def text_kmeans_clustering():
    # 聚类评价时选取TOPK_FREQ_WORD的高频词
    TOPK_FREQ_WORD = 50

    # 聚类评价时最小簇的大小
    LEAST_SIZE = 8

    eventcomment = EventComments(topicid)
    newsIds = eventcomment.getNewsIds()

    for news_id in newsIds:
        results = eventcomment.getNewsComments(news_id)

        inputs = []
        for r in results:
            r['title'] = ''
            r['content'] = r['content168'].encode('utf-8')
            r['text'] = r['content168']
            item = ad_filter(r)
            if item['ad_label'] == 0:
                inputs.append(item)

        # 情绪计算
        for r in inputs:
            sentiment = triple_classifier(r)
            comment = Comment(r['_id'])
            comment.update_comment_sentiment(sentiment)

        # kmeans 聚类及评价
        kmeans_results = kmeans(inputs, k=10)
        reserve_num = 5
        final_cluster_results, tfidf_dict = cluster_evaluation(kmeans_results, \
                top_num=reserve_num, topk_freq=TOPK_FREQ_WORD, least_size=LEAST_SIZE, min_tfidf=None)

        inputs = []
        for label, items in final_cluster_results.iteritems():
            if label != 'other':
                inputs.extend(items)

            for item in items:
                news = News(item['news_id'])

                if label == 'other':
                    label = news.otherClusterId

                comment = Comment(item['_id'])
                comment.update_comment_label(label)

            eventcomment.save_cluster(label, news_id, int(time.time()))

        #计算各簇特征词
        cluster_feature = extract_feature(inputs)
        for label, fwords in cluster_feature.iteritems():
            eventcomment.update_feature_words(label, fwords)

        #计算文本权重
        for input in inputs:
            weight = text_weight_cal(input, cluster_feature[input['label']])
            comment = Comment(input['_id'])
            comment.update_comment_weight(weight)
예제 #4
0
    def step3_cal():
        """计算各簇的特征词、代表文本、去重, 更新簇的大小、增幅信息
        """
        print '[%s] ' % ts2datetime(int(time.time(
        ))), 'event ', eventid, ' %s start step3' % ts2datetime(timestamp)

        inputs = []
        subevents = event.getSubEvents()
        for subevent in subevents:
            subeventid = subevent["_id"]
            inputs.extend(event.getSubeventInfos(subeventid))

        for r in inputs:
            r["title"] = r["title"].encode("utf-8")
            r["content"] = r["content168"].encode("utf-8")
            r["label"] = r["subeventid"]

        # 计算各簇的存量特征词
        cluster_feature = extract_feature(inputs)
        for label, fwords in cluster_feature.iteritems():
            feature = Feature(label)
            feature.upsert_newest(fwords)

        # 计算文本权重
        for r in inputs:
            weight = text_weight_cal(r, cluster_feature[r['label']])
            news = News(r["_id"], event.id)
            news.update_news_weight(weight)

        # 文本去重
        items_dict = {}
        for r in inputs:
            try:
                items_dict[r["label"]].append(r)
            except KeyError:
                items_dict[r["label"]] = [r]

        for label, items in items_dict.iteritems():
            results = duplicate(items)
            for r in results:
                news = News(r["_id"], event.id)
                news.update_news_duplicate(r["duplicate"], r["same_from"])

            # 更新簇的大小、增幅信息
            before_size = event.get_subevent_size(label)
            event.update_subevent_size(label, len(items))
            event.update_subevent_addsize(label, len(items) - before_size)

        if initializing:
            # 更新事件状态由initializing变为active
            event.activate()

        print '[%s] ' % ts2datetime(int(time.time(
        ))), 'event ', eventid, ' %s end step3' % ts2datetime(timestamp)
예제 #5
0
    def step3_cal():
        """计算各簇的特征词、代表文本、去重, 更新簇的大小、增幅信息
        """
        print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s start step3' % ts2datetime(timestamp)

        inputs = []
        subevents = event.getSubEvents()
        for subevent in subevents:
            subeventid = subevent["_id"]
            inputs.extend(event.getSubeventInfos(subeventid))

        for r in inputs:
            r["title"] = r["title"].encode("utf-8")
            r["content"] = r["content168"].encode("utf-8")
            r["label"] = r["subeventid"]

        # 计算各簇的存量特征词
        cluster_feature = extract_feature(inputs)
        for label, fwords in cluster_feature.iteritems():
            feature = Feature(label)
            feature.upsert_newest(fwords)

        # 计算文本权重
        for r in inputs:
            weight = text_weight_cal(r, cluster_feature[r['label']])
            news = News(r["_id"], event.id)
            news.update_news_weight(weight)

        # 文本去重
        items_dict = {}
        for r in inputs:
            try:
                items_dict[r["label"]].append(r)
            except KeyError:
                items_dict[r["label"]] = [r]

        for label, items in items_dict.iteritems():
            results = duplicate(items)
            for r in results:
                news = News(r["_id"], event.id)
                news.update_news_duplicate(r["duplicate"], r["same_from"])

            # 更新簇的大小、增幅信息
            before_size = event.get_subevent_size(label)
            event.update_subevent_size(label, len(items))
            event.update_subevent_addsize(label, len(items) - before_size)

        if initializing:
            # 更新事件状态由initializing变为active
            event.activate()

        print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s end step3' % ts2datetime(timestamp)
예제 #6
0
def get_cnn_feature(input, reuse, mode):

    input_shape = input.get_shape().as_list()
    if len(input_shape) > 4:
        input = tf.reshape(input, [-1] + input_shape[2:])

    is_train = True if mode == ModeKeys.TRAIN else False
    with tf.variable_scope('feature_extraction', reuse=reuse):
        cnn_feature = extract_feature(is_train, input)

    if len(input_shape) > 4:
        cnn_feature_shape = cnn_feature.get_shape().as_list()
        cnn_feature = tf.reshape(cnn_feature,
                                 input_shape[0:2] + cnn_feature_shape[1:])

    return cnn_feature
예제 #7
0
    # 返回pandas.DataFrame类型的特征矩阵
    f_datafram = pd.DataFrame([feature[k] for k in feature.keys()],
                              index=list(feature.keys())).T

    # 返回 FX,Fy
    features = [
        'mean', 'rms', 'std', 'skewness', 'kurtosis', 'maxf', 'signal_entropy',
        'am_median_pdf'
    ]
    FX, Fy = f_datafram[features], f_datafram['label']

    return FX, Fy


if __name__ == "__main__":
    from augment import preprocess
    from feature import extract_feature
    # -1- 载入数据
    path = r"./data/0HP"
    data_mark = "FE"
    len_data = 1024
    overlap_rate = 50  # 50%
    random_seed = 1
    fs = 12000

    X, y = preprocess(path, data_mark, fs, len_data / fs, overlap_rate,
                      random_seed)
    # -2- 提取特征
    FX, Fy = extract_feature(X, y, fs)
예제 #8
0
파일: main.py 프로젝트: kymo/kaggle
# -*- encoding:utf8 -*-

import sys
from model import RandomForestModel
from feature import extract_feature
from model import LinearRegModel

if __name__ == '__main__':
	
	try:
		if sys.argv[1] == 'feature':
			if len(sys.argv) < 3:
				print 'Usage: python main.py feature [train raw feature] [test raw feature]'
			
			else:
				features = extract_feature(sys.argv[2], sys.argv[3])
				# load train features
		
		elif sys.argv[1] == 'model':
			if len(sys.argv) < 4:
				print 'usage: python main.py model [train_feature] [model_file] [test_file]' 
			model = RandomForestModel(sys.argv[2],
					sys.argv[3],
					n_estimators = 200)
			"""
			model = LinearRegModel(sys.argv[2],
					sys.argv[3],
					alpha = 0.5)
			"""
			"""
			model = GBDTModle(sys.argv[2],