Пример #1
0
def cluster_text_job():
    logger.info('cluster_text_job() start...')

    # 政府资讯的 pulish time 是 0 点
    compare_time_gap = 1.5
    start_time = time_utils.n_days_ago_milli_time(compare_time_gap)
    end_time = time_utils.current_milli_time()
    logger.info("load text data start_time: {}".format(start_time))
    logger.info("load text data end_time: {}".format(end_time))

    original_data_file = 'logs/original_data.txt'
    extradata_file = 'logs/extra_data.txt'

    # 由于是按着 publish time 增量,需要自己进行比对
    data_file, new_file_data = load_text_data.get_extradata_from_api(
        start_time, end_time, original_data_file, extradata_file)
    logger.info("load text data file path...: {}".format(data_file))

    # 两个版本
    #ner_content_data, raw_data = cluster.fetch_data(data_file)
    ner_content_data, word_content_data, text_data, word_title_data, raw_data = cluster2.fetch_data(
        data_file)

    length_data = len(raw_data)
    logger.info('cluster corpus size: ' + str(length_data))
    # 太少的数据不做聚类
    if length_data < 100:
        logger.info("corpus size is too small only update end_time: {}".format(
            end_time))
        return

    # 覆盖掉 original_data_file
    load_text_data.update_original_data_file(new_file_data, original_data_file)
    new_file_data = None

    # 取上一次聚类结果(增量聚类)
    origin_cluster_file_path = 'logs/origin_cluster.txt'
    n_reserve_days_for_1size_cluster = 1
    n_reserve_days = 1
    # 两个版本 同上
    #origin_cluster_result = cluster.get_origin_cluster_result(origin_cluster_file_path, end_time, n_reserve_days_for_1size_cluster, n_reserve_days)
    origin_cluster_result = cluster2.get_origin_cluster_result(
        origin_cluster_file_path, end_time, n_reserve_days_for_1size_cluster,
        n_reserve_days)

    # 开始聚类
    # 两个版本 同上
    #cluster_result = cluster.cluster(origin_cluster_result, ner_content_data, raw_data)
    cluster_result = cluster2.cluster(origin_cluster_result, ner_content_data,
                                      word_content_data, text_data,
                                      word_title_data, raw_data)

    # 在本地此次保存更新后的聚类结果(如只保留近 1 天,删掉之前的时间簇,目的是参与下一次聚类,作为下一次
    # 增量聚类的基数,即考虑近 1 天的事件进行合并)
    cluster_result, cluster_already_merged = save_cluster_result.cluster_result_futher_merge(
        cluster_result, origin_cluster_file_path)
    save_cluster_result.dele_already_merged_cluster(cluster_already_merged)
    save_cluster_result.save_cluster_result(cluster_result, 1)

    logger.info('cluster_text_job() end...')
Пример #2
0
def main():
    #--------Read in old variables data-------
    # read in preprocessed values
    d  = pd.read_csv("newProcessed.csv")
    dataframe = d.loc[:, ['PriceChange', 'VolumeChange']]
    # designated weight
    weight = [0.2, 0.78, 0.015, 0.005]

    #--------Read in new variables data--------
    # # Uncomment this to run on new variable data
    # #read in preprocessed values
    # d  = pd.read_csv("SandPMarch31.csv")
    # dataframe = d.loc[:, ['PriceChange', 'frac']]
    # #designated weight
    # weight = [0.09, 0.1, 0.1, 0.71]

    #--------Pre-process data------------------
    X = np.array(dataframe.to_numpy())
    #summarize 30 day data and put into matrix
    data = sum30Day(X)
    np.set_printoptions(precision=4, suppress=True)
    np.random.seed(2)
    
    # convert data to np.array
    raw_data =  np.asarray(data, dtype=np.float32)

    # define the number of clusters 
    k = 7

    #---------Optimization of parameters--------------
    ### Uncomment to either manually or use scipy to optimize weight
    # # get the sihouette score
    # def rosen(weight):
    #     return opt.opt_helper(k, weight, raw_data)

    # # manually test different weights
    # opt.manual_minimize(rosen)

   
    # # calculate distance depending on the weight 
    # def distance(item1, item2):
    #     return cluster.distance(weight,item1, item2)
    # # use the scipy minimization to find optimal parameters
    # opt.minimizeHelper(rosen, weight)
    
    #-------------K-mean cluster once-----------
    # Perform K-mean cluster once for the designated weight, in order
    # to generate the Markov Chain plot
    print("\nBegin k-means clustering demo \n")
    # normalize the raw data so that they are all in the range of (0,1)
    (norm_data, mins, maxs) = cluster.mm_normalize(raw_data)

    # perform clustering
    print("\nClustering normalized data with k=" + str(k))
    clustering = cluster.cluster(weight, norm_data, k)
    print("\nDone. Clustering:")

    print("\nRaw data grouped by cluster: ")
    clusters = cluster.display(norm_data, clustering, k)
Пример #3
0
def processing(start_time, end_time):

    dir_path = './data1/'

    day = get_standard_time(end_time)
    logger.info("day processing... : {}".format(day))

    logger.info("load text data start_time: {}".format(start_time))
    logger.info("load text data end_time: {}".format(end_time))

    data_file = dir_path + day + '.txt'
    data_file_ = dir_path + day + '_.txt'
    cluster_result_file = dir_path + day + '_cluster_result.txt'
    cluster_triple_file = dir_path + day + '_cluster_triple.txt'
    triple_cluster_file = dir_path + day + '_triple_cluster.txt'

    load_text_data.load_data_from_api(start_time, end_time, data_file)
    logger.info("load text data file path...: {}".format(data_file))

    data_file_ = cluster2.data_event_process(data_file, data_file_)

    ner_content_data, word_content_data, text_data, word_title_data, raw_data = cluster2.fetch_data(
        data_file_)

    length_data = len(raw_data)
    logger.info('cluster corpus size: ' + str(length_data))

    # 太少的数据不做聚类
    if length_data < 100:
        logger.info("corpus size is too small only update end_time: {}".format(
            end_time))

    origin_cluster_result = []

    cluster_result = cluster2.cluster(origin_cluster_result, ner_content_data,
                                      word_content_data, text_data,
                                      word_title_data, raw_data)

    with io.open(cluster_result_file, 'w', encoding='utf-8') as f1:
        for x in cluster_result:
            f1.write(json.dumps(x, ensure_ascii=False) + "\n")

    all_cluster_event_infos = load_cluster_and_process.load_cluster_info_process(
        cluster_result_file, cluster_triple_file, hot_filter=0)

    load_cluster_and_process.all_cluster_event_infos_process(
        all_cluster_event_infos, triple_cluster_file)
Пример #4
0
def opt_helper(k, weight_, raw_data):
    """
    Calculate silhouette score based on given weight and k
    """
    # normalize the raw data so that they are all in the range of (0,1)
    (norm_data, mins, maxs) = cluster.mm_normalize(raw_data)
    # define the number of clusters
    #weight_ = [weight[0], weight[1], weight[2], 0]
    print("weight:", weight_)

    def distance(item1, item2):
        return cluster.distance(weight_, item1, item2)

    clustering = cluster.cluster(weight_, norm_data, k)
    # clusters = cluster.display(norm_data, clustering, k)

    result = -metrics.silhouette_score(norm_data,
                                       clustering,
                                       metric=distance,
                                       sample_size=1000,
                                       random_state=2)
    print("result: ", result)
    return result
Пример #5
0
def main():
    # read in preprocessed values
    d  = pd.read_csv("newProcessed.csv")
    dataframe = d.loc[:, ['PriceChange', 'VolumeChange']]
    X = np.array(dataframe.to_numpy())

    # data is the matrix that holds all the pca 5-elements lists
    # it has a dimension of (n, 5) where n is the number of pcas we have
    data = []

    # Perform PCA on every 30 data points using the shifting strategy
    i = 0
    pca = PCA() # declare PCA object with constructor

    '''
    Summarizing 30 days data in the following format:
        [mean of percent price change, mean of percent volume change change,
        principal eigenvalue, secondary principal eigenvalue, theta]
    '''
    while (i<len(X)-30):
        oneMonth = X[i:i+30]
        returnList=[]

        # Append mean of % change price and % change volume
        returnList.append(mean(oneMonth[:, 0]))
        returnList.append(mean(oneMonth[:, 1]))
        
        # Append the two eigenvalues, the bigger eigenvalues go first
        # and hold more weight
        pca.fit(oneMonth)
        evals = pca.explained_variance_
        returnList.append(evals[0])
        returnList.append(evals[1])

        # calculate theta
        y = pca.components_[0][1]
        x = pca.components_[0][0]
        theta = math.atan(y/x)*180/(math.pi)

        # append theta
        returnList.append(theta)

        # increment i 
        i = i+1

        # append returnList to data
        data.append(returnList)

  
    print("\nBegin k-means clustering demo \n")
    np.set_printoptions(precision=4, suppress=True)
    np.random.seed(2)

    # convert data to np.array
    raw_data =  np.asarray(data, dtype=np.float32)
    # normalize the raw data so that they are all in the range of (0,1)
    (norm_data, mins, maxs) = cluster.mm_normalize(raw_data)
    # define the number of clusters 
    k = 7

    # perform clustering
    print("\nClustering normalized data with k=" + str(k))
    clustering = cluster.cluster(norm_data, k)
    
    # print results
    print("\nDone. Clustering:")
    print(clustering)
    print("\nRaw data grouped by cluster: ")
    clusters = cluster.display(norm_data, clustering, k)

    # Uncomment in order to visualize the result by plotting all elicpses on the same plot
    #
    # draw_ellipse(data, clustering)

    ### Uncoment if want to visualize the optimal k value, must comment out the block above
    # Find the optimal k value by calculating the average distance associated with each
    #
    # distance_L = []
    # for  k in range(1, 8):
    #     print("k = "+ str(k))
    #     clustering = cluster.cluster(norm_data, k)
    #     clusters = cluster.display(norm_data, clustering, k)
    #     distance = cluster_distance(clusters)
    #     distance_L.append([k, distance])
    # threshold_plot(distance_L)

    print("\nEnd k-means demo ")

    # Split the eclipses in 9 Xtrain: 1 Xtest for the Markov Model

    splitPt = int(len(clustering) * 0.9)
    Xtrain = clustering[0:splitPt]
    Xtest = clustering[splitPt:]
    
    # Training and Testing Markov Model
    dictionary = get_cluster_dict(Xtrain)
    correctness, not_found = test_markov(dictionary, Xtest)
    print("Accuracy is " + str(correctness*100) + "%")
    print("Cases not found: ", not_found)
Пример #6
0
import pandas as pd
import vector as v
import preprocessing as p
import cluster2 as c
import classifier as r
a = pd.read_csv("Z:/TermPaper/twitter_cred-master/data.csv")
print("cleaning....")
doc, id1 = p.clean(a)
print("vectorizing....")
dvec, global_vector = v.vectorize(doc)
print("clustering....")
g, t = c.cluster(dvec, global_vector, id1)
cnt = 0
x = []
print(len(t))
print("credibility calculating")
r.classifier(g)
Пример #7
0
    logger.info('cluster corpus size: ' + str(length_data))
    # 太少的数据不做聚类
    if length_data < 100:
        logger.info("corpus size is too small only update end_time: {}".format(
            end_time))

    # 覆盖掉 original_data_file
    #load_text_data.update_original_data_file(new_file_data, original_data_file)
    #new_file_data = None

    # 取上一次聚类结果(增量聚类)
    origin_cluster_file_path = 'logs/origin_cluster.txt'
    n_reserve_days_for_1size_cluster = 1
    n_reserve_days = 1
    # 两个版本 同上
    #origin_cluster_result = cluster.get_origin_cluster_result(origin_cluster_file_path, end_time, n_reserve_days_for_1size_cluster, n_reserve_days)
    origin_cluster_result = cluster2.get_origin_cluster_result(
        origin_cluster_file_path, end_time, n_reserve_days_for_1size_cluster,
        n_reserve_days)

    # 开始聚类
    # 两个版本 同上
    #cluster_result = cluster.cluster(origin_cluster_result, ner_content_data, raw_data)
    cluster_result = cluster2.cluster(origin_cluster_result, ner_content_data,
                                      word_content_data, text_data,
                                      word_title_data, raw_data)

    with io.open('logs1/cluster_result.txt', 'w', encoding='utf-8') as f1:
        for x in cluster_result:
            f1.write(json.dumps(x, ensure_ascii=False) + "\n")