Exemplo n.º 1
0
    def test_biased_vs_unbiased(self):
        fname = "example_data.txt"
        unbiased_ticc = TICC(window_size=1,
                             number_of_clusters=8,
                             lambda_parameter=11e-2,
                             beta=600,
                             maxIters=100,
                             threshold=2e-5,
                             write_out_file=False,
                             prefix_string="output_folder/",
                             num_proc=1)
        (unbiased_cluster_assignment,
         unbiased_cluster_MRFs) = unbiased_ticc.fit(input_file=fname)

        biased_ticc = TICC(window_size=1,
                           number_of_clusters=8,
                           lambda_parameter=11e-2,
                           beta=600,
                           maxIters=100,
                           threshold=2e-5,
                           write_out_file=False,
                           prefix_string="output_folder/",
                           num_proc=1,
                           biased=True)
        (biased_cluster_assignment,
         biased_cluster_MRFs) = biased_ticc.fit(input_file=fname)

        np.testing.assert_array_equal(
            np.array(biased_cluster_assignment),
            np.array(unbiased_cluster_assignment),
            "Biased assignment is not equel to unbiased assignment!")
Exemplo n.º 2
0
    def test_multiExample(self):
        fname = "example_data.txt"
        ticc = TICC(window_size=5,
                    number_of_clusters=5,
                    lambda_parameter=11e-2,
                    beta=600,
                    maxIters=100,
                    threshold=2e-5,
                    write_out_file=False,
                    prefix_string="output_folder/",
                    num_proc=1)
        (cluster_assignment, cluster_MRFs) = ticc.fit(input_file=fname)
        assign = np.loadtxt("UnitTest_Data/multiResults.txt")
        val = abs(assign - cluster_assignment)
        self.assertEqual(sum(val), 0)

        for i in range(5):
            mrf = np.loadtxt("UnitTest_Data/multiCluster_" + str(i) + ".txt",
                             delimiter=',')
            try:
                np.testing.assert_array_almost_equal(mrf,
                                                     cluster_MRFs[i],
                                                     decimal=3)
            except AssertionError:
                #Test failed
                self.assertTrue(1 == 0)
Exemplo n.º 3
0
    def test_example(self):
        fname = "example_data.txt"
        ticc = TICC(window_size=1,
                    number_of_clusters=8,
                    lambda_parameter=11e-2,
                    beta=600,
                    maxIters=100,
                    threshold=2e-5,
                    write_out_file=False,
                    prefix_string="output_folder/",
                    num_proc=1)
        (cluster_assignment, cluster_MRFs) = ticc.fit(input_file=fname)
        assign = np.loadtxt("UnitTest_Data/Results.txt")
        val = abs(assign - cluster_assignment)
        self.assertEqual(sum(val), 0)

        # Test prediction works with batch of data outside of `fit` method. Perhaps there is a better way
        # to test this in parallel so these are more like unit tests rather than integration tests?
        test_batch = ticc.predict_clusters(
            ticc.trained_model['complete_D_train'][0:1000, ])
        batch_val = abs(test_batch - cluster_assignment[0:1000])
        self.assertEqual(sum(batch_val), 0)

        # Test streaming by passing in 5 row blocks at a time (current timestamp and previous 4)
        # I am causing data leakage by training on the whole set and then using the trained model while streaming,
        # but this is for testing the code, so it is ok
        # TODO: figure out why larger blocks don't improve predictions more. Reference:
        # https://github.com/davidhallac/TICC/issues/18#issuecomment-384514116
        def test_streaming(block_size):
            test_stream = np.zeros(1000)
            test_stream[0:block_size] = cluster_assignment[0:block_size]
            for i in range(block_size, 1000):
                point = ticc.trained_model['complete_D_train'][i -
                                                               block_size:i, ]
                test_stream[i] = ticc.predict_clusters(point)[block_size - 1]

            percent_correct_streaming = 100 * sum(
                cluster_assignment[0:1000] == test_stream) / 1000.0
            self.assertGreater(percent_correct_streaming, 0.9)

        test_streaming(5)

        for i in range(8):
            mrf = np.loadtxt("UnitTest_Data/cluster_" + str(i) + ".txt",
                             delimiter=',')
            try:
                np.testing.assert_array_almost_equal(mrf,
                                                     cluster_MRFs[i],
                                                     decimal=3)
            except AssertionError:
                #Test failed
                self.assertTrue(1 == 0)
Exemplo n.º 4
0
    def test_failed_unbiased(self):
        with self.assertRaises(Exception) as context:
            # TICC will fail in Iteration 2, because cluster 9 has only one observation.
            fname = "example_data.txt"
            ticc = TICC(window_size=1,
                        number_of_clusters=50,
                        lambda_parameter=11e-2,
                        beta=600,
                        maxIters=100,
                        threshold=2e-5,
                        write_out_file=False,
                        prefix_string="output_folder/",
                        num_proc=1)
            (cluster_assignment, cluster_MRFs) = ticc.fit(input_file=fname)

        self.assertTrue('This is broken {}'.format(context.exception))
Exemplo n.º 5
0
    def test_multiExample(self):
        fname = "example_data.txt"
        ticc = TICC(window_size = 5,number_of_clusters = 5, lambda_parameter = 11e-2, beta = 600, maxIters = 100,
                    threshold = 2e-5, write_out_file = False, prefix_string = "output_folder/", num_proc=1)
        (cluster_assignment, cluster_MRFs) = ticc.fit(input_file=fname)
        assign = np.loadtxt("UnitTest_Data/multiResults.txt")
        val = abs(assign - cluster_assignment)
        self.assertEqual(sum(val), 0)

        for i in range(5):
            mrf = np.loadtxt("UnitTest_Data/multiCluster_"+str(i)+".txt",delimiter=',')
            try:
                np.testing.assert_array_almost_equal(mrf, cluster_MRFs[i], decimal=3)
            except AssertionError:
                #Test failed
                self.assertTrue(1==0)
Exemplo n.º 6
0
def run_ticc(data):
    ticc = TICC(window_size=1,
                number_of_clusters=2,
                lambda_parameter=11e-2,
                beta=600,
                maxIters=50,
                threshold=2e-15,
                write_out_file=True,
                prefix_string="ration_folder/",
                num_proc=1)
    (cluster_assignment, cluster_MRFs) = ticc.fit(input_file=data,
                                                  rf=-1,
                                                  rl=-1,
                                                  rational=True)
    cuts = []
    for i in range(1, len(cluster_assignment)):
        if cluster_assignment[i] != cluster_assignment[i - 1]:
            cuts.append(i)
    return cuts
Exemplo n.º 7
0
def run_ticc(data, save_file):
    num_cluster = 4  #chickendance
    #num_cluster= 3 #sudden_cardiac, synthetic
    n = data.shape[1]
    ticc = TICC(window_size=5,
                number_of_clusters=num_cluster,
                lambda_parameter=11e-2,
                beta=600,
                maxIters=100,
                threshold=2e-15,
                write_out_file=True,
                prefix_string="ration_folder/",
                num_proc=1)
    (cluster_assignment, cluster_MRFs) = ticc.fit(input_file=data,
                                                  rf=-1,
                                                  rl=-1,
                                                  rational=True)
    cluster_score = get_importance_score(cluster_MRFs, n, num_cluster,
                                         save_file)
    return np.array(cluster_score), cluster_assignment
Exemplo n.º 8
0
    def test_example(self):
        fname = "example_data.txt"
        ticc = TICC(window_size = 1,number_of_clusters = 8, lambda_parameter = 11e-2, beta = 600, maxIters = 100,
                    threshold = 2e-5, write_out_file = False, prefix_string = "output_folder/", num_proc=1)
        (cluster_assignment, cluster_MRFs) = ticc.fit(input_file=fname)
        assign = np.loadtxt("UnitTest_Data/Results.txt")
        val = abs(assign - cluster_assignment)
        self.assertEqual(sum(val), 0)

        # Test prediction works with batch of data outside of `fit` method. Perhaps there is a better way
        # to test this in parallel so these are more like unit tests rather than integration tests?
        test_batch = ticc.predict_clusters(ticc.trained_model['complete_D_train'][0:1000, ])
        batch_val = abs(test_batch - cluster_assignment[0:1000])
        self.assertEqual(sum(batch_val), 0)

        # Test streaming by passing in 5 row blocks at a time (current timestamp and previous 4)
        # I am causing data leakage by training on the whole set and then using the trained model while streaming,
        # but this is for testing the code, so it is ok
        # TODO: figure out why larger blocks don't improve predictions more. Reference:
        # https://github.com/davidhallac/TICC/issues/18#issuecomment-384514116
        def test_streaming(block_size):
            test_stream = np.zeros(1000)
            test_stream[0:block_size] = cluster_assignment[0:block_size]
            for i in range(block_size, 1000):
                point = ticc.trained_model['complete_D_train'][i - block_size:i, ]
                test_stream[i] = ticc.predict_clusters(point)[block_size - 1]

            percent_correct_streaming = 100 * sum(cluster_assignment[0:1000] == test_stream) / 1000.0
            self.assertGreater(percent_correct_streaming, 0.9)

        test_streaming(5)

        for i in range(8):
            mrf = np.loadtxt("UnitTest_Data/cluster_"+str(i)+".txt",delimiter=',')
            try:
                np.testing.assert_array_almost_equal(mrf, cluster_MRFs[i], decimal=3)
            except AssertionError:
                #Test failed
                self.assertTrue(1==0)
Exemplo n.º 9
0
    # maxiters 1k
    # window size 1
    for betavals in range(0, 6000, 50):
        try:
            ticc = TICC(window_size=1,
                        number_of_clusters=numclust,
                        lambda_parameter=lambvals,
                        beta=betavals,
                        maxIters=10,
                        threshold=2e-5,
                        write_out_file=False,
                        prefix_string="output_folder/",
                        num_proc=1)

            (cluster_assignment, cluster_MRFs,
             bic) = ticc.fit(input_file=fname)
            print("what?")

            tup = (numclust, lambvals, betavals, bic)

            biclist.append(tup)

        except:
            tup = "Fail"
            print(tup)

    # print("what")
    # for numclust in range(3, 10, 1):
    # 	print(fname)
    # 	print("hm")
    # 	# for lambvals in np.linspace(5e-2, 9e-2, 4):
Exemplo n.º 10
0
a[1, 0].scatter(x, y[2] + y0)
a[1, 1].scatter(x, y[3] + y0)
a[2, 0].scatter(x, y[4] + y0)
#pyplot.bar(x,yz)

np.savetxt("Syn_TimeSeries2.csv",
           np.transpose([y[0, ], y[1, ], y[2, ], y[3, ], y[4, ]]),
           delimiter=',')
#np.savetxt('test.csv', x, delimiter=',')
''' Time Series using Pandas,'''
if __name__ == '__main__':
    fname = "Syn_TimeSeries2.csv"
    ticc = TICC(window_size=1,
                number_of_clusters=2,
                lambda_parameter=11e-2,
                beta=600,
                maxIters=1000,
                threshold=2e-5,
                write_out_file=False,
                prefix_string="output_folder/",
                num_proc=1)
    (cluster_assignment, cluster_MRFs) = ticc.fit(input_file=fname)

    print(cluster_assignment)
    #np.savetxt('Results2.txt', cluster_assignment, fmt='%d', delimiter=',')
    np.savetxt('Results_SynData.csv',
               cluster_assignment,
               fmt='%d',
               delimiter=',')
    #print(np.size(cluster_assignment))
Exemplo n.º 11
0
from TICC_solver import TICC
import numpy as np
import sys

fname = "example_data.txt"
ticc = TICC(window_size=1, number_of_clusters=8, lambda_parameter=11e-2, beta=600, maxIters=100, threshold=2e-5,
            write_out_file=False, prefix_string="output_folder/", num_proc=1)
(cluster_assignment, cluster_MRFs) = ticc.fit(input_file=fname)

print(cluster_assignment)
np.savetxt('Results.txt', cluster_assignment, fmt='%d', delimiter=',')
Exemplo n.º 12
0
def genera_cluster(cluster=0,
                   window=0,
                   p_lambda=0,
                   beta=0,
                   percorsoD="/",
                   percorsoS="/",
                   percorsoG="",
                   percorsoDN="",
                   seed=102):

    percorsoS = percorsoS + "/TICC"
    if not os.path.exists(percorsoS):
        os.makedirs(percorsoS)

    # Read the dataset
    base_name = percorsoD
    file_name = base_name
    df, latitude, longitude, experiment = get_dataset(file_name)
    df = df.astype(float)

    ticc = TICC(window_size=int(window),
                number_of_clusters=int(cluster),
                lambda_parameter=float(p_lambda),
                beta=int(beta),
                seed=seed)

    print "[XM]> ========= Generating TICC clustering model ==========="

    cluster_assignment, cluster_MRFs, bic, aic, ll = ticc.fit(
        input_file=percorsoD)
    cluster_assignment = [int(item) for item in cluster_assignment]
    surplus = []
    if int(window) > 1:
        for i in range(int(window) - 1):
            surplus.append(cluster_assignment[0])
    cluster_assignment = surplus + cluster_assignment
    if percorsoG != "":
        y = open(percorsoG, 'r').readlines()
        y = [float(item) for item in y]
        evaluation_names = [
            "Acc.", "CE", "F1", "Entropy", "Purity", "NbClust."
        ]
        df_evaluation = pd.DataFrame()
        df_evaluation["clusters_found"] = list(cluster_assignment)
        df_evaluation["clusters_hidden"] = list(y)
        clusters_found = df_evaluation["clusters_found"]
        clusters_hidden = df_evaluation["clusters_hidden"]
        evaluation_temp = generate_evaluation(clusters_found, clusters_hidden)
        print "[XM]> Generating results files"
        fileClusters = open(percorsoS + "/cl.txt", "w")
        for item in clusters_found:
            fileClusters.write("%s\n" % item)
        fileClusters.close()
        print "[XM]> Clustering generated"
        coordinate = cluster_MRFs.values()
        #mrf = []
        """
        for mat in coordinate:
            diag = np.diagonal(mat)
            mrf.append(diag)
        mrf = pd.DataFrame(mrf).to_string()
        """
        for cl, mat in enumerate(coordinate):
            centri = open(percorsoS + "/model_parameters" + str(cl) + ".txt",
                          "w")
            mat = pd.DataFrame(mat).to_string()
            centri.write("{}".format(mat))
            centri.close()

        print "[XM]> Model parameters generated"
        general_info(righe=df.shape[0],
                     colonne=df.shape[1],
                     clust=cluster,
                     window=window,
                     p_lambda=p_lambda,
                     beta=beta,
                     bic=bic,
                     aic=aic,
                     ll=ll,
                     percorsoD=percorsoD,
                     percorsoS=percorsoS,
                     accuracy_local=evaluation_temp[0],
                     ce_local=evaluation_temp[1],
                     f1_local=evaluation_temp[2],
                     entropy_local=evaluation_temp[3],
                     purity_local=evaluation_temp[4],
                     nb_clusters_found=evaluation_temp[5])
        if percorsoDN != "":
            fileDN = open(percorsoDN, 'r')
            fileDND = open(percorsoS + "/dataStandardization.csv", 'w')
            for line in fileDN:
                fileDND.write(line)
            fileDN.close()
            fileDND.close()
    else:
        #evaluation_names = ["Acc.", "CE", "F1", "Entropy", "NbClust."]
        df_evaluation = pd.DataFrame()
        df_evaluation["clusters_found"] = list(cluster_assignment)
        #df_evaluation["clusters_hidden"] = list(y)
        clusters_found = df_evaluation["clusters_found"]
        #clusters_hidden = df_evaluation["clusters_hidden"]
        #evaluation_temp =subc.generate_evaluation(clusters_found, clusters_hidden)
        #txt = subc.get_evaluation_string(evaluation_temp, evaluation_names)+"
        print "[XM]> Generating results files"
        fileClusters = open(percorsoS + "/cl.txt", "w")
        for item in clusters_found:
            fileClusters.write("%s\n" % item)
        fileClusters.close()
        print "[XM]> Clustering generated"
        coordinate = cluster_MRFs.values()
        #mrf = []
        """
        for mat in coordinate:
            diag = np.diagonal(mat)
            mrf.append(diag)
        mrf = pd.DataFrame(mrf).to_string()
        centri = open(percorsoS+"/model_parameters.txt","w")
        centri.write("{}".format(mrf))
        centri.close()
        """
        for cl, mat in enumerate(coordinate):
            centri = open(percorsoS + "/model_parameters" + str(cl) + ".txt",
                          "w")
            mat = pd.DataFrame(mat).to_string()
            centri.write("{}".format(mat))
            centri.close()
        print "[XM]> Model parameters generated"
        general_info(righe=df.shape[0],
                     colonne=df.shape[1],
                     clust=cluster,
                     window=window,
                     p_lambda=p_lambda,
                     beta=beta,
                     bic=bic,
                     aic=aic,
                     ll=ll,
                     percorsoD=percorsoD,
                     percorsoS=percorsoS,
                     accuracy_local="NA",
                     ce_local="NA",
                     f1_local="NA",
                     entropy_local="NA",
                     purity_local="NA",
                     nb_clusters_found=np.unique(clusters_found).size)
        if percorsoDN != "":
            fileDN = open(percorsoDN, 'r')
            fileDND = open(percorsoS + "/dataStandardization.csv", 'w')
            for line in fileDN:
                fileDND.write(line)
            fileDN.close()
            fileDND.close()