Python dist_all示例，pyksc.dist.dist_all Python示例

示例#1

0

显示文件

文件： test_dist.py 项目： flaviovdf/pyksc

    def test_dist_all(self):
        m1 = np.array([[0.0], [0.0]])
        m2 = np.array([[0.0], [0.0]])

        expected = np.array([[0.0, 0.0], [0.0, 0.0]])
        assert_array_equal(expected, dist.dist_all(m1, m2)[0])
        assert_array_equal(expected, dist.dist_all(m1, m2)[1])

        m1 = np.array([[1.0], [1.0]])
        m2 = np.array([[0.0], [0.0]])
        expected = np.array([[1.0, 1.0], [1.0, 1.0]])
        assert_array_equal(expected, dist.dist_all(m1, m2)[0])

        m1 = np.array([[2.0, 3.0, 4.0], [3.0, 4.0, 0.0]])
        m2 = np.array([[2.0, 3.0, 4.0], [3.0, 4.0, 0.0]])
        expected = np.array([[0.0, 2 / sqrt(29)], [2 / sqrt(29), 0.0]])
        assert_array_almost_equal(expected, dist.dist_all(m1, m2, True)[0])

示例#2

0

显示文件

    def test_dist_all(self):
        m1 = np.array([[0.0], [0.0]])
        m2 = np.array([[0.0], [0.0]])

        expected = np.array([[0.0, 0.0], [0.0, 0.0]])
        assert_array_equal(expected, dist.dist_all(m1, m2)[0])
        assert_array_equal(expected, dist.dist_all(m1, m2)[1])

        m1 = np.array([[1.0], [1.0]])
        m2 = np.array([[0.0], [0.0]])
        expected = np.array([[1.0, 1.0], [1.0, 1.0]])
        assert_array_equal(expected, dist.dist_all(m1, m2)[0])

        m1 = np.array([[2.0, 3.0, 4.0], [3.0, 4.0, 0.0]])
        m2 = np.array([[2.0, 3.0, 4.0], [3.0, 4.0, 0.0]])
        expected = np.array([[0.0, 2 / sqrt(29)], [2 / sqrt(29), 0.0]])
        assert_array_almost_equal(expected, dist.dist_all(m1, m2, True)[0])

示例#3

0

显示文件

def main(tseries_fpath, test_fpath, cents_fpath):

    X = ioutil.load_series(tseries_fpath, test_fpath)

    C = np.loadtxt(cents_fpath)
    dist_cents = dist.dist_all(C, X, rolling=True)[0]
    y_true = dist_cents.argmin(axis=0)

    for t in y_true:
        print t

示例#4

0

显示文件

文件： metrics.py 项目： antoine-tran/pyksc

def cost(tseries, assign, centroids, dist_centroids=None):
    
    num_series = tseries.shape[0]
    if dist_centroids is None:
        dist_centroids = dist_all(centroids, tseries)
    
    cost_f = 0.0
    for i in xrange(num_series):
        k = assign[i]
        cost_f += dist_centroids[k, i] ** 2
    
    return cost_f / num_series

示例#5

0

显示文件

文件： metrics.py 项目： FlorentF9/pyksc

def cost(tseries, assign, centroids, dist_centroids=None):

    num_series = tseries.shape[0]
    if dist_centroids is None:
        dist_centroids = dist_all(centroids, tseries)

    cost_f = 0.0
    for i in range(num_series):
        k = assign[i]
        cost_f += dist_centroids[k, i]**2

    return cost_f / num_series

示例#6

0

显示文件

文件： metrics.py 项目： antoine-tran/pyksc

def avg_inter_dist(tseries, assign, dists_all_pairs=None):
    
    num_series = tseries.shape[0]
    
    if dists_all_pairs is None:
        dists_all_pairs = dist_all(tseries, tseries, rolling=True)[0]
    
    dists = []
    for i in xrange(num_series):
        k = assign[i]
        non_members = assign != k
        dists_i = dists_all_pairs[i]
        dists.extend(dists_i[non_members])
        
    return np.mean(dists), np.std(dists)

示例#7

0

显示文件

文件： metrics.py 项目： FlorentF9/pyksc

def avg_inter_dist(tseries, assign, dists_all_pairs=None):

    num_series = tseries.shape[0]

    if dists_all_pairs is None:
        dists_all_pairs = dist_all(tseries, tseries, rolling=True)[0]

    dists = []
    for i in range(num_series):
        k = assign[i]
        non_members = assign != k
        dists_i = dists_all_pairs[i]
        dists.extend(dists_i[non_members])

    return np.mean(dists), np.std(dists)

示例#8

0

显示文件

def main(tseries_fpath, in_folder):

    ids = []
    with open(tseries_fpath) as tseries_file:
        for l in tseries_file:
            ids.append(l.split()[0])

    ids = np.array(ids)
    folders = glob.glob(os.path.join(in_folder, 'fold-*/ksc'))
    num_folders = len(folders)

    agree = 0
    diff = 0
    
    for i in xrange(num_folders):

        base_i = os.path.dirname(folders[i])
        Ci = np.loadtxt(os.path.join(folders[i], 'cents.dat'))

        train_i = np.loadtxt(os.path.join(base_i, 'train.dat'), dtype='bool')
        assign_i = np.loadtxt(os.path.join(folders[i], 'assign.dat'))

        for j in xrange(i, num_folders):

            base_j = os.path.dirname(folders[j])    
            Cj = np.loadtxt(os.path.join(folders[j], 'cents.dat'))
            
            dists = dist.dist_all(Ci, Cj, rolling=True)[0]
            argsrt = dists.argsort(axis=1)
            
            train_j = np.loadtxt(os.path.join(base_j, 'train.dat'), dtype='bool')    
            assign_j = np.loadtxt(os.path.join(folders[j], 'assign.dat'))
            
            for k in xrange(argsrt.shape[0]):
                first = True
                for o in argsrt[k]:
                    ids_k = set(ids[train_i][assign_i == k])
                    ids_o = set(ids[train_j][assign_j == o])
                    n_inter = len(ids_k.intersection(ids_o))

                    if first:
                        first = False
                        agree += n_inter
                    else:
                        diff += n_inter
    
    print('AgreedProb = ', agree / (agree + diff))
    print('DisagreeProb = ', diff / (agree + diff))

示例#9

0

显示文件

文件： summarize_results.py 项目： FlorentF9/pyksc

def main(tseries_fpath, base_folder):

    folders = glob.glob(os.path.join(base_folder, 'fold-*'))
    num_folders = len(folders)

    cluster_mapping = []
    C_base = np.loadtxt(os.path.join(folders[0], 'ksc/cents.dat'))

    for i in range(num_folders):
        Ci = np.loadtxt(os.path.join(folders[i], 'ksc/cents.dat'))

        dists = dist.dist_all(Ci, C_base, rolling=True)[0]
        closest = dists.argmin(axis=1)

        cluster_mapping.append({})
        for k in range(Ci.shape[0]):
            cluster_mapping[i][k] = closest[k]

    y_true_all = []
    y_pred_all = []
    for i in range(num_folders):
        y_true = np.loadtxt(os.path.join(folders[i], 'ksc/test_assign.dat'))
        y_pred = np.loadtxt(os.path.join(folders[i], \
                'cls-res-fitted-50/pred.dat'))

        for j in range(y_true.shape[0]):
            y_true[j] = cluster_mapping[i][y_true[j]]
            if y_pred[j] != -1:
                y_pred[j] = cluster_mapping[i][y_pred[j]]

        y_true_all.extend(y_true)
        y_pred_all.extend(y_pred)

    y_pred_all = np.asarray(y_pred_all)
    y_true_all = np.asarray(y_true_all)

    report = classification_report(y_true_all, y_pred_all)
    valid = y_pred_all != -1
    print()
    print('Using the centroids from folder: ', folders[0])
    print('Micro Aggregation of Folds:')
    print('%.3f fract of videos were not classified' %
          (sum(~valid) / y_pred_all.shape[0]))
    print()
    print(classification_report(y_true_all[valid], y_pred_all[valid]))

示例#10

0

显示文件

文件： summarize_results.py 项目： flaviovdf/pyksc

def main(tseries_fpath, base_folder):

    folders = glob.glob(os.path.join(base_folder, "fold-*"))
    num_folders = len(folders)

    cluster_mapping = []
    C_base = np.loadtxt(os.path.join(folders[0], "ksc/cents.dat"))

    for i in xrange(num_folders):
        Ci = np.loadtxt(os.path.join(folders[i], "ksc/cents.dat"))

        dists = dist.dist_all(Ci, C_base, rolling=True)[0]
        closest = dists.argmin(axis=1)

        cluster_mapping.append({})
        for k in xrange(Ci.shape[0]):
            cluster_mapping[i][k] = closest[k]

    y_true_all = []
    y_pred_all = []
    for i in xrange(num_folders):
        y_true = np.loadtxt(os.path.join(folders[i], "ksc/test_assign.dat"))
        y_pred = np.loadtxt(os.path.join(folders[i], "cls-res-fitted-50/pred.dat"))

        for j in xrange(y_true.shape[0]):
            y_true[j] = cluster_mapping[i][y_true[j]]
            if y_pred[j] != -1:
                y_pred[j] = cluster_mapping[i][y_pred[j]]

        y_true_all.extend(y_true)
        y_pred_all.extend(y_pred)

    y_pred_all = np.asarray(y_pred_all)
    y_true_all = np.asarray(y_true_all)

    report = classification_report(y_true_all, y_pred_all)
    valid = y_pred_all != -1
    print()
    print("Using the centroids from folder: ", folders[0])
    print("Micro Aggregation of Folds:")
    print("%.3f fract of videos were not classified" % (sum(~valid) / y_pred_all.shape[0]))
    print()
    print(classification_report(y_true_all[valid], y_pred_all[valid]))

示例#11

0

显示文件

文件： metrics.py 项目： antoine-tran/pyksc

def silhouette(tseries, assign, dists_all_pairs=None):
    
    if dists_all_pairs is None:
        dists_all_pairs = dist_all(tseries, tseries, rolling=True)[0]

    num_series = tseries.shape[0]
    sils = np.zeros(num_series, dtype='f')
    labels = set(assign)
    for i in xrange(num_series):
        
        k = assign[i]
        dists_i = dists_all_pairs[i]
        intra = np.mean(dists_i[assign == k])
        
        min_inter = float('inf')
        for o in labels:
            if o != k:
                inter = np.mean(dists_i[assign == o])
                if inter < min_inter:
                    min_inter = inter
         
        sils[i] = (min_inter - intra) / max(intra, min_inter)
    
    return np.mean(sils)

示例#12

0

显示文件

文件： metrics.py 项目： FlorentF9/pyksc

def silhouette(tseries, assign, dists_all_pairs=None):

    if dists_all_pairs is None:
        dists_all_pairs = dist_all(tseries, tseries, rolling=True)[0]

    num_series = tseries.shape[0]
    sils = np.zeros(num_series, dtype='f')
    labels = set(assign)
    for i in range(num_series):

        k = assign[i]
        dists_i = dists_all_pairs[i]
        intra = np.mean(dists_i[assign == k])

        min_inter = float('inf')
        for o in labels:
            if o != k:
                inter = np.mean(dists_i[assign == o])
                if inter < min_inter:
                    min_inter = inter

        sils[i] = (min_inter - intra) / max(intra, min_inter)

    return np.mean(sils)

示例#13

0

显示文件

文件： ksc.py 项目： antoine-tran/pyksc

def _base_ksc(tseries, initial_centroids, n_iters=-1):
    '''
    This is the base of the KSC algorithm. It follows the same idea of a K-Means
    algorithm. Firstly, we assign time series to a new cluster based on the
    distance to the centroids. For each time series, it is computed the best
    shift to minimize the distance to the closest centroid.
     
    The assignment step is followed by an update step where new centroids are 
    computed based on the new clustering (based on the update step).
    
    Both steps above are repeated `n_iters` times. If this parameter is negative
    then the steps are repeated until convergence, that is, until no time series
    changes cluster between consecutive steps. 

    Arguments
    ---------
    tseries: a matrix of shape (number of time series, size of each series)
        The time series to cluster
    initial_centroids: a matrix of shape (num. of clusters, size of time series)
        The initial centroid estimates
    n_iters: int
        The number of iterations which the algorithm will run

    Returns
    -------
    centroids: a matrix of shape (num. of clusters, size of time series)
        The final centroids found by the algorithm
    assign: an array of num. series size
        The cluster id which each time series belongs to
    best_shift: an array of num. series size
        The amount shift amount performed for each time series
    cent_dists: a matrix of shape (num. centroids, num. series)
        The distance of each centroid to each time series

    References
    ----------    References
    ----------
    .. [1] J. Yang and J. Leskovec, 
       "Patterns of Temporal Variation in Online Media" - WSDM'11  
       http://dl.acm.org/citation.cfm?id=1935863
    .. [1] J. Yang and J. Leskovec, 
       "Patterns of Temporal Variation in Online Media" - WSDM'11  
       http://dl.acm.org/citation.cfm?id=1935863
    .. [2] Wikipedia, 
        "K-means clustering"  
        http://en.wikipedia.org/wiki/K-means_clustering
    '''
    
    num_clusters = initial_centroids.shape[0]
    num_series = tseries.shape[0]

    centroids = initial_centroids

    #KSC algorithm
    cent_dists = None
    assign = None
    prev_assign = None
    best_shift = None

    iters = n_iters
    converged = False

    while iters != 0 and not converged:
        #assign elements to new clusters    References
        cent_dists, shifts = dist_all(centroids, tseries, rolling=True)
        
        assign = cent_dists.argmin(axis=0)
        best_shift = np.ndarray(num_series, dtype='i')
        for i in xrange(shifts.shape[1]):
            best_shift[i] = shifts[assign[i], i]
        
        #check if converged, if not compute new centroids
        if prev_assign is not None and not (prev_assign - assign).any():
            converged = True
        else: 
            centroids = _compute_centroids(tseries, assign, num_clusters, 
                                          best_shift)

        prev_assign = assign
        iters -= 1
    
    return centroids, assign, best_shift, cent_dists

示例#14

0

显示文件

文件： plot_quality.py 项目： flaviovdf/pyksc

def main(tseries_fpath, plot_foldpath):
    assert os.path.isdir(plot_foldpath)
    initialize_matplotlib()

    X = np.genfromtxt(tseries_fpath)[:, 1:].copy()

    n_samples = X.shape[0]
    sample_rows = np.arange(n_samples)

    clust_range = range(2, 16)
    n_clustering_vals = len(clust_range)

    intra_array = np.zeros(shape=(25, n_clustering_vals))
    inter_array = np.zeros(shape=(25, n_clustering_vals))
    bcvs_array = np.zeros(shape=(25, n_clustering_vals))
    costs_array = np.zeros(shape=(25, n_clustering_vals))

    r = 0
    for i in xrange(5):
        np.random.shuffle(sample_rows)
        rand_sample = sample_rows[:200]

        X_new = X[rand_sample]
        D_new = dist.dist_all(X_new, X_new, rolling=True)[0]

        for j in xrange(5):
            for k in clust_range:
                intra, inter, bcv, cost = run_clustering(X_new, k, D_new)

                intra_array[r, k - 2] = intra
                inter_array[r, k - 2] = inter
                bcvs_array[r, k - 2] = bcv
                costs_array[r, k - 2] = cost

            r += 1
            print(r)

    intra_err = np.zeros(n_clustering_vals)
    inter_err = np.zeros(n_clustering_vals)
    bcvs_err = np.zeros(n_clustering_vals)
    costs_err = np.zeros(n_clustering_vals)

    for k in clust_range:
        j = k - 2
        intra_err[j] = hci(intra_array[:, j], 0.95)
        inter_err[j] = hci(inter_array[:, j], 0.95)
        bcvs_err[j] = hci(bcvs_array[:, j], 0.95)
        costs_err[j] = hci(costs_array[:, j], 0.95)

    plt.errorbar(clust_range, np.mean(inter_array, axis=0), fmt="gD", label="Inter Cluster", yerr=inter_err)
    plt.errorbar(clust_range, np.mean(bcvs_array, axis=0), fmt="bo", label="BetaCV", yerr=bcvs_err)
    plt.errorbar(clust_range, np.mean(intra_array, axis=0), fmt="rs", label="Intra Cluster", yerr=intra_err)
    plt.ylabel("Average Distance")
    plt.xlabel("Number of clusters")
    plt.xlim((0.0, 16))
    plt.ylim((0.0, 1.0))
    plt.legend(frameon=False, loc="lower left")
    plt.savefig(os.path.join(plot_foldpath, "bcv.pdf"))
    plt.close()

    plt.errorbar(clust_range, np.mean(costs_array, axis=0), fmt="bo", label="Cost", yerr=costs_err)
    plt.ylabel("Cost (F)")
    plt.xlabel("Number of clusters")
    plt.xlim((0.0, 16))
    plt.ylim((0.0, 1.0))
    plt.legend(frameon=False, loc="lower left")
    plt.savefig(os.path.join(plot_foldpath, "cost.pdf"))
    plt.close()

示例#15

0

显示文件

def main(tseries_fpath, plot_foldpath):
    assert os.path.isdir(plot_foldpath)
    initialize_matplotlib()

    X = np.genfromtxt(tseries_fpath)[:, 1:].copy()

    n_samples = X.shape[0]
    sample_rows = np.arange(n_samples)

    clust_range = range(2, 16)
    n_clustering_vals = len(clust_range)

    intra_array = np.zeros(shape=(25, n_clustering_vals))
    inter_array = np.zeros(shape=(25, n_clustering_vals))
    bcvs_array = np.zeros(shape=(25, n_clustering_vals))
    costs_array = np.zeros(shape=(25, n_clustering_vals))

    r = 0
    for i in xrange(5):
        np.random.shuffle(sample_rows)
        rand_sample = sample_rows[:200]

        X_new = X[rand_sample]
        D_new = dist.dist_all(X_new, X_new, rolling=True)[0]

        for j in xrange(5):
            for k in clust_range:
                intra, inter, bcv, cost = run_clustering(X_new, k, D_new)

                intra_array[r, k - 2] = intra
                inter_array[r, k - 2] = inter
                bcvs_array[r, k - 2] = bcv
                costs_array[r, k - 2] = cost

            r += 1
            print(r)

    intra_err = np.zeros(n_clustering_vals)
    inter_err = np.zeros(n_clustering_vals)
    bcvs_err = np.zeros(n_clustering_vals)
    costs_err = np.zeros(n_clustering_vals)

    for k in clust_range:
        j = k - 2
        intra_err[j] = hci(intra_array[:, j], .95)
        inter_err[j] = hci(inter_array[:, j], .95)
        bcvs_err[j] = hci(bcvs_array[:, j], .95)
        costs_err[j] = hci(costs_array[:, j], .95)

    plt.errorbar(clust_range,
                 np.mean(inter_array, axis=0),
                 fmt='gD',
                 label='Inter Cluster',
                 yerr=inter_err)
    plt.errorbar(clust_range,
                 np.mean(bcvs_array, axis=0),
                 fmt='bo',
                 label='BetaCV',
                 yerr=bcvs_err)
    plt.errorbar(clust_range,
                 np.mean(intra_array, axis=0),
                 fmt='rs',
                 label='Intra Cluster',
                 yerr=intra_err)
    plt.ylabel('Average Distance')
    plt.xlabel('Number of clusters')
    plt.xlim((0., 16))
    plt.ylim((0., 1.))
    plt.legend(frameon=False, loc='lower left')
    plt.savefig(os.path.join(plot_foldpath, 'bcv.pdf'))
    plt.close()

    plt.errorbar(clust_range,
                 np.mean(costs_array, axis=0),
                 fmt='bo',
                 label='Cost',
                 yerr=costs_err)
    plt.ylabel('Cost (F)')
    plt.xlabel('Number of clusters')
    plt.xlim((0., 16))
    plt.ylim((0., 1.))
    plt.legend(frameon=False, loc='lower left')
    plt.savefig(os.path.join(plot_foldpath, 'cost.pdf'))
    plt.close()

示例#16

0

显示文件

def _base_ksc(tseries, initial_centroids, n_iters=-1):
    '''
    This is the base of the KSC algorithm. It follows the same idea of a K-Means
    algorithm. Firstly, we assign time series to a new cluster based on the
    distance to the centroids. For each time series, it is computed the best
    shift to minimize the distance to the closest centroid.
     
    The assignment step is followed by an update step where new centroids are 
    computed based on the new clustering (based on the update step).
    
    Both steps above are repeated `n_iters` times. If this parameter is negative
    then the steps are repeated until convergence, that is, until no time series
    changes cluster between consecutive steps. 

    Arguments
    ---------
    tseries: a matrix of shape (number of time series, size of each series)
        The time series to cluster
    initial_centroids: a matrix of shape (num. of clusters, size of time series)
        The initial centroid estimates
    n_iters: int
        The number of iterations which the algorithm will run

    Returns
    -------
    centroids: a matrix of shape (num. of clusters, size of time series)
        The final centroids found by the algorithm
    assign: an array of num. series size
        The cluster id which each time series belongs to
    best_shift: an array of num. series size
        The amount shift amount performed for each time series
    cent_dists: a matrix of shape (num. centroids, num. series)
        The distance of each centroid to each time series

    References
    ----------    References
    ----------
    .. [1] J. Yang and J. Leskovec, 
       "Patterns of Temporal Variation in Online Media" - WSDM'11  
       http://dl.acm.org/citation.cfm?id=1935863
    .. [1] J. Yang and J. Leskovec, 
       "Patterns of Temporal Variation in Online Media" - WSDM'11  
       http://dl.acm.org/citation.cfm?id=1935863
    .. [2] Wikipedia, 
        "K-means clustering"  
        http://en.wikipedia.org/wiki/K-means_clustering
    '''

    num_clusters = initial_centroids.shape[0]
    num_series = tseries.shape[0]

    centroids = initial_centroids

    #KSC algorithm
    cent_dists = None
    assign = None
    prev_assign = None
    best_shift = None

    iters = n_iters
    converged = False

    while iters != 0 and not converged:
        #assign elements to new clusters    References
        cent_dists, shifts = dist_all(centroids, tseries, rolling=True)

        assign = cent_dists.argmin(axis=0)
        best_shift = np.ndarray(num_series, dtype='i')
        for i in range(shifts.shape[1]):
            best_shift[i] = shifts[assign[i], i]

        #check if converged, if not compute new centroids
        if prev_assign is not None and not (prev_assign - assign).any():
            converged = True
        else:
            centroids = _compute_centroids(tseries, assign, num_clusters,
                                           best_shift)

        prev_assign = assign
        iters -= 1

    return centroids, assign, best_shift, cent_dists

示例#17

0

显示文件

文件： trend_comparison.py 项目： flaviovdf/ecmlpkdd-analytics-challenge-2014

    Z = preprocessing.StandardScaler().fit_transform(T)
    km = cluster.MiniBatchKMeans(n_clusters=num_clusters)
    km = km.fit(Z)
    D = km.transform(Z)

    return D

if __name__ == '__main__':
    
    X_train, T12_train, hosts_train = myio.read_features(test=False)
    Y_train = myio.read_response_train()
    k = 50 
    
    print('K-means')
    D = transform_km(T12_train, k)
    X_train_new = np.hstack((D,  X_train))
    
    model = OLS()
    model.fit(X_train_new, Y_train)
    print(k, np.sqrt(model.G.mean(axis=0)))

    print('KSC')
    C = np.genfromtxt('ksc-results/cents_visits_%d.dat' % k, dtype='d')
    T_nolog = np.asarray(np.exp(T12_train) - 1, order='C')
    D = dist_all(C, T_nolog, rolling=True)[0].T
    X_train_new = np.hstack((D,  X_train))
    
    model = OLS()
    model.fit(X_train_new, Y_train)
    print(k, np.sqrt(model.G.mean(axis=0)))