def test_silhouettes(): """ Tests integrity of silhouettes method. Uses a simple dataset and centroid list, and computes the silhouette scores for every station. Compares it to known-good run of silhouettes on the same data, which was laboriously checked via spreadsheet. Same data in and same methods should result in the same result, give or take some small amount of floating point error. Returns: numpy array (dtype:bool), indicates whether sil, Vsil datapoints are (near-)identical for each station in the simplified list. """ def dmu(df, home): homedat = df.loc[[home]] cbd = df.copy() cbd['mu'] = 0 for ivar in ['x', 'y']: cbd.mu = cbd.mu + (df[ivar] - homedat[ivar][0])**2 cbd['mu_' + home] = np.sqrt(cbd.mu) cbd.drop('mu', axis=1) return cbd t0 = pd.read_csv(noaafile('testfiles/sil0_troids.csv')) d0 = pd.read_csv(noaafile('testfiles/sil0_data.csv')) silres = pd.read_csv(noaafile('testfiles/sil0_result.csv')) silres.set_index('station', inplace=True) newsil = noaa_kmeans.silhouettes(d0, 12, 'sil') #there may be a little decimal dust, but that's ok return np.sum(np.abs(silres - newsil)) < 1e-10
def test_centroid_restore(stationstats, metric_function, fname=noaafile('centroids/internal/t0.csv')): """Ensures that we can create save/restore centroids properly. Creates a small centroid-stationassign pair Arguemnts: stationstats -- DataFrame containing station statistics metric_function -- Function quantifying degree of climate difference fname -- filename to save centroids to """ t0, d0 = noaa_kmeans.evolve_kmeans(stationstats, metric_function, gens=2, cct=10) t0.to_csv(fname) t2, d2 = noaa_kmeans.centroid_file(fname, stationstats, metric_function) os.remove(fname) return (0==np.sum(np.abs(d0.vtx - d2.vtx)))