import numpy as np import matplotlib.pyplot as plt import datasets plt.rc('font', family='Arial', size=10) # Load all the DAN data X, Y, X_error, names = datasets.read_dan_data() # Specify which DAN measurements we want to plot h1 = 'DNB_456787389EAC06680361170_______M1' # 2.6 H, 0.0169 BNACS h2 = 'DNB_455739444EAC06560341120_______M1' # 3.3 H, 0.0169 BNACS acs1 = 'DNB_456787389EAC06680361170_______M1' # 2.6 H, 0.0169 BNACS acs2 = 'DNB_459287442EAC06960391552_______M1' # 2.6 H, 0.0143 ACS # Get the matching data h1_idx = np.where(names == h1) h2_idx = np.where(names == h2) acs1_idx = np.where(names == acs1) acs2_idx = np.where(names == acs2) fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(15,5)) # The last time bin is the end of the last bin time_bins = datasets.time_bins_dan[:-1] # Avoid error with x-axis log scale time_bins[0] = 1e-20
import datasets # Parse command line arguments parser = argparse.ArgumentParser() parser.add_argument('--n_components', type=int, default=3, help='number of principal components to use for PCA') parser.add_argument( '--use_restricted_bins', action='store_true', help='only use bins 18-34 and 13-17 for thermal and epithermal') parser.add_argument('--score', help='likelihood or mse') args = parser.parse_args() X, Y, Y_err, names = datasets.read_dan_data() n_meas = X.shape[0] if args.use_restricted_bins: X = np.take(X, range(17, 34) + range(64 + 12, 64 + 17), axis=1) scores = [] for i in range(n_meas): if i in range(5): scores.append(0) continue pca = PCA(n_components=args.n_components) # Build a model using measurements up to this point pca.fit(X[:i, :]) # Get the log likelihood that the new sample belongs to the distribution # of data in the previous sols
type=int, default=2, help='number of neighbors to evaluate') parser.add_argument('--n_components', type=int, default=3, help='number of principal components to use for PCA') parser.add_argument( '--use_restricted_bins', action='store_true', help='only use bins 18-34 and 13-17 for thermal and epithermal') args = parser.parse_args() # Load the data sets X, y = datasets.read_acs_grid_data() dan_X, dan_y, _, names = datasets.read_dan_data() # Normalize counts to approximately same range X = datasets.normalize_counts(X) dan_X = datasets.normalize_counts(dan_X) if args.use_restricted_bins: n_bins = 64 X = np.take(X, range(17, 34) + range(n_bins + 12, n_bins + 17), axis=1) dan_X = np.take(dan_X, range(17, 34) + range(n_bins + 12, n_bins + 17), axis=1) # Project the data into principal subspace of model data pca = PCA(n_components=args.n_components) pca.fit(X)
'--plot_sol_hist', action='store_true', help='plot frequency of sol occurrence for DANs in each cluster') parser.add_argument( '--plot_geochem_hist', action='store_true', help='plot histogram of H and ACS values for DANs in each cluster') # parser.add_argument('--plot_cluster_centers', action='store_true', help='plot DAN at center point of each cluster (as opposed to mean measurement)') # parser.add_argument('--plot_cluster_means', action='store_true', help='plot mean measurement of each cluster (as opposed to center point)') parser.add_argument( '--use_restricted_bins', action='store_true', help='only run analysis for bins 18-34 (CTN) and 12-17 (CETN)') args = parser.parse_args() X, Y, Y_err, X_filenames = datasets.read_dan_data() time_bins = datasets.time_bins_dan n_bins = 64 if args.use_restricted_bins: X = np.take(X, range(17, 34) + range(n_bins + 12, n_bins + 17), axis=1) # Fit PCA model and project data into PC space (and back out) pca = PCA(n_components=args.n_components) pca.fit(X) print sum(pca.explained_variance_ratio_) transformed = pca.transform(X) # gm = GaussianMixture(n_components=args.n_gaussians).fit(transformed) # assignments = gm.predict(transformed)
# import matplotlib # # matplotlib.use('TkAgg') import numpy as np import matplotlib.pyplot as plt from sklearn.decomposition import PCA import datasets plt.rc('font', family='Arial', size=10) X_d, _, _, _ = datasets.read_dan_data() #X_d = datasets.normalize_counts(X_d) X_d = np.take(X_d, range(17, 34) + range(64 + 12, 64 + 17), axis=1) X_m, _ = datasets.read_highweh_grid() #X_m = datasets.normalize_counts(X_m) variances_d = [] variances_m = [] for k in range(1, 11): pca_d = PCA(n_components=k) pca_m = PCA(n_components=k) pca_d.fit(X_d) pca_m.fit(X_m) print k print 'DAN' print pca_d.explained_variance_ratio_ print sum(pca_d.explained_variance_ratio_)
n_test = int(X.shape[0] * args.testing_percentage/100.0) X_test = X[:n_test] Y_test = Y[:n_test] X_train = X[n_test:] Y_train = Y[n_test:] elif args.test_sebina: X, Y = datasets.read_acs_grid_data() n_test = int(X.shape[0] * args.testing_percentage/100.0) X_test = X[:n_test] Y_test = Y[:n_test] X_train = X[n_test:] Y_train = Y[n_test:] elif args.test_dan: if args.model_grid == 'hardgrove2011': X, Y = datasets.read_sim_data(use_dan_bins=True) X_test, Y_test, Y_test_error, test_names = datasets.read_dan_data(limit_2000us=True) n_bins = 34 elif args.model_grid == 'sabina': X, Y = datasets.read_grid_data() X_test, Y_test, Y_test_error, test_names = datasets.read_dan_data(limit_2000us=False, label_source='iki') n_bins = len(datasets.time_bins_dan)-1 elif args.model_grid == 'acs': X, Y = datasets.read_acs_grid_data() X_test, Y_test, Y_chi2, test_names = datasets.read_dan_data(limit_2000us=False, label_source='asu') n_bins = len(datasets.time_bins_dan)-1 elif args.model_grid == 'both': X_full, Y_full = datasets.read_sim_data(use_dan_bins=True) X_rover, Y_rover = datasets.read_grid_data(limit_2000us=True) X = np.concatenate([X_full, X_rover]) Y = np.concatenate([Y_full, Y_rover]) X_test, Y_test, Y_test_error, test_names = datasets.read_dan_data(limit_2000us=True)
import datasets # Parse command line arguments parser = argparse.ArgumentParser() parser.add_argument('--n_components', type=int, default=3, help='number of principal components to use for PCA') parser.add_argument('--normalize', action='store_true', help='normalize the data before PCA') args = parser.parse_args() X_sebina, Y_sebina = datasets.read_acs_grid_data() print Y_sebina.shape X_dan, Y_dan, err_dan, names_dan = datasets.read_dan_data() print Y_dan.shape time_bins = datasets.time_bins_dan n_bins = 64 if args.normalize: X_sebina = datasets.normalize_counts(X_sebina) X_dan = datasets.normalize_counts(X_dan) pca = PCA(n_components=args.n_components) X_t = pca.fit_transform(X_sebina) # Plot the Sebina grid points in PC space fig = plt.figure() ax1 = fig.add_subplot(1, 1, 1, projection='3d')