Python import_data示例，clean1.import_data Python示例

示例#1

0

显示文件

文件： clustering.py 项目： gmiers7642/exoplanets

def unit004():
    print "Importing data..."
    df = c.import_data('../data/planets.csv')

    # Extract columns
    cols_phys = ['pl_orbper', 'pl_orbsmax', 'pl_orbeccen', 'pl_orbincl',
                 'pl_bmassj', 'pl_radj', 'pl_dens', 'st_dist',
                 'st_optmag', 'st_teff', 'st_mass', 'st_rad',
                 'st_logg', 'st_dens', 'st_lum', 'pl_rvamp',
                 'pl_eqt', 'st_plx', 'st_age', 'st_vsini',
                 'st_acts']
    df_p = c.get_physical_columns(df, cols_phys)

    logcols = ['pl_bmassj', 'pl_dens', 'pl_orbper', 'pl_orbsmax',
               'pl_radj', 'st_dist', 'st_rad', 'st_teff', 'st_dens',
               'pl_rvamp', 'st_plx', 'st_vsini', 'st_acts']

    # Pre-process the data, imputation, and create svd for plotting
    print "Applying pre-processing..."
    km_labels, df_imputed = kmeans_centroid_fill(df_p, 3, 10)

    # Columns selected from previous analysi of 21 columns
    print "Clustering..."
    cols = ['st_age', 'pl_orbsmax', 'pl_bmassj', 'st_plx', 'st_dist', 'st_lum', 'st_mass']
    n_clusters = 3
    ac, df_select = agg_clustering(df_imputed, cols, n_clusters)
    labels = ac.labels_

    # Create svd
    u,s,vt = svd(df_select)

    # Create the color codeded pc plot
    print "Creating pc plot..."
    color_coded_pc_plot(u, labels)

示例#2

0

显示文件

文件： clustering.py 项目： gmiers7642/exoplanets

def unit003():
    print "Importing data..."
    df = c.import_data('../data/planets.csv')

    # Extract columns
    cols_phys = ['pl_orbper', 'pl_orbsmax', 'pl_orbeccen', 'pl_orbincl',
                 'pl_bmassj', 'pl_radj', 'pl_dens', 'st_dist',
                 'st_optmag', 'st_teff', 'st_mass', 'st_rad',
                 'st_logg', 'st_dens', 'st_lum', 'pl_rvamp',
                 'pl_eqt', 'st_plx', 'st_age', 'st_vsini',
                 'st_acts']
    df_p = c.get_physical_columns(df, cols_phys)

    logcols = ['pl_bmassj', 'pl_dens', 'pl_orbper', 'pl_orbsmax',
               'pl_radj', 'st_dist', 'st_rad', 'st_teff', 'st_dens',
               'pl_rvamp', 'st_plx', 'st_vsini', 'st_acts']

    # Pre-process the data, imputation, and create svd for plotting
    print "Applying pre-processing"
    km_labels, df_imputed = kmeans_centroid_fill(df_p, 3, 10)
    u,s,vt = svd(df_imputed)

    # Create plot of most important components
    print "Creating feature plots"
    plot_pc_features(vt, 4, df_imputed.columns, "21 columns")

示例#3

0

显示文件

文件： clustering.py 项目： gmiers7642/exoplanets

def unit002():
    print "Importing data"
    df = c.import_data('../data/planets.csv')

    # Extract columns
    cols_phys = ['pl_orbper', 'pl_orbsmax', 'pl_orbeccen', 'pl_orbincl',
                 'pl_bmassj', 'pl_radj', 'pl_dens', 'st_dist',
                 'st_optmag', 'st_teff', 'st_mass', 'st_rad',
                 'st_logg', 'st_dens', 'st_lum', 'pl_rvamp',
                 'pl_eqt', 'st_plx', 'st_age', 'st_vsini',
                 'st_acts']
    df_p = c.get_physical_columns(df, cols_phys)

    logcols = ['pl_bmassj', 'pl_dens', 'pl_orbper', 'pl_orbsmax',
               'pl_radj', 'st_dist', 'st_rad', 'st_teff', 'st_dens',
               'pl_rvamp', 'st_plx', 'st_vsini', 'st_acts']

    # Pre-process the data, apply lof to all columns
    df_p_log = log_all(df_p, df_p.columns)
    df_p_avg = fill_avg(df_p_log)

    # Actual agglomerative clustering
    n_clusters = 3
    ac = AgglomerativeClustering(n_clusters=n_clusters)
    ac.fit(df_p_avg)

    # Scatterplotting of agglomerative clusters
    cph1 = cols_phys[0:10]
    cph2 = cols_phys[11:]
    ac_labels = ac.labels_
    c.physical_scatterplot(df_p, cph1, cph2, logcols=df_p.columns, \
                                colors=ac_labels, alpha=0.2)

示例#4

0

显示文件

文件： clustering.py 项目： gmiers7642/exoplanets

def unit001():
    print "Importing data"
    df = c.import_data('../data/planets.csv')

    # Extract columns
    cols_phys = ['pl_orbper', 'pl_orbsmax', 'pl_orbeccen', 'pl_orbincl',
                 'pl_bmassj', 'pl_radj', 'pl_dens', 'st_dist',
                 'st_optmag', 'st_teff', 'st_mass', 'st_rad',
                 'st_logg', 'st_dens', 'st_lum', 'pl_rvamp',
                 'pl_eqt', 'st_plx', 'st_age', 'st_vsini',
                 'st_acts']
    df_p = c.get_physical_columns(df, cols_phys)

    logcols = ['pl_bmassj', 'pl_dens', 'pl_orbper', 'pl_orbsmax',
               'pl_radj', 'st_dist', 'st_rad', 'st_teff', 'st_dens',
               'pl_rvamp', 'st_plx', 'st_vsini', 'st_acts']

    print "Creating clusters"
    labels, df_imputed = kmeans_centroid_fill(df_p, 3, 10)

    # Split columns into two groups
    cph1 = cols_phys[0:10]
    cph2 = cols_phys[11:]

    print "Creating plots"
    c.physical_scatterplot(df_p, cph1, cph2, logcols=df_p.columns, \
                           colors=labels, alpha=0.2)

示例#5

0

显示文件

文件： create_clusters.py 项目： gmiers7642/exoplanets

def create_clusters_dbscan():
    np.random.seed(seed=12509234)

    df = c.import_data('../data/planets.csv')

    # Extract columns
    cols_phys = [
        'pl_orbper', 'pl_orbsmax', 'pl_orbeccen', 'pl_orbincl', 'pl_bmassj',
        'pl_radj', 'pl_dens', 'st_dist', 'st_optmag', 'st_teff', 'st_mass',
        'st_rad', 'st_logg', 'st_dens', 'st_lum', 'pl_rvamp', 'pl_eqt',
        'st_plx', 'st_age', 'st_vsini', 'st_acts'
    ]
    df_p = c.get_physical_columns(df, cols_phys)

    logcols = [
        'pl_bmassj', 'pl_dens', 'pl_orbper', 'pl_orbsmax', 'pl_radj',
        'st_dist', 'st_rad', 'st_teff', 'st_dens', 'pl_rvamp', 'st_plx',
        'st_vsini', 'st_acts'
    ]

    # Pre-process the data, apply lof to all columns
    km_labels, df_imputed = cl.kmeans_centroid_fill(df_p, 3, 10)

    # Create TSNE embedding
    vis_data_transit = bh_sne(df_imputed, perplexity=40)
    vis_x_transit = vis_data_transit[:, 0]
    vis_y_transit = vis_data_transit[:, 1]

    # Create a background plot of TSNE embedding
    fig = plt.figure(figsize=(12, 8))
    plt.scatter(vis_y_transit,
                vis_x_transit,
                c=['blue'],
                cmap=plt.cm.get_cmap("jet", 10),
                alpha=0.2)
    plt.savefig("../data/QC010_TSNE_background.png")

    # DBSCAN clustering
    X = np.array([vis_x_transit, vis_y_transit]).T
    dbs = DBSCAN(eps=2.1, min_samples=12)
    dbs.fit(X)

    # Generate clustering plot from TSNE
    n_clusters = len(np.unique(dbs.labels_))
    fig = plt.figure(figsize=(15, 12))
    plt.scatter(vis_y_transit,
                vis_x_transit,
                c=dbs.labels_,
                cmap=plt.cm.get_cmap("jet", n_clusters),
                alpha=1.0,
                s=10 * dbs.labels_ + 1)
    plt.colorbar(ticks=range(n_clusters))
    plt.clim(-0.5, n_clusters - 0.5)
    plt.savefig("../data/QC011_TSNE_clustering_w_sizes.png")

示例#6

0

显示文件

文件： clustering.py 项目： gmiers7642/exoplanets

def unit005():
    print "Importing data..."
    df = c.import_data('../data/planets.csv')

    # Extract columns
    cols_phys = ['pl_orbper', 'pl_orbsmax', 'pl_orbeccen', 'pl_orbincl',
                 'pl_bmassj', 'pl_radj', 'pl_dens', 'st_dist',
                 'st_optmag', 'st_teff', 'st_mass', 'st_rad',
                 'st_logg', 'st_dens', 'st_lum', 'pl_rvamp',
                 'pl_eqt', 'st_plx', 'st_age', 'st_vsini',
                 'st_acts']
    df_p = c.get_physical_columns(df, cols_phys)

    logcols = ['pl_bmassj', 'pl_dens', 'pl_orbper', 'pl_orbsmax',
               'pl_radj', 'st_dist', 'st_rad', 'st_teff', 'st_dens',
               'pl_rvamp', 'st_plx', 'st_vsini', 'st_acts']

    # Pre-process the data, imputation, and create svd for plotting
    print "Applying pre-processing..."
    km_labels, df_imputed = kmeans_centroid_fill(df_p, 3, 10)

    # Create svd
    u,s,vt = svd(df_imputed)

    # Select top 7 features, and plot some principal components
    top7 = get_n_best(vt, 7, df_imputed.columns)

    df_top_7 = df_imputed[top7['features'].values]
    print top7['features'].values

    # Create svd, again
    u7,s7,vt7 = svd(df_top_7)

    # Create plot of most important components
    print "Creating feature plots"
    plot_pc_features(vt7, 4, df_top_7.columns, "7 columns")

示例#7

0

显示文件

文件： create_clusters.py 项目： gmiers7642/exoplanets

def svd_cluster_analysis():
    print "Importing data..."
    df = c.import_data('../data/planets.csv')

    # Extract columns
    cols_phys = [
        'pl_orbper', 'pl_orbsmax', 'pl_orbeccen', 'pl_orbincl', 'pl_bmassj',
        'pl_radj', 'pl_dens', 'st_dist', 'st_optmag', 'st_teff', 'st_mass',
        'st_rad', 'st_logg', 'st_dens', 'st_lum', 'pl_rvamp', 'pl_eqt',
        'st_plx', 'st_age', 'st_vsini', 'st_acts'
    ]
    df_p = c.get_physical_columns(df, cols_phys)

    logcols = [
        'pl_bmassj', 'pl_dens', 'pl_orbper', 'pl_orbsmax', 'pl_radj',
        'st_dist', 'st_rad', 'st_teff', 'st_dens', 'pl_rvamp', 'st_plx',
        'st_vsini', 'st_acts'
    ]

    # Pre-process the data, apply lof to all columns
    print "Creating imputed data..."
    km_labels, df_imputed = cl.kmeans_centroid_fill(df_p, 3, 10)

    print "Separating transit and radial velocity data..."
    df_imputed['pl_discmethod'] = df['pl_discmethod']
    df_transit = df[df['pl_discmethod'] == 'Transit']
    df_radialv = df[df['pl_discmethod'] == 'Radial Velocity']
    df_p_transit = df_imputed[df_imputed['pl_discmethod'] == 'Transit'].drop(
        'pl_discmethod', 1)
    df_p_radialv = df_imputed[df_imputed['pl_discmethod'] ==
                              'Radial Velocity'].drop('pl_discmethod', 1)

    # SVDs
    print "Determining relevances of features..."
    ut, st, vtt = svd(df_p_transit)
    ur, sr, vtr = svd(df_p_radialv)

    transit_relevances = cl.get_n_best(vtt, 21, df_p_transit.columns)
    radial_relevances = cl.get_n_best(vtr, 21, df_p_radialv.columns)

    ### So, 11 pcs for transits
    cols_transit = radial_relevances['features'].values[0:11]
    ### And, 7 for the radial velocity cases
    cols_radialv = radial_relevances['features'].values[0:7]

    print "Performing cluster analysis..."
    n_clusters = 3

    # Transit
    ac_transit, df_select_transit = cl.agg_clustering(df_p_transit,
                                                      cols_transit, n_clusters)
    labels_transit = ac_transit.labels_

    # Radial velocity
    ac_radialv, df_select_radialv = cl.agg_clustering(df_p_radialv,
                                                      cols_radialv, n_clusters)
    labels_radialv = ac_radialv.labels_

    print "Creating plotting attributes for pca cluster analysis..."
    uts, sts, vtts = svd(df_p_transit)
    urs, srs, vtrs = svd(df_p_radialv)

    # Plotting the first two pc clusters, commented out until ready to export
    #cl.color_coded_pc_plot(uts[:,0:], labels_transit, xlim=None, ylim=None)

    df_p_transit['label'] = labels_transit + 1

    print "Creating separated groups for svd clusters..."
    df1 = df_p_transit[df_p_transit['label'] == 1]
    df2 = df_p_transit[df_p_transit['label'] == 2]
    df3 = df_p_transit[df_p_transit['label'] == 3]

    print "Creating normailzed summary statistics..."
    df_sst = pd.DataFrame({
        'full_mean': df_p_transit.mean(),
        'full_std': df_p_transit.std(),
        'g1_mean': df1.mean(),
        'g1_std': df1.std(),
        'g2_mean': df2.mean(),
        'g2_std': df2.std(),
        'g3_mean': df3.mean(),
        'g3_std': df3.std()
    }).T

    df_means = df_sst.loc[['full_mean', 'g1_mean', 'g2_mean', 'g3_mean'], :]
    df_means_n = df_means.copy()
    disp_cols = [
        'pl_orbper', 'pl_orbsmax', 'pl_orbeccen', 'pl_bmassj', 'pl_radj',
        'st_lum', 'pl_rvamp', 'st_vsini'
    ]
    df_means_n = df_means_n[disp_cols]

    for col in df_means_n:
        df_means_n.loc['g1_mean',
                       col] = df_means_n.loc['g1_mean',
                                             col] / df_means_n.loc['full_mean',
                                                                   col]
        df_means_n.loc['g2_mean',
                       col] = df_means_n.loc['g2_mean',
                                             col] / df_means_n.loc['full_mean',
                                                                   col]
        df_means_n.loc['g3_mean',
                       col] = df_means_n.loc['g3_mean',
                                             col] / df_means_n.loc['full_mean',
                                                                   col]

    # Bar plot highlighting differences in features between the columns, commented out until ready
    #print "Creating bar plot of svd clusters..."
    #df_means_n.T[['g1_mean', 'g2_mean', 'g3_mean']].plot(figsize=(15,8), kind='bar');
    #plt.show()

    print "Entering data on the Earth / Sun system for comparison..."
    df_s = pd.DataFrame(
        {
            'pl_orbper': 365.24,
            'pl_orbsmax': 1.00001018,
            'pl_orbeccen': 0.0167086,
            'pl_orbincl': 1.578690,
            'pl_bmassj': 0.0911301,
            'pl_radj': 0.00314442,
            'pl_dens': 5.514,
            'st_dist': 0.0,
            'st_optmag': 4.75,
            'st_teff': 5777.0,
            'st_mass': 1.0,
            'st_rad': 1.0,
            'st_logg': 1.447468,
            'st_dens': 1.41,
            'st_lum': 0.0,
            'pl_rvamp': 0.1,
            'pl_eqt': 300.0,
            'st_plx': 21600.0,
            'st_age': 4.6,
            'st_vsini': 0.46511,
            'st_acts': 0.65
        },
        index=['sun'])

    df_s = df_s.loc[:, disp_cols]
    df_means_n = df_means_n.append(df_s)

    # Critical normalizing step
    for col in df_means_n.columns:
        df_means_n.loc['sun',
                       col] = np.log(df_means_n.loc['sun', col] +
                                     1) / df_means_n.loc['full_mean', col]

    # Plotting comparison attributes with the sun / earth system included, commented out until ready
    df_means_n.T[['g1_mean', 'g2_mean', 'g3_mean',
                  'sun']].plot(figsize=(15, 7), kind='bar')
    plt.show()

示例#8

0

显示文件

文件： create_clusters.py 项目： gmiers7642/exoplanets

def create_clusters_agg():
    ### Import data, and create imputed KMeans data

    # Extract columns
    print "Importing data..."
    df = c.import_data('../data/planets.csv')
    cols_phys = [
        'pl_orbper', 'pl_orbsmax', 'pl_orbeccen', 'pl_orbincl', 'pl_bmassj',
        'pl_radj', 'pl_dens', 'st_dist', 'st_optmag', 'st_teff', 'st_mass',
        'st_rad', 'st_logg', 'st_dens', 'st_lum', 'pl_rvamp', 'pl_eqt',
        'st_plx', 'st_age', 'st_vsini', 'st_acts'
    ]
    df_p = c.get_physical_columns(df, cols_phys)

    logcols = [
        'pl_bmassj', 'pl_dens', 'pl_orbper', 'pl_orbsmax', 'pl_radj',
        'st_dist', 'st_rad', 'st_teff', 'st_dens', 'pl_rvamp', 'st_plx',
        'st_vsini', 'st_acts'
    ]

    # Pre-process the data, apply lof to all columns
    print "Imputing data..."
    km_labels, df_imputed = cl.kmeans_centroid_fill(df_p, 3, 10)

    ### Transit and radial velocity group analysis

    # Split the data into transit and radial velocity groups
    print "Splitting into transit and radial velocity sets..."
    df_imputed['pl_discmethod'] = df['pl_discmethod']
    df_transit = df[df['pl_discmethod'] == 'Transit']
    df_radialv = df[df['pl_discmethod'] == 'Radial Velocity']
    df_p_transit = df_imputed[df_imputed['pl_discmethod'] == 'Transit'].drop(
        'pl_discmethod', 1)
    df_p_radialv = df_imputed[df_imputed['pl_discmethod'] ==
                              'Radial Velocity'].drop('pl_discmethod', 1)

    # SVDs for feature selection
    print "Creating feature seleciton SVDs..."
    ut, st, vtt = svd(df_p_transit)
    ur, sr, vtr = svd(df_p_radialv)

    # Feature selection, 11 for transiting, 7 for radial velocity, based on prior analysis
    print "Performing feature selection..."
    transit_relevances = cl.get_n_best(vtt, 21, df_p_transit.columns)
    radial_relevances = cl.get_n_best(vtr, 21, df_p_radialv.columns)
    cols_transit = radial_relevances['features'].values[0:11]
    cols_radialv = radial_relevances['features'].values[0:7]

    ### Agglomerative clustering based on optimal parameters from prior analysis
    print "Creating clusters..."
    n_clusters = 3

    # Transit
    ac_transit, df_select_transit = cl.agg_clustering(df_p_transit,
                                                      cols_transit, n_clusters)
    labels_transit = ac_transit.labels_

    # Radial velocity
    ac_radialv, df_select_radialv = cl.agg_clustering(df_p_radialv,
                                                      cols_radialv,
                                                      n_clusters,
                                                      linkage='average',
                                                      affinity='cosine')
    labels_radialv = ac_radialv.labels_

    ### Plot the data clusters for both the transit and radial velocity cases to make sure that everything went ok
    print "Creating plots..."
    fig = plt.figure(figsize=(20, 8))
    ax1 = plt.subplot(1, 2, 1)
    ax1.scatter(ut[:, 0], ut[:, 1], c=labels_transit, s=45, alpha=0.05)
    ax1.set_title("Transit clusters")
    ax1.set_xlim((-0.022, -0.016))
    ax1.legend(labels_transit)
    ax2 = plt.subplot(1, 2, 2)
    ax2.scatter(ur[:, 0], ur[:, 1], c=labels_radialv, s=45, alpha=0.2)
    ax2.set_title("Radial Velocity clusters")
    ax2.legend(labels_radialv)
    plt.suptitle(
        "Cluster labeled scatterplots for transit and radial velocity discoveries"
    )
    plt.savefig("../data/QC001_Clusters_rad_and_trans.png")

    ### Data cleanup and addition of ancillary information
    print "Data cleanup..."
    df_p_transit['label'] = labels_transit
    df_p_radialv['label'] = labels_radialv

    ### Merge data frames back together and output them to disk
    print "Exporting data..."
    merged = df_p_transit.merge(df_p_radialv, how='outer')
    merged.to_csv("../data/planets_physical_w_labels.csv", index=False)