def linearRegressionDemo(conn): ''' Demonstrate Linear Regression ''' mdl = LinearRegression(conn) #Train Model and Score lreg = LinearRegression(conn) mdl_dict, mdl_params = lreg.train('public.wine_training_set',['1','alcohol','proline','hue','color_intensity','flavanoids'],'quality') #Show model params mdl_params #Now do prediction predictions = lreg.predict('public.wine_test_set','quality') #Show prediction results predictions.head() #Show Scatter Matrix of Actual Vs Predicted smat = scatter_matrix(predictions.get(['quality','prediction']), diagonal='kde') # 1 b) Linear Regression with categorical variables # We'll use the auto_mpg dataset from UCI : http://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.names # make, fuel_type, fuel_system are all categorical variables, rest are real. #Train Linear Regression Model on a mixture of Numeric and Categorical Variables mdl_dict, mdl_params = lreg.train('public.auto_mpg_train',['1','height','width','length','highway_mpg','engine_size','make','fuel_type','fuel_system'],'price') predictions = lreg.predict('public.auto_mpg_test','price') #Show sample predictions predictions.head() #Display Scatter Plot of Actual Vs Predicted Values smat = scatter_matrix(predictions.get(['price','prediction']), diagonal='kde')
def scatter_plot(self, factor): f_and_c = self.get_factors_and_column_indices() # add in the atac seq column columns = [1,] + f_and_c[factor] scatter_matrix(self[columns].rank(), figsize=(16,16), alpha=0.05, color='black')
def corr_plots(self, append=True): """ uses scatter_matrix to plot all columns pair-wise against each other """ plt.figure() # self._df.drop([x for ]) # ptp.scatter_matrix(self._df, alpha=0.2, figsize=(6, 6), diagonal='kde') # ptp.scatter_matrix(self._df, alpha=0.2, diagonal='kde') # diagonal='kde' # ptp.scatter_matrix(self._df, alpha=0.2, diagonal=diagonal) # print(diagonal) # plt.savefig(self._output_dir + "correlation_matrix_%s.png" % diagonal) diagonal = "hist" print(sys._getframe().f_code.co_name, diagonal) # scatter_opt={'kind':'hexbin'} off_diagonal_opt = {"bins": "log"} # draw options for off-diagonal elements # scatter_opt={} # scatter_matrix(self._df, alpha=0.2, hspace=0.2, wspace=0.2, diagonal=diagonal, **scatter_opt) # axes = ptp.scatter_matrix(self._df, alpha=0.2, diagonal=diagonal, **scatter_opt) ptp.scatter_matrix(self._df, alpha=0.2, diagonal=diagonal, **off_diagonal_opt) # scatter_matrix(self._df, alpha=0.2, diagonal=diagonal) fs = fig_summary() # fs.mean = average(df[var_name]) fs.label = "pair-wise correlation plots" fs.fig_path = self._output_dir fs.fig_rel_path = self._rel_dir + "correlation_plots_%s.png" % diagonal plt.savefig(fs.fig_path + fs.fig_rel_path) if not append: self.list_fig_summary.clear() self.list_fig_summary.append(fs)
def scatmat(df, category=None, colors='rgyb', num_plots=4, num_topics=100, num_columns=4, show=False, block=False, data_path=DATA_PATH, save=False, verbose=1): """FIXME: empty plots that dont go away, Plot and/save scatter matrix in groups of num_columns topics""" if category is None: category = list(df.columns)[-1] if category in df.columns: category = df[category] else: category = pd.Series(category) suffix = '{}x{}'.format(len(df), num_topics) save = bool(save) for i in range(int(min(num_plots * num_columns, num_topics) / float(num_plots))): scatter_matrix(df[df.columns[i * num_columns:(i + 1) * num_columns]], marker='+', c=[colors[int(x) % len(colors)] for x in category.values], figsize=(18, 12)) if save: name = 'scatmat_topics_{}-{}.jpg'.format(i * num_columns, (i + 1) * num_columns) + suffix plt.savefig(os.path.join(data_path, name + '.jpg')) if show: if block: plt.show() else: plt.show(block=False)
def plot_data(indf, prefix='html'): """ create scatter matrix plot, histograms """ list_of_plots = [] column_groups = [] column_list = [] for col in indf.columns: if len(indf[col].unique()) > 5 and 'checkin' not in col: column_list.append(col) for idx in range(0, len(column_list), 3): print len(column_list), idx, (idx+3) column_groups.append(column_list[idx:(idx+3)]) for idx in range(len(column_groups)): for idy in range(0, idx): if idx == idy: continue print column_groups[idx]+column_groups[idy] pl.clf() scatter_matrix(indf[column_groups[idx]+column_groups[idy]]) pl.savefig('scatter_matrix_%d_%d.png' % (idx, idy)) list_of_plots.append('scatter_matrix_%d_%d.png' % (idx, idy)) pl.close() for col in indf: pl.clf() print col indf[col].hist(histtype='step', normed=True) pl.title(col) pl.savefig('%s_hist.png' % col) list_of_plots.append('%s_hist.png' % col) create_html_page_of_plots(list_of_plots, prefix) return
def test_scatter_plot_legacy(self): df = pd.DataFrame(randn(100, 2)) with tm.assert_produces_warning(FutureWarning): plotting.scatter_matrix(df) with tm.assert_produces_warning(FutureWarning): pd.scatter_matrix(df)
def auto_pairs(plot_cols, df): import matplotlib.pyplot as plt from pandas.tools.plotting import scatter_matrix fig = plt.figure(figsize=(12, 12)) fig.clf() ax = fig.gca() scatter_matrix(df[plot_cols], alpha=0.3, diagonal='kde', ax = ax) return 'Done'
def visualizeData(inputDF): ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### outputFILE = 'plot-scatter.png' myPlot = inputDF.plot( label = 'population', kind = 'scatter', x = 'longitude', y = 'latitude', s = inputDF["population"] / 100, c = 'median_house_value', cmap = plt.get_cmap("jet"), colorbar = True, alpha = 0.1 ) plt.savefig(filename = outputFILE, bbox_inches='tight', pad_inches=0.2) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### outputFILE = 'plot-correlations.png' corrMatrix = inputDF.corr() attributes = ["median_house_value","median_income","total_rooms","housing_median_age"] myPlot = scatter_matrix(frame=inputDF[attributes], figsize=(12,8)) plt.savefig(filename = outputFILE, bbox_inches='tight', pad_inches=0.2) print('\ncorrMatrix["median_house_value"].sort_values(ascending=False)') print( corrMatrix["median_house_value"].sort_values(ascending=False) ) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### outputFILE = 'plot-medianIncome.png' myPlot = inputDF.plot( kind = 'scatter', x = "median_income", y = "median_house_value", alpha = 0.1, figsize = (12,8) ) plt.savefig(filename = outputFILE, bbox_inches='tight', pad_inches=0.2) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### outputFILE = 'plot-correlations-02.png' tempDF = inputDF.copy() tempDF[ "roomsPerHousehold"] = tempDF["total_rooms"] / tempDF["households"] tempDF["populationPerHousehold"] = tempDF["population"] / tempDF["households"] tempDF[ "bedroomsPerRoom"] = tempDF["total_bedrooms"] / tempDF["total_rooms"] corrMatrix = tempDF.corr() print('\ncorrMatrix["median_house_value"].sort_values(ascending=False)') print( corrMatrix["median_house_value"].sort_values(ascending=False) ) attributes = ["median_house_value","median_income","roomsPerHousehold","bedroomsPerRoom"] myPlot = scatter_matrix(frame=tempDF[attributes], figsize=(12,8)) plt.savefig(filename = outputFILE, bbox_inches='tight', pad_inches=0.2) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### return( None )
def pd_scatter_matrix(self): """ No parameters. Run on object's attributres. A normal scatter plot matrix using pandas.scatter_matrix. Nothing new here. """ class_groups = self.ALL.groupby(self.classes) plt.figure() scatter_matrix(self.ALL, alpha=0.2, figsize=(60, 60), diagonal='kde') plt.savefig('scatter_matrix.png')
def plot_scatter_matrix(x,y,fname='scatter_matrix.png'): import pandas as pd from pandas.tools.plotting import scatter_matrix df = pd.DataFrame(np.hstack((x,y.reshape(len(y),1))), columns=['intensity', 'gaussian', 'gradient mag', 'grad dir', 'laplacian', 'imglog', 'label']) df['label'] = df['label'].astype(int) colors = ['red','green','blue','cyan','black'] import matplotlib.pyplot as plt scatter_matrix(df,figsize=[9,7],marker='x',c=df.label.apply(lambda xx:colors[xx])) plt.savefig(fname) print "Saved scatter matrix to %s" % fname
def plotPcaProjections(self, pca_components=(0, 4)): """ Plots the principal components projected on the data. **Parameters** pca_components : int (tuple) The number of the principal components to be projected """ tmp = np.dot(self.evts,self.U[:, pca_components[0]:pca_components[1]]) df = pd.DataFrame(tmp) scatter_matrix(df, alpha=.2, s=4, c='k', figsize=(10, 10), diagonal='kde', marker=".")
def plot(self, df=None, type=None, **parameter): if type == "bar": df.plot.bar(**parameter) if type == "barh": df.plot.barh(**parameter) if type == "hist": df.hist(**parameter) if type == "scatter": df.plot.scatter(**parameter) if type == "scatter matrix": scatter_matrix(df, diagonal="kde", **parameter)
def data_visualization(): # box and whisker plots dataset.plot(kind='box', subplots=True, layout=(2,2), sharex=False, sharey=False) plt.show() # histotrams dataset.hist() plt.show() # scatter plot matrix scatter_matrix(dataset) plt.show()
def create_xy(): rawdata = pd.read_csv(get_fullpath('insurance.csv'), delimiter = ',') print('====== describe ======') print rawdata.describe() print('====== head ======') print rawdata.head() print('====== corr ======') print rawdata.corr() print('====== describe ======') scatter_matrix(rawdata) rawdata.age.plot() rawdata.charges.hist() rawdata.boxplot()
def azureml_main(frame1): import matplotlib matplotlib.use("agg") matplotlib.style.use('ggplot') import pandas as pd from pandas.tools.plotting import scatter_matrix from mpl_toolkits.mplot3d import Axes3D import numpy as np import matplotlib.pyplot as plt Azure = False ## If not running in MAML read the data from a csv file. if(Azure == False): frame1 = pd.read_csv("C:\\Users\\Steve\\Documents\\AzureML\\Data Sets\\Forest_Fire\\forestfireslog.csv") fig2 = plt.figure(2, figsize = (10,6)) ax = fig2.gca() plt.plot(frame1["X"], frame1["Y"], 'bo', alpha = 0.2) fig2.savefig("C:\\Users\\Steve\\Documents\\AzureML\\Data Sets\\Forest_Fire\\fig2.png") fig1 = plt.figure(1, figsize = (10,6)) ax = fig1.gca() plotCols = ["FFMC", "DMC", "DC", "ISI", \ "temp", "RH", "wind", "rain", "areaLog"] # print(pd.DataFrame.head(frame1[plotCols])) scatter_matrix(frame1[plotCols], ax = ax, alpha = 0.5) fig1.savefig("C:\\Users\\Steve\\Documents\\AzureML\\Data Sets\\Forest_Fire\\fig1.png") print(frame1.shape) trmCols = ["FFMC", "ISI", "rain"] trmLst = [[-3.0, 10.0], [-10.0, 4.0], [-10.0, 3.0]] frame2 = trimOutliers(frame1, trmCols, trmLst) print(frame1.shape) fig4 = plt.figure(4, figsize = (10,6)) ax = fig4.gca() scatter_matrix(frame2[plotCols], ax = ax, alpha = 0.5) fig4.savefig("C:\\Users\\Steve\\Documents\\AzureML\\Data Sets\\Forest_Fire\\fig4.png") fig5 = plt.figure(5, figsize = (12,8)) ax = fig5.add_subplot(221, projection='3d') ax.scatter(frame2["X"], frame2["Y"], frame2["areaLog"], c = 'r') ax = fig5.add_subplot(222, projection='3d') ax.scatter(frame1["X"], frame1["Y"], frame1["areaLog"], c = 'r') fig5.savefig("C:\\Users\\Steve\\Documents\\AzureML\\Data Sets\\Forest_Fire\\fig5.png") return frame1
def plot_full_feature_scatter_matrix(X,Y,fname='scatter_matrix_full_feature.png'): print X.shape; print Y.shape import pandas as pd; from pandas.tools.plotting import scatter_matrix; COL = ['VAR1', 'VAR2', 'VAR3', 'VAR4', 'VAR5', 'VAR6', 'VAR7', 'VAR8', 'VAR9', 'VAR10', 'VAR11', 'VAR12', 'VAR13', 'VAR14', 'VAR15', 'VAR16', 'VAR17', 'VAR18', 'VAR19', 'VAR20', 'label']; df = pd.DataFrame(np.hstack((X,Y.reshape(Y.shape[0],1))), columns=COL); df['label'] = df['label'].astype(int); R = list(np.linspace(0,1,num=20)); G = list(np.linspace(1,0,num=20)); B = list(np.linspace(0,1,num=20)); colors = ['r','g','blue','c','m','y','black','w','orange','darkgreen','r','g','blue','c','m','y','black','w','orange','darkgreen'] import matplotlib.pyplot as plt; scatter_matrix(df,figsize=[25,25], marker='x',diagonal='kde',c=df.label.apply(lambda k:colors[k])); plt.savefig(fname);
def azureml_main(frame1): import matplotlib matplotlib.use('agg') import pandas as pd import matplotlib.pyplot as plt from pandas.tools.plotting import scatter_matrix ## Remove unwanted columns frame1.drop(["X", "Y", "month", "day"], axis = 1, inplace = True) ## Create a scatter plot matrix fig1 = plt.figure(1, figsize = (12,9)) ax = fig1.gca() scatter_matrix(frame1, alpha=0.2, figsize=(10, 10), diagonal='kde', ax=ax) fig1.savefig('scatter2.png') return frame1
def factor_scatter_matrix(df, factor, factor_labels, legend_title=None, palette=None, title=None): '''Create a scatter matrix of the variables in df, with differently colored points depending on the value of df[factor]. inputs: df: pandas.DataFrame containing the columns to be plotted, as well as factor. factor: string or pandas.Series. The column indicating which group each row belongs to. palette: A list of hex codes, at least as long as the number of groups. If omitted, a predefined palette will be used, but it only includes 9 groups. ''' if isinstance(factor, basestring): factor_name = factor # save off the name factor = df[factor] # extract column df = df.drop(factor_name, axis=1) # remove from df, so it # doesn't get a row and col in the plot. classes = list(set(factor)) if palette is None: palette = sns.color_palette("gist_ncar", len(set(factor))) elif isinstance(palette, basestring): palette = sns.color_palette(palette, len((set(factor)))) else: palette = sns.color_palette(palette) color_map = dict(zip(classes, palette)) if len(classes) > len(palette): raise ValueError(( "Too many groups for the number of colors provided." "We only have {} colors in the palette, but you have {}" "groups.").format(len(palette), len(classes))) colors = factor.apply(lambda group: color_map[group]) axarr = scatter_matrix(df, figsize=(10, 10), marker='o', c=np.array(list(colors)), diagonal=None, alpha=1.0) if legend_title is not None: plt.grid('off') plt.legend([plt.Circle((0, 0), fc=color) for color in palette], factor_labels, title=legend_title, loc='best', ncol=3) if title is not None: plt.suptitle(title) # for rc in xrange(len(df.columns)): # for group in classes: # y = df[factor == group].icol(rc).values # gkde = gaussian_kde(y) # ind = np.linspace(y.min(), y.max(), 1000) # axarr[rc][rc].plot(ind, gkde.evaluate(ind), c=color_map[group]) return axarr, color_map
def scatterMatrix(dframe): ''' Show Scatter Matrix ''' df = DataFrame(dframe) #Rename columns so that the plot if not very cluttered. df.columns = range(len(df.columns)) smatrix = scatter_matrix(df, alpha=0.2, figsize=(6, 6), diagonal='kde') plt.show()
def featurecomparison(cr,features,name,path,columns=None): if not os.path.exists(path): os.makedirs(path) sessions = cr[cr.trial > 0][features] if columns is not None: sessions.columns = columns sessions = sessions.groupby(level=['subject','session']) for (subject,session),group in sessions: scatter_matrix(group) plt.suptitle(str.format('{0} (session {1})',subject,session)) fname = str.format("{0}_session_{1}_{2}.png", subject, session, name) fpath = os.path.join(path,subject) if not os.path.exists(fpath): os.makedirs(fpath) fpath = os.path.join(fpath,fname) plt.savefig(fpath) plt.close('all')
def scatter_matrix_bin_target(df, bin_col, numeric_cols): """Scatter matrix of numerical columns, showing colors based on a binary target variable Parameters ---------- df : pandas.DataFrame Contains columnar data containing `bin_col` and `numeric_cols` columns bin_col : str Name of column containing binary data numeric_cols : [str] List containing column names containing numerical data Reference --------- http://stackoverflow.com/questions/28034424/pandas-scatter-matrix-plot-categorical-variables """ _scatter_color = df[bin_col].apply(lambda v: ('red', 'blue')[v]) scatter_matrix(df[numeric_cols], c=_scatter_color)
def plot_error_correlations(vs, fractions): import pandas as pd from pandas.tools.plotting import scatter_matrix from matplotlib import pyplot as plt vs2 = pd.DataFrame(vs) real = fractions # vs2 = vs2.subtract(real,axis=0) vs2 = (vs2.T - real).T fig,ax = plt.subplots(figsize=[plotinfo.TEXTWIDTH_IN*.5, plotinfo.TEXTWIDTH_IN*.5]) axes = scatter_matrix(vs2, ax=ax, color='k', marker='.', s=2.) for ax in axes.ravel(): ax.grid(False) ax.set_axis_bgcolor((1,1,1)) ax.set_xticks([]) ax.set_yticks([]) ax.spines['bottom'].set_visible(False) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.spines['left'].set_visible(False) for a in range(vs2.shape[1]): for b in range(a+1, vs2.shape[1]): da,db = vs2[[a,b]].T.values r = np.corrcoef(da,db)[0,1] ax = axes[b,a] ylabel = ax.get_ylabel() xlabel = ax.get_xlabel() ax.clear() ax.set_ylabel(ylabel, fontsize=5) ax.set_xlabel(xlabel, fontsize=5) ax.set_xticks([]) ax.set_yticks([]) ax.set_ylim(0,1) ax.set_xlim(0,1) ax.text(.55, .4, '{:.2}'.format(r), fontsize=9, horizontalalignment='center', verticalalignment='center') # ax = axes[a,b] # ax.plot([0,0],[-.25,+.25], 'k-', lw=3) # ax.plot([-.25,+.25], [0,0], 'k-', lw=3) # ax.scatter(db, da, color='r', s=1) axes[0,0].set_ylabel(axes[0,0].get_ylabel(), fontsize=5) axes[5,5].set_xlabel(axes[5,5].get_xlabel(), fontsize=5) fig.tight_layout() fig.savefig('figures/error_scatter.eps') fig.savefig('figures/error_scatter.pdf') fig.savefig('figures/error_scatter.png', dpi=1200)
def explore_data(dataframe, histograms=True, scattermatrix=True, export_summary=True): ''' ''' # Create directory for output import os from pandas.tools.plotting import scatter_matrix if not os.path.exists('output'): os.mkdir('output') print('Descriptive statistics exported as "output/summary_original.csv"') summary = dataframe.describe(include='all').round(2).transpose() summary.insert(1, 'missing', len(dataframe) - summary['count']) summary.insert(0, 'type', dataframe.dtypes) if export_summary == True: summary.to_csv('output/summary_original.csv') if histograms == True: print('Histograms exported as "output/histograms.png"') dataframe.hist(); plt.savefig('output/histograms.png') if scattermatrix == True: print('Scatter matrix exported as "output/scatter_matrix.png"') scatter_matrix(dataframe, diagonal='kde'); plt.savefig('output/scatter_matrix.png')
def visualize(config): # Create various visualizations of the data, this would help to create a feature vector for dataset in config['datasets']: scatter_matrix(dataset['df'], alpha=0.2, figsize=(20, 20), diagonal='kde') fig_name = dataset['name'] + '_scatter_matrix' + '.png' plt.savefig(fig_name) plt.close() plt.figure(figsize=(20,20)) parallel_coordinates(dataset['df'], 'quality') fig_name = dataset['name'] + '_parallel_coordinates' + '.png' plt.savefig(fig_name) plt.close() plt.figure(figsize=(20,20)) radviz(dataset['df'], 'quality') fig_name = dataset['name'] + '_radviz' + '.png' plt.savefig(fig_name) plt.close() return OK
def plot_data(indf, prefix='html', do_scatter=False): """ create scatter matrix plot, histograms """ list_of_plots = [] if do_scatter: column_groups = [] for idx in range(0, len(indf.columns), 3): print len(indf.columns), idx, (idx+3) column_groups.append(indf.columns[idx:(idx+3)]) for idx in range(len(column_groups)): for idy in range(0, idx): if idx == idy: continue print column_groups[idx]+column_groups[idy] pl.clf() scatter_matrix(indf[column_groups[idx]+column_groups[idy]]) pl.savefig('scatter_matrix_%d_%d.png' % (idx, idy)) list_of_plots.append('scatter_matrix_%d_%d.png' % (idx, idy)) pl.close() for col in indf: pl.clf() print col if 'WnvPresent' in indf.columns and col != 'WnvPresent': haswnv = indf['WnvPresent'] == 1 notwnv = indf['WnvPresent'] == 0 indf[notwnv][col].hist(histtype='step', normed=True) indf[haswnv][col].hist(histtype='step', normed=True) else: indf[col].hist(histtype='step', normed=True) pl.title(col) pl.savefig('%s_hist.png' % col) list_of_plots.append('%s_hist.png' % col) create_html_page_of_plots(list_of_plots, prefix) return
def save_scatter(best_pipe, df_X, y, start, shard): """ Plots and saves scatter_matrix of data. Parameters ---------- best_pipe : object Pipeline used. df_X : pandas.DataFrame Input data. y : list Target labels. start : float Time at start of run. shard : bool Indicates if shard of data is used. Returns ------- None """ df = pipe_transform(best_pipe, df_X) df['match'] = y colors = ['red', 'green'] scatter_matrix(df, alpha=0.25, figsize=(20, 12), c=df.match.apply(lambda x: colors[x])) fname = str(int(start - 1470348265)).zfill(7) + '_' if shard: fname = 'shard_' + fname plt.savefig('../output/%sscatter-matrix' % fname) plt.close('all')
def factor_scatter_matrix(df, factor, palette=None, title=None): '''Create a scatter matrix of the variables in df, with differently colored points depending on the value of df[factor]. inputs: df: pandas.DataFrame containing the columns to be plotted, as well as factor. factor: string or pandas.Series. The column indicating which group each row belongs to. palette: A list of hex codes, at least as long as the number of groups. If omitted, a predefined palette will be used, but it only includes 9 groups. ''' if isinstance(factor, basestring): factor_name = factor # save off the name factor = df[factor] # extract column df = df.drop(factor_name, axis=1) # remove from df, so it # doesn't get a row and col in the plot. classes = list(set(factor)) if palette is None: #palette = matplotlib.colors.cnames.values() palette = ['#e41a1c', '#377eb8', '#4eae4b', '#994fa1', '#ff8101', '#fdfc33', '#a8572c', '#f482be', '#999999', '#4B610B', '#DF013A', '#DF013A'] color_map = dict(zip(classes, palette)) if len(classes) > len(palette): raise ValueError(( "Too many groups for the number of colors provided." "We only have {} colors in the palette, but you have {}" "groups.").format(len(palette), len(classes))) colors = factor.apply(lambda group: color_map[group]) axarr = scatter_matrix(df, figsize=(10, 10), marker='o', c=colors, diagonal=None) if title is not None: plt.title(title) # for rc in xrange(len(df.columns)): # for group in classes: # y = df[factor == group].icol(rc).values # gkde = gaussian_kde(y) # ind = np.linspace(y.min(), y.max(), 1000) # axarr[rc][rc].plot(ind, gkde.evaluate(ind), c=color_map[group]) return axarr, color_map
def test_quantile_vs_tmm(): """ Test quantile normalization versus TMM in rank correlation of genes. """ counts_fname = utils.load_testdata("pasilla") # Consider only a subset of the samples samples = OrderedDict() samples["Untreated 1"] = "untreated1" samples["Untreated 2"] = "untreated2" exp_obj = experiment.Experiment(counts_fname, samples) quantile_counts_df = normalizers.norm_q(exp_obj) tmm_counts_df = normalizers.norm_tmm(exp_obj) print "\nQuantile versus TMM Testing:" print "--------------" print "Normalized quantile counts: " print quantile_counts_df.head() print "Normalized TMM counts: " print tmm_counts_df.head() print "Correlating the genes." # Merge the dataframes together, indexing by gene combined_df = pandas.merge(quantile_counts_df, tmm_counts_df, left_index=True, right_index=True, suffixes=["_q", "_tmm"], how="outer") # Get log of counts: get rid of infinite values log_counts_df = combined_df.apply(np.log2).replace([-np.inf, np.inf], np.nan) print "Combined dataframe: " print combined_df.head() print "Combined log dataframe: " print log_counts_df.head() # Plot correlation from pandas.tools.plotting import scatter_matrix scatter_matrix(log_counts_df, alpha=0.2, figsize=(8, 7)) plot_utils.save_fig("quantile_vs_tmm_corr", ext="png")
def plot_discern_distributions( aml, brca, luad ): ''' Plot some useful visualizations of the DISCERN scores as a scatter matrix, where the diagonal is the kernel density of the scores, and the off-diagonals are scatter plots comparing two conditions. Pass in filenames for where the DISCERN scores are stored. ''' from pandas.tools.plotting import scatter_matrix import seaborn as sns AML = pd.read_csv( aml, index_col=0 ) BRCA = pd.read_csv( brca, index_col=0 ) LUAD = pd.read_csv( luad, index_col=0 ) AML['Gene'], BRCA['Gene'], LUAD['Gene'] = AML.index, BRCA.index, LUAD.index AML['AML'], BRCA['BRCA'], LUAD['LUAD'] = np.log10(AML['T2']), np.log10(BRCA['T2']), np.log10(LUAD['T2']) AML, BRCA, LUAD = AML[['Gene', 'AML']], BRCA[['Gene', 'BRCA']], LUAD[['Gene', 'LUAD']] data = pd.merge( AML, BRCA, on='Gene' ) data = pd.merge( data, LUAD, on='Gene' ) with sns.axes_style( "whitegrid" ): scatter_matrix( data, alpha=0.2, figsize=(6,6), diagonal='kde', color='c', density_kwds={'c': 'r', 'lw':1}, lw=0, grid=False ) plt.savefig( 'DISCERN_Scores.pdf' ) plt.clf() print "TOP 10 GENES SORTED BY EACH METHOD" print "AML" print data.sort( 'AML', ascending=False )[['Gene', 'AML']][:10] print print "BRCA" print data.sort( 'BRCA', ascending=False )[['Gene', 'BRCA']][:10] print print "LUAD" print data.sort( 'LUAD', ascending=False )[['Gene', 'LUAD']][:10]
def scatterPlot(dataset, out="DISPLAY"): """ Perform a scatter plot of dataset. Args: DataFrame:: dataset str::out: path to save the plot. """ graph = scatter_matrix(dataset) if out=='DISPLAY': plt.show() else: plt.savefig(out+"scatter_matrix") plt.clf() return graph
label="population", c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True, ) plt.legend() plt.show() corr_matrix = housing.corr().round(2) corr_matrix['median_house_value'].sort_values(ascending=False) from pandas.tools.plotting import scatter_matrix attributes = [ "median_house_value", "median_income", "total_rooms", "housing_median_age" ] scatter_matrix(housing[attributes], figsize=(12, 8)) plt.show() housing.info() housing.total_bedrooms.hist() plt.show() housing["rooms_per_household"] = housing["total_rooms"] / housing["households"] housing[ "bedrooms_per_room"] = housing["total_bedrooms"] / housing["total_rooms"] housing[ "population_per_household"] = housing["population"] / housing["households"] corr_matrix = housing.corr() corr_matrix['median_house_value'].sort_values(ascending=False) housing = strat_train_set.copy()
(symbol, DataReader(symbol, "yahoo", pause=1)) for symbol in symbols) panel = Panel(data).swapaxes('items', 'minor') closing = panel['Close'].dropna() closing.head() # Calculate log returns rets = log(closing / closing.shift(1)).dropna() rets.head() # Correlation Matrix corr_matrix = rets.corr() corr_matrix # Plot correlation and scatter from pandas.tools.plotting import scatter_matrix scatter_matrix(rets) #Cholesky decomposition from scipy.linalg import cholesky upper_cholesky = cholesky(corr_matrix, lower=False) upper_cholesky # Simulation parameters # business days import numpy as np from pandas import bdate_range # business days n_days = 21 dates = bdate_range(start=closing.ix[-1].name, periods=n_days) n_assets = len(symbols) n_sims = 50000
# histograms dataset.hist() plt.show() #multivariate plot # scatter plot matrix scatter_matrix(dataset) plt.show() # Split-out validation dataset array = dataset.values X = array[:,0:4] Y = array[:,4] validation_size = 0.15
# Import the housing information for analysis housing = pd.DataFrame.from_csv('../data/housing.csv', index_col=0) housing.head() # In[5]: # Use covariance to calculate the association housing.cov() # In[6]: # Use correlation to calculate the association is more appropriate in this case housing.corr() # In[7]: # scatter matrix plot from pandas.tools.plotting import scatter_matrix sm = scatter_matrix(housing, figsize=(10, 10)) # ## Let's do an analysis by yourself! # # ## Observe the association between LSTAT and MEDV: # In[8]: # This time we take a closer look at MEDV vs LSTAT。 What is the association between MEDV and LSTAT you observed? housing.plot(kind='scatter', x='LSTAT', y='MEDV', figsize=(10, 10))
alpha=0.4, s=dataset["population"] / 100, label="population", figsize=(10, 7), c="median_house_value", cmap=plt.get_cmap("gist_rainbow"), colorbar=True) plt.legend() corr_matrix = dataset.corr() print(corr_matrix["median_house_value"].sort_values(ascending=False)) attributes = [ "median_house_value", "median_income", "total_rooms", "housing_median_age" ] scatter_matrix(dataset[attributes], figsize=(8, 8)) dataset["rooms_per_household"] = dataset["total_rooms"] / dataset["households"] dataset[ "bedrooms_per_room"] = dataset["total_bedrooms"] / dataset["total_rooms"] dataset[ "population_per_household"] = dataset["population"] / dataset["households"] corr_matrix = dataset.corr() print(corr_matrix["median_house_value"].sort_values(ascending=False)) dataset1 = dataset dataset_labels = dataset["median_house_value"].copy() dataset = dataset.drop("median_house_value", axis=1) dataset = dataset.dropna(subset=["total_bedrooms"]) median = dataset["total_bedrooms"].median()
def DrawGraph8(df): from pandas.tools.plotting import scatter_matrix scatter_matrix(df[['Open', 'High', 'Low', 'Close']], alpha=0.2, figsize=(25,16), diagonal='kde') plt.show()
# In[735]: df.columns # Finding the co-relation between the features # In[736]: df.corr() # In[737]: scatter_matrix(df,alpha=0.5, figsize=(30,32)); # ### Splitting the dataset with all features # In[738]: df_allfeatures = df # In[739]: df.columns # In[740]:
##print(auto.tail()) print(auto.describe()) ##print(list(auto)) ## range, mean, and standard deviation for columns ## only use first 7 columns rownames = list(['min', 'max', 'mean', 'sd']) vals = pd.DataFrame([ auto.ix[:, 0:6].min(), auto.ix[:, 0:6].max(), auto.ix[:, 0:6].mean(), auto.ix[:, 0:6].std() ], index=rownames) print(vals) ##compare data axes = scatter_matrix(auto) plt.show() ##Now with subplots :) fig, axes = plt.subplots(nrows=2, ncols=2) ax1 = auto.plot(x='weight', y='mpg', c='displacement', kind='scatter', ax=axes[0, 0], rot=45) auto.plot(x='horsepower', y='mpg', c='acceleration', kind='scatter',
from sklearn.cluster import AffinityPropagation as Clusterer clusterer = Clusterer() X, y = data.get.people_xy() vecs = [] names = [] couples = data.get.couples_raw() for couple in couples: vecs.append( np.array(X[y.index(couple["male"])]) - np.array(X[y.index(couple["female"])])) names.append(couple["male"].split(' ')[0] + " - " + couple["female"].split(" ")[0]) labels = Clusterer().fit(X).labels_ df = pandas.DataFrame(vecs, columns=[ "Extroversion", "Emotional", "Agreeableness", "Conscientiousness", "Intellect" ]) plot = scatter_matrix(df, figsize=(15, 15), marker='o', hist_kwds={'bins': 10}, s=60, alpha=1, cmap="nipy_spectral") plt.show()
print( "¿Cuál es el valor de la variable respuesta para el paciente número 415?") print(pd_diabetes['Y'][415]) print("¿Cuál es el resumen de datos?") print(pd_diabetes.describe()) # import matplotlib.pyplot as plt # plt.show() #plt.plot(pd_diabetes['AGE'].plot.hist(x = 'Age',alpha=0.5)) plt.hist(pd_diabetes['AGE']) plt.show() plt.scatter(x=pd_diabetes['AGE'], y=pd_diabetes['Y']) plt.show() #representar la scatter matrix from pandas.tools.plotting import scatter_matrix scatter_matrix(pd_diabetes, alpha=0.2, figsize=(12, 12), diagonal='kde') pd_diabetes.corr() # # #Y vs SEX, scatter plot # plt.boxplot(by='SEX',column = 'Y') # fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(6, 6), sharey=True) # axes[0, 0].boxplot(pd_diabetes, labels=labels) # axes[0, 0].set_title('Default', fontsize=fs) # plt.show()
def scat(**kwds): return plotting.scatter_matrix(df, **kwds)
data = pandas.read_csv('brain_size.csv', sep=';', na_values=".") t = np.linspace(-6, 6, 20) sin_t = np.sin(t) cos_t = np.cos(t) pandas.DataFrame({'t': t, 'sin': sin_t, 'cos': cos_t}) data.shape data.columns print(data['Gender']) data[data['Gender'] == 'Female']['VIQ'].mean() groupby_gender = data.groupby('Gender') for gender, value in groupby_gender['VIQ']: print((gender, value.mean())) groupby_gender.mean() from pandas.tools import plotting plotting.scatter_matrix(data[['Weight', 'Height', 'MRI_Count']]) plotting.scatter_matrix(data[['PIQ', 'VIQ', 'FSIQ']]) stats.ttest_1samp(data['VIQ'], 0) female_viq = data[data['Gender'] == 'Female']['VIQ'] male_viq = data[data['Gender'] == 'Male']['VIQ'] stats.ttest_ind(female_viq, male_viq) stats.ttest_ind(data['FSIQ'], data['PIQ']) stats.ttest_rel(data['FSIQ'], data['PIQ']) stats.ttest_1samp(data['FSIQ'] - data['PIQ'], 0) stats.wilcoxon(data['FSIQ'], data['PIQ']) #23結束 x = np.linspace(-5, 5, 20) np.random.seed(1) # normal distributed noise y = -5 + 3 * x + 4 * np.random.normal(size=x.shape)
import numpy as np colors = np.array(['red', 'green', 'blue', 'yellow']) plt.scatter(beer["calories"], beer["alcohol"], c=colors[beer["cluster"]]) plt.scatter(centers.calories, centers.alcohol, linewidths=3, marker='+', s=300, c='black') plt.xlabel("Calories") plt.ylabel("Alcohol") scatter_matrix(beer[["calories", "sodium", "alcohol", "cost"]], s=100, alpha=1, c=colors[beer["cluster"]], figsize=(10, 5)) plt.suptitle("With 3 centroids initialized") scatter_matrix(beer[["calories", "sodium", "alcohol", "cost"]], s=100, alpha=1, c=colors[beer["cluster2"]], figsize=(10, 5)) plt.suptitle("With 2 centroids initialized") plt.show() from sklearn.preprocessing import StandardScaler scaler = StandardScaler() X_scaled = scaler.fit_transform(X)
def function(myData): print(myData.head(20)) print() # Summary of data print(myData.describe()) print() # Look at the number of instances of each class # class distribution print(myData.groupby('increase_rate').size()) # Box and whisker plots myData.plot(kind='box', subplots=True, layout=(2, 2), sharex=False, sharey=False) plt.show() # Histogram myData.hist() plt.show() # Scatterplots to look at 2 variables at once # scatter plot matrix scatter_matrix(myData) plt.show() ###################################################### # Evaluate algorithms ###################################################### # Separate training and final validation data set. First remove class # label from data (X). Setup target class (Y) # Then make the validation set 20% of the entire # set of labeled data (X_validate, Y_validate) valueArray = myData.values X = valueArray[:, 0:4] Y = valueArray[:, 4] test_size = 0.20 seed = 7 X_train, X_validate, Y_train, Y_validate = cross_validation.train_test_split( X, Y, test_size=test_size, random_state=seed) # Setup 10-fold cross validation to estimate the accuracy of different models # Split data into 10 parts # Test options and evaluation metric num_folds = 10 num_instances = len(X_train) seed = 7 scoring = 'accuracy' #Normalize the Data X = preprocessing.normalize(X) ###################################################### # Use different algorithms to build models ###################################################### # Add each algorithm and its name to the model array models = [] models.append(('KNN', KNeighborsClassifier())) models.append(('CART', DecisionTreeClassifier())) models.append(('NB', GaussianNB())) models.append(('SVM', SVC())) models.append(('RF', RandomForestClassifier())) # Evaluate each model, add results to a results array, # Print the accuracy results (remember these are averages and std results = [] names = [] for name, model in models: kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed) cv_results = cross_validation.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name) msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) print(msg) ###################################################### # For the best model, see how well it does on the # validation test. This is for KNeighborsClassifier ###################################################### # Make predictions on validation dataset knn = KNeighborsClassifier() knn.fit(X_train, Y_train) predictions = knn.predict(X_validate) print() print(accuracy_score(Y_validate, predictions)) print(confusion_matrix(Y_validate, predictions)) print(classification_report(Y_validate, predictions)) ###################################################### # For the best model, see how well it does on the # validation test. This is for DecisionTreeClassifier ###################################################### # Make predictions on validation dataset cart = DecisionTreeClassifier() cart.fit(X_train, Y_train) predictions = cart.predict(X_validate) print() print(accuracy_score(Y_validate, predictions)) print(confusion_matrix(Y_validate, predictions)) print(classification_report(Y_validate, predictions)) ###################################################### # For the best model, see how well it does on the # validation test. This is for GaussianNB ###################################################### # Make predictions on validation dataset nb = GaussianNB() nb.fit(X_train, Y_train) predictions = nb.predict(X_validate) print() print(accuracy_score(Y_validate, predictions)) print(confusion_matrix(Y_validate, predictions)) print(classification_report(Y_validate, predictions)) ###################################################### # For the best model, see how well it does on the # validation test. This is for SVM ###################################################### # Make predictions on validation dataset svm = SVC() svm.fit(X_train, Y_train) predictions = svm.predict(X_validate) print() print(accuracy_score(Y_validate, predictions)) print(confusion_matrix(Y_validate, predictions)) print(classification_report(Y_validate, predictions)) ###################################################### # For the best model, see how well it does on the # validation test. This is for RandomForestClassifier ###################################################### # Make predictions on validation dataset rf = RandomForestClassifier() rf.fit(X_train, Y_train) predictions = rf.predict(X_validate) print() print(accuracy_score(Y_validate, predictions)) print(confusion_matrix(Y_validate, predictions)) print(classification_report(Y_validate, predictions))
#-*- coding: utf-8 -*- import matplotlib.pyplot as plt import numpy as np import pandas as pd from pandas import Series, DataFrame from pandas.tools.plotting import scatter_matrix df = DataFrame(np.random.randn(1000, 4), columns=['a', 'b', 'c', 'd']) corr_mat = df.corr() print corr_mat scatter_matrix(df, alpha=0.2, figsize=(16, 16), diagonal='kde') plt.show() #plt.savefig('features.png')
train["Fare"][train["Survived"] == 1] = train["Fare"][train["Survived"] == 1].fillna(train["Fare"][train["Survived"] == 1].median()) train["SibSp"][train["Survived"] == 1] = train["SibSp"][train["Survived"] == 1].fillna(train["SibSp"][train["Survived"] == 1].mode()) train["SibSp"][train["Survived"] == 0] = train["SibSp"][train["Survived"] == 0].fillna(train["SibSp"][train["Survived"] == 0].mode()) train["Parch"][train["Survived"] == 1] = train["Parch"][train["Survived"] == 1].fillna(train["Parch"][train["Survived"] == 1].mode()) train["Parch"][train["Survived"] == 0] = train["Parch"][train["Survived"] == 0].fillna(train["Parch"][train["Survived"] == 0].mode()) #creating new coulumn "Relatives" containing sum of nymber of Siblings/Spouse/Parent or Children a person has onboard train["Relatives"] = train["SibSp"] + train["Parch"] train["Relatives"][train["Survived"] == 1] = train["Relatives"][train["Survived"] == 1].fillna(train["Relatives"][train["Survived"] == 1].mode()) train["Relatives"][train["Survived"] == 0] = train["Relatives"][train["Survived"] == 0].fillna(train["Relatives"][train["Survived"] == 0].mode()) #plotting scatter plot to get an idea of correlation within varaibles numeric_cols = train[["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked","Relatives"]] _ = scatter_matrix(numeric_cols, c = train["Survived"] ,alpha = 0.2, figsize=(8,8), diagonal = 'hist') plt.show() #converting "Age" into a discrete variable for decision tree functioning. Age of 16 was taken as all people below age 16 (children) had #higher chances of getting saved as observed from training data. train["Age"][train["Age"] < 16] = 0 train["Age"][train["Age"] >= 16][ train["Age"] < 60] = 1 train["Age"][train["Age"] >= 60] = 2 #taking log of Fare column as it has a very long range. This discretizes into only 3 values - 0,1 and 2. train["Fare"] = np.log10(train["Fare"]+1).astype(int) #plotting scatter plot again to see effect of above discretizations numeric_cols = train[["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked","Relatives"]]
data.plot(kind='box', subplots=True, layout=(3, 3), sharex=False, sharey=False) plt.show() # In[7]: # Correction Matrix Plot import numpy correlations = data.corr() # plot correlation matrix fig = plt.figure() ax = fig.add_subplot(111) cax = ax.matshow(correlations, vmin=-1, vmax=1) fig.colorbar(cax) ticks = numpy.arange(0, 9, 1) ax.set_xticks(ticks) ax.set_yticks(ticks) ax.set_xticklabels(names) ax.set_yticklabels(names) plt.show() # In[8]: # Scatterplot Matrix from pandas.tools.plotting import scatter_matrix scatter_matrix(data) plt.show() # In[ ]:
plt.savefig("attribute_histogram_plots") # plt.show() sf.plot(kind="scatter", x="longitude", y="latitude", alpha=0.2) plt.savefig('map1.png') sf.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4, figsize=(10,7), c="lastsoldprice", cmap=plt.get_cmap("jet"), colorbar=True, sharex=False) plt.savefig('map2.png') corr_matrix = sf.corr() corr_matrix["lastsoldprice"].sort_values(ascending=False) from pandas.tools.plotting import scatter_matrix attributes = ["lastsoldprice", "finishedsqft", "bathrooms", "zindexvalue"] scatter_matrix(sf[attributes], figsize=(12, 8)) plt.savefig('matrix.png') sf.plot(kind="scatter", x="finishedsqft", y="lastsoldprice", alpha=0.5) plt.savefig('scatter.png') sf['price_per_sqft'] = sf['lastsoldprice']/sf['finishedsqft'] corr_matrix = sf.corr() corr_matrix["lastsoldprice"].sort_values(ascending=False) len(sf['neighborhood'].value_counts()) freq = sf.groupby('neighborhood').count()['address'] mean = sf.groupby('neighborhood').mean()['price_per_sqft'] cluster = pd.concat([freq, mean], axis=1)
""" there seem to exist positive correlation between balance and income, limit and rating""" dummies = pd.get_dummies(df['Married']).rename(columns = lambda x: 'Married_'+str(x)) df=pd.concat([df,dummies["Married_Yes"]],axis=1) #2 dummies = pd.get_dummies(df['Ethnicity']).rename(columns = lambda x: 'Ethnicity_'+str(x)) df_fin=pd.concat([df,dummies[["Ethnicity_Asian" , "Ethnicity_Caucasian"]]],axis=1).drop(["Ethnicity","Married"],1) scatter_matrix(df_fin,figsize=(10,10)) #3 est = smf.ols(formula = 'Balance ~ Unnamed: 0+ Income+ Limit+ Rating+ Cards+ Age+ Education+ Gender+ Student+ Married+ Married_Yes+ Ethnicity_Asian+ Ethnicity_Caucasian', data=df_fin).fit() print est.summary() #http://statsmodels.sourceforge.net/devel/examples/notebooks/generated/example_regression_plots.html # fig = plt.figure(figsize=(12,8)) # fig = sm.graphics.plot_regress_exog(est, "Income", fig=fig) student_resid=est.outlier_test()["student_resid"] plt.plot(est.fittedvalues,student_resid) #5 est1 = smf.ols(formula = 'Balance ~ Income+ Limit+ Rating+ Student', data=df_fin).fit()
with open('points1878.geojson') as f: point_data = json.load(f) cells = 25, 50, 75 bandwidths = 25, 50, 100, 150, 250 data = {year: data.add_coordinates(value, point_data, coordinates_to_meters=False) for year, value in pop_data.items()} plotting.plot_densities_all( data, cell_size=50, bw=100, kernel='epanechnikov', subplot_title_param=dict(year='vuosi'), labels='luterilaiset ortodoksit erotus'.split(), title='Tiheys' ).set_facecolor('white') results = pd.read_csv('kaikki.csv') print(results) corr_values = results.loc[:, lambda results: 's km exposure isolation information'.split()] corr_values.columns = 'S D hPg gPg H'.split() print(corr_values.corr(method='spearman')) plt.style.use('ggplot') scatter_matrix(corr_values, figsize=(10, 10), diagonal='hist') plt.suptitle('Hajontamatriisi', size=20) plt.show()
# In[3]: df # In[4]: df.index = list(range(1, len(df.index) + 1)) # In[5]: color_codes = ["#FF0000", "#0000FF", "#00FF00"] class_names = list(set(df.iloc[:, -7])) colors = [color_codes[class_names.index(x)] for x in list(df.iloc[:, -7])] plotting.scatter_matrix(df[list(df.columns[:])], figsize=(30, 30), color=colors) plt.show() # 赤が健康体、青がパーキンソン病の人であり、一部の散布図では分離できそうなことが読み取れる。 # In[6]: # 名前と評価値を抜くためのブーリアンを作成 libool = [True] * len(df.columns) libool[-7] = False libool[0] = False # 行列の正規化 dfs = df.iloc[:, libool].apply(lambda x: (x - x.mean()) / x.std(), axis=0).fillna(0)
def scattergr(dataset): scatter_matrix(dataset) plt.show()
predicted = clf.predict(x_test) # summarize the fit of the model print(metrics.classification_report(expected, predicted)) print(metrics.confusion_matrix(expected, predicted)) sales_test['return'] = clf.predict(z) print(sales_test['return'].value_counts()) Kscore = cross_val_score(clf, x, y, cv=10, scoring='accuracy') print(Kscore) print(Kscore.mean()) #correlation matrix from pandas.tools.plotting import scatter_matrix plt.style.use('ggplot') scatter = scatter_matrix(attributes, alpha=0.2, figsize=(6, 6), diagonal='kde') #plt.show() def plot_corr(df, size=10): '''Function plots a graphical correlation matrix for each pair of columns in the dataframe. Input: df: pandas DataFrame size: vertical and horizontal size of the plot''' corr = df.corr() fig, ax = plt.subplots(figsize=(size, size)) ax.matshow(corr) plt.xticks(range(len(corr.columns)), corr.columns) plt.yticks(range(len(corr.columns)), corr.columns)
plt.scatter(tips['total_bill'], tips['tip'], marker='x') plt.scatter(tips['total_bill'], tips['tip'], marker='x', alpha=0.5, s=100, color='green') #cargando data feliz = pd.read_csv('happy2015.csv') feliz.columns #importando librerias from pandas.tools.plotting import scatter_matrix scatter_matrix(feliz) #subsetting feliz.columns scatter_matrix(feliz[['Happiness Score', 'Economy (GDP per Capita)']]) sub_feliz = feliz[[ 'Happiness Score', 'Economy (GDP per Capita)', 'Trust (Government Corruption)', 'Generosity' ]] scatter_matrix(sub_feliz) iris.plot(kind="scatter", x="sepal_length", y="sepal_width") g = sns.FacetGrid(iris, hue='species', size=5) g.map(plt.scatter, 'sepal_length', 'sepal_width').add_legend()
def ScatterPlot(self, data_frame): scatter_matrix(data_frame, diagonal='kde', color='green', alpha=1) plt.savefig("ScaterCommon.png")
line = line.replace("None", "null") fp.write("db.rssInfo.insert("+ line +")\n") pp.pprint(line) #print(row.inserted_id) except: print(i) i = i + 1 pass; fp.close() #or use sys.stdin if data is too large #can be added depending on the size of json file cursor = collection1.find() client.close() exit(0) #to get description and summary of data data = pd.DataFrame(list(collection1.find())) for x in data: #whatever computation that has to be done pass; print(data.describe()) data.plot() scatter_matrix(data, figsize = (10, 10)) plt.show()
def visual_correlations(data_frame): scatter_matrix(data_frame[["median_house_value", "median_income"]], figsize=(12, 8)) pyplot.show()
rfecv.fit(trainData, trainLabel) print("Optimal number of features : %d" % rfecv.n_features_) # Plotting features with cross validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show() # After an hour, the SVM model has been trained optimizing the features in the database. Using only these features # will reduce the time of training of the model so used only 373 features instead of input of 561. print('Accuracy of the SVM model on test data is ', rfecv.score(testData,testLabel) ) # Getting the best features best_features = [] for ix,val in enumerate(rfecv.support_): if val==True: best_features.append(testData[:,ix]) #The above yields an accuracy of approximately 97%. Following helps in visualization. from pandas.tools.plotting import scatter_matrix visualize = pd.DataFrame(np.asarray(best_features).T) print(visualize.shape) scatter_matrix(visualize.iloc[:,0:5], alpha=0.2, figsize=(6, 6), diagonal='kde')
@author: abrown09 """ import pandas as pd import matplotlib.pyplot as plt from pandas.tools.plotting import scatter_matrix churn_data = pd.read_csv('/Users/amybrown/Thinkful/Capstone/Data/WA_Fn-UseC_-Telco-Customer-Churn.csv') # Data exploration churn_data.shape # check dimensions churn_data.dtypes print(churn_data.head(10)) # not sure what tenure vaar means-it must be the amount of time customer has been with company print(churn_data.describe()) # looks like numerical data is complete. senior citizen is not an age variable but a binary categorical variable # mean tenure is 32...i'm going to guess this is months and not years because that seems crazy categorical = churn_data.dtypes[churn_data.dtypes == 'object'].index print(categorical) churn_data[categorical].describe() # all data appear complete # I think total charges needs to be changed to float churn_data.hist(column='tenure', figsize=(9,6)) churn_data.hist(column='MonthlyCharges', figsize=(9,6)) scatter_matrix(churn_data, alpha=0.2, figsize=(6, 6), diagonal='kde') # should Yes and No responses be changed to 1s and 0s?
from sklearn.svm import SVC # Importing the dataset titanic_train = pd.read_csv('../input/train.csv') titanic_test = pd.read_csv('../input/test.csv') titanic_train.info() titanic_test.info() titanic_train.describe() # In[ ]: #plotting the scatter matrix first import matplotlib.pyplot as plt from pandas.tools.plotting import scatter_matrix scatter_matrix(titanic_train, figsize=(25, 25)) plt.show() # In[ ]: #dropping the columns which might not affect prediction titanic_train = titanic_train.drop(['PassengerId', 'Ticket'], 1) titanic_test = titanic_test.drop(['Ticket'], 1) #To convert Sex to category datatype titanic_train['Sex'] = titanic_train['Sex'].astype('category') titanic_test['Sex'] = titanic_test['Sex'].astype('category') #drop cabin because of too many NaN values titanic_train = titanic_train.drop(['Cabin'], 1) titanic_test = titanic_test.drop(['Cabin'], 1)