def mem_svg(self, table, column, outfile): import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import seaborn as sb sb.set(style="whitegrid") svgdat = (table. rename(columns={ column : 'Memory' }). groupby(['Dataset', 'Threads', 'Program']). agg({ 'Memory' : max }). reset_index()) svgdat = svgdat.assign(MemoryMB=svgdat['Memory'] / 1000000) threads = svgdat.Threads.unique() if len(threads) == 1: plot = sb.factorplot( x='Program', y='MemoryMB', col="Dataset", data=svgdat, kind="bar", ci=None, sharey=True) else: plot = sb.factorplot( x='Threads', y='MemoryMB', col="Dataset", hue="Program", data=svgdat, kind="bar", ci=None, sharey=True) if len(threads) == 1: plot = plot.set_titles('') plot = plot.set_xlabels('Threads') plot = plot.set_ylabels('Memory (MB)') plot = plot.set_xticklabels(rotation=90) plot.fig.subplots_adjust(wspace=0.35) plot.savefig(outfile)
def pclass_analyze(is_plot=True): global titanic_df global test_df print '-' * 40 print titanic_df['Pclass'][(titanic_df['Pclass'] == 1) & (titanic_df['Survived'] == 1)].count() print titanic_df['Pclass'][(titanic_df['Pclass'] == 1) & (titanic_df['Survived'] == 0)].count() print titanic_df['Survived'][titanic_df['Pclass'] == 1].describe() print titanic_df['Survived'][titanic_df['Pclass'] == 2].describe() print titanic_df['Survived'][titanic_df['Pclass'] == 3].describe() if is_plot: sns.factorplot('Pclass', 'Survived', order=[1,2,3], data=titanic_df, size=5) # 创建pclass的向量化数据 pclass_dummies_titanic = pd.get_dummies(titanic_df['Pclass']) pclass_dummies_titanic.columns = ['Class_1', 'Class_2', 'Class_3'] pclass_dummies_titanic.drop(['Class_3'], axis=1, inplace=True) pclass_dummies_test = pd.get_dummies(test_df['Pclass']) pclass_dummies_test.columns = ['Class_1', 'Class_2', 'Class_3'] pclass_dummies_test.drop(['Class_3'], axis=1, inplace=True) titanic_df.drop(['Pclass'], axis=1, inplace=True) test_df.drop(['Pclass'], axis=1, inplace=True) titanic_df = titanic_df.join(pclass_dummies_titanic) test_df = test_df.join(pclass_dummies_test)
def player_performance_plots(database,table,player_name): conn = MySQLdb.connect(user="******",passwd="xxxx",db=database, cursorclass=MySQLdb.cursors.DictCursor) cmd_target = 'SELECT * FROM '+ table + ' WHERE player IN (\''+ player_name +'\');' player_frame = pd.read_sql(cmd_target, con=conn) conn.close() player_values = player_frame['pos_streak_list'].values streaks = [ast.literal_eval(x) for x in player_values] streak_data = np.concatenate(streaks) x=range(len(streak_data)) y=streak_data df_streaks = pd.DataFrame(dict(streaks=x, streak_length=y)) streak_counts = pd.value_counts(df_streaks.values.ravel()) xData = streak_counts.index[:15] xData_1 = [x-1 for x in xData] yData = streak_counts.values[:15] # yData_1 = yData*(1000)/yData[0] popt, pcov = optimize.curve_fit(exp_func, xData, yData) yEXP = exp_func(xData, *popt) plt.figure() sns.factorplot("streak_length", data=df_streaks,kind="bar",palette="Blues",size=6,aspect=2,legend_out=False); plt.plot(xData_1, yData, label='Data', marker='o') plt.plot(xData_1, yEXP, 'r-',ls='--', label="Exp Fit") plt.legend() plt.show() a,b,c = popt return streak_counts
def draw_graphs(df, y, file_prefix, format="png"): for request_size in df[SIZE].unique(): # Draw a nested barplot to show survival for class and sex g = sns.factorplot(x="initcwnd", y=y, col="bandwidth", row="delay", data=df[(df[SIZE] == request_size)], kind="bar", palette="muted") g.set(xlim=(.5, None)) g.despine(left=True) name = "%s-request_size-%dkb.%s" % (file_prefix, request_size, format) print(name) g.savefig(name, dpi=300) for bandwidth in df.bandwidth.unique(): # Draw a nested barplot to show survival for class and sex g = sns.factorplot(x="initcwnd", y=y, col="delay", row=SIZE, data=df[(df.bandwidth == bandwidth)], kind="bar", palette="muted", aspect=1.2) g.set(ylim=(.5, None)) g.despine(left=True) name = "%s-bandwidth-%dmb.%s" % (file_prefix, bandwidth, format) print(name) g.savefig(name, dpi=100)
def plotStats(frame): # Actual plotting sns.set_style("ticks", {"xtick.minor.size": 12}) sns.set_context('paper') frame.rename(columns={'Capture_Count': 'Discovery_Event_Count'}, inplace=True) factors = ['Adv Latency','Listener Latency', 'Adv Power', 'pos'] variables = ['Time_to_Discovery', 'RSSI', 'Discovery_Event_Count', 'Percent_Captures'] # for factor in factors: # for variable in variables: # sns.factorplot(factor, y=variable, data=frame.dropna()) # plt.savefig( factor + ' vs. ' + variable + '.png', dpi=600, bbox_inches='tight') if True: sns.factorplot('Trial', y=variables[0], data=frame.dropna()) plt.savefig('Trial' + ' vs. ' + variables[0] + '.png', dpi=600, bbox_inches='tight') for variable in variables: sns.factorplot(factors[2], variable, col=factors[0], row=factors[1], data=frame.dropna(), margin_titles=True, size=3, aspect=.8, x_order=['High','Medium','Low','Ultra_low'], col_order=['Low_Latency', 'Balanced', 'Low_Power'], row_order=['Low_Latency', 'Balanced', 'Low_Power']) plt.savefig('Trials' + ' vs. ' + variable + '.png', dpi=600, bbox_inches='tight') g = sns.factorplot(factors[3], variables[1], data=frame.dropna(), margin_titles=True, size=3, aspect=.8) g.set_xticklabels(rotation=90) plt.savefig(factors[3] + ' vs. ' + variables[1] + '.png', dpi=600, bbox_inches='tight')
def behaviors_distribution(df,filepath): cols = ["known_eq","index"] rcols = ["Logical networks", "Input-Output behaviors"] sort_cols = ["known_eq"] if "mse" in df.columns: cols.append("mse") rcols.append("MSE") sort_cols = ["mse"] + sort_cols df.mse = df.mse.map(lambda f: "%.4f" % f) df = df.sort_values(sort_cols).reset_index(drop=True).reset_index(level=0)[cols] df.known_eq = df.known_eq + 1 df.index = df.index + 1 df.columns = rcols if "MSE" in df.columns: g = sns.factorplot(x='Input-Output behaviors', y='Logical networks', hue='MSE', data=df, aspect=3, kind='bar', legend_out=False) else: g = sns.factorplot(x='Input-Output behaviors', y='Logical networks', data=df, aspect=3, kind='bar', legend_out=False) g.ax.set_xticks([]) if filepath: g.savefig(os.path.join(filepath,'behaviors-distribution.pdf')) return g
def sim_analysis(self): df = pandas.DataFrame(self.exp_plan) df.sort_values(by=['subjid', 'order', 'obj', 'imgno', 'stim_dur'], inplace=True) gr = df.groupby(['obj', 'objno', 'imgno']).groups.keys() fs = {} for obj, objno, imgno in gr: a = .1 #+ #.1 * np.random.random() b = .1 + objno/10. #+ #.1 * np.random.random() lam = .05 + imgno/100. #+ .1 * np.random.random() fs[(obj,imgno)] = (a,b,lam) def accf(row): a, b, lam = fs[(row.obj, row.imgno)] x = row.stim_dur / 1000. acc = .5 + (.5 - lam) / (1 + np.exp(-(x-a)/b)) return acc df.acc = df.apply(accf, axis=1) df.acc = df.acc.astype(float) print(df[df.qe==False].groupby(['obj', 'imgno', 'stim_dur']).acc.mean()) print(df[df.qe==True].groupby(['obj', 'imgno', 'stim_dur']).acc.mean()) import pdb; pdb.set_trace() sel = df.obj.isin(df.obj.unique()[:2]) & \ df.imgno.isin(df.imgno.unique()[:3]) sns.factorplot(x='stim_dur', y='acc', col='obj', row='imgno', data=df[sel], kind='point') sns.plt.show()
def prediction_quality(datas, models, labels, points, runs=1): filename = "../../data/matmat/2016-01-04/tmp2.data.pd" df = pd.DataFrame(columns=["~answers", "rmse", "models"]) data_size = len(datas[0](None).get_dataframe_all()) for i in range(points): ratio = (i + 1) / points print("Evaluation for {}% of data".format(ratio * 100)) for data, model, label in zip(datas, models, labels): for run in range(runs): d = data(None) d.set_seed(run) d.set_train_size(ratio) d.filter_data(100, 0) d.get_dataframe_train().to_pickle(filename) d = Data(filename) m = model(None) Runner(d, m).run(force=True, only_train=True) report = Evaluator(d, m).get_report(force_evaluate=True, force_run=True) df.loc[len(df)] = (ratio * data_size, report["rmse"], label) print(df) sns.factorplot(x="~answers", y="rmse", hue="models", data=df)
def tm_gene_family_plot(tm_data, ordered_genomes, biotypes, gene_family_tgt): """transMap gene family collapse plots.""" try: df = json_biotype_nested_counter_to_df(tm_data, 'Gene Family Collapse') except ValueError: # no gene family collapse. probably the test set. with gene_family_tgt.open('w') as outf: pass return df['Gene Family Collapse'] = pd.to_numeric(df['Gene Family Collapse']) tot_df = df[['Gene Family Collapse', 'genome', 'count']].\ groupby(['genome', 'Gene Family Collapse']).aggregate(sum).reset_index() tot_df = tot_df.sort_values('Gene Family Collapse') with gene_family_tgt.open('w') as outf, PdfPages(outf) as pdf: g = sns.factorplot(y='count', col='genome', x='Gene Family Collapse', data=tot_df, kind='bar', col_order=ordered_genomes, col_wrap=4) g.fig.suptitle('Number of genes collapsed during gene family collapse') g.set_xlabels('Number of genes collapsed to one locus') g.set_ylabels('Number of genes') g.fig.subplots_adjust(top=0.9) multipage_close(pdf, tight_layout=False) for biotype in biotypes: biotype_df = biotype_filter(df, biotype) if biotype_df is None: continue biotype_df = biotype_df.sort_values('Gene Family Collapse') g = sns.factorplot(y='count', col='genome', x='Gene Family Collapse', data=biotype_df, kind='bar', col_order=[x for x in ordered_genomes if x in set(biotype_df.genome)], col_wrap=4) g.fig.suptitle('Number of genes collapsed during gene family collapse for {}'.format(biotype)) g.set_xlabels('Number of genes collapsed to one locus') g.set_ylabels('Number of genes') g.fig.subplots_adjust(top=0.9) multipage_close(pdf, tight_layout=False)
def composition(graph): """Figure 4""" dets = [graph[w] for w in ['that', 'my']] nouns = [graph[w] for w in [ 'table', 'bunny']] verbs = [graph[w] for w in ['saw', 'ate']] noun_phrases = [graph.bind(d, n) for d in dets for n in nouns] # Train (NP -> verb) pairs for NP in noun_phrases: graph.add(NP) for verb in verbs: NP.bump_edge(verb, factor=5) the, boy, saw, ate, jack = map(graph.get, ('the', 'boy', 'saw', 'ate', 'Jack')) that_table = graph.get('[that table]') data = [{'composition': str(composition), 'noun phrase': str(NP), 'verb': str(verb), 'edge weight': NP.edge_weight(verb)} for composition in (0, 0.5) for NP in [that_table, graph.bind(the, boy, composition=composition)] for verb in [saw, ate, the, boy]] # include bad verbs (the, boy) df = pd.DataFrame(data) sns.factorplot('verb', 'edge weight', hue='noun phrase', col='composition', data=df, kind='bar').despine(left=True) sns.plt.savefig('figs/composition.pdf') print('created figs/composition.pdf')
def fishers_exact_plot(data, condition1, condition2): """ Perform a Fisher's exact test to compare to binary columns Parameters ---------- data: Pandas dataframe Dataframe to retrieve information from condition1: str First binary column compare condition2: str Second binary column to compare """ sb.factorplot( x=condition1, y=condition2, kind='bar', data=data ) count_table = pd.crosstab(data[condition1], data[condition2]) print(count_table) oddsratio, pvalue = fisher_exact(count_table) print("Fisher's Exact Test: OR: {}, p-value={}".format(oddsratio, pvalue)) return (oddsratio, pvalue)
def denovo_plot(consensus_data, ordered_genomes, denovo_tgt): with denovo_tgt.open('w') as outf, PdfPages(outf) as pdf: try: df = json_biotype_nested_counter_to_df(consensus_data, 'denovo') except ValueError: # No de novo results. Probably the test set. return # fix column names because json_biotype_nested_counter_to_df makes assumptions df.columns = ['Result', 'Number of transcripts', 'Augustus mode', 'genome'] has_pb = len(set(df['Augustus mode'])) == 2 if len(set(df.genome)) > 1: # if we ran in PB only, we may not have multiple genomes if has_pb is True: ax = sns.factorplot(data=df, x='genome', y='Number of transcripts', kind='bar', col='Result', hue='Augustus mode', col_wrap=2, row_order=ordered_genomes, sharex=True, sharey=False) else: ax = sns.factorplot(data=df, x='genome', y='Number of transcripts', kind='bar', col='Result', col_wrap=2, row_order=ordered_genomes, sharex=True, sharey=False) else: if has_pb is True: ax = sns.factorplot(data=df, x='Result', y='Number of transcripts', kind='bar', hue='Augustus mode') else: ax = sns.factorplot(data=df, x='Result', y='Number of transcripts', kind='bar') ax.set_xticklabels(rotation=90) ax.fig.suptitle('Incorporation of de-novo predictions') ax.fig.subplots_adjust(top=0.9) multipage_close(pdf, tight_layout=False)
def accuracy(self): df = self._acc() df = df[df.sel] sns.factorplot(x='dataset', y='model_accuracy', hue='model', data=df, kind='bar', color=self.myexp.colors['shape']) sns.plt.ylim([0,1]) self._plot_behav() base.show(pref='acc', exp=self.myexp.exp, suffix='all_acc', savefig=self.myexp.savefig, html=self.myexp.html) return df
def factorplots(wine_set): seaborn.factorplot(x="quality", y="alcohol", data=wine_set, kind="strip") plt.xlabel("Quality level of wine, 0-10 scale") plt.ylabel("Alcohol level in wine, % ABV") if wine_set.equals(red): plt.title("Alcohol percent in each level of red wine's quality") else: plt.title("Alcohol percent in each level of white wine's quality") plt.show()
def plot_data(df): """ simple plots of soil moisture records """ # melt dataframes three sensor depths into a reading column and depth column df = pd.melt(df, id_vars=["NAME", "FID", "SID", "DTG", "DATE", "TEMP"], var_name="DEPTH", value_name="READING") for station in df["NAME"].unique(): sns.factorplot("DATE", "READING", hue="DEPTH", data=df[df["NAME"] == station]) sns.plt.title(station) sns.plt.show()
def plotsbarra(exames=["BLD", "BIL", "UBG", "KET", "GLU", "PRO", "NIT", "LEU", "PH"]): for i in exames: filename = "barplot" + i + ".png" ordem = sorted(dfcatemelt[dfcatemelt.EXAME == i].MEDIDA.unique()) if "neg" in ordem: ordem.remove("neg") ordem.insert(0, "neg") print(":: para", i) sns.factorplot(x="MEDIDA", kind="count", hue="HORA", data=dfcatemelt[dfcatemelt.EXAME == i], order=ordem) plt.savefig(filename)
def game_performance_plots(player_name,sub_frame,N,save,file_name): player_frame = frame_from_player(player_name,sub_frame) plays = [recent_performance(player_name,player_frame,N,rowIndex,stats_weights) for rowIndex in player_frame.index] perf =[event[1] for event in plays] x=range(len(perf)) y=perf df = pd.DataFrame(dict(game_event=x, performance=y)) sns.factorplot("game_event","performance", data=df,kind="bar",palette="Blues",size=6,aspect=2,legend_out=False); if save == 'save': plt.savefig(file_name, dpi=200)
def createScanOrderBarPlot(GroupDF,goodsubj,BV=False,ax=[],savefig=True): if type(ax)==list: plt.figure() if BV: sns.factorplot(data=GroupDF[GroupDF.Subject_ID.isin(goodsubj)],x='FB',y='modelcorr',hue='scanorder',kind='bar',units='Subject',ci=68) else: sns.violinplot(data=GroupDF[GroupDF.Subject_ID.isin(goodsubj)],x='FB',y='modelcorr',hue='scanorder',split='True',bw=.4,inner='quartile',ax=ax, color='w') if savefig: plt.savefig('%s/ScanOrder_ModelCorrelations.pdf' % saveFigureLocation,dpi=600)
def errors(self): colors = sns.color_palette('Set2')[1] df = [] for depth, model in self.myexp.models: self.myexp.set_model(model) e = self.myexp.errors() for i,r in e.iterrows(): df.append([depth] + r.values.tolist()) df = pandas.DataFrame(df, columns=['depth']+e.columns.values.tolist()) sns.factorplot(x='dimension', y='count', data=df, hue='depth', kind='bar') self.show(pref='errors', suffix='all')
def plot_fit(data, model, x_cols, nonzero_only=True, title='',binary=False): sns.set_style('darkgrid') if binary: coefs = model.coef_[0] else: coefs = model.coef_ results = pd.DataFrame({'param': [data.columns.tolist()[i] for i in x_cols], 'value': coefs}) if nonzero_only: results = results.loc[results.value != 0] sns.factorplot('value', 'param', kind="bar", data=results, size=10, aspect=.7) plt.title(title)
def corr_models(mods1_dis, mods2_dis): df = [] for mods1_label, mods1_data in mods1_dis.items(): inds = np.triu_indices(mods1_data.shape[0], k=1) for mods2_label, mods2_data in mods2_dis.items(): corr = np.corrcoef(mods1_data[inds], mods2_data[inds])[0,1] df.append([mods1_label, mods2_label, corr]) df = pandas.DataFrame(df, columns=['perception', 'models', 'correlation']) df = stats.factorize(df) sns.factorplot('perception', 'correlation', 'models', data=df, kind='bar') return df
def bivariate_bar_plot(df,dep_var, indep_var,grpby, units): #grpby is used to add hue to graph if grpby: seaborn.factorplot(x=indep_var, y=dep_var, data=df, hue =grpby , kind="bar", ci=None) else: seaborn.factorplot(x=indep_var, y=dep_var, data=df, kind="bar", ci=None) #would be great to figure out how to remove '_cat' plt.xlabel(indep_var) plt.ylabel(dep_var) plt.title(dep_var + " by " + indep_var) plt.savefig(wd + "Bivariate Plot of " + dep_var + "_vs_"+ indep_var + '.png') plt.close
def plot_modified_data(data_dict_paths_correct, data_dict_paths_wrong, modified_correct_plotting_dict, modified_wrong_plotting_dict, plot_savefile, mode): with open(data_dict_paths_correct, 'rb') as f: data_correct = pickle.load(f) with open(data_dict_paths_wrong, 'rb') as f: data_wrong = pickle.load(f) with open(modified_correct_plotting_dict, 'rb') as f: modified_data_correct = pickle.load(f) with open(modified_wrong_plotting_dict, 'rb') as f: modified_data_wrong = pickle.load(f) data_correct['lstm_pred'] = np.array(['correct'] * len(data_correct['scores'])) data_wrong['lstm_pred'] = np.array(['wrong'] * len(data_wrong['scores'])) modified_data_correct['lstm_pred'] = np.array(['modified_correct'] * len(modified_data_correct['scores'])) modified_data_wrong['lstm_pred'] = np.array(['modified_wrong'] * len(modified_data_wrong['scores'])) data_merged = defaultdict(list) for k, v in chain(data_correct.items(), data_wrong.items(), modified_data_correct.items(), modified_data_wrong.items()): data_merged[k].append(v) overall_scores = data_merged['mean_score'] data_merged_new = defaultdict(list) for key in data_merged.keys(): if str(key) == 'mean_score': continue else: data_merged_new[key] = data_merged[key] for key in list(data_merged_new.keys()): data_merged_new[key] = [item for sublist in data_merged_new[key] for item in sublist] #data_merged['squared error'] = data_merged.pop('scores') print(data_merged_new.keys()) data = pd.DataFrame(data_merged_new, columns=list(data_merged_new.keys())) if mode == 'regression': sns.set(font_scale=1.5) data.columns = ['squared error', 'timesteps', 'activation', 'layer', 'lstm prediction'] plot = sns.factorplot(x="timesteps", y="squared error", hue="lstm prediction", palette={'correct': 'darkgreen', 'modified_correct': 'lightgreen', 'wrong': 'crimson', 'modified_wrong': 'salmon'}, row='layer', col="activation", legend_out=True, data=data) print('MSE from top left to bottom right:', overall_scores) else: sns.set(font_scale=1.5) data.columns = ['accuracy', 'timesteps', 'activation', 'layer', 'lstm prediction'] plot = sns.factorplot(x="timesteps", y="accuracy", hue="lstm prediction", palette={'correct': 'darkgreen', 'modified_correct': 'lightgreen', 'wrong': 'crimson', 'modified_wrong': 'salmon'}, row='layer', col="activation", legend_out=True, data=data) print('Mean accuracies correct:', overall_scores[0]) print('Mean accuracies false:', overall_scores[1]) plot.savefig(plot_savefile + '.png')
def plot_modalities_bars(self, sample_ids=None, feature_ids=None, data=None, groupby=None, phenotype_to_color=None, bootstrapped=False, bootstrapped_kws=None): """Plot bar Parameters ---------- sample_ids : None or list of str Which samples to use. If None, use all feature_ids : None or list of str Which features to use. If None, use all color : None or matplotlib color Which color to use for plotting the lavalamps of these features and samples x_offset : numeric How much to offset the x-axis of each event. Useful if you want to plot the same event, but in several iterations with different celltypes or colors use_these_modalities : bool If True, then use these sample ids to calculate modalities. Otherwise, use the modalities assigned using ALL samples and features bootstrapped : bool Whether or not to use bootstrapping, i.e. resample each splicing event several times to get a better estimate of its true modality. Default False. bootstrappped_kws : dict Valid arguments to _bootstrapped_fit_transform. If None, default is dict(n_iter=100, thresh=0.6, minimum_samples=10) """ if data is not None: assignments = self.modalities(data=data, groupby=groupby, bootstrapped=bootstrapped, bootstrapped_kws=bootstrapped_kws) else: assignments = self.modalities( sample_ids, feature_ids, groupby=groupby, bootstrapped=bootstrapped, bootstrapped_kws=bootstrapped_kws) # make sure this is always a dataframe if isinstance(assignments, pd.Series): assignments = pd.DataFrame([assignments.values], index=assignments.name, columns=assignments.index) x_order = self.modalities_visualizer.modalities_order id_vars = list(self.data.columns.names) df = pd.melt(assignments.T.reset_index(), value_vars=assignments.index.tolist(), id_vars=id_vars) sns.factorplot('value', hue=assignments.index.name, data=df, x_order=x_order)
def behav_amir(self): df = pandas.read_csv('amir_2012.csv') df = df[df.version=='3d'] df = df[~df.subjid.isin(['KA11','JJ'])] df = df[df.run!=15] df = df[~df.cond.isin([31,34])] df = df[df.acc==100] agg = stats.aggregate(df, groupby=['dimension', 'variant', 'version', 'subjid']) sns.factorplot(x='version',y='rt',hue='variant',col='dimension', data=agg,kind='bar',col_wrap=3) sns.plt.show()
def plot_multi_bars_with_sns(): ''' ''' N = 10 GROUP_N = 3 labels = ['label%d' % i for i in range(N)] data = np.random.rand(GROUP_N, 10).reshape(-1) df = pd.DataFrame(dict(data=data, label=labels * GROUP_N, group=['g1', 'g2', 'g3'] * N)) sns.factorplot(data=df, x='label', y='data', hue='group', kind='bar') # Hue 代表x轴每个值 再分成小类别显示 plt.xticks(rotation='vertical') plt.show()
def plot_scores_RF(scores, path): sn.set_style("whitegrid") sn.factorplot(x="n_estimators", y="mean", hue="max_depths", col="max_features", data=scores, palette="BuGn_r", col_wrap = 3) if path is not None: file_name = 'max_features_{:.0f}-{:.0f}_max_depts_{:.0f}-{:.0f}_n_estimators_{:.0f}-{:.0f}'.format( scores.max_features.min(), scores.max_features.max(), scores.max_depths.min(), scores.max_depths.max(), scores.n_estimators.min(), scores.n_estimators.max())+'.png' plt.savefig(os.path.join(path,file_name))
def class_SHM(self): SHM_columns = ['Antibody', 'Group', 'Sample', 'Data'] antibody_class_list = ['all classes', 'IGHM', 'IGHG', 'IGHA', 'IGHE', 'IGHD'] ls_split = [] for key in antibody_class_list: for igroup,group in enumerate(self.groups): for sample in group['samples']: for i in self.groups[igroup]['sample data'][sample]['sh_dict'][key]: ls_split.append([key, group['name'], sample, i]) self.SHM_DF_split = pd.DataFrame(ls_split, columns = SHM_columns) sns.factorplot('Group', 'Data', data=self.SHM_DF_split, hue='Antibody' , kind='box', size = 8, aspect=2)
def showMetric(metric_name, m_table, title, output_fn = None): pl.figure() # siz = m_table.shape[0] #pl.plot(range(siz), m_table[metric_name], color="r", lw=2) sns.factorplot(x='specimen_id',y=metric_name, data = m_table, kind="bar") # labels = range(1,siz,30) pl.xticks(fontsize=3) pl.title(title) pl.xticks(rotation=90) # pl.xlabel('Image ID 1 ~ ' + str(siz)) if ( output_fn is None): output_fn = data_DIR+'/'+metric_name+title+'.pdf' pl.savefig(output_fn) pl.show()
def plot(self): # plot mean phase of spikes to show that they are fish dependent df = pd.DataFrame(self.fetch()) df['eod'] = [1 / np.median(np.diff(e)) for e in df.eod_times] df['cmean'] = [circ.mean(e) for e in df.phases] df['jitter'] = [circ.std(ph) / 2 / np.pi / e for ph, e in zip(df.phases, df.eod)] model = ols('cmean ~ C(fish_id)', data=df).fit() table = sm.stats.anova_lm(model) print(table) sns.factorplot('fish_id', 'cmean', data=df, kind='bar') g = sns.pairplot(df.ix[:, ('cmean', 'jitter', 'fish_id')], hue='fish_id') plt.show()
# select observations containing more than 2 outliers outlier_indices = Counter(outlier_indices) multiple_outliers = list( k for k, v in outlier_indices.items() if v > n ) return multiple_outliers # detect outliers from Age, SibSp , Parch and Fare Outliers_to_drop = detect_outliers(train,2,["Age","SibSp","Parch","Fare"]) train = train.drop(Outliers_to_drop, axis = 0).reset_index(drop=True) train_len=len(train) dataset=pd.concat(objs=[train,test],axis=0).reset_index(drop=True) dataset=dataset.fillna(np.nan) sns.heatmap(train[['Survived','SibSp','Parch','Age','Fare']].corr(),annot=True,fmt='.2f',cmap='coolwarm') g=sns.factorplot(x='SibSp',y='Survived',data=train,kind='bar',size=6,palette='muted') g.despine(left=True) g=g.set_ylabels("Survival Probability") g=sns.FacetGrid(train,col='Survived') g=g.map(sns.distplot,'Age') dataset['Fare']=dataset['Fare'].fillna(dataset['Fare'].median()) g = sns.distplot(dataset["Fare"], color="m", label="Skewness : %.2f"%(dataset["Fare"].skew())) g = g.legend(loc="best") #applying Log dataset['Fare']=dataset['Fare'].map(lambda i : np.log(i) if i>0 else 0) #filling Embarked dataset['Embarked']=dataset['Embarked'].fillna('S')
# All categorical variables contains NAN whereas continuous ones have 0. # So that means there is no basement for those houses. # we can replace it with 'None'. # In[ ]: for col in basement_cols: if 'FinSF' not in col: houses[col] = houses[col].fillna('None') # *Fireplaces* # ------------ # In[ ]: sns.factorplot("Fireplaces", "SalePrice", data=houses, hue="FireplaceQu") # Having 2 fireplaces increases house price and fireplace of Excellent quality is a big plus. # In[ ]: #If fireplace quality is missing that means that house doesn't have a fireplace houses["FireplaceQu"] = houses["FireplaceQu"].fillna('None') pd.crosstab(houses.Fireplaces, houses.FireplaceQu) # *Garages* # --------- # In[ ]: sns.distplot(houses["GarageArea"], color='r', kde=False)
X, y, Z, y2 = preprocessing(df_train_data) #print(df_train_data) print(X.columns) print(X.shape) print(y.shape) print(y2.shape) print(Z.shape) #print(y2) # In[4]: #ヴァイオリンプロットで視覚的に データを比較する sns.factorplot(x='marital', y='age', data=Z, kind='violin',aspect=2) plt.show() sns.factorplot(x='marital', y='job', data=Z, kind='violin',aspect=2) plt.show() sns.factorplot(x='marital', y='education', data=Z, kind='violin',aspect=2) plt.show() sns.factorplot(x='marital', y='default', data=Z, kind='violin',aspect=2) plt.show() sns.factorplot(x='marital', y='housing', data=Z, kind='violin',aspect=2) plt.show() sns.factorplot(x='marital', y='loan', data=Z, kind='violin',aspect=2) plt.show() sns.factorplot(x='marital', y='contact', data=Z, kind='violin',aspect=2) plt.show() sns.factorplot(x='marital', y='month', data=Z, kind='violin',aspect=2) plt.show()
print("RUTA : {}".format(ruta)) # imprimir rtts relativos de ruta ultimo_rtt = 0 ruta_final = [] for ip, rtt in ruta: ruta_final.append((ip, rtt, rtt - ultimo_rtt)) ultimo_rtt = rtt ruta_rtts_relativos = [(ip, rel_rtt) for (ip, rtt, rel_rtt) in ruta_final] print("ruta_rtts_relativos : {}".format(ruta_rtts_relativos)) df = pd.DataFrame(ruta_rtts_relativos, columns=['IP', 'RTT']) sns.set(font_scale=1.5) ax = sns.factorplot(x='IP', y='RTT', data=df, aspect=1.5) ax.set(xlabel='IPs con más apariciones por salto', ylabel='RTT relativo medio (ms)') ax.set_xticklabels(rotation=90) ax.fig.suptitle('RTT medio para cada salto') #plt.tight_layout() if target: ax.fig.set_size_inches(24, 6) ax.savefig("../img/" + target + "-rtts.pdf") else: plt.show() ax.fig.clear() # imprimir incremento de rtts de ruta
sns.countplot(x='Sex', data=titanic_train) #histogram to undertand continuous feature #x: bins of continuous data, y: frequency #issue: how do you select number of bins? sns.distplot(titanic_train['Fare'], kde=False) sns.distplot(titanic_train['Fare'], kde=True) sns.distplot(titanic_train['Fare'], bins=20, rug=False, kde=False) sns.distplot(titanic_train['Fare'], bins=20) sns.distplot(titanic_train['Fare'], bins=100, kde=False) #density plot to understand continuous feature #it doesnt require bins argument #x: fare y:density sns.distplot(titanic_train['Fare'], hist=False) sns.distplot(titanic_train['Fare']) #box-whisker plot to understand continuous feature sns.boxplot(x='Fare', data=titanic_train) titanic_test = pd.read_csv( "D:\\Data Science\\Code Exec\\Data\\titanic_test.csv") print(titanic_test.shape) titanic_test['Survived'] = 0 titanic_test['test'] = 23 titanic_test.loc[titanic_test.Sex == 'female', ['Sex', 'Survived']] titanic_test.to_csv("D:/Data Science/submission1.csv", columns=['PassengerId', 'Survived', 'test'], index=False) #bi variate plots sns.factorplot(x='Sex')
def survival_rate(feature): rate = train[[feature, 'Survived' ]].groupby([feature], as_index=False).mean().sort_values(by=[feature], ascending=True) sns.factorplot(x=feature, y="Survived", data=rate)
def show_bikeshare_charts(city_data): # Following visualizations will be generated for the cities which has data of Birth Year # Bar chart for the User Type of the city sns.factorplot('User Type', data=city_data, kind='count') plt.xlabel('User Type', fontsize=14) plt.title("Customer Vs Subscriber Data", fontsize=16) ax = plt.gca() ax.axes.get_yaxis().set_visible(False) for p in ax.patches: ax.text(p.get_x() + p.get_width() / 2., p.get_height(), '%d' % int(p.get_height()), fontsize=12, ha='center', va='bottom') plt.show() # Pie chart for User Type of the city city_data['User Type'].value_counts().plot(kind='pie', autopct='%1.1%%f') plt.axis('equal') plt.title('User Type', fontsize=16) # Ridership for the Days of the Week sns.factorplot('Start Day', data=city_data, kind='count', order=[ 'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday' ], size=8) plt.xlabel('Weekday', fontsize=14) plt.title("Ridership for the Days of the Week", fontsize=16) ax = plt.gca() ax.axes.get_yaxis().set_visible(False) for p in ax.patches: ax.text(p.get_x() + p.get_width() / 2., p.get_height(), '%d' % int(p.get_height()), fontsize=12, ha='center', va='bottom') plt.show() # Following visualization will be generated for the cities which has data for Gender if 'Gender' in city_data: sns.factorplot('Gender', data=city_data, kind='count', size=8) plt.xlabel('Gender', fontsize=14) plt.title("Bike Share Gender Data", fontsize=16) ax = plt.gca() ax.axes.get_yaxis().set_visible(False) for p in ax.patches: ax.text(p.get_x() + p.get_width() / 2., p.get_height(), '%d' % int(p.get_height()), fontsize=12, ha='center', va='bottom') plt.show()
incomes.nomprov = incomes.nomprov.astype("category", categories=[i for i in df.nomprov.unique()],ordered=False) incomes.head() # In[ ]: with sns.axes_style({ "axes.facecolor": "#ffc400", "axes.grid" : False, "figure.facecolor": "#c60b1e"}): h = sns.factorplot(data=incomes, x="nomprov", y=("renta","MedianIncome"), order=(i for i in incomes.nomprov), size=6, aspect=1.5, scale=1.0, color="#c60b1e", linestyles="None") plt.xticks(rotation=90) plt.tick_params(labelsize=16,labelcolor="#ffc400")# plt.ylabel("Median Income",size=32,color="#ffc400") plt.xlabel("City",size=32,color="#ffc400") plt.title("Income Distribution by City",size=40,color="#ffc400") plt.ylim(0,180000) plt.yticks(range(0,180000,40000)) # There's a lot of variation, so I think assigning missing incomes by providence is a good idea. First group the data by city, and reduce to get the median. This intermediate data frame is joined by the original city names to expand the aggregated median incomes, ordered so that there is a 1-to-1 mapping between the rows, and finally the missing values are replaced.
def CallAccuracyPlots(name, data): df = None for adp, vals in data.iteritems(): filtered = [v for v in vals if v[2] > 0.0] TP = [v[2] for v in filtered if v[3]] FP = [v[2] for v in filtered if not v[3]] classes = (["TruePos"] * len(TP)) + (["FalsePos"] * len(FP)) raw = { "AdapterType": adp, "AdapterClass": pd.Series(classes), "CallAccuracy": pd.Series(TP + FP) } if df is None: df = pd.DataFrame(raw) else: df = df.append(pd.DataFrame(raw)) ax = sns.factorplot(x="AdapterType", y="CallAccuracy", hue="AdapterClass", kind="box", data=df) plt.subplots_adjust(top=0.9) ax.fig.suptitle("Adapter Call Accuracy by Type and Classification") plt.ylim(0.4, 1.05) pltFilename = "{0}_call_accuracy_box.png".format(name) plt.savefig(pltFilename) plt.close() p1 = { "caption": "Adapter Call Accuracy Box Plots For Adapter Types", "image": pltFilename, "tags": [], "id": "{0} - Adapter Call Accuracy Box Plots".format(name), "title": "{0} - Adapter Call Accuracy Box Plots".format(name), "uid": "0500001" } g = sns.FacetGrid(pd.melt(df, id_vars=['AdapterType', 'AdapterClass'], value_vars=['CallAccuracy']), hue='AdapterClass', row='AdapterType', aspect=2.0) g.map(sns.kdeplot, 'value', shade=True) plt.subplots_adjust(top=0.9) g.fig.suptitle("Adapter Call Accuracy by Type and Classification") plt.legend() pltFilename = "{0}_call_accuracy_dist.png".format(name) plt.savefig(pltFilename) plt.close() p2 = { "caption": "Adapter Call Accuracy Density Plot For Adapter Types", "image": pltFilename, "tags": [], "id": "{0} - Adapter Call Accuracy Density Plot".format(name), "title": "{0} - Adapter Call Accuracy Density Plot".format(name), "uid": "0500002" } g = sns.FacetGrid(pd.melt(df, id_vars=['AdapterType', 'AdapterClass'], value_vars=['CallAccuracy']), hue='AdapterClass', row='AdapterType', aspect=2.0) bins = [x / 1000.0 for x in range(400, 1001, 25)] g.map(plt.hist, 'value', alpha=0.5, bins=bins) plt.subplots_adjust(top=0.9) g.fig.suptitle("Adapter Call Accuracy by Type and Classification") plt.legend() pltFilename = "{0}_call_accuracy_hist.png".format(name) plt.savefig(pltFilename) p3 = { "caption": "Adapter Call Accuracy Histogram For Adapter Types", "image": pltFilename, "tags": [], "id": "{0} - Adapter Call Accuracy Histogram".format(name), "title": "{0} - Adapter Call Accuracy Histogram".format(name), "uid": "0500003" } return [p1, p2, p3]
# View first lines of test data df_test.head() df_train.info() df_train.describe() sns.countplot(x = 'Survived', data = df_train) df_test['Survived'] = 0 df_test[['PassengerId', 'Survived']].to_csv('/home/sarvesh/Titanic/no_survivors.csv', index = False) sns.countplot(x = 'Sex', data = df_train) sns.factorplot(x = 'Survived', col = 'Sex', kind = 'count', data = df_train) df_train.groupby(['Sex']).Survived.sum() print(df_train[df_train.Sex == 'female'].Survived.sum()/df_train[df_train.Sex == 'female'].Survived.count()) print(df_train[df_train.Sex == 'male'].Survived.sum()/df_train[df_train.Sex == 'male'].Survived.count()) df_test['Survived'] = df_test.Sex == 'female' df_test['Survived'] = df_test.Survived.apply(lambda x: int(x)) df_test.head() sns.factorplot(x = 'Survived', col = 'Embarked', kind = 'count', data = df_train) sns.distplot(df_train.Fare, kde = False) df_train.groupby('Survived').Fare.hist(alpha = 0.6)
data['Pclass'].value_counts().plot.bar(color=["#CD7F32", "#FFDF00", "#D3D3D3"], ax=ax[0]) ax[0].set_title('Number Of Passengers By Pclass') ax[0].set_ylabel('Count') sns.countplot('Pclass', hue='Survived', data=data, ax=ax[1]) ax[1].set_title('Pclass:Survived vs Dead') plt.show() # In[ ]: pd.crosstab([data.Sex, data.Survived], data.Pclass, margins=True).style.background_gradient(cmap='summer_r') # In[ ]: sns.factorplot('Pclass', 'Survived', hue='Sex', data=data) plt.show() # In[ ]: print('Oldest Passenger was of:', data['Age'].max(), 'Years') print('Youngest Passenge was of:', data['Age'].min(), 'Years') print('Average Age on the ship:', data['Age'].mean(), 'Years') # In[ ]: f, ax = plt.subplots(1, 2, figsize=(18, 8)) sns.violinplot("Pclass", "Age", hue="Survived", data=data,
d = {} for state_name in Data.state: sub_data = Data[Data.state == state_name] if sub_data[sub_data.party == 'Democrat'].empty or sub_data[ sub_data.party == 'Republican'].empty: continue else: advantage = float(np.sum(sub_data[sub_data.party=='Democrat'].votes) - \ np.sum(sub_data[sub_data.party=='Republican'].votes))/np.sum(sub_data.votes) d[state_name] = advantage Advantage_score = pd.DataFrame(sorted(d.items(), key=lambda x: x[1])) Advantage_score.columns = ['state', 'adv_score'] Demo_Adv = Ave_vote_Demo[Ave_vote_Demo.state.isin( Advantage_score.tail(10).state)] Repu_Adv = Ave_vote_Repu[Ave_vote_Repu.state.isin( Advantage_score.head(10).state)] #Plot of ave_vote_rate for different candidate among the top 10 advantage state sns.factorplot(x="state", y="ave_vote_rate", hue="candidate", data=Demo_Adv, kind="bar") sns.factorplot(x="state", y="ave_vote_rate", hue="candidate", data=Repu_Adv, kind="bar")
5) How did voter sentiment change over time? 6) Can we see an effect in the polls from the debate? ''' # Read election poll dataset into a pandas dataset url = 'http://elections.huffingtonpost.com/pollster/2012-general-election-romney-vs-obama.csv' source = requests.get(url).text poll_data = StringIO(source) poll_df = pd.read_csv(poll_data) print poll_df.head() # Who was being polled? (bargraph/factorplot) sns.factorplot(x='Affiliation', kind='count', data=poll_df, order=(['Dem', 'None', 'Rep']), hue='Population', size=6, aspect=2) # What was the mean and stdev avg = poll_df[['Obama', 'Romney', 'Undecided']].mean() std = poll_df[['Obama', 'Romney', 'Undecided']].std() plt.figure() avg.plot(kind='bar', yerr=std, legend=False, color='indianred') # Concatenating dataframes using pd.concat poll_avg = pd.concat([avg, std], axis=1) poll_avg.columns = ['Average', 'STD'] print poll_avg.head()
train_data[train_data["Fare"].isnull()] x = train_data[train_data["Pclass"] == 3]["Fare"].mean() train_data["Fare"] = train_data["Fare"].fillna(x) train_data[train_data["Fare"].isnull()] # Visualization list1 = ["SibSp", "Parch", "Age", "Fare", "Survived"] sns.heatmap(train_data[list1].corr(), annot=True, fmt=".2f") plt.show() g = sns.factorplot(x="SibSp", y="Survived", data=train_data, kind="bar", size=6) #g.set_ylabels("Survived Probability") sns.factorplot(x="Pclass", y="Survived", data=train_data, hue="Sex", kind="bar") sns.factorplot(x="Pclass", y="Fare", data=train_data, hue="Sex", kind="violin") sns.factorplot(x="Parch", y="Survived", kind="bar", data=train_data, size=6) g = sns.FacetGrid(train_data, col="Survived") g.map(sns.distplot, "Age", bins=25)
# compare conditions # ------------------ # create a data frame data_dict = {'travel time' : condition_list, 'body part' : \ ['shoulder'] * reps_per_cond + ['leg'] * reps_per_cond} df = pd.DataFrame(data_dict) # set figure style sns.set(style="ticks") # create figure g = sns.factorplot("body part", "travel time", data=df, kind="box", palette="PRGn") g.despine(offset=10, trim=True) # save figure plt.savefig(filename) # get image on screen img = Image.open(filename) imgsize = np.array(img.size) del img # set image info_img.size = imgsize info_img.setImage(filename) info_img.draw()
def grid(data, x, y, col=None, hue=None, col_wrap=4, palette='default', style='astetik', dpi=72, title='', sub_title='', x_label='', y_label='', legend=True, x_scale='linear', y_scale='linear', x_limit=None, y_limit=None, save=False): '''THE GRID The grid provides an overview of 4 features simultanously by drawing a grid of scatter plots. Inputs: 4 Features: Ideally two continuous, and two categorical, but will also work with just one continuous and two categoricals. 1. USE ====== ast.grid(data=new_patients.head(1000), x='icu_stays', y='hospital_days', col='religion', palette='default', col_wrap=4); 2. PARAMETERS ============= 2.1 INPUT PARAMETERS -------------------- data :: pandas dataframe x :: x-axis data (continuous or categorical) y :: y-axis data (continuous) hue :: color highlight (categorical) col :: the side-by-side plot comparison feature -------------------- 2.2. PLOT PARAMETERS -------------------- col_wrap :: the number of plots to show per row ---------------------- 2.3. COMMON PARAMETERS ---------------------- palette :: One of the hand-crafted palettes: 'default' 'colorblind' 'blue_to_red' 'blue_to_green' 'red_to_green' 'green_to_red' 'violet_to_blue' 'brown_to_green' 'green_to_marine' Or use any cmap, seaborn or matplotlib color or palette code, or hex value. style :: Use one of the three core styles: 'astetik' # white '538' # grey 'solarized' # sepia Or alternatively use any matplotlib or seaborn style definition. dpi :: the resolution of the plot (int value) title :: the title of the plot (string value) sub_title :: a secondary title to be shown below the title x_label :: string value for x-axis label y_label :: string value for y-axis label x_scale :: 'linear' or 'log' or 'symlog' y_scale :: 'linear' or 'log' or 'symlog' x_limit :: int or list with two ints y_limit :: int or list with two ints outliers :: Remove outliers using either 'zscore' or 'iqr' ''' data = data.copy(deep=True) if hue != None: n_colors = len(data[hue].unique()) else: n_colors = 1 # HEADER STARTS >>> palette = _header(palette, style, n_colors=n_colors, dpi=72, fig_height=None, fig_width=None) # <<< HEADER ENDS p = sns.factorplot(data=data, x=x, y=y, col=col, hue=hue, palette=palette, col_wrap=4, kind='strip', size=3) # FOOTER STARTS >>> _footer(p, x_label, y_label, save=save) sns.despine(bottom=True, left=True) p.set(xticklabels=[])
train.loc[Outliers_to_drop] # Show the outliers rows train = train.drop(Outliers_to_drop, axis = 0).reset_index(drop=True) train_len = len(train) dataset = pd.concat(objs=[train, test], axis=0).reset_index(drop=True) dataset = dataset.fillna(np.nan) dataset.isnull().sum() train.info() train.isnull().sum() train.describe() # feature analysis g = sns.heatmap(train[["Survived", "SibSp", "Parch", "Age", "Fare"]].corr(), annot = True, fmt = ".2f", cmap = "coolwarm") # Explore SibSp feature vs Survived g = sns.factorplot(x = "SibSp", y = "Survived", data = train, kind = 'bar', size= 6, palette = "muted") g.despine(left = True) g = g.set_ylabels("survival probability") # Explore Parch feature vs Survived g = sns.factorplot(x="Parch",y="Survived",data=train,kind="bar", size = 6 , palette = "muted") g.despine(left=True) g = g.set_ylabels("survival probability") plt.show() plt.figure() # age vs survivability g = sns.FacetGrid(train, col = 'Survived') g = g.map(sns.distplot, 'Age') # Explore Age vs Survived g = sns.FacetGrid(train, col='Survived')
# Pclass g = sns.catplot(x="Pclass", y="Survived", data=train, kind="bar", size=6, palette="muted") g.despine(left=True) g = g.set_ylabels("survival probability") g = sns.factorplot(x="Pclass", y="Survived", hue="Sex", data=train, size=6, kind="bar", palette="muted") g.despine(left=True) g = g.set_ylabels("survival probability") # Embarked dataset["Embarked"].isnull().sum() dataset.info() dataset["Embarked"] = dataset["Embarked"].fillna("S") g = sns.catplot(x="Embarked", y="Survived", data=train,
import warnings warnings.filterwarnings("ignore") data = pd.read_csv('f:\\nesarc_pds.csv', low_memory=False) sub = data[data['S1Q213'] <= 1] # a copy of dataset where undesired data has been removed #MY PRIMARY TOPIC OR DEPENDENT VARIABLE c4 = data['S1Q213'].value_counts(dropna = False) #DEPENDENT VARIABLE GRAPH PLOT seaborn.countplot(x = 'S1Q213', data=sub) plt.title('DURING PAST 4 WEEKS, HOW OFTEN FELT DOWNHEARTED AND DEPRESSED') seaborn.factorplot(x = 'S1Q213', y = 'S4AQ4A18',data = sub, kind = 'bar', ci= None) plt.title('Relation between sucidaland depressed') seaborn.factorplot(x = 'S4AQ11', y = 'S1Q213',data = sub, kind = 'bar', ci= None) plt.title('Relation between drinking and depressed') seaborn.distplot(c4); plt.xlabel(' DISTRIBUTION')
ascending=False) # % de survivant per Cabin(first letter) # In[47]: #plots of several relevant features against 'survived' leading to feature creation g = sns.FacetGrid(train, col='Survived') g = g.map(sns.distplot, "Age") g = g.set_ylabels("survival probability") # In[48]: # Relation between Siblings and survival probability g = sns.factorplot(x="SibSp", y="Survived", data=train, kind="bar", size=6, palette="muted") g.despine(left=True) g = g.set_ylabels("survival probability") # In[49]: # Survival probability according to their class and sex g = sns.factorplot(x="Pclass", y="Survived", hue="Sex", data=train, size=6, kind="bar", palette="muted")
# - It would be interesting to see if there are relatively more women/men in a particular field - Do women or men like to identify more with specific job titles? # In[ ]: jobs_by_gender = df[["GenderSelect", "CurrentJobTitleSelect" ]].groupby([df.CurrentJobTitleSelect, df.GenderSelect ]).size().reset_index(name="number") # In[ ]: from matplotlib import pyplot chart = sns.factorplot(x='CurrentJobTitleSelect', y='number', hue='GenderSelect', data=jobs_by_gender, kind='bar', size=15, aspect=2, legend=False) for ax in plt.gcf().axes: ax.set_xlabel("Job Title", fontsize=35) ax.set_ylabel("Count", fontsize=35) for ax in chart.axes.flatten(): ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontsize=25) ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=25) plot = plt.legend(loc='upper left', prop={'size': 20}) plot = plt.title("Number of people with Different Job Titles by Gender", fontsize=30)
def run(): url = "https://api.usa.gov/crime/fbi/ucr/estimates/states/" page = "?page=1&per_page=200&output=json&api_key=" api_key = "iiHnOKfno2Mgkt5AynpvPpUQTEyxE77jo1RU8PIv" states = ['co', 'wa', 'ak', 'or'] response = [] year = [] aggravated_assault = [] burglary = [] caveats = [] homicide = [] larceny = [] motor_vehicle_theft = [] population = [] property_crime = [] rape_legacy = [] robbery = [] state_abbr = [] violent_crime = [] stateList = [] for state in states: #print(state) for i in range(22): response = requests.get(url + state + page + api_key).json() aggravated_assault.append( response["results"][i]["aggravated_assault"]) burglary.append(response["results"][i]["burglary"]) caveats.append(response["results"][i]["caveats"]) homicide.append(response["results"][i]["homicide"]) larceny.append(response["results"][i]["larceny"]) motor_vehicle_theft.append( response["results"][i]["motor_vehicle_theft"]) property_crime.append(response["results"][i]["property_crime"]) rape_legacy.append(response["results"][i]["rape_legacy"]) robbery.append(response["results"][i]["robbery"]) violent_crime.append(response["results"][i]["violent_crime"]) year.append(response["results"][i]["year"]) state_abbr.append(response["results"][i]["state_abbr"]) population.append(response["results"][i]["population"]) crime_data = pd.DataFrame({ "state_abbr": state_abbr, "year": year, "population": population, "aggravated_assault": aggravated_assault, "burglary": burglary, "homicide": homicide, "larceny": larceny, "motor_vehicle_theft": motor_vehicle_theft, "property_crime": property_crime, "rape_legacy": rape_legacy, "robbery": robbery, "violent_crime": violent_crime }) crime_date_f = crime_data.loc[(crime_data["year"] == 2010) | (crime_data["year"] == 2016)].copy() total_Crime=crime_date_f.burglary+crime_date_f.aggravated_assault+ \ crime_date_f.motor_vehicle_theft \ +crime_date_f.property_crime+crime_date_f.violent_crime+crime_date_f.robbery+crime_date_f.homicide+ \ crime_date_f.rape_legacy+crime_date_f.larceny crime_data['total_Crime'] = total_Crime crime_data_long = crime_date_f.melt(id_vars=['state_abbr', 'year'], value_name='Sum', var_name='Crime_types') array = [ 'burglary', 'property_crime', 'violent_crime', 'motor_vehicle_theft' ] crime_data_long_1 = crime_data_long.loc[ crime_data_long["Crime_types"].isin(array)] ay = sns.factorplot(x="year", y="Sum", hue="Crime_types", col="state_abbr", data=crime_data_long_1, kind="bar", size=4, aspect=.7)
# drop unnecessary columns, these columns won't be useful in analysis and prediction titanic_df = titanic_df.drop(['PassengerId','Name','Ticket'], axis=1) test_df = test_df.drop(['Name','Ticket'], axis=1) test_df.head() # In[ ]: # Embarked # only in titanic_df, fill the two missing values with the most occurred value, which is "S". titanic_df["Embarked"] = titanic_df["Embarked"].fillna("S") # plot sns.factorplot('Embarked','Survived', data=titanic_df,size=4,aspect=3) fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(15,5)) # sns.factorplot('Embarked',data=titanic_df,kind='count',order=['S','C','Q'],ax=axis1) # sns.factorplot('Survived',hue="Embarked",data=titanic_df,kind='count',order=[1,0],ax=axis2) sns.countplot(x='Embarked', data=titanic_df, ax=axis1) sns.countplot(x='Survived', hue="Embarked", data=titanic_df, order=[1,0], ax=axis2) # group by embarked, and get the mean for survived passengers for each value in Embarked embark_perc = titanic_df[["Embarked", "Survived"]].groupby(['Embarked'],as_index=False).mean() sns.barplot(x='Embarked', y='Survived', data=embark_perc,order=['S','C','Q'],ax=axis3) # Either to consider Embarked column in predictions, # and remove "S" dummy variable, # and leave "C" & "Q", since they seem to have a good rate for Survival.
predict = pd.DataFrame(y_pred_orig) output = output.join(predict) output.columns = ['PassengerId', 'Survived'] print(output) import seaborn as sns from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC, LinearSVC from sklearn.ensemble import RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB # Age versus Survival rate. graph. sns.factorplot('Age', 'Survived', data = train_df, size=4, aspect=3) fig, (axis1, axis2, axis3) = plt.subplots(1, 3, figsize = (15, 5)) sns.countplot(x='Age', data = train_df, ax=axis1) sns.countplot(x='Survived', hue="Age", data = train_df, order=[1, 0], ax=axis2) Age_perc = train_df[["Age", "Survived"]].groupby(['Age'], as_index=False).mean() sns.barplot(x='Age', y='Survived', data=Age_perc, ax=axis3) Age_dummies_titanic = pd.get_dummies(train_df['Age']) Age_dummies_test = pd.get_dummies(test_df['Age']) train_df = train_df.join(Age_dummies_titanic) test_df = test_df.join(Age_dummies_test)
#dataset.columns.values dataset.describe() dataset.info() dataset.head() dataset.isnull().sum() dataset['Survived'].value_counts().plot.pie() sns.countplot(dataset['Survived']) ## categorical feature dataset.groupby(['Sex', 'Survived'])['Survived'].count() dataset[['Sex', 'Survived']].groupby(['Sex']).mean().plot.bar() sns.countplot('Sex', hue='Survived', data=dataset) dataset[['Embarked', 'Survived']].groupby(['Embarked']).mean().plot.bar() sns.factorplot('Embarked', 'Survived', data=dataset) sns.countplot('Embarked', data=dataset) sns.countplot('Embarked', hue='Sex', data=dataset) sns.countplot('Embarked', hue='Survived', data=dataset) sns.countplot('Embarked', hue='Pclass', data=dataset) ## ordinal feature pd.crosstab(dataset.Pclass, dataset.Survived, margins=True) dataset['Pclass'].value_counts().plot.bar() sns.countplot('Pclass', hue='Survived', data=dataset) pd.crosstab([dataset.Sex, dataset.Survived], dataset.Pclass, margins=True) sns.factorplot('Pclass', 'Survived', hue='Sex', data=dataset) ## continous feature dataset['Age'].describe()
if mean_score < 0.8: continue players = set(data[data['tick'] == max(data['tick'])]['pid']) n = len(players) vals += list(dists - syn_dists) ns += [n] * len(dists) models += [model] * len(dists) data = pd.DataFrame({'model': models, 'n_players': ns, 'values': vals}) sns.set(font='serif', context='poster', style='white') sns.despine() g = sns.factorplot('n_players', 'values', markers=['o', 's'], linestyles=['-', '--'], data=data, kind='point', dodge=0.15, x_order=sorted(set(data['n_players'])), col='model') plt.plot([0, 7], [0, 0], 'k-', lw=2) fig = plt.gcf() fig.savefig('../../plots/values.pdf')
continue log_file = '{}/test_{}_{}.log'.format(dir, d, e) with open(log_file) as f: lines = f.readlines() lines = [ line.replace('=', ' ').split() for line in lines if line.find('Test') != -1 ] acc = float(lines[-1][9]) if d == 'ppi' else float(lines[-1][6]) accs.append(acc) algos.append(etitle) data.append(d) df = pd.DataFrame(data={ 'Testing accuracy': accs, 'Algorithm': algos, 'Dataset': data }) print(df) g = sns.factorplot(x='Dataset', y='Testing accuracy', hue='Algorithm', data=df, kind='bar', aspect=2, size=2, palette=colors) g.savefig('test.pdf') os.system('pdfcrop test.pdf test.pdf')
size=5) # Okay! We're done with parameter tweaking so let us now move on to use another Seaborn attribute for plotting our **beeswarms** on separate axes. This attribute is known as **Factorplot** and we shall discuss it majorly in the later section of the course BUT for now we shall just use it to get more mileage from our **Swarmplot**. # Little tired of *Iris* flower sets! Let us use our *Tips* dataset this time around. # In[44]: # Loading Tips dataset: tips = sns.load_dataset("tips") sns.factorplot(x="sex", y="total_bill", hue="smoker", col="time", data=tips, kind="swarm", size=4.5, aspect=.7, palette="rocket") # **Factorplot** has given us the flexibility to visualize our dataset, i.e. **Tips dataset**, in two separate segments within a single plot, segregated by the **time** of day. So the *first set of axes* help us understand the trend during **Lunch** time and on **right**, we get a set of axes for **Dinner** time. `hue` parameter reflects the `palette` parameter, which in turn displays data points in *separate colors*, where **smokers** are presented by *purple* color. With such a presentation, it gets easier to see the *bulk of customers* on basis of their **Gender**, the **total bill** that their arrival in the restaurant generates. # More often you shall find that it is never a Swarmplot that alone represents those data points, as it is generally combined with **Boxplot** or **Violinplots**, that we shall discuss in-depth later on in this course. # I won't get into great detail but will show you a simple way of mixing these **Swarmplots** with other plot. Let me use a **Boxplot** to demonstrate what I mean and as assured earlier, I will cover Boxplots later in much more depth with all it's *parameters* and *general use-cases*: # In[77]: sns.swarmplot(x="day", y="total_bill", data=tips, palette="rocket") sns.boxplot(x="day",
#2. Pair plots #Age = data_titanic['Age'] #sns.pairplot(Age) #Not a good example. Don't RUN THIS! #age_sex = data_titanic.iloc[:,4:6] #sns.pairplot(age_sex) #sns.heatmap(age_sex) ############################################################################################# # We have 3 dfs now. data_titanic, mendata, womendata #Gender distribution gender = sns.factorplot('Sex', data=data_titanic, kind='count') gender.set_ylabels("count of passengers") #Distribution by age #age_data = data_titanic['Age'].hist(bins = 80) #plt.set_ylabel("Age of Passengers") age_data = data_titanic['Age'] plt.hist(age_data.dropna(), bins=80) plt.xlabel("Age of Passengers") plt.ylabel("Frequency") plt.title("Passenger's Age Distribution", fontsize=30, color='black') plt.show() #Distribution by class count_first = data_titanic.groupby('Pclass')['PassengerId'].count()
""" from sklearn.datasets import make_circles from sklearn.utils import shuffle import pandas as pd from timeit import default_timer as tic import sklearn.cluster import dask_ml.cluster import seaborn as sns Ns = [2500, 5000, 7500, 10000] X, y = make_circles(n_samples=10_000, noise=0.05, random_state=0, factor=0.5) X, y = shuffle(X, y) timings = [] for n in Ns: X, y = make_circles(n_samples=n, random_state=n, noise=0.5, factor=0.5) t1 = tic() sklearn.cluster.SpectralClustering(n_clusters=2).fit(X) timings.append(('Scikit-Learn (exact)', n, tic() - t1)) t1 = tic() dask_ml.cluster.SpectralClustering(n_clusters=2, n_components=100).fit(X) timings.append(('dask-ml (approximate)', n, tic() - t1)) df = pd.DataFrame(timings, columns=['method', 'Number of Samples', 'Fit Time']) sns.factorplot(x='Number of Samples', y='Fit Time', hue='method', data=df, aspect=1.5)