Exemplo n.º 1
0
    def mem_svg(self, table, column, outfile):
        import matplotlib
        matplotlib.use('Agg')
        import matplotlib.pyplot as plt
        import seaborn as sb
        sb.set(style="whitegrid")
        
        svgdat = (table.
            rename(columns={ column : 'Memory' }).
            groupby(['Dataset', 'Threads', 'Program']).
            agg({ 'Memory' : max }).
            reset_index())
        svgdat = svgdat.assign(MemoryMB=svgdat['Memory'] / 1000000)
        
        threads = svgdat.Threads.unique()
        if len(threads) == 1:
            plot = sb.factorplot(
                x='Program', y='MemoryMB', col="Dataset", 
                data=svgdat, kind="bar", ci=None, sharey=True)
        else:
            plot = sb.factorplot(
                x='Threads', y='MemoryMB', col="Dataset", hue="Program", 
                data=svgdat, kind="bar", ci=None, sharey=True)

        if len(threads) == 1:
            plot = plot.set_titles('')

        plot = plot.set_xlabels('Threads')
        plot = plot.set_ylabels('Memory (MB)')
        plot = plot.set_xticklabels(rotation=90)
        plot.fig.subplots_adjust(wspace=0.35)
        plot.savefig(outfile)
Exemplo n.º 2
0
def pclass_analyze(is_plot=True):
    global titanic_df
    global test_df

    print '-' * 40
    print titanic_df['Pclass'][(titanic_df['Pclass'] == 1) & (titanic_df['Survived'] == 1)].count()
    print titanic_df['Pclass'][(titanic_df['Pclass'] == 1) & (titanic_df['Survived'] == 0)].count()
    print titanic_df['Survived'][titanic_df['Pclass'] == 1].describe()
    print titanic_df['Survived'][titanic_df['Pclass'] == 2].describe()
    print titanic_df['Survived'][titanic_df['Pclass'] == 3].describe()

    if is_plot:
        sns.factorplot('Pclass', 'Survived',
                       order=[1,2,3],
                       data=titanic_df,
                       size=5)

    # 创建pclass的向量化数据
    pclass_dummies_titanic = pd.get_dummies(titanic_df['Pclass'])
    pclass_dummies_titanic.columns = ['Class_1', 'Class_2', 'Class_3']
    pclass_dummies_titanic.drop(['Class_3'], axis=1, inplace=True)

    pclass_dummies_test = pd.get_dummies(test_df['Pclass'])
    pclass_dummies_test.columns = ['Class_1', 'Class_2', 'Class_3']
    pclass_dummies_test.drop(['Class_3'], axis=1, inplace=True)

    titanic_df.drop(['Pclass'], axis=1, inplace=True)
    test_df.drop(['Pclass'], axis=1, inplace=True)

    titanic_df = titanic_df.join(pclass_dummies_titanic)
    test_df = test_df.join(pclass_dummies_test)
def player_performance_plots(database,table,player_name):
    conn = MySQLdb.connect(user="******",passwd="xxxx",db=database,
                           cursorclass=MySQLdb.cursors.DictCursor)
    cmd_target = 'SELECT * FROM '+ table + ' WHERE player IN (\''+ player_name +'\');'
    player_frame = pd.read_sql(cmd_target, con=conn)
    conn.close()
    player_values = player_frame['pos_streak_list'].values
    streaks = [ast.literal_eval(x) for x in player_values]
    streak_data = np.concatenate(streaks)
    x=range(len(streak_data))
    y=streak_data
    df_streaks = pd.DataFrame(dict(streaks=x, streak_length=y))
    streak_counts = pd.value_counts(df_streaks.values.ravel())

    xData = streak_counts.index[:15]
    xData_1 = [x-1 for x in xData]
    yData = streak_counts.values[:15]
    # yData_1 = yData*(1000)/yData[0]

    popt, pcov = optimize.curve_fit(exp_func, xData, yData)

    yEXP = exp_func(xData, *popt)

    plt.figure()
    sns.factorplot("streak_length", data=df_streaks,kind="bar",palette="Blues",size=6,aspect=2,legend_out=False);
    plt.plot(xData_1, yData, label='Data', marker='o')
    plt.plot(xData_1, yEXP, 'r-',ls='--', label="Exp Fit")
    plt.legend()
    plt.show()
    a,b,c = popt
    return streak_counts
Exemplo n.º 4
0
def draw_graphs(df, y, file_prefix, format="png"):
    for request_size in df[SIZE].unique():
        # Draw a nested barplot to show survival for class and sex
        g = sns.factorplot(x="initcwnd",
                           y=y,
                           col="bandwidth",
                           row="delay",
                           data=df[(df[SIZE] == request_size)],
                           kind="bar",
                           palette="muted")
        g.set(xlim=(.5, None))
        g.despine(left=True)
        name = "%s-request_size-%dkb.%s" % (file_prefix, request_size, format)
        print(name)
        g.savefig(name, dpi=300)

    for bandwidth in df.bandwidth.unique():
        # Draw a nested barplot to show survival for class and sex
        g = sns.factorplot(x="initcwnd",
                           y=y,
                           col="delay",
                           row=SIZE,
                           data=df[(df.bandwidth == bandwidth)],
                           kind="bar",
                           palette="muted",
                           aspect=1.2)
        g.set(ylim=(.5, None))
        g.despine(left=True)
        name = "%s-bandwidth-%dmb.%s" % (file_prefix, bandwidth, format)
        print(name)
        g.savefig(name, dpi=100)
Exemplo n.º 5
0
def plotStats(frame):
    # Actual plotting
    sns.set_style("ticks", {"xtick.minor.size":  12})
    sns.set_context('paper')
    
    frame.rename(columns={'Capture_Count': 'Discovery_Event_Count'}, inplace=True)
    
    factors = ['Adv Latency','Listener Latency', 'Adv Power', 'pos']
    variables = ['Time_to_Discovery', 'RSSI', 'Discovery_Event_Count', 'Percent_Captures']

    # for factor in factors:
        # for variable in variables:
            # sns.factorplot(factor, y=variable, data=frame.dropna())
            # plt.savefig( factor + ' vs. ' + variable + '.png', dpi=600, bbox_inches='tight')
            
    if True:
        sns.factorplot('Trial', y=variables[0], data=frame.dropna())
        plt.savefig('Trial' + ' vs. ' + variables[0] + '.png', dpi=600, bbox_inches='tight')
        for variable in variables:
            sns.factorplot(factors[2], variable, col=factors[0], row=factors[1], data=frame.dropna(), margin_titles=True, size=3, aspect=.8, x_order=['High','Medium','Low','Ultra_low'], col_order=['Low_Latency', 'Balanced', 'Low_Power'], row_order=['Low_Latency', 'Balanced', 'Low_Power'])
            plt.savefig('Trials' + ' vs. ' + variable + '.png', dpi=600, bbox_inches='tight')
    
    g = sns.factorplot(factors[3], variables[1], data=frame.dropna(), margin_titles=True, size=3, aspect=.8)
    g.set_xticklabels(rotation=90)
    plt.savefig(factors[3] + ' vs. ' + variables[1] + '.png', dpi=600, bbox_inches='tight')
Exemplo n.º 6
0
def behaviors_distribution(df,filepath):
    cols = ["known_eq","index"]
    rcols = ["Logical networks", "Input-Output behaviors"]
    sort_cols = ["known_eq"]
    
    if "mse" in df.columns:
        cols.append("mse")
        rcols.append("MSE")
        sort_cols = ["mse"] + sort_cols
        
        df.mse = df.mse.map(lambda f: "%.4f" % f)
    
    df = df.sort_values(sort_cols).reset_index(drop=True).reset_index(level=0)[cols]
    df.known_eq = df.known_eq + 1
    df.index = df.index + 1
    
    df.columns = rcols
    
    if "MSE" in df.columns:
        g = sns.factorplot(x='Input-Output behaviors', y='Logical networks', hue='MSE', data=df, aspect=3, kind='bar', legend_out=False)
    else:
        g = sns.factorplot(x='Input-Output behaviors', y='Logical networks', data=df, aspect=3, kind='bar', legend_out=False)
    
    g.ax.set_xticks([])
    if filepath:    
        g.savefig(os.path.join(filepath,'behaviors-distribution.pdf'))
    
    return g
    
Exemplo n.º 7
0
    def sim_analysis(self):
        df = pandas.DataFrame(self.exp_plan)
        df.sort_values(by=['subjid', 'order', 'obj', 'imgno', 'stim_dur'], inplace=True)
        gr = df.groupby(['obj', 'objno', 'imgno']).groups.keys()
        fs = {}
        for obj, objno, imgno in gr:
            a = .1 #+ #.1 * np.random.random()
            b = .1 + objno/10. #+ #.1 * np.random.random()
            lam = .05 + imgno/100. #+ .1 * np.random.random()
            fs[(obj,imgno)] = (a,b,lam)

        def accf(row):
            a, b, lam = fs[(row.obj, row.imgno)]
            x = row.stim_dur / 1000.
            acc = .5 + (.5 - lam) / (1 + np.exp(-(x-a)/b))
            return acc

        df.acc = df.apply(accf, axis=1)
        df.acc = df.acc.astype(float)
        print(df[df.qe==False].groupby(['obj', 'imgno', 'stim_dur']).acc.mean())
        print(df[df.qe==True].groupby(['obj', 'imgno', 'stim_dur']).acc.mean())
        import pdb; pdb.set_trace()
        sel = df.obj.isin(df.obj.unique()[:2]) & \
              df.imgno.isin(df.imgno.unique()[:3])
        sns.factorplot(x='stim_dur', y='acc', col='obj', row='imgno',
                       data=df[sel], kind='point')
        sns.plt.show()
Exemplo n.º 8
0
def prediction_quality(datas, models, labels, points, runs=1):
    filename = "../../data/matmat/2016-01-04/tmp2.data.pd"
    df = pd.DataFrame(columns=["~answers", "rmse", "models"])
    data_size = len(datas[0](None).get_dataframe_all())
    for i in range(points):
        ratio = (i + 1) / points
        print("Evaluation for {}% of data".format(ratio * 100))

        for data, model, label in zip(datas, models, labels):
            for run in range(runs):
                d = data(None)
                d.set_seed(run)
                d.set_train_size(ratio)
                d.filter_data(100, 0)
                d.get_dataframe_train().to_pickle(filename)

                d = Data(filename)
                m = model(None)

                Runner(d, m).run(force=True, only_train=True)
                report = Evaluator(d, m).get_report(force_evaluate=True, force_run=True)
                df.loc[len(df)] = (ratio * data_size, report["rmse"], label)

    print(df)
    sns.factorplot(x="~answers", y="rmse", hue="models", data=df)
def tm_gene_family_plot(tm_data, ordered_genomes, biotypes, gene_family_tgt):
    """transMap gene family collapse plots."""
    try:
        df = json_biotype_nested_counter_to_df(tm_data, 'Gene Family Collapse')
    except ValueError:  # no gene family collapse. probably the test set.
        with gene_family_tgt.open('w') as outf:
            pass
        return
    df['Gene Family Collapse'] = pd.to_numeric(df['Gene Family Collapse'])
    tot_df = df[['Gene Family Collapse', 'genome', 'count']].\
        groupby(['genome', 'Gene Family Collapse']).aggregate(sum).reset_index()
    tot_df = tot_df.sort_values('Gene Family Collapse')
    with gene_family_tgt.open('w') as outf, PdfPages(outf) as pdf:
        g = sns.factorplot(y='count', col='genome', x='Gene Family Collapse', data=tot_df, kind='bar',
                           col_order=ordered_genomes, col_wrap=4)
        g.fig.suptitle('Number of genes collapsed during gene family collapse')
        g.set_xlabels('Number of genes collapsed to one locus')
        g.set_ylabels('Number of genes')
        g.fig.subplots_adjust(top=0.9)
        multipage_close(pdf, tight_layout=False)
        for biotype in biotypes:
            biotype_df = biotype_filter(df, biotype)
            if biotype_df is None:
                continue
            biotype_df = biotype_df.sort_values('Gene Family Collapse')
            g = sns.factorplot(y='count', col='genome', x='Gene Family Collapse', data=biotype_df, kind='bar',
                               col_order=[x for x in ordered_genomes if x in set(biotype_df.genome)], col_wrap=4)
            g.fig.suptitle('Number of genes collapsed during gene family collapse for {}'.format(biotype))
            g.set_xlabels('Number of genes collapsed to one locus')
            g.set_ylabels('Number of genes')
            g.fig.subplots_adjust(top=0.9)
            multipage_close(pdf, tight_layout=False)
def composition(graph):
    """Figure 4"""
    dets = [graph[w] for w in ['that', 'my']]
    nouns = [graph[w] for w in [ 'table', 'bunny']]
    verbs = [graph[w] for w in ['saw', 'ate']]
    noun_phrases = [graph.bind(d, n) for d in dets for n in nouns]

    # Train (NP -> verb) pairs
    for NP in noun_phrases:
        graph.add(NP)
        for verb in verbs:
            NP.bump_edge(verb, factor=5)


    the, boy, saw, ate, jack = map(graph.get, ('the', 'boy', 'saw', 'ate', 'Jack'))
    that_table = graph.get('[that table]')

    data = [{'composition': str(composition),
             'noun phrase': str(NP),
             'verb': str(verb),
             'edge weight': NP.edge_weight(verb)} 
            for composition in (0, 0.5)
            for NP in [that_table, graph.bind(the, boy, composition=composition)]
            for verb in [saw, ate, the, boy]]  # include bad verbs (the, boy)

    df = pd.DataFrame(data)
    sns.factorplot('verb', 'edge weight', hue='noun phrase', col='composition',
                   data=df, kind='bar').despine(left=True)

    sns.plt.savefig('figs/composition.pdf')
    print('created figs/composition.pdf')
Exemplo n.º 11
0
def fishers_exact_plot(data, condition1, condition2):
    """
    Perform a Fisher's exact test to compare to binary columns

    Parameters
    ----------
    data: Pandas dataframe
        Dataframe to retrieve information from

    condition1: str
        First binary column compare

    condition2: str
        Second binary column to compare
    """
    sb.factorplot(
        x=condition1,
        y=condition2,
        kind='bar',
        data=data
    )
    count_table = pd.crosstab(data[condition1], data[condition2])
    print(count_table)
    oddsratio, pvalue = fisher_exact(count_table)
    print("Fisher's Exact Test: OR: {}, p-value={}".format(oddsratio, pvalue))
    return (oddsratio, pvalue)
def denovo_plot(consensus_data, ordered_genomes, denovo_tgt):
    with denovo_tgt.open('w') as outf, PdfPages(outf) as pdf:
        try:
            df = json_biotype_nested_counter_to_df(consensus_data, 'denovo')
        except ValueError:
            # No de novo results. Probably the test set.
            return
        # fix column names because json_biotype_nested_counter_to_df makes assumptions
        df.columns = ['Result', 'Number of transcripts', 'Augustus mode', 'genome']
        has_pb = len(set(df['Augustus mode'])) == 2
        if len(set(df.genome)) > 1:  # if we ran in PB only, we may not have multiple genomes
            if has_pb is True:
                ax = sns.factorplot(data=df, x='genome', y='Number of transcripts', kind='bar', col='Result',
                                    hue='Augustus mode', col_wrap=2, row_order=ordered_genomes, sharex=True,
                                    sharey=False)
            else:
                ax = sns.factorplot(data=df, x='genome', y='Number of transcripts', kind='bar', col='Result',
                                    col_wrap=2, row_order=ordered_genomes, sharex=True, sharey=False)
        else:
            if has_pb is True:
                ax = sns.factorplot(data=df, x='Result', y='Number of transcripts', kind='bar', hue='Augustus mode')
            else:
                ax = sns.factorplot(data=df, x='Result', y='Number of transcripts', kind='bar')
        ax.set_xticklabels(rotation=90)
        ax.fig.suptitle('Incorporation of de-novo predictions')
        ax.fig.subplots_adjust(top=0.9)
        multipage_close(pdf, tight_layout=False)
Exemplo n.º 13
0
 def accuracy(self):
     df = self._acc()
     df = df[df.sel]
     sns.factorplot(x='dataset', y='model_accuracy', hue='model', data=df,
                     kind='bar', color=self.myexp.colors['shape'])
     sns.plt.ylim([0,1])
     self._plot_behav()
     base.show(pref='acc', exp=self.myexp.exp, suffix='all_acc', savefig=self.myexp.savefig, html=self.myexp.html)
     return df
def factorplots(wine_set):
    seaborn.factorplot(x="quality", y="alcohol", data=wine_set, kind="strip")
    plt.xlabel("Quality level of wine, 0-10 scale")
    plt.ylabel("Alcohol level in wine, % ABV")
    if wine_set.equals(red):
        plt.title("Alcohol percent in each level of red wine's quality")
    else:
        plt.title("Alcohol percent in each level of white wine's quality")
    plt.show()
def plot_data(df):
    """ simple plots of soil moisture records """

    # melt dataframes three sensor depths into a reading column and depth column
    df = pd.melt(df, id_vars=["NAME", "FID", "SID", "DTG", "DATE", "TEMP"], var_name="DEPTH", value_name="READING")

    for station in df["NAME"].unique():
        sns.factorplot("DATE", "READING", hue="DEPTH", data=df[df["NAME"] == station])
        sns.plt.title(station)
        sns.plt.show()
Exemplo n.º 16
0
def plotsbarra(exames=["BLD", "BIL", "UBG", "KET", "GLU", "PRO", "NIT", "LEU", "PH"]):
    for i in exames:
        filename = "barplot" + i + ".png"
        ordem = sorted(dfcatemelt[dfcatemelt.EXAME == i].MEDIDA.unique())
        if "neg" in ordem:
            ordem.remove("neg")
            ordem.insert(0, "neg")
        print(":: para", i)
        sns.factorplot(x="MEDIDA", kind="count", hue="HORA", data=dfcatemelt[dfcatemelt.EXAME == i], order=ordem)
        plt.savefig(filename)
def game_performance_plots(player_name,sub_frame,N,save,file_name):
    player_frame = frame_from_player(player_name,sub_frame)
    plays = [recent_performance(player_name,player_frame,N,rowIndex,stats_weights) for rowIndex in player_frame.index]
    perf =[event[1] for event in plays]
    x=range(len(perf))
    y=perf
    df = pd.DataFrame(dict(game_event=x, performance=y))
    sns.factorplot("game_event","performance", data=df,kind="bar",palette="Blues",size=6,aspect=2,legend_out=False);
    if save == 'save':
        plt.savefig(file_name, dpi=200)
Exemplo n.º 18
0
def createScanOrderBarPlot(GroupDF,goodsubj,BV=False,ax=[],savefig=True):
    if type(ax)==list:
        plt.figure()
    if BV:
        sns.factorplot(data=GroupDF[GroupDF.Subject_ID.isin(goodsubj)],x='FB',y='modelcorr',hue='scanorder',kind='bar',units='Subject',ci=68)
    else:
        sns.violinplot(data=GroupDF[GroupDF.Subject_ID.isin(goodsubj)],x='FB',y='modelcorr',hue='scanorder',split='True',bw=.4,inner='quartile',ax=ax, color='w')


    if savefig:
        plt.savefig('%s/ScanOrder_ModelCorrelations.pdf' % saveFigureLocation,dpi=600)
Exemplo n.º 19
0
 def errors(self):
     colors = sns.color_palette('Set2')[1]
     df = []
     for depth, model in self.myexp.models:
         self.myexp.set_model(model)
         e = self.myexp.errors()
         for i,r in e.iterrows():
             df.append([depth] + r.values.tolist())
     df = pandas.DataFrame(df, columns=['depth']+e.columns.values.tolist())
     sns.factorplot(x='dimension', y='count', data=df, hue='depth', kind='bar')
     self.show(pref='errors', suffix='all')
Exemplo n.º 20
0
def plot_fit(data, model, x_cols, nonzero_only=True, title='',binary=False):
    sns.set_style('darkgrid')
    if binary:
        coefs = model.coef_[0]
    else:
        coefs = model.coef_
    results = pd.DataFrame({'param': [data.columns.tolist()[i] for i in x_cols],
                            'value': coefs})
    if nonzero_only:
        results = results.loc[results.value != 0]
    sns.factorplot('value', 'param', kind="bar", data=results, size=10, aspect=.7)
    plt.title(title)
Exemplo n.º 21
0
def corr_models(mods1_dis, mods2_dis):
    df = []
    for mods1_label, mods1_data in mods1_dis.items():
        inds = np.triu_indices(mods1_data.shape[0], k=1)
        for mods2_label, mods2_data in mods2_dis.items():
            corr = np.corrcoef(mods1_data[inds], mods2_data[inds])[0,1]
            df.append([mods1_label, mods2_label, corr])
    df = pandas.DataFrame(df, columns=['perception', 'models', 'correlation'])
    df = stats.factorize(df)
    sns.factorplot('perception', 'correlation', 'models',
                   data=df, kind='bar')
    return df
def bivariate_bar_plot(df,dep_var, indep_var,grpby, units):
    #grpby is used to add hue to graph
    if grpby:
        seaborn.factorplot(x=indep_var, y=dep_var, data=df, hue =grpby , kind="bar", ci=None)
    else:
        seaborn.factorplot(x=indep_var, y=dep_var, data=df, kind="bar", ci=None)
    #would be great to figure out how to remove '_cat'    
    plt.xlabel(indep_var)
    plt.ylabel(dep_var)   
    plt.title(dep_var + " by " + indep_var)   
    plt.savefig(wd + "Bivariate Plot of " + dep_var + "_vs_"+ indep_var + '.png')
    plt.close
Exemplo n.º 23
0
def plot_modified_data(data_dict_paths_correct, data_dict_paths_wrong, modified_correct_plotting_dict, modified_wrong_plotting_dict, plot_savefile, mode):
    with open(data_dict_paths_correct, 'rb') as f:
        data_correct = pickle.load(f)

    with open(data_dict_paths_wrong, 'rb') as f:
        data_wrong = pickle.load(f)
        
    with open(modified_correct_plotting_dict, 'rb') as f:
        modified_data_correct = pickle.load(f)

    with open(modified_wrong_plotting_dict, 'rb') as f:
        modified_data_wrong = pickle.load(f)

    data_correct['lstm_pred'] = np.array(['correct'] * len(data_correct['scores']))
    data_wrong['lstm_pred'] = np.array(['wrong'] * len(data_wrong['scores']))
    modified_data_correct['lstm_pred'] = np.array(['modified_correct'] * len(modified_data_correct['scores']))
    modified_data_wrong['lstm_pred'] = np.array(['modified_wrong'] * len(modified_data_wrong['scores']))

    data_merged = defaultdict(list)

    for k, v in chain(data_correct.items(), data_wrong.items(), modified_data_correct.items(), modified_data_wrong.items()):
        data_merged[k].append(v)

    overall_scores = data_merged['mean_score']

    data_merged_new = defaultdict(list)
    for key in data_merged.keys():
        if str(key) == 'mean_score':
            continue
        else:
            data_merged_new[key] = data_merged[key]

    for key in list(data_merged_new.keys()):
        data_merged_new[key] = [item for sublist in data_merged_new[key] for item in sublist]

    #data_merged['squared error'] = data_merged.pop('scores')
    print(data_merged_new.keys())
    data = pd.DataFrame(data_merged_new, columns=list(data_merged_new.keys()))

    if mode == 'regression':
        sns.set(font_scale=1.5)
        data.columns = ['squared error', 'timesteps', 'activation', 'layer', 'lstm prediction']
        plot = sns.factorplot(x="timesteps", y="squared error", hue="lstm prediction", palette={'correct': 'darkgreen', 'modified_correct': 'lightgreen', 'wrong': 'crimson', 'modified_wrong': 'salmon'}, row='layer', col="activation", legend_out=True, data=data)
        print('MSE from top left to bottom right:', overall_scores)
    else:
        sns.set(font_scale=1.5)
        data.columns = ['accuracy', 'timesteps', 'activation', 'layer', 'lstm prediction']
        plot = sns.factorplot(x="timesteps", y="accuracy", hue="lstm prediction", palette={'correct': 'darkgreen', 'modified_correct': 'lightgreen', 'wrong': 'crimson', 'modified_wrong': 'salmon'}, row='layer', col="activation", legend_out=True, data=data)
        print('Mean accuracies correct:', overall_scores[0])
        print('Mean accuracies false:', overall_scores[1])

    plot.savefig(plot_savefile + '.png')
Exemplo n.º 24
0
    def plot_modalities_bars(self, sample_ids=None, feature_ids=None,
                             data=None, groupby=None,
                             phenotype_to_color=None,
                             bootstrapped=False, bootstrapped_kws=None):
        """Plot bar

        Parameters
        ----------
        sample_ids : None or list of str
            Which samples to use. If None, use all
        feature_ids : None or list of str
            Which features to use. If None, use all
        color : None or matplotlib color
            Which color to use for plotting the lavalamps of these features
            and samples
        x_offset : numeric
            How much to offset the x-axis of each event. Useful if you want
            to plot the same event, but in several iterations with different
            celltypes or colors
        use_these_modalities : bool
            If True, then use these sample ids to calculate modalities.
            Otherwise, use the modalities assigned using ALL samples and
            features
        bootstrapped : bool
            Whether or not to use bootstrapping, i.e. resample each splicing
            event several times to get a better estimate of its true modality.
            Default False.
        bootstrappped_kws : dict
            Valid arguments to _bootstrapped_fit_transform. If None, default is
            dict(n_iter=100, thresh=0.6, minimum_samples=10)
        """
        if data is not None:
            assignments = self.modalities(data=data, groupby=groupby,
                                          bootstrapped=bootstrapped,
                                          bootstrapped_kws=bootstrapped_kws)
        else:
            assignments = self.modalities(
                sample_ids, feature_ids, groupby=groupby,
                bootstrapped=bootstrapped, bootstrapped_kws=bootstrapped_kws)

        # make sure this is always a dataframe
        if isinstance(assignments, pd.Series):
            assignments = pd.DataFrame([assignments.values],
                                       index=assignments.name,
                                       columns=assignments.index)
        x_order = self.modalities_visualizer.modalities_order
        id_vars = list(self.data.columns.names)
        df = pd.melt(assignments.T.reset_index(),
                     value_vars=assignments.index.tolist(),
                     id_vars=id_vars)
        sns.factorplot('value', hue=assignments.index.name, data=df,
                       x_order=x_order)
Exemplo n.º 25
0
    def behav_amir(self):
        df = pandas.read_csv('amir_2012.csv')
        df = df[df.version=='3d']
        df = df[~df.subjid.isin(['KA11','JJ'])]
        df = df[df.run!=15]
        df = df[~df.cond.isin([31,34])]
        df = df[df.acc==100]
        agg = stats.aggregate(df, groupby=['dimension', 'variant',
                              'version', 'subjid'])

        sns.factorplot(x='version',y='rt',hue='variant',col='dimension',
                        data=agg,kind='bar',col_wrap=3)
        sns.plt.show()
Exemplo n.º 26
0
def plot_multi_bars_with_sns():
    '''
    '''
    N = 10
    GROUP_N = 3
    labels = ['label%d' % i for i in range(N)]
    data = np.random.rand(GROUP_N, 10).reshape(-1)

    df = pd.DataFrame(dict(data=data, label=labels * GROUP_N, group=['g1', 'g2', 'g3'] * N))
    sns.factorplot(data=df,  x='label', y='data', hue='group', kind='bar')
    # Hue 代表x轴每个值 再分成小类别显示
    plt.xticks(rotation='vertical')
    plt.show()
def plot_scores_RF(scores, path):
    sn.set_style("whitegrid") 
    sn.factorplot(x="n_estimators", y="mean", hue="max_depths", col="max_features", data=scores,
                   palette="BuGn_r", col_wrap = 3)
    if path is not None:
        file_name =  'max_features_{:.0f}-{:.0f}_max_depts_{:.0f}-{:.0f}_n_estimators_{:.0f}-{:.0f}'.format(
         scores.max_features.min(),
         scores.max_features.max(),
         scores.max_depths.min(),
         scores.max_depths.max(),
         scores.n_estimators.min(),
         scores.n_estimators.max())+'.png'
        plt.savefig(os.path.join(path,file_name)) 
Exemplo n.º 28
0
	def class_SHM(self):

		SHM_columns = ['Antibody', 'Group', 'Sample', 'Data']
		antibody_class_list = ['all classes', 'IGHM', 'IGHG', 'IGHA', 'IGHE', 'IGHD']

		ls_split = []
		for key in antibody_class_list:
			for igroup,group in enumerate(self.groups):
				for sample in group['samples']:
					for i in self.groups[igroup]['sample data'][sample]['sh_dict'][key]:
						ls_split.append([key, group['name'], sample, i])
		self.SHM_DF_split = pd.DataFrame(ls_split, columns = SHM_columns)

		sns.factorplot('Group', 'Data', data=self.SHM_DF_split, hue='Antibody' , kind='box', size = 8, aspect=2)
def showMetric(metric_name, m_table, title, output_fn = None):
    pl.figure()
   # siz =  m_table.shape[0]
    #pl.plot(range(siz), m_table[metric_name], color="r", lw=2)
    sns.factorplot(x='specimen_id',y=metric_name, data = m_table, kind="bar")
  #  labels = range(1,siz,30)
    pl.xticks(fontsize=3)
    pl.title(title)
    pl.xticks(rotation=90)
   # pl.xlabel('Image ID 1 ~ ' + str(siz))
    if ( output_fn is None):
        output_fn = data_DIR+'/'+metric_name+title+'.pdf'
    pl.savefig(output_fn)
    pl.show()
Exemplo n.º 30
0
    def plot(self):
        # plot mean phase of spikes to show that they are fish dependent
        df = pd.DataFrame(self.fetch())
        df['eod'] = [1 / np.median(np.diff(e)) for e in df.eod_times]
        df['cmean'] = [circ.mean(e) for e in df.phases]
        df['jitter'] = [circ.std(ph) / 2 / np.pi / e for ph, e in zip(df.phases, df.eod)]

        model = ols('cmean ~ C(fish_id)', data=df).fit()
        table = sm.stats.anova_lm(model)
        print(table)

        sns.factorplot('fish_id', 'cmean', data=df, kind='bar')
        g = sns.pairplot(df.ix[:, ('cmean', 'jitter', 'fish_id')], hue='fish_id')
        plt.show()
Exemplo n.º 31
0
    # select observations containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    
    
    return multiple_outliers   

# detect outliers from Age, SibSp , Parch and Fare
Outliers_to_drop = detect_outliers(train,2,["Age","SibSp","Parch","Fare"])

train = train.drop(Outliers_to_drop, axis = 0).reset_index(drop=True)
train_len=len(train)
dataset=pd.concat(objs=[train,test],axis=0).reset_index(drop=True)
dataset=dataset.fillna(np.nan)
sns.heatmap(train[['Survived','SibSp','Parch','Age','Fare']].corr(),annot=True,fmt='.2f',cmap='coolwarm')
g=sns.factorplot(x='SibSp',y='Survived',data=train,kind='bar',size=6,palette='muted')
g.despine(left=True)
g=g.set_ylabels("Survival Probability")
g=sns.FacetGrid(train,col='Survived')
g=g.map(sns.distplot,'Age')
dataset['Fare']=dataset['Fare'].fillna(dataset['Fare'].median())
g = sns.distplot(dataset["Fare"], color="m", label="Skewness : %.2f"%(dataset["Fare"].skew()))
g = g.legend(loc="best")

#applying Log 

dataset['Fare']=dataset['Fare'].map(lambda i : np.log(i) if i>0 else 0)

#filling Embarked
dataset['Embarked']=dataset['Embarked'].fillna('S')
Exemplo n.º 32
0
# All categorical variables contains NAN whereas continuous ones have 0.
# So that means there is no basement for those houses.
# we can replace it with 'None'.

# In[ ]:

for col in basement_cols:
    if 'FinSF' not in col:
        houses[col] = houses[col].fillna('None')

# *Fireplaces*
# ------------

# In[ ]:

sns.factorplot("Fireplaces", "SalePrice", data=houses, hue="FireplaceQu")

# Having 2 fireplaces increases house price and fireplace of Excellent quality is a big plus.

# In[ ]:

#If fireplace quality is missing that means that house doesn't have a fireplace
houses["FireplaceQu"] = houses["FireplaceQu"].fillna('None')
pd.crosstab(houses.Fireplaces, houses.FireplaceQu)

# *Garages*
# ---------

# In[ ]:

sns.distplot(houses["GarageArea"], color='r', kde=False)
X, y, Z, y2 = preprocessing(df_train_data)
#print(df_train_data)
print(X.columns)
print(X.shape)
print(y.shape)
print(y2.shape)
print(Z.shape)
#print(y2)


# In[4]:


#ヴァイオリンプロットで視覚的に データを比較する
sns.factorplot(x='marital', y='age', data=Z, kind='violin',aspect=2)
plt.show()
sns.factorplot(x='marital', y='job', data=Z, kind='violin',aspect=2)
plt.show()
sns.factorplot(x='marital', y='education', data=Z, kind='violin',aspect=2)
plt.show()
sns.factorplot(x='marital', y='default', data=Z, kind='violin',aspect=2)
plt.show()
sns.factorplot(x='marital', y='housing', data=Z, kind='violin',aspect=2)
plt.show()
sns.factorplot(x='marital', y='loan', data=Z, kind='violin',aspect=2)
plt.show()
sns.factorplot(x='marital', y='contact', data=Z, kind='violin',aspect=2)
plt.show()
sns.factorplot(x='marital', y='month', data=Z, kind='violin',aspect=2)
plt.show()
Exemplo n.º 34
0
    print("RUTA : {}".format(ruta))

    # imprimir rtts relativos de ruta
    ultimo_rtt = 0
    ruta_final = []
    for ip, rtt in ruta:
        ruta_final.append((ip, rtt, rtt - ultimo_rtt))
        ultimo_rtt = rtt

    ruta_rtts_relativos = [(ip, rel_rtt) for (ip, rtt, rel_rtt) in ruta_final]

    print("ruta_rtts_relativos : {}".format(ruta_rtts_relativos))

    df = pd.DataFrame(ruta_rtts_relativos, columns=['IP', 'RTT'])
    sns.set(font_scale=1.5)
    ax = sns.factorplot(x='IP', y='RTT', data=df, aspect=1.5)
    ax.set(xlabel='IPs con más apariciones por salto',
           ylabel='RTT relativo medio (ms)')
    ax.set_xticklabels(rotation=90)
    ax.fig.suptitle('RTT medio para cada salto')

    #plt.tight_layout()
    if target:
        ax.fig.set_size_inches(24, 6)
        ax.savefig("../img/" + target + "-rtts.pdf")
    else:
        plt.show()

    ax.fig.clear()

    # imprimir incremento de rtts de ruta
sns.countplot(x='Sex', data=titanic_train)

#histogram to undertand continuous feature
#x: bins of continuous data, y: frequency
#issue: how do you select number of bins?
sns.distplot(titanic_train['Fare'], kde=False)
sns.distplot(titanic_train['Fare'], kde=True)
sns.distplot(titanic_train['Fare'], bins=20, rug=False, kde=False)
sns.distplot(titanic_train['Fare'], bins=20)
sns.distplot(titanic_train['Fare'], bins=100, kde=False)
#density plot to understand continuous feature
#it doesnt require bins argument
#x: fare y:density
sns.distplot(titanic_train['Fare'], hist=False)
sns.distplot(titanic_train['Fare'])
#box-whisker plot to understand continuous feature
sns.boxplot(x='Fare', data=titanic_train)

titanic_test = pd.read_csv(
    "D:\\Data Science\\Code Exec\\Data\\titanic_test.csv")
print(titanic_test.shape)

titanic_test['Survived'] = 0
titanic_test['test'] = 23
titanic_test.loc[titanic_test.Sex == 'female', ['Sex', 'Survived']]
titanic_test.to_csv("D:/Data Science/submission1.csv",
                    columns=['PassengerId', 'Survived', 'test'],
                    index=False)
#bi variate plots
sns.factorplot(x='Sex')
Exemplo n.º 36
0
def survival_rate(feature):
    rate = train[[feature, 'Survived'
                  ]].groupby([feature],
                             as_index=False).mean().sort_values(by=[feature],
                                                                ascending=True)
    sns.factorplot(x=feature, y="Survived", data=rate)
Exemplo n.º 37
0
def show_bikeshare_charts(city_data):

    # Following visualizations will be generated for the cities which has data of Birth Year

    # Bar chart for the User Type of the city
    sns.factorplot('User Type', data=city_data, kind='count')
    plt.xlabel('User Type', fontsize=14)
    plt.title("Customer Vs Subscriber Data", fontsize=16)
    ax = plt.gca()
    ax.axes.get_yaxis().set_visible(False)
    for p in ax.patches:
        ax.text(p.get_x() + p.get_width() / 2.,
                p.get_height(),
                '%d' % int(p.get_height()),
                fontsize=12,
                ha='center',
                va='bottom')
    plt.show()

    # Pie chart for User Type of the city
    city_data['User Type'].value_counts().plot(kind='pie', autopct='%1.1%%f')
    plt.axis('equal')
    plt.title('User Type', fontsize=16)

    # Ridership for the Days of the Week
    sns.factorplot('Start Day',
                   data=city_data,
                   kind='count',
                   order=[
                       'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday',
                       'Friday', 'Saturday'
                   ],
                   size=8)
    plt.xlabel('Weekday', fontsize=14)
    plt.title("Ridership for the Days of the Week", fontsize=16)
    ax = plt.gca()
    ax.axes.get_yaxis().set_visible(False)
    for p in ax.patches:
        ax.text(p.get_x() + p.get_width() / 2.,
                p.get_height(),
                '%d' % int(p.get_height()),
                fontsize=12,
                ha='center',
                va='bottom')
    plt.show()

    # Following visualization will be generated for the cities which has data for Gender
    if 'Gender' in city_data:
        sns.factorplot('Gender', data=city_data, kind='count', size=8)
        plt.xlabel('Gender', fontsize=14)
        plt.title("Bike Share Gender Data", fontsize=16)
        ax = plt.gca()
        ax.axes.get_yaxis().set_visible(False)
        for p in ax.patches:
            ax.text(p.get_x() + p.get_width() / 2.,
                    p.get_height(),
                    '%d' % int(p.get_height()),
                    fontsize=12,
                    ha='center',
                    va='bottom')
        plt.show()
Exemplo n.º 38
0
incomes.nomprov = incomes.nomprov.astype("category", categories=[i for i in df.nomprov.unique()],ordered=False)
incomes.head()


# In[ ]:


with sns.axes_style({
        "axes.facecolor":   "#ffc400",
        "axes.grid"     :    False,
        "figure.facecolor": "#c60b1e"}):
    h = sns.factorplot(data=incomes,
                   x="nomprov",
                   y=("renta","MedianIncome"),
                   order=(i for i in incomes.nomprov),
                   size=6,
                   aspect=1.5,
                   scale=1.0,
                   color="#c60b1e",
                   linestyles="None")
plt.xticks(rotation=90)
plt.tick_params(labelsize=16,labelcolor="#ffc400")#
plt.ylabel("Median Income",size=32,color="#ffc400")
plt.xlabel("City",size=32,color="#ffc400")
plt.title("Income Distribution by City",size=40,color="#ffc400")
plt.ylim(0,180000)
plt.yticks(range(0,180000,40000))


# There's a lot of variation, so I think assigning missing incomes by providence is a good idea. First group the data by city, and reduce to get the median. This intermediate data frame is joined by the original city names to expand the aggregated median incomes, ordered so that there is a 1-to-1 mapping between the rows, and finally the missing values are replaced.
Exemplo n.º 39
0
def CallAccuracyPlots(name, data):
    df = None
    for adp, vals in data.iteritems():
        filtered = [v for v in vals if v[2] > 0.0]
        TP = [v[2] for v in filtered if v[3]]
        FP = [v[2] for v in filtered if not v[3]]
        classes = (["TruePos"] * len(TP)) + (["FalsePos"] * len(FP))
        raw = {
            "AdapterType": adp,
            "AdapterClass": pd.Series(classes),
            "CallAccuracy": pd.Series(TP + FP)
        }

        if df is None:
            df = pd.DataFrame(raw)
        else:
            df = df.append(pd.DataFrame(raw))

    ax = sns.factorplot(x="AdapterType",
                        y="CallAccuracy",
                        hue="AdapterClass",
                        kind="box",
                        data=df)
    plt.subplots_adjust(top=0.9)
    ax.fig.suptitle("Adapter Call Accuracy by Type and Classification")

    plt.ylim(0.4, 1.05)
    pltFilename = "{0}_call_accuracy_box.png".format(name)
    plt.savefig(pltFilename)
    plt.close()

    p1 = {
        "caption": "Adapter Call Accuracy Box Plots For Adapter Types",
        "image": pltFilename,
        "tags": [],
        "id": "{0} - Adapter Call Accuracy Box Plots".format(name),
        "title": "{0} - Adapter Call Accuracy Box Plots".format(name),
        "uid": "0500001"
    }

    g = sns.FacetGrid(pd.melt(df,
                              id_vars=['AdapterType', 'AdapterClass'],
                              value_vars=['CallAccuracy']),
                      hue='AdapterClass',
                      row='AdapterType',
                      aspect=2.0)
    g.map(sns.kdeplot, 'value', shade=True)
    plt.subplots_adjust(top=0.9)
    g.fig.suptitle("Adapter Call Accuracy by Type and Classification")
    plt.legend()
    pltFilename = "{0}_call_accuracy_dist.png".format(name)
    plt.savefig(pltFilename)
    plt.close()

    p2 = {
        "caption": "Adapter Call Accuracy Density Plot For Adapter Types",
        "image": pltFilename,
        "tags": [],
        "id": "{0} - Adapter Call Accuracy Density Plot".format(name),
        "title": "{0} - Adapter Call Accuracy Density Plot".format(name),
        "uid": "0500002"
    }

    g = sns.FacetGrid(pd.melt(df,
                              id_vars=['AdapterType', 'AdapterClass'],
                              value_vars=['CallAccuracy']),
                      hue='AdapterClass',
                      row='AdapterType',
                      aspect=2.0)
    bins = [x / 1000.0 for x in range(400, 1001, 25)]
    g.map(plt.hist, 'value', alpha=0.5, bins=bins)
    plt.subplots_adjust(top=0.9)
    g.fig.suptitle("Adapter Call Accuracy by Type and Classification")
    plt.legend()
    pltFilename = "{0}_call_accuracy_hist.png".format(name)
    plt.savefig(pltFilename)

    p3 = {
        "caption": "Adapter Call Accuracy Histogram For Adapter Types",
        "image": pltFilename,
        "tags": [],
        "id": "{0} - Adapter Call Accuracy Histogram".format(name),
        "title": "{0} - Adapter Call Accuracy Histogram".format(name),
        "uid": "0500003"
    }

    return [p1, p2, p3]
# View first lines of test data
df_test.head()

df_train.info()

df_train.describe()

sns.countplot(x = 'Survived', data = df_train)

df_test['Survived'] = 0
df_test[['PassengerId', 'Survived']].to_csv('/home/sarvesh/Titanic/no_survivors.csv', index = False)

sns.countplot(x = 'Sex', data = df_train)

sns.factorplot(x = 'Survived', col = 'Sex', kind = 'count', data = df_train)

df_train.groupby(['Sex']).Survived.sum()

print(df_train[df_train.Sex == 'female'].Survived.sum()/df_train[df_train.Sex == 'female'].Survived.count())
print(df_train[df_train.Sex == 'male'].Survived.sum()/df_train[df_train.Sex == 'male'].Survived.count())

df_test['Survived'] = df_test.Sex == 'female'
df_test['Survived'] = df_test.Survived.apply(lambda x: int(x))
df_test.head()

sns.factorplot(x = 'Survived', col = 'Embarked', kind = 'count', data = df_train)

sns.distplot(df_train.Fare, kde = False)

df_train.groupby('Survived').Fare.hist(alpha = 0.6)
data['Pclass'].value_counts().plot.bar(color=["#CD7F32", "#FFDF00", "#D3D3D3"],
                                       ax=ax[0])
ax[0].set_title('Number Of Passengers By Pclass')
ax[0].set_ylabel('Count')
sns.countplot('Pclass', hue='Survived', data=data, ax=ax[1])
ax[1].set_title('Pclass:Survived vs Dead')
plt.show()

# In[ ]:

pd.crosstab([data.Sex, data.Survived], data.Pclass,
            margins=True).style.background_gradient(cmap='summer_r')

# In[ ]:

sns.factorplot('Pclass', 'Survived', hue='Sex', data=data)
plt.show()

# In[ ]:

print('Oldest Passenger was of:', data['Age'].max(), 'Years')
print('Youngest Passenge was of:', data['Age'].min(), 'Years')
print('Average Age on the ship:', data['Age'].mean(), 'Years')

# In[ ]:

f, ax = plt.subplots(1, 2, figsize=(18, 8))
sns.violinplot("Pclass",
               "Age",
               hue="Survived",
               data=data,
Exemplo n.º 42
0
d = {}
for state_name in Data.state:
    sub_data = Data[Data.state == state_name]
    if sub_data[sub_data.party == 'Democrat'].empty or sub_data[
            sub_data.party == 'Republican'].empty:
        continue
    else:
        advantage = float(np.sum(sub_data[sub_data.party=='Democrat'].votes) - \
        np.sum(sub_data[sub_data.party=='Republican'].votes))/np.sum(sub_data.votes)
        d[state_name] = advantage

Advantage_score = pd.DataFrame(sorted(d.items(), key=lambda x: x[1]))
Advantage_score.columns = ['state', 'adv_score']

Demo_Adv = Ave_vote_Demo[Ave_vote_Demo.state.isin(
    Advantage_score.tail(10).state)]
Repu_Adv = Ave_vote_Repu[Ave_vote_Repu.state.isin(
    Advantage_score.head(10).state)]

#Plot of ave_vote_rate for different candidate among the top 10 advantage state
sns.factorplot(x="state",
               y="ave_vote_rate",
               hue="candidate",
               data=Demo_Adv,
               kind="bar")
sns.factorplot(x="state",
               y="ave_vote_rate",
               hue="candidate",
               data=Repu_Adv,
               kind="bar")
Exemplo n.º 43
0
5) How did voter sentiment change over time?
6) Can we see an effect in the polls from the debate?
'''

# Read election poll dataset into a pandas dataset
url = 'http://elections.huffingtonpost.com/pollster/2012-general-election-romney-vs-obama.csv'
source = requests.get(url).text
poll_data = StringIO(source)
poll_df = pd.read_csv(poll_data)
print poll_df.head()

# Who was being polled? (bargraph/factorplot)
sns.factorplot(x='Affiliation',
               kind='count',
               data=poll_df,
               order=(['Dem', 'None', 'Rep']),
               hue='Population',
               size=6,
               aspect=2)

# What was the mean and stdev
avg = poll_df[['Obama', 'Romney', 'Undecided']].mean()
std = poll_df[['Obama', 'Romney', 'Undecided']].std()

plt.figure()
avg.plot(kind='bar', yerr=std, legend=False, color='indianred')

# Concatenating dataframes using pd.concat
poll_avg = pd.concat([avg, std], axis=1)
poll_avg.columns = ['Average', 'STD']
print poll_avg.head()
train_data[train_data["Fare"].isnull()]

x = train_data[train_data["Pclass"] == 3]["Fare"].mean()
train_data["Fare"] = train_data["Fare"].fillna(x)
train_data[train_data["Fare"].isnull()]

# Visualization

list1 = ["SibSp", "Parch", "Age", "Fare", "Survived"]
sns.heatmap(train_data[list1].corr(), annot=True, fmt=".2f")
plt.show()

g = sns.factorplot(x="SibSp",
                   y="Survived",
                   data=train_data,
                   kind="bar",
                   size=6)
#g.set_ylabels("Survived Probability")

sns.factorplot(x="Pclass",
               y="Survived",
               data=train_data,
               hue="Sex",
               kind="bar")
sns.factorplot(x="Pclass", y="Fare", data=train_data, hue="Sex", kind="violin")

sns.factorplot(x="Parch", y="Survived", kind="bar", data=train_data, size=6)

g = sns.FacetGrid(train_data, col="Survived")
g.map(sns.distplot, "Age", bins=25)
# compare conditions
# ------------------

# create a data frame
data_dict = {'travel time' : condition_list, 'body part' : \
 ['shoulder'] * reps_per_cond + ['leg'] * reps_per_cond}
df = pd.DataFrame(data_dict)

# set figure style
sns.set(style="ticks")

# create figure
g = sns.factorplot("body part",
                   "travel time",
                   data=df,
                   kind="box",
                   palette="PRGn")
g.despine(offset=10, trim=True)
# save figure
plt.savefig(filename)

# get image on screen
img = Image.open(filename)
imgsize = np.array(img.size)
del img

# set image
info_img.size = imgsize
info_img.setImage(filename)
info_img.draw()
Exemplo n.º 46
0
def grid(data,
         x,
         y,
         col=None,
         hue=None,
         col_wrap=4,
         palette='default',
         style='astetik',
         dpi=72,
         title='',
         sub_title='',
         x_label='',
         y_label='',
         legend=True,
         x_scale='linear',
         y_scale='linear',
         x_limit=None,
         y_limit=None,
         save=False):
    '''THE GRID

    The grid provides an overview of 4 features simultanously by
    drawing a grid of scatter plots.

    Inputs: 4
    Features: Ideally two continuous, and two categorical, but will
              also work with just one continuous and two categoricals.

    1. USE
    ======
    ast.grid(data=new_patients.head(1000),
              x='icu_stays',
              y='hospital_days',
              col='religion',
              palette='default',
              col_wrap=4);

    2. PARAMETERS
    =============
    2.1 INPUT PARAMETERS
    --------------------
    data :: pandas dataframe

    x :: x-axis data (continuous or categorical)

    y :: y-axis data (continuous)

    hue :: color highlight (categorical)

    col :: the side-by-side plot comparison feature

    --------------------
    2.2. PLOT PARAMETERS
    --------------------
    col_wrap :: the number of plots to show per row

    ----------------------
    2.3. COMMON PARAMETERS
    ----------------------
    palette :: One of the hand-crafted palettes:
                'default'
                'colorblind'
                'blue_to_red'
                'blue_to_green'
                'red_to_green'
                'green_to_red'
                'violet_to_blue'
                'brown_to_green'
                'green_to_marine'

                Or use any cmap, seaborn or matplotlib
                color or palette code, or hex value.

    style :: Use one of the three core styles:
                'astetik'     # white
                '538'         # grey
                'solarized'   # sepia

              Or alternatively use any matplotlib or seaborn
              style definition.

    dpi :: the resolution of the plot (int value)

    title :: the title of the plot (string value)

    sub_title :: a secondary title to be shown below the title

    x_label :: string value for x-axis label

    y_label :: string value for y-axis label

    x_scale :: 'linear' or 'log' or 'symlog'

    y_scale :: 'linear' or 'log' or 'symlog'

    x_limit :: int or list with two ints

    y_limit :: int or list with two ints

    outliers :: Remove outliers using either 'zscore' or 'iqr'

    '''

    data = data.copy(deep=True)

    if hue != None:
        n_colors = len(data[hue].unique())
    else:
        n_colors = 1

    # HEADER STARTS >>>
    palette = _header(palette,
                      style,
                      n_colors=n_colors,
                      dpi=72,
                      fig_height=None,
                      fig_width=None)
    # <<< HEADER ENDS

    p = sns.factorplot(data=data,
                       x=x,
                       y=y,
                       col=col,
                       hue=hue,
                       palette=palette,
                       col_wrap=4,
                       kind='strip',
                       size=3)

    # FOOTER STARTS >>>
    _footer(p, x_label, y_label, save=save)

    sns.despine(bottom=True, left=True)
    p.set(xticklabels=[])
Exemplo n.º 47
0
train.loc[Outliers_to_drop] # Show the outliers rows
train = train.drop(Outliers_to_drop, axis = 0).reset_index(drop=True)
train_len = len(train)
dataset =  pd.concat(objs=[train, test], axis=0).reset_index(drop=True)
dataset = dataset.fillna(np.nan)
dataset.isnull().sum()

train.info()
train.isnull().sum()

train.describe()
# feature analysis
g = sns.heatmap(train[["Survived", "SibSp", "Parch", "Age", "Fare"]].corr(), annot = True, fmt = ".2f", cmap = "coolwarm")
# Explore SibSp feature vs Survived

g = sns.factorplot(x = "SibSp", y = "Survived", data = train, kind = 'bar', size= 6, palette = "muted")
g.despine(left = True)
g = g.set_ylabels("survival probability")

# Explore Parch feature vs Survived
g  = sns.factorplot(x="Parch",y="Survived",data=train,kind="bar", size = 6 , palette = "muted")
g.despine(left=True)
g = g.set_ylabels("survival probability")
plt.show()
plt.figure()

# age vs survivability
g = sns.FacetGrid(train, col = 'Survived')
g = g.map(sns.distplot, 'Age')
# Explore Age vs Survived
g = sns.FacetGrid(train, col='Survived')
Exemplo n.º 48
0
# Pclass

g = sns.catplot(x="Pclass",
                y="Survived",
                data=train,
                kind="bar",
                size=6,
                palette="muted")
g.despine(left=True)
g = g.set_ylabels("survival probability")

g = sns.factorplot(x="Pclass",
                   y="Survived",
                   hue="Sex",
                   data=train,
                   size=6,
                   kind="bar",
                   palette="muted")
g.despine(left=True)
g = g.set_ylabels("survival probability")

# Embarked
dataset["Embarked"].isnull().sum()
dataset.info()

dataset["Embarked"] = dataset["Embarked"].fillna("S")

g = sns.catplot(x="Embarked",
                y="Survived",
                data=train,
Exemplo n.º 49
0
import warnings
warnings.filterwarnings("ignore")


data = pd.read_csv('f:\\nesarc_pds.csv', low_memory=False)

sub = data[data['S1Q213'] <= 1]
# a copy of dataset where undesired data has been removed

#MY PRIMARY TOPIC OR DEPENDENT VARIABLE
c4 = data['S1Q213'].value_counts(dropna = False)

#DEPENDENT VARIABLE GRAPH PLOT
seaborn.countplot(x = 'S1Q213', data=sub)

plt.title('DURING PAST 4 WEEKS, HOW OFTEN FELT DOWNHEARTED AND DEPRESSED')

seaborn.factorplot(x = 'S1Q213', y = 'S4AQ4A18',data = sub, kind = 'bar', ci= None)

plt.title('Relation between sucidaland depressed')


seaborn.factorplot(x = 'S4AQ11', y = 'S1Q213',data = sub, kind = 'bar', ci= None)
plt.title('Relation between drinking and depressed')


seaborn.distplot(c4); plt.xlabel(' DISTRIBUTION')



Exemplo n.º 50
0
                                                      ascending=False)
# % de survivant per Cabin(first letter)

# In[47]:

#plots of several relevant features against 'survived' leading to feature creation
g = sns.FacetGrid(train, col='Survived')
g = g.map(sns.distplot, "Age")
g = g.set_ylabels("survival probability")

# In[48]:

# Relation between Siblings and survival probability
g = sns.factorplot(x="SibSp",
                   y="Survived",
                   data=train,
                   kind="bar",
                   size=6,
                   palette="muted")
g.despine(left=True)
g = g.set_ylabels("survival probability")

# In[49]:

# Survival probability according to their class and sex
g = sns.factorplot(x="Pclass",
                   y="Survived",
                   hue="Sex",
                   data=train,
                   size=6,
                   kind="bar",
                   palette="muted")
Exemplo n.º 51
0
# - It would be interesting to see if there are relatively more women/men in a particular field - Do women or men like to identify more with specific job titles?

# In[ ]:

jobs_by_gender = df[["GenderSelect", "CurrentJobTitleSelect"
                     ]].groupby([df.CurrentJobTitleSelect, df.GenderSelect
                                 ]).size().reset_index(name="number")

# In[ ]:

from matplotlib import pyplot

chart = sns.factorplot(x='CurrentJobTitleSelect',
                       y='number',
                       hue='GenderSelect',
                       data=jobs_by_gender,
                       kind='bar',
                       size=15,
                       aspect=2,
                       legend=False)
for ax in plt.gcf().axes:
    ax.set_xlabel("Job Title", fontsize=35)
    ax.set_ylabel("Count", fontsize=35)

for ax in chart.axes.flatten():
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontsize=25)
    ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=25)

plot = plt.legend(loc='upper left', prop={'size': 20})
plot = plt.title("Number of people with Different Job Titles by Gender",
                 fontsize=30)
Exemplo n.º 52
0
def run():
    url = "https://api.usa.gov/crime/fbi/ucr/estimates/states/"
    page = "?page=1&per_page=200&output=json&api_key="
    api_key = "iiHnOKfno2Mgkt5AynpvPpUQTEyxE77jo1RU8PIv"
    states = ['co', 'wa', 'ak', 'or']
    response = []
    year = []
    aggravated_assault = []
    burglary = []
    caveats = []
    homicide = []
    larceny = []
    motor_vehicle_theft = []
    population = []
    property_crime = []
    rape_legacy = []
    robbery = []
    state_abbr = []
    violent_crime = []
    stateList = []
    for state in states:
        #print(state)
        for i in range(22):
            response = requests.get(url + state + page + api_key).json()
            aggravated_assault.append(
                response["results"][i]["aggravated_assault"])
            burglary.append(response["results"][i]["burglary"])
            caveats.append(response["results"][i]["caveats"])
            homicide.append(response["results"][i]["homicide"])
            larceny.append(response["results"][i]["larceny"])
            motor_vehicle_theft.append(
                response["results"][i]["motor_vehicle_theft"])
            property_crime.append(response["results"][i]["property_crime"])
            rape_legacy.append(response["results"][i]["rape_legacy"])
            robbery.append(response["results"][i]["robbery"])
            violent_crime.append(response["results"][i]["violent_crime"])
            year.append(response["results"][i]["year"])
            state_abbr.append(response["results"][i]["state_abbr"])
            population.append(response["results"][i]["population"])
    crime_data = pd.DataFrame({
        "state_abbr": state_abbr,
        "year": year,
        "population": population,
        "aggravated_assault": aggravated_assault,
        "burglary": burglary,
        "homicide": homicide,
        "larceny": larceny,
        "motor_vehicle_theft": motor_vehicle_theft,
        "property_crime": property_crime,
        "rape_legacy": rape_legacy,
        "robbery": robbery,
        "violent_crime": violent_crime
    })
    crime_date_f = crime_data.loc[(crime_data["year"] == 2010) |
                                  (crime_data["year"] == 2016)].copy()
    total_Crime=crime_date_f.burglary+crime_date_f.aggravated_assault+ \
    crime_date_f.motor_vehicle_theft \
    +crime_date_f.property_crime+crime_date_f.violent_crime+crime_date_f.robbery+crime_date_f.homicide+ \
    crime_date_f.rape_legacy+crime_date_f.larceny
    crime_data['total_Crime'] = total_Crime
    crime_data_long = crime_date_f.melt(id_vars=['state_abbr', 'year'],
                                        value_name='Sum',
                                        var_name='Crime_types')
    array = [
        'burglary', 'property_crime', 'violent_crime', 'motor_vehicle_theft'
    ]
    crime_data_long_1 = crime_data_long.loc[
        crime_data_long["Crime_types"].isin(array)]
    ay = sns.factorplot(x="year",
                        y="Sum",
                        hue="Crime_types",
                        col="state_abbr",
                        data=crime_data_long_1,
                        kind="bar",
                        size=4,
                        aspect=.7)
# drop unnecessary columns, these columns won't be useful in analysis and prediction
titanic_df = titanic_df.drop(['PassengerId','Name','Ticket'], axis=1)
test_df    = test_df.drop(['Name','Ticket'], axis=1)
test_df.head()


# In[ ]:


# Embarked

# only in titanic_df, fill the two missing values with the most occurred value, which is "S".
titanic_df["Embarked"] = titanic_df["Embarked"].fillna("S")

# plot
sns.factorplot('Embarked','Survived', data=titanic_df,size=4,aspect=3)

fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(15,5))

# sns.factorplot('Embarked',data=titanic_df,kind='count',order=['S','C','Q'],ax=axis1)
# sns.factorplot('Survived',hue="Embarked",data=titanic_df,kind='count',order=[1,0],ax=axis2)
sns.countplot(x='Embarked', data=titanic_df, ax=axis1)
sns.countplot(x='Survived', hue="Embarked", data=titanic_df, order=[1,0], ax=axis2)

# group by embarked, and get the mean for survived passengers for each value in Embarked
embark_perc = titanic_df[["Embarked", "Survived"]].groupby(['Embarked'],as_index=False).mean()
sns.barplot(x='Embarked', y='Survived', data=embark_perc,order=['S','C','Q'],ax=axis3)

# Either to consider Embarked column in predictions,
# and remove "S" dummy variable, 
# and leave "C" & "Q", since they seem to have a good rate for Survival.
Exemplo n.º 54
0
predict = pd.DataFrame(y_pred_orig)
output = output.join(predict)
output.columns = ['PassengerId', 'Survived']
print(output)


import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB


# Age versus Survival rate. graph. 
sns.factorplot('Age', 'Survived', data = train_df, size=4, aspect=3)

fig, (axis1, axis2, axis3) = plt.subplots(1, 3, figsize = (15, 5))
sns.countplot(x='Age', data = train_df, ax=axis1)
sns.countplot(x='Survived', hue="Age", data = train_df, order=[1, 0], ax=axis2)

Age_perc = train_df[["Age", "Survived"]].groupby(['Age'], as_index=False).mean()
sns.barplot(x='Age', y='Survived', data=Age_perc, ax=axis3)

Age_dummies_titanic = pd.get_dummies(train_df['Age'])
Age_dummies_test = pd.get_dummies(test_df['Age'])


train_df = train_df.join(Age_dummies_titanic)
test_df = test_df.join(Age_dummies_test)
#dataset.columns.values
dataset.describe()
dataset.info()
dataset.head()
dataset.isnull().sum()

dataset['Survived'].value_counts().plot.pie()
sns.countplot(dataset['Survived'])

## categorical feature
dataset.groupby(['Sex', 'Survived'])['Survived'].count()
dataset[['Sex', 'Survived']].groupby(['Sex']).mean().plot.bar()
sns.countplot('Sex', hue='Survived', data=dataset)

dataset[['Embarked', 'Survived']].groupby(['Embarked']).mean().plot.bar()
sns.factorplot('Embarked', 'Survived', data=dataset)
sns.countplot('Embarked', data=dataset)
sns.countplot('Embarked', hue='Sex', data=dataset)
sns.countplot('Embarked', hue='Survived', data=dataset)
sns.countplot('Embarked', hue='Pclass', data=dataset)
## ordinal feature
pd.crosstab(dataset.Pclass, dataset.Survived, margins=True)
dataset['Pclass'].value_counts().plot.bar()
sns.countplot('Pclass', hue='Survived', data=dataset)
pd.crosstab([dataset.Sex, dataset.Survived], dataset.Pclass, margins=True)

sns.factorplot('Pclass', 'Survived', hue='Sex', data=dataset)

## continous feature

dataset['Age'].describe()
Exemplo n.º 56
0
        if mean_score < 0.8:
            continue

        players = set(data[data['tick'] == max(data['tick'])]['pid'])
        n = len(players)
        vals += list(dists - syn_dists)
        ns += [n] * len(dists)
        models += [model] * len(dists)

data = pd.DataFrame({'model': models, 'n_players': ns, 'values': vals})

sns.set(font='serif', context='poster', style='white')
sns.despine()

g = sns.factorplot('n_players',
                   'values',
                   markers=['o', 's'],
                   linestyles=['-', '--'],
                   data=data,
                   kind='point',
                   dodge=0.15,
                   x_order=sorted(set(data['n_players'])),
                   col='model')

plt.plot([0, 7], [0, 0], 'k-', lw=2)

fig = plt.gcf()

fig.savefig('../../plots/values.pdf')
Exemplo n.º 57
0
            continue
        log_file = '{}/test_{}_{}.log'.format(dir, d, e)
        with open(log_file) as f:
            lines = f.readlines()
            lines = [
                line.replace('=', ' ').split() for line in lines
                if line.find('Test') != -1
            ]
            acc = float(lines[-1][9]) if d == 'ppi' else float(lines[-1][6])

            accs.append(acc)
            algos.append(etitle)
            data.append(d)

df = pd.DataFrame(data={
    'Testing accuracy': accs,
    'Algorithm': algos,
    'Dataset': data
})
print(df)
g = sns.factorplot(x='Dataset',
                   y='Testing accuracy',
                   hue='Algorithm',
                   data=df,
                   kind='bar',
                   aspect=2,
                   size=2,
                   palette=colors)
g.savefig('test.pdf')
os.system('pdfcrop test.pdf test.pdf')
    size=5)

# Okay! We're done with parameter tweaking so let us now move on to use another Seaborn attribute for plotting our **beeswarms** on separate axes. This attribute is known as **Factorplot** and we shall discuss it majorly in the later section of the course BUT for now we shall just use it to get more mileage from our **Swarmplot**.

# Little tired of *Iris* flower sets! Let us use our *Tips* dataset this time around.

# In[44]:

# Loading Tips dataset:
tips = sns.load_dataset("tips")

sns.factorplot(x="sex",
               y="total_bill",
               hue="smoker",
               col="time",
               data=tips,
               kind="swarm",
               size=4.5,
               aspect=.7,
               palette="rocket")

# **Factorplot** has given us the flexibility to visualize our dataset, i.e. **Tips dataset**, in two separate segments within a single plot, segregated by the **time** of day. So the *first set of axes* help us understand the trend during **Lunch** time and on **right**, we get a set of axes for **Dinner** time. `hue` parameter reflects the `palette` parameter, which in turn displays data points in *separate colors*, where **smokers** are presented by *purple* color. With such a presentation, it gets easier to see the *bulk of customers* on basis of their **Gender**, the **total bill** that their arrival in the restaurant generates.

# More often you shall find that it is never a Swarmplot that alone represents those data points, as it is generally combined with **Boxplot** or **Violinplots**, that we shall discuss in-depth later on in this course.

# I won't get into great detail but will show you a simple way of mixing these **Swarmplots** with other plot. Let me use a **Boxplot** to demonstrate what I mean and as assured earlier, I will cover Boxplots later in much more depth with all it's *parameters* and *general use-cases*:

# In[77]:

sns.swarmplot(x="day", y="total_bill", data=tips, palette="rocket")
sns.boxplot(x="day",
Exemplo n.º 59
0
#2. Pair plots
#Age = data_titanic['Age']
#sns.pairplot(Age) #Not a good example. Don't RUN THIS!

#age_sex = data_titanic.iloc[:,4:6]
#sns.pairplot(age_sex)

#sns.heatmap(age_sex)

#############################################################################################

# We have 3 dfs now. data_titanic, mendata, womendata

#Gender distribution

gender = sns.factorplot('Sex', data=data_titanic, kind='count')
gender.set_ylabels("count of passengers")

#Distribution by age
#age_data = data_titanic['Age'].hist(bins = 80)
#plt.set_ylabel("Age of Passengers")

age_data = data_titanic['Age']
plt.hist(age_data.dropna(), bins=80)
plt.xlabel("Age of Passengers")
plt.ylabel("Frequency")
plt.title("Passenger's Age Distribution", fontsize=30, color='black')
plt.show()

#Distribution by class
count_first = data_titanic.groupby('Pclass')['PassengerId'].count()
Exemplo n.º 60
0
"""
from sklearn.datasets import make_circles
from sklearn.utils import shuffle
import pandas as pd

from timeit import default_timer as tic
import sklearn.cluster
import dask_ml.cluster
import seaborn as sns

Ns = [2500, 5000, 7500, 10000]
X, y = make_circles(n_samples=10_000, noise=0.05, random_state=0, factor=0.5)
X, y = shuffle(X, y)

timings = []
for n in Ns:
    X, y = make_circles(n_samples=n, random_state=n, noise=0.5, factor=0.5)
    t1 = tic()
    sklearn.cluster.SpectralClustering(n_clusters=2).fit(X)
    timings.append(('Scikit-Learn (exact)', n, tic() - t1))
    t1 = tic()
    dask_ml.cluster.SpectralClustering(n_clusters=2, n_components=100).fit(X)
    timings.append(('dask-ml (approximate)', n, tic() - t1))

df = pd.DataFrame(timings, columns=['method', 'Number of Samples', 'Fit Time'])
sns.factorplot(x='Number of Samples',
               y='Fit Time',
               hue='method',
               data=df,
               aspect=1.5)