示例#1
0
def main():
    img_dir = 'images/'
    images = [img_dir + f for f in os.listdir(img_dir)]
    labels = [f.split('/')[-1].split('_')[0] for f in images]
    label2ids = {v: i for i, v in enumerate(sorted(set(labels),
                                                   key=labels.index))}
    y = np.array([label2ids[l] for l in labels])

    data = []
    for image_file in images:
        img = img_to_matrix(image_file)
        img = flatten_image(img)
        data.append(img)
    data = np.array(data)

    # training samples
    is_train = np.random.uniform(0, 1, len(data)) <= 0.7
    train_X, train_y = data[is_train], y[is_train]

    # training a classifier
    pca = RandomizedPCA(n_components=5)
    train_X = pca.fit_transform(train_X)
    multi_svm = OneVsRestClassifier(LinearSVC())
    multi_svm.fit(train_X, train_y)

    # evaluating the model
    test_X, test_y = data[is_train == False], y[is_train == False]
    test_X = pca.transform(test_X)
    print pd.crosstab(test_y, multi_svm.predict(test_X),
                      rownames=['Actual'], colnames=['Predicted'])
示例#2
0
def generation_analysis(G, attribute, plot=True):
    """ 
    Analyzes an attribute, e.g. health status, by generation.
    
    PARAMETERS
    -------------
    G = networkx object
    attribute = case attribute for analysis, e.g. health status or sex
    table = print cross table of attribute by generation. Default is true.
    plot = produce histogram of attribute by generation. Default is true.
    
    RETURNS
    --------------
    matplotlib figure and axis objects
    
    """
    
    gen_df = pd.DataFrame(G.node).T
    
    print '{} by generation'.format(attribute)
    table = pd.crosstab(gen_df.generation, gen_df[attribute], margins=True)
    print table, '\n'
    
    if plot == True:
        fig, ax = plt.subplots()
        ax.set_aspect('auto')
        pd.crosstab(gen_df.generation, gen_df[attribute]).plot(kind='bar', ax=ax, alpha=.5)
        ax.set_xlabel('Generation')
        ax.set_ylabel('Case count')
        ax.grid(False)
        ax.legend(loc='best');
        return fig, ax, table
    else:
        return table
    def fit_ode(self, X, y, father):
        self.father = father
        self.Z = X.shape[1] # No of features
        self.Zval = map( len, map( np.unique, X.T ) )
        self.C = map(int, list( set(y)) )
        self.py = np.array([ list(y).count(i) for i in set( y )], float ) / X.shape[0]
        self.names[self.father] = map(int,list( set( X[:,self.father] )))
        self.validity =  map( int, [list( X[:,self.father]).count(i) > self.father for i in np.unique(X[:,self.father])] ) 

        for z in range(self.Z):
            self.names[z] = map(int,list( set( X[:,z] )))
            if z is father:
                self.names[z] = map(int,list( set( X[:,z] )))
                ct = crosstab( X[:,z], y )
                ct = ct.reindex_axis( self.names[z], axis=0).fillna(0)
                ct = ct.reindex_axis( self.C, axis=1).fillna(0)
                tmp = np.asarray ( (ct + self.Lap).apply(lambda r: r/r.sum(), axis=0) )
                tmp = tmp.T
                self.pxy.append( tmp )
                self.pxyx.append( None )
            else:
                tmp_array = list() 
                for curr_y in set( y ):
                    ct = crosstab( X[y == curr_y,z], X[y == curr_y,self.father] )
                    ct = ct.reindex_axis( self.names[z], axis=0).fillna(0)
                    ct = ct.reindex_axis( self.names[self.father], axis=1).fillna(0)
                    pxx = np.asarray ( (ct + self.Lap).apply(lambda r: r/r.sum(), axis=0) )
                    tmp_array.append( pxx.T ) # Trasposition for a better indexing
                self.pxyx.append( tmp_array )
                self.pxy.append( None )

        self.kind = 'One-Dependency Estimator'
示例#4
0
    def test_margin_ignore_dropna_bug(self):
        # GH 12577
        # pivot_table counts null into margin ('All')
        # when margins=true and dropna=true

        df = pd.DataFrame({'a': [1, 2, 2, 2, 2, np.nan],
                           'b': [3, 3, 4, 4, 4, 4]})
        actual = pd.crosstab(df.a, df.b, margins=True, dropna=True)
        expected = pd.DataFrame([[1, 0, 1], [1, 3, 4], [2, 3, 5]])
        expected.index = Index([1.0, 2.0, 'All'], name='a')
        expected.columns = Index([3, 4, 'All'], name='b')
        tm.assert_frame_equal(actual, expected)

        df = DataFrame({'a': [1, np.nan, np.nan, np.nan, 2, np.nan],
                        'b': [3, np.nan, 4, 4, 4, 4]})
        actual = pd.crosstab(df.a, df.b, margins=True, dropna=True)
        expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]])
        expected.index = Index([1.0, 2.0, 'All'], name='a')
        expected.columns = Index([3.0, 4.0, 'All'], name='b')
        tm.assert_frame_equal(actual, expected)

        df = DataFrame({'a': [1, np.nan, np.nan, np.nan, np.nan, 2],
                        'b': [3, 3, 4, 4, 4, 4]})
        actual = pd.crosstab(df.a, df.b, margins=True, dropna=True)
        expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]])
        expected.index = Index([1.0, 2.0, 'All'], name='a')
        expected.columns = Index([3, 4, 'All'], name='b')
        tm.assert_frame_equal(actual, expected)
示例#5
0
def exploreData(df):
    """
    Data Exploration
    """
    import seaborn as sns
    print("Describe:", df.describe())
    print(df.columns)
    print("Dtypes:", df.dtypes)
    
    # Scatter plot to show all attribute relations
    pd.plotting.scatter_matrix(df, alpha=0.2, figsize=(10, 10), diagonal="kde")
    pd.plotting.scatter_matrix(df, alpha=0.2, figsize=(10, 10), diagonal="hist")
    plt.tight_layout()
    plt.show()
    
    #Using seaborn to plot all charts to understand relation
    sns.set(style="ticks", color_codes=True)
    ##sns.pairplot(data=df.dropna(), hue="Loan_Status", size=2.5 )
    print
    sns.lmplot("Credit_History", "CoapplicantIncome", data=df.dropna(), hue="Loan_Status", fit_reg=False)
    sns.lmplot("Credit_History", "LoanAmount", data=df.dropna(), hue="Loan_Status", fit_reg=False)
    sns.lmplot("Loan_Amount_Term", "LoanAmount", data=df.dropna(), hue="Loan_Status", fit_reg=False)
    
    print(pd.crosstab(df.Education, df.Self_Employed))
    edu_empl = pd.crosstab(index=df.Education, columns=df.Self_Employed, margins=True)
    print("edu_empl", edu_empl)
    
    df[['Credit_History', 'Loan_Amount_Term', 'Loan_Status']].plot.bar(stacked=True)
    print("Training data size:",len(df)) #614
    print("Training data size without NaNs:",len(df.dropna())) #480
    
    # Plotting numeric columns to understand their ranges
    df[df.dtypes[(df.dtypes=="float64")|(df.dtypes=="int64")].index.values].hist(figsize=[11,11])
    return df
示例#6
0
文件: eve.py 项目: khughitt/eve
    def predict_variants(self, classifier, test_set, features, target_classes):
        """Uses a trained Random Forest classifier to predict variants"""
        cls_predict = classifier.predict(test_set[features])

        # map integer predictions back to the original classes
        predictions = target_classes[cls_predict]
        actual = target_classes[np.array(test_set['actual'])]

        pandas.crosstab(actual, predictions,
                        rownames=['actual'], colnames=['preds'])

        # variable importantance
        # http://nbviewer.ipython.org/github/rauanmaemirov/kaggle-titanic101/blob/master/Titanic101.ipynb
        feature_importance = classifier.feature_importances_
        feature_importance = 100.0 * (feature_importance / feature_importance.max())

        sorted_idx = np.argsort(feature_importance)
        pos = np.arange(sorted_idx.shape[0]) + .5

        plt.barh(pos, feature_importance[sorted_idx], align='center')
        plt.yticks(pos, test_set[features].columns[sorted_idx])
        plt.xlabel('Relative feature importance')
        plt.ylabel('Feature name')
        plt.title('EVE Random Forest Variable Importance');
        plt.savefig(os.path.join(self.output_dir,
                    'EVE_Variable_Importance.png'), bbox_inches='tight')
示例#7
0
 def printConfusionMatrix(self):
     if ( self.runOn == 'train_set' ):
         print('----'*self.nDashes)
         cm = pd.crosstab(self.y_train, self.y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False)
         ncm = cm / cm.sum(axis=1)
         print('----'*self.nDashes)
         print('Confusion matrix on train set')
         print('----'*self.nDashes)
         display(cm)
         print('----'*self.nDashes)
         print('Normalized Confusion matrix on train set')
         print('----'*self.nDashes)
         display(ncm)
         print('----'*self.nDashes)
     else:
         print('----'*self.nDashes)
         cm = pd.crosstab(self.y_test, self.y_pred_test, rownames=['Actual'], colnames=['Predicted'], margins=True)
         #cm = confusion_matrix(self.y_test, self.y_pred_test)
         ncm = cm / cm.sum(axis=1)
         print('----'*self.nDashes)
         print('Confusion matrix on test set')
         print('----'*self.nDashes)
         display(cm)
         print('----'*self.nDashes)
         print('Normalized Confusion matrix on test set')
         print('----'*self.nDashes)
         display(ncm)
示例#8
0
文件: heatmap.py 项目: AravindRam/ML
def plot_heatmap(df,graphNo):

    #Cross-tabulate Category and PdDistrict
    if(graphNo == 1):
        df_crosstab = pd.crosstab(df.PdDistrict,df.Category,margins=True)
    elif(graphNo == 2):
        df_crosstab = pd.crosstab(df.Category,df.Month,margins=True)
    elif(graphNo == 3):
        df_crosstab = pd.crosstab(df.PdDistrict,df.Year,margins=True)
    elif(graphNo == 4):
        df_crosstab = pd.crosstab(df.PdDistrict,df.Month,margins=True)
    del df_crosstab['All']
    df_crosstab = df_crosstab.ix[:-1]

    column_labels = list(df_crosstab.columns.values)
    row_labels = df_crosstab.index.values.tolist()

    if(graphNo == 2 or graphNo == 4):
        month_names=[]
        for month_number in column_labels:
            month_names.append(calendar.month_abbr[month_number])
        column_labels = month_names

    fig,ax = plt.subplots()
    #Specify color map for each visualization
    if(graphNo == 1):
        heatmap = ax.pcolor(df_crosstab,cmap=plt.cm.Blues)
    elif(graphNo == 2):
        heatmap = ax.pcolor(df_crosstab,cmap=plt.cm.RdPu)
    elif(graphNo == 3):
        heatmap = ax.pcolor(df_crosstab,cmap=plt.cm.PuBuGn)
    elif(graphNo == 4):
        heatmap = ax.pcolor(df_crosstab,cmap=plt.cm.YlOrRd)

    fig = plt.gcf()
    fig.set_size_inches(15,5)

    ax.set_frame_on(False)

    ax.set_yticks(np.arange(df_crosstab.shape[0])+0.5, minor=False)
    ax.set_xticks(np.arange(df_crosstab.shape[1])+0.5, minor=False)

    ax.invert_yaxis()
    ax.xaxis.tick_top()
    ax.set_xticklabels(column_labels, minor=False)
    ax.set_yticklabels(row_labels, minor=False)

    if(graphNo == 1):
        plt.xticks(rotation=90)

    ax.grid(False)

    ax = plt.gca()
    for t in ax.xaxis.get_major_ticks():
        t.tick1On = False
        t.tick2On = False
    for t in ax.yaxis.get_major_ticks():
        t.tick1On = False
        t.tick2On = False
    plt.show()
def chi_comparision (data_chi, field1, field2):

    ct = pandas.crosstab(data_chi[field2], data_chi[field1])
    chi2 = pandas.DataFrame(numpy.random.randn(ct.axes[1].max()+1, 
                                               ct.axes[1].max()+1))
    pvalue = pandas.DataFrame(numpy.random.randn(ct.axes[1].max()+1, 
                                                 ct.axes[1].max()+1))
    pvalue.ix[:] = numpy.nan
    chi2.ix[:]   = numpy.nan    
    num_comp     = 0
    
    # Now the bucle, but it only calculates chi2 and pvalue if the pair has not 
    # been already calculated
    
    for ax1 in ct.axes[1]:
        for ax2 in ct.axes[1]:
            if ax1 == ax2:
                continue
            ax1 = ax1.astype(numpy.int64)
            ax2 = ax2.astype(numpy.int64)        
            if not(numpy.isnan(chi2[ax2][ax1])):
                continue
            recode_chi = {ax1:ax1, ax2:ax2}
            versus = ax1.astype('str') + 'v' + ax2.astype('str')
            data_chi[versus] = data_chi[field1].map(recode_chi)
            ct2 = pandas.crosstab(data_chi[field2], data_chi[versus])
            cs2 = scipy.stats.chi2_contingency(ct2)
            chi2[ax1][ax2] = cs2[0]
            pvalue[ax1][ax2] = cs2[1]
            num_comp += 1
            
    thresold_bonferroni = 0.05 / num_comp
    rejected_h0 = pvalue < thresold_bonferroni
    
    return (chi2, pvalue, rejected_h0)
示例#10
0
def report(test,predictions):
   print pd.crosstab(test['Sentiment'], predictions, rownames=['Actual'], colnames=['Predicted'], margins=True)
   a=accuracy_score(test['Sentiment'],predictions)
   p=precision_score(test['Sentiment'],predictions, pos_label = "pos")
   r=recall_score(test['Sentiment'].values,predictions, pos_label = "pos")
   f=f1_score(test['Sentiment'].values,predictions, pos_label = "pos")
   print "Accuracy = ",a,"\nPrecision =",p,"\nRecall = ",r,"\nF-Score = ",f 
示例#11
0
def knnSimulate(param):
    trainSet = SimData.simulate2Group(
        n = int(param['n']),
        p = int(param['p']),
        effect = [param['effect']] * int(param['p'])
    )
    knnFit = KNeighborsClassifier(n_neighbors=int(param['k']))
    knnFit.fit(np.array(trainSet['x']), np.array(trainSet['y']))
    testSet = SimData.simulate2Group(
        n = int(param['n']),
        p = int(param['p']),
        effect = [param['effect']] * int(param['p'])
    )
    out = OrderedDict()
    out['p'] = int(param['p'])
    out['k'] = int(param['k'])
    out['train'] = trainSet
    out['test'] = testSet
    out['resubPreds'] = knnFit.predict(trainSet['x'])
    out['resubProbs'] = knnFit.predict_proba(trainSet['x'])
    out['testPreds'] = knnFit.predict(testSet['x'])
    out['testProbs'] = knnFit.predict_proba(testSet['x'])
    out['resubTable'] = pd.crosstab(
        Series(out['resubPreds'], index=trainSet['y'].index),
        trainSet['y']
    )
    out['resubAccuracy'] = (np.sum(np.diag(out['resubTable'])) /
                            (1.0 * np.sum(np.sum(out['resubTable']))))
    out['testTable'] = pd.crosstab(
        Series(out['testPreds'], index=testSet['y'].index),
        testSet['y']
    )
    out['testAccuracy'] = (np.sum(np.diag(out['testTable'])) /
                           (1.0 * np.sum(np.sum(out['testTable']))))
    return out
示例#12
0
def cleanCrosstab(rows, cols, values, aggfunc=sum, weight=None): 
    """ 
    Performs a crosstab on the rows, cols and values specified.
    
    In the end, if there are no observations, it the value is zero, but
    if there are observations with a np.nan value, then those remain as
    missing values.  
    
    Also, adds in proper row and column totals
    """
    
    if weight is None:     
        t = pd.crosstab(rows, cols, values, aggfunc=aggfunc, dropna=False)
    else:
        t = pd.crosstab(rows, cols, values*weight, aggfunc=aggfunc, dropna=False)

    count = pd.crosstab(rows, cols, dropna=False)
    
    t = t.mask(count==0, other=0)
        
    t['Total'] = t.sum(axis=1)
    t = t.append(pd.Series(t.sum(axis=0), name='Total'))
    
    return t
        
示例#13
0
 def displaySidAndQueryName(self):
     data = self.dataManager.loadData(["Aid","Sid","QueryName"],transformFields=False)        
     
     # Specific Aid
     dataAid = data[data.Aid == "012abc55-5801-494f-a77f-a799f1d855de"]
     colors = cm.gist_ncar(np.linspace(0,1,dataAid.QueryName.nunique()))
     pd.crosstab(dataAid.Sid, dataAid.QueryName).plot.barh(stacked=True, color=colors,figsize=(20, 20))
     plt.show()
示例#14
0
def print_null_freq(df):
    """
    for a given DataFrame, calculates how many values for each variable is null
    and prints the resulting table to stdout
    """
    df_lng = pd.melt(df)
    null_variables = df_lng.value.isnull()
    print pd.crosstab(df_lng.variable, null_variables)
示例#15
0
def find_coalition(train,clf):

	X_train = train.drop(['Vote','Financial_agenda_matters','Will_vote_only_large_party','Most_Important_Issue', 'Avg_Residancy_Altitude'], axis=1).values
	y_train = train.Vote.values

	clf.fit(X_train)
	clusters = clf.predict(X_train)

	print pd.crosstab(np.array(PARTIES)[y_train.astype(int)], clusters, rownames=["Party"], colnames=["Cluster"])
示例#16
0
文件: utilities.py 项目: awm182/494
def printContingencyTable(y,ypred,labels):
    confusion_matrix = metrics.confusion_matrix(y, ypred)
    plt.matshow(confusion_matrix)
    plt.title('Confusion matrix')
    plt.colorbar()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()
    print pd.crosstab(y,ypred,rownames=[labels[0]],colnames=[labels[1]])
示例#17
0
文件: analytics.py 项目: rkokkelk/CCS
def create_secure_usage_graph(df):
    data = pd.crosstab(df['Age'],df['Connection Secure'])
    plot = data.plot(kind='barh',stacked=True)
    save_figure(plot, 'verifying_connection')

    data = pd.crosstab(df['Age'],df['Connection Secure (Banking)'])
    plot = data.plot(kind='barh',stacked=True)
    save_figure(plot, 'verifying_connection_banking')
    log.info('Generated Secure connection graphs')
def get_kappa(A=None,B=None):
    import pandas as pd
    import numpy as np
    
    if A is None or B is None:
        k=5
        n=30
        A=np.array([np.random.randint(k)+1 for _ in range(n)])
        B=np.array([np.random.randint(k)+1 for _ in range(n)])
        ## Wikipedia Example 1
        A= np.append(np.zeros(25, dtype=int),np.ones(25, dtype=int))
        B= np.roll(np.append(np.zeros(30, dtype=int),np.ones(20, dtype=int)), 5)
        
#         ## Wikipedia Example 2
#         A= np.append(np.zeros(60, dtype=int),np.ones(40, dtype=int))
#         B= np.roll(np.append(np.zeros(70, dtype=int),np.ones(30, dtype=int)), 15)
#         
#         ## Wikipedia Example 3
#         A= np.append(np.zeros(60, dtype=int),np.ones(40, dtype=int))
#         B= np.roll(np.append(np.zeros(30, dtype=int),np.ones(70, dtype=int)), -5)
        
        
#         print 'A',A
#         print 'B', B
#         colnames=['0', '1', '2', '3', '4', '5']
        T=pd.crosstab(A,B,rownames='A',colnames='B').as_matrix()
    else:
        A=np.array(A)
        B=np.array(B)
        T=pd.crosstab(A,B,rownames='A',colnames='B')#.as_matrix()\
        
        vals=['0', '1', '2', '3', '4', '5']
        
        for v in vals:
            try:
                T[v]
            except :
                T[v] = pd.Series(np.zeros(len(list(T.index))), index=T.index)
        for v in vals:
            try:
                list(T.index).index(v)
            except :
                T.loc[v] = np.zeros(6) 
        T= T.sort()
        T= T.reindex_axis(sorted(T.columns), axis=1).as_matrix()
    b= T.sum(0)
    a= T.sum(1)
    p=T.diagonal().sum()/float(T.sum())
    b=b/float(b.sum())
    a=a/float(a.sum())
    e= sum(a*b)
#     e=sum((T.diagonal()/float(T.sum()))**2) ## xiaoqian's xls file
    kappa= max(1e-200,(p-e)/(1-e))
#     return np.log(kappa)
    return kappa
 def get_probabilities(self):
     pairs = pd.DataFrame(self.pairs)
     pairs.columns = ['parent', 'child']
     
     frequency = pd.crosstab(pairs.parent, pairs.child)
     
     list_data = pd.DataFrame(self.list_data)
     list_data.columns = ['parent', 'child']
     frequency_list_data = pd.crosstab(list_data.parent, list_data.child)
     frequency = frequency.astype(np.float)
     frequency.values[:] = frequency.values / frequency.values.sum(axis=1, keepdims=True)
     return frequency
 def printReport(self, ROCcurve=False):
     print "\nModel Report"
     print "Confusion Matrix:"
     print pd.crosstab(self.data_train[self.target], self.train_predictions)
     print 'Note: rows - actual; col - predicted'
     # print "\nClassification Report:"
     # print metrics.classification_report(y_true=self.data_train[self.target], y_pred=self.train_predictions)
     print "Accuracy : %s" % "{0:.3%}".format(self.classification_output['Accuracy'])
     print "AUC : %s" % "{0:.3%}".format(self.classification_output['AUC'])
     print "CV Score : Mean - %s | Std - %s" % ("{0:.3%}".format(self.classification_output['CVScore_mean']),"{0:.3%}".format(self.classification_output['CVScore_std']))
     
     if ROCcurve:
         fpr, tpr, thresholds = metrics.roc_curve(self.data_train[self.target],self.test_pred_prob)
def one_row_feature( lat, lon, column_tuple, feature_names ):
    # Return a 1-dimensional DataFrame or Series to be merged with other rows. 
    temp_cos = map( math.cos, column_tuple['Lat (radian)'] )
    x = 6373.0 * 1000 * abs( lon * 3.1415926 / 180 - column_tuple['Lon (radian)']) * temp_cos  # In meters
    y = 6373.0 * 1000 * abs( lat * 3.1415926 / 180 - column_tuple['Lat (radian)'])
    
    temp = pd.DataFrame( {'x': x, 'y': y} )
    distance = temp.max( axis = 1 )
    
    temp_list_pool = []
    for feature_name in feature_names:
        # 20 meters
        #if sum( distance < 20.0 ) > 1:

        temp_data_1 = pd.crosstab( '', column_tuple.ix[ distance < 20.0, feature_name ] )
        try: 
            temp_column_1 = list( temp_data_1.columns )
            temp_column_1 = [ '20m ' + x for x in temp_column_1 ]
            temp_data_1.columns = temp_column_1
            temp_list_pool.append( temp_data_1 )
        except:
            pass
            
        
        # 200 Meteirs    
        #if sum( distance < 200.0 ) > 1:
    
        temp_data_2 = pd.crosstab( '', column_tuple.ix[ distance < 200.0, feature_name ] )
        try:
            temp_column_2 = list( temp_data_2.columns )
            temp_column_2 = [ '200m ' + x for x in temp_column_2 ]
            temp_data_2.columns = temp_column_2
            temp_list_pool.append( temp_data_2 ) 
        except:
            pass            
 
        # 2000 Meteirs    
        #if sum( distance < 2000.0 ) > 1:
          
        temp_data_3 = pd.crosstab( '', column_tuple.ix[ distance < 2000.0, feature_name ] )
        try:
            temp_column_3 = list( temp_data_3.columns )
            temp_column_3 = [ '2000m ' + x for x in temp_column_3 ]
            temp_data_3.columns = temp_column_3
            temp_list_pool.append( temp_data_3 )
        except:
            pass
    
    temp_data = pd.concat( temp_list_pool, axis = 1 )   
        
    return temp_data
示例#22
0
    def draw(self):
        """ Draw a heat map. """

        def get_crosstab(data, row_fact,col_fact, row_names, col_names):
            ct = pd.crosstab(data[row_fact], data[col_fact])
            ct = ct.reindex_axis(row_names, axis=0).fillna(0)
            ct = ct.reindex_axis(col_names, axis=1).fillna(0)
            return ct

        def plot(data, color):
            ct = get_crosstab(
                    data,
                    self._groupby[0],
                    self._groupby[1],
                    self._levels[0],
                    self._levels[1])

            sns.heatmap(ct,
                robust=True,
                annot=True,
                cbar=False,
                cmap=cmap,
                fmt="g",
                vmax=vmax,
                #ax=plt.gca(),
                linewidths=1)

        if len(self._groupby) < 2:
            # create a dummy cross tab with one dimension containing empty
            # values:
            data_column = self._table[self._groupby[0]].reset_index(drop=True)
            tab = pd.crosstab(
                pd.Series([""] * len(data_column), name=""),
                data_column)
            plot_facet = lambda data, color: sns.heatmap(
                tab,
                robust=True,
                annot=True,
                cbar=False,
                cmap=cmap,
                fmt="g",
                linewidths=1)
        else:
            plot_facet = plot
            vmax = pd.crosstab(
                [self._table[x] for x in [self._row_factor, self._groupby[0]] if x != None],
                [self._table[x] for x in [self._col_factor, self._groupby[1]] if x != None]).values.max()

        cmap = ListedColormap(self.options["color_palette_values"])
        self.map_data(plot_facet)
示例#23
0
def plots(train,categories):
    matplotlib.style.use('ggplot')
    temp = pd.crosstab([train.Category],train.PdDistrict)
    temp.plot(kind='barh')
    temp = pd.crosstab([train.Category],train.DayOfWeek)
    temp.plot(kind='barh')
    temp = pd.crosstab([train.Category],train.time)
    temp.plot(kind='barh')
    temp = pd.crosstab([train.loc[train['Category'].isin(categories),'Category']],train.time)
    temp.plot(kind='barh')
    train.time.value_counts().plot(kind='barh')
    train.DayOfWeek.value_counts().plot(kind='barh')
    train.PdDistrict.value_counts().plot(kind='barh')
    train.Category.value_counts().plot(kind='barh')
    matplotlib.pyplot.show()
示例#24
0
def Cramer(var1, var2):
	"""
	Compute Cramer's V statistic for two Pandas series

	Parameters:
	----------
	var1, var2: Pandas series

	Returns:
	--------
	v : float
		The Cramer's V statistic of two categorical-variable series

	Status:
	-------	
	Cramer's V Implementation
	Author: Jesse Lund, [email protected]
	Date: 9/12/2015

	##Round 1##
	Comments: Thomas Roderick, [email protected]
	Date: 9/13/2015

	"""

	table = crosstab(var1,var2) #For Pandas: must have an index, can't just feed in two lists. This could be a sticking point. Might be better to do a check or roll our own crosstab implementation
	l,w = table.shape #save on a (small) function call here--reads in both outputs 
	df = min(l-1, w-1)
	colsum, rowsum = table.sum(0), table.sum(1) 
	n = float(l*w)
	expectmat = outer(rowsum,colsum)/n
	outmat = outer(table.sum(0),table.sum(1))/n #this works if same size
	return  sqrt((((table - expectmat)**2)/(expectmat*n*df)).sum().sum())
示例#25
0
def fischer_bar_chart(bin_vec, response_vec, ax=None, filename=None):
    fig, ax = init_ax(ax)
    t = pd.crosstab(bin_vec, response_vec)
    t.plot(kind='bar', ax=ax)
    if filename is not None:
        fig.savefig(filename)
    return fig     
示例#26
0
def show_tabulation(request):
    """Send Response to Ajax Request for selected tabulated data"""
    required_fields =  ['gender','handedness','uploaded_file_base_encoded']
    for field_name in required_fields:
        try:
            request.POST[field_name]
        except:
            raise Exception(field_name+' is required')
    
    uploaded_file_name = request.POST.get('uploaded_file_base_encoded',None)
    uploaded_file_name_base_decoded = base64.b64decode(uploaded_file_name)
    gender = [request.POST['gender']]
    handedness = [request.POST['handedness']]
    
    if uploaded_file_name_base_decoded and  os.path.isfile(uploaded_file_name_base_decoded):
        df =  pd.read_csv(uploaded_file_name_base_decoded)
        
        gender_groups       =  df.groupby('Gender').groups.keys()
        handedness_groups   =  df.groupby('Handedness').groups.keys()
    
        if gender and gender[0]!='all':
            gender_groups = gender
        if handedness and handedness[0]!='all':
            handedness_groups = handedness
        
        df_filtered = df[df.Gender.isin(gender_groups) & df.Handedness.isin(handedness_groups)]
        df_calc     = pd.crosstab(df_filtered.Gender , df_filtered.Handedness, rownames=['Gender'],  colnames=['Handedness'], margins=True)
        
        response_data = {}
        response_data['tab_data'] = str(df_calc.to_html())
        response_data['json_data'] = str(df_calc.to_json())
        return HttpResponse(json.dumps(response_data),  content_type="application/json")
    else:
        raise Exception('File not found or missing.')
示例#27
0
	def __init__(self,feature_data,features,N,K):

		"""
		feature_data: contains observations indexed by a 'key' column, rest of columns are features
		features: the list of features one wants to use in estimation
		K: number of latent types to estimate
		"""

		self.N = N
		self.K = K
		self.features = [f for feature in features for f in feature]
		self.F = len(self.features)
		data_index = [i for j,f in enumerate(features) for i in [j]*len(f)]

		# counts stored as lists because different features have different number of categories
		
		self.feature_counts = []
		self.M_f = np.empty(self.F,dtype=np.int)
		self.observations = np.empty((self.N,self.F),dtype=np.int)

		for i,f in enumerate(self.features):

			self.feature_counts.append(pd.crosstab(feature_data[data_index[i]]['key'],feature_data[data_index[i]][f]).values)
			self.M_f[i] = self.feature_counts[i].shape[1]
			self.observations[:,i] = self.feature_counts[i].sum(axis=1)
			
		# seed parameters

		self.rho = np.full(self.K,1/self.K) # equal probability of all types

		self.mu = []
		for M_f in self.M_f:
			self.mu.append(np.random.dirichlet(M_f*[1],self.K)) # uniform probability
示例#28
0
def convert_to_matrix(player_df):
	print "Writing out player matrix"
	table_df = pd.crosstab(player_df['Club.Country'], player_df['Country'], dropna=False)
	cols = table_df.columns
	rows = list(set(table_df.index))

	row_df = pd.DataFrame(np.zeros((len([col for col in cols if col not in rows]),table_df.shape[1]), dtype=int),columns=table_df.columns, index=[col for col in cols if col not in rows])

	table_df = table_df.append(row_df)

	col_df = pd.DataFrame(np.zeros((table_df.shape[0],len([row for row in rows if row not in cols])), dtype=int),columns=[row for row in rows if row not in cols], index=table_df.index)

	table_df = pd.concat([table_df, col_df], axis=1)

	file_name = 'euro_matrix.json'

	all_countries = list(table_df.index)

	all_countries.sort()

	table_df = table_df.ix[all_countries][all_countries]
	lines = [",".join([str(ele) for ele in list(row[1])]) for row in table_df.iterrows()]

	with open (file_name, 'w') as file_handle:
		file_handle.write('[\n[')
		file_handle.write("],\n[".join(lines))
		file_handle.write("]\n]")
	return table_df
示例#29
0
    def get_mobem(self):
        """Return a dataframe compatible with ANOVA analysis

        The returned object can be read
        by :class:`~gdsctools.readers.GenomicFeatures`.

        """
        # Select gene that appear at least a minimum number of times
        #agg = self.unified.groupby("GENE")["GENE"].count()
        #self.selection = agg[agg>=minimum_gene]

        # keep only gene in the selection
        #df = self.unified.query("GENE in @self.selection.index")
        df = self.unified
        this = pd.crosstab(df['GENE'], columns=[
            df["COSMIC_ID"], df['TISSUE_TYPE'], df["SAMPLE"]])
        this = this.T
        this = this.reset_index()

        if "TISSUE_TYPE" in this.columns:
            this.rename(columns={"TISSUE_TYPE":"TISSUE_FACTOR"}, inplace=True)
        else:
            print("Expected TISSUE_TYPE column. Not found.")

        return this
def validate(x, gt_y, clf, subset):
    predicted_y = clf.predict(x)
    prob_predicted_y = clf.predict_proba(x)
    print('%s confusion matrix:' % subset)
    print(pd.crosstab(gt_y, predicted_y, rownames=['Actual'], colnames=['Predicted']))

    return predicted_y, prob_predicted_y
#converting gender to numerical variable
gender = {'male':1, 'female':2}
trainingData['gender'] = trainingData['gender'].map(gender)

trainingData = trainingData.fillna(0)

trainingData['gender']=trainingData['gender'].astype(np.int8);

#generating correlation matrix
corrmatrix=trainingData[trainingData.columns[1:]].corr()
f,ax=plt.subplots(figsize=(12,9))
sns.heatmap(corrmatrix, vmax=1, cbar=True, annot=True, square=True);
plt.show()

#checking gender data relationship with churn
gender_crosstab=pd.crosstab(trainingData['gender'],trainingData['is_churn'])
gender_crosstab.plot(kind='bar', stacked=True, grid=True)

#checking age data relationship with churn
age_crosstab=pd.crosstab(trainingData['bd'],trainingData['is_churn'])
age_crosstab.plot(kind='bar', stacked=True, grid=True)

#checking city data relationship with churn
city_crosstab=pd.crosstab(trainingData['city'],trainingData['is_churn'])
city_crosstab.plot(kind='bar', stacked=True, grid=True)

#checking city data relationship with churn
registered_via_crosstab=pd.crosstab(trainingData['registered_via'],trainingData['is_churn'])
registered_via_crosstab.plot(kind='bar', stacked=True, grid=True)

trainingData.to_csv('MembersProc_file.csv', sep=',')
示例#32
0
def print_demographics(df, idx=None):
    # create a dictionary which maps each variable to a data type
    all_vars = OrderedDict(
        (('N', 'N'), ('age', 'median'), ('gender', 'gender'),
         ('bmi', 'continuous'), ('ethnicity', 'race'), ('elixhauser_hospital',
                                                        'median'),
         ('qsofa', 'median'), ('sirs', 'median'), ('sofa', 'median'),
         ('mlods', 'median'), ('lactate_max', 'continuous'),
         ('vent', 'binary'), ('icu_los', 'median'), ('hosp_los', 'median'),
         ('thirtyday_expire_flag', 'binary'), ('hospital_expire_flag',
                                               'binary')))

    if idx is None:
        # print demographics for entire dataset
        for i, curr_var in enumerate(all_vars):
            if all_vars[curr_var] == 'N':  # print number of patients
                print('{:20s}\t{:4g}'.format(curr_var, df.shape[0]))
            elif curr_var in df.columns:
                if all_vars[curr_var] == 'continuous':  # report mean +- STD
                    print('{:20s}\t{:2.1f} +- {:2.1f}'.format(
                        curr_var, df[curr_var].mean(), df[curr_var].std()))
                elif all_vars[curr_var] == 'gender':  # convert from M/F
                    print('{:20s}\t{:4g} ({:2.1f}%)'.format(
                        curr_var, np.sum(df[curr_var].values == 'M'), 100.0 *
                        np.sum(df[curr_var].values == 'M').astype(float) /
                        df.shape[0]))
                # binary, report percentage
                elif all_vars[curr_var] == 'binary':
                    print('{:20s}\t{:4g} ({:2.1f}%)'.format(
                        curr_var, df[curr_var].sum(),
                        100.0 * (df[curr_var].mean()).astype(float)))
                # report median [25th percentile, 75th percentile]
                elif all_vars[curr_var] == 'median':
                    print('{:20s}\t{:2.1f} [{:2.1f}, {:2.1f}]'.format(
                        curr_var, df[curr_var].median(),
                        np.percentile(df[curr_var].values,
                                      25,
                                      interpolation='midpoint'),
                        np.percentile(df[curr_var].values,
                                      75,
                                      interpolation='midpoint')))
                elif all_vars[curr_var] == 'measured':
                    print('{:20s}\t{:2.1f}%'.format(
                        curr_var, 100.0 * np.mean(df[curr_var].isnull())))
                elif all_vars[curr_var] == 'race':
                    # special case: print each race individually
                    # race_black, race_other
                    print('{:20s}\t'.format('Race'))

                    # each component
                    curr_var_tmp = 'White'
                    print('{:20s}\t{:4g} ({:2.1f}%)'.format(
                        curr_var_tmp, df['race_white'].sum(),
                        100.0 * (df['race_white'].mean()).astype(float)))
                    curr_var_tmp = 'Black'
                    print('{:20s}\t{:4g} ({:2.1f}%)'.format(
                        curr_var_tmp, df['race_black'].sum(),
                        100.0 * (df['race_black'].mean()).astype(float)))
                    curr_var_tmp = 'Hispanic'
                    print('{:20s}\t{:4g} ({:2.1f}%)'.format(
                        curr_var_tmp, df['race_hispanic'].sum(),
                        100.0 * (df['race_black'].mean()).astype(float)))
                    # curr_var_tmp = 'Other'
                    # print('{:20s}\t{:4g} ({:2.1f}%)'.format(curr_var_tmp, df['race_other'].sum(),
                    # 100.0*(df['race_other'].mean()).astype(float)))

                # additional lactate measurements output with lactate_max
                if curr_var == 'lactate_max':
                    # also print measured
                    print('{:20s}\t{:4g} ({:2.1f}%)'.format(
                        curr_var.replace('_max', ' ') + 'measured',
                        np.sum(~df[curr_var].isnull()),
                        100.0 * np.mean(~df[curr_var].isnull())))
                    print('{:20s}\t{:4g} ({:2.1f}%)'.format(
                        curr_var.replace('_max', ' ') + '> 2',
                        np.sum(df[curr_var] >= 2),
                        100.0 * np.mean(df[curr_var] >= 2)))

            else:
                print('{:20s}'.format(curr_var))
    else:
        # print demographics split into two groups
        # also print p-values testing between the two groups
        for i, curr_var in enumerate(all_vars):
            if all_vars[curr_var] == 'N':  # print number of patients
                print('{:20s}\t{:4g}{:5s}\t{:4g}{:5s}\t{:5s}'.format(
                    curr_var, np.sum(~idx), '', np.sum(idx), '', ''))
            elif curr_var in df.columns:
                if all_vars[curr_var] == 'continuous':  # report mean +- STD
                    tbl = np.array(
                        [[df[~idx][curr_var].mean(), df[idx][curr_var].mean()],
                         [
                             df.loc[~idx, curr_var].std(),
                             df.loc[idx, curr_var].std()
                         ]])

                    stat, pvalue = scipy.stats.ttest_ind(df[~idx][curr_var],
                                                         df[idx][curr_var],
                                                         equal_var=False,
                                                         nan_policy='omit')

                    # print out < 0.001 if it's a very low p-value
                    if pvalue < 0.001:
                        pvalue = '< 0.001'
                    else:
                        pvalue = '{:0.3f}'.format(pvalue)

                    print(
                        '{:20s}\t{:2.1f} +- {:2.1f}\t{:2.1f} +- {:2.1f}\t{:5s}'
                        .format(curr_var, tbl[0, 0], tbl[1, 0], tbl[0, 1],
                                tbl[1, 1], pvalue))

                elif all_vars[curr_var] in ('gender',
                                            'binary'):  # convert from M/F
                    # build the contingency table
                    if all_vars[curr_var] == 'gender':
                        tbl = np.array(
                            [[
                                np.sum(df[~idx][curr_var].values == 'M'),
                                np.sum(df[idx][curr_var].values == 'M')
                            ],
                             [
                                 np.sum(df[~idx][curr_var].values != 'M'),
                                 np.sum(df[idx][curr_var].values != 'M')
                             ]])
                    else:
                        tbl = np.array(
                            [[
                                np.sum(df[~idx][curr_var].values),
                                np.sum(df[idx][curr_var].values)
                            ],
                             [
                                 np.sum(1 - df[~idx][curr_var].values),
                                 np.sum(1 - df[idx][curr_var].values)
                             ]])

                    # get the p-value
                    chi2, pvalue, dof, ex = scipy.stats.chi2_contingency(tbl)

                    # print out < 0.001 if it's a very low p-value
                    if pvalue < 0.001:
                        pvalue = '< 0.001'
                    else:
                        pvalue = '{:0.3f}'.format(pvalue)

                    # binary, report percentage
                    print('{:20s}\t{:4g} ({:2.1f}%)\t{:4g} ({:2.1f}%)\t{:5s}'.
                          format(
                              curr_var, tbl[0, 0],
                              100.0 * tbl[0, 0].astype(float) /
                              (tbl[0, 0] + tbl[1, 0]), tbl[0, 1],
                              100.0 * tbl[0, 1].astype(float) /
                              (tbl[0, 1] + tbl[1, 1]), pvalue))

                elif all_vars[curr_var] == 'median':
                    stat, pvalue = scipy.stats.mannwhitneyu(
                        df[~idx][curr_var],
                        df[idx][curr_var],
                        use_continuity=True,
                        alternative='two-sided')

                    # print out < 0.001 if it's a very low p-value
                    if pvalue < 0.001:
                        pvalue = '< 0.001'
                    else:
                        pvalue = '{:0.3f}'.format(pvalue)

                    print(
                        '{:20s}\t{:2.1f} [{:2.1f}, {:2.1f}]\t{:2.1f} [{:2.1f}, {:2.1f}]\t{:5s}'
                        .format(
                            curr_var, df[~idx][curr_var].median(),
                            np.percentile(df[~idx][curr_var].values,
                                          25,
                                          interpolation='midpoint'),
                            np.percentile(df[~idx][curr_var].values,
                                          75,
                                          interpolation='midpoint'),
                            df[idx][curr_var].median(),
                            np.percentile(df[idx][curr_var].values,
                                          25,
                                          interpolation='midpoint'),
                            np.percentile(df[idx][curr_var].values,
                                          75,
                                          interpolation='midpoint'), pvalue))

                elif all_vars[curr_var] == 'measured':
                    # build the contingency table
                    tbl = np.array([[
                        np.sum(df[~idx][curr_var].isnull()),
                        np.sum(df[idx][curr_var].isnull())
                    ],
                                    [
                                        np.sum(~df[~idx][curr_var].isnull()),
                                        np.sum(~df[idx][curr_var].isnull())
                                    ]])

                    # get the p-value
                    chi2, pvalue, dof, ex = scipy.stats.chi2_contingency(tbl)

                    # print out < 0.001 if it's a very low p-value
                    if pvalue < 0.001:
                        pvalue = '< 0.001'
                    else:
                        pvalue = '{:0.3f}'.format(pvalue)

                    print('{:20s}\t{:2.1f}%\t{:2.1f}%'.format(
                        curr_var, np.sum(~df[~idx][curr_var].isnull()),
                        100.0 * np.mean(~df[~idx][curr_var].isnull()),
                        np.sum(~df[idx][curr_var].isnull()),
                        100.0 * np.mean(~df[idx][curr_var].isnull()), pvalue))

                elif all_vars[curr_var] == 'race':
                    # special case: evaluate each race in chi2
                    # race_black, race_other

                    # create a contingency table with three rows

                    # use crosstab
                    df['race'] = 'other'
                    df.loc[df['race_black'] == 1, 'race'] = 'black'
                    df.loc[df['race_white'] == 1, 'race'] = 'white'
                    df.loc[df['race_hispanic'] == 1, 'race'] = 'hispanic'
                    tbl = pd.crosstab(df.race, idx, margins=True)

                    curr_var_vec = tbl.index.values[0:-1]
                    # Extract table without totals
                    tbl = tbl.ix[0:-1, 0:-1]

                    # get the p-value
                    chi2, pvalue, dof, ex = scipy.stats.chi2_contingency(
                        tbl, correction=False)

                    # print out < 0.001 if it's a very low p-value
                    if pvalue < 0.001:
                        pvalue = '< 0.001'
                    else:
                        pvalue = '{:0.3f}'.format(pvalue)

                    # first print out we are comparing races (with p-value)
                    print('{:20s}\t{:10s}\t{:10s}\t{:5s}'.format(
                        curr_var, '', '', pvalue))

                    # next print out individual race #s (no p-value)
                    for r in curr_var_vec:
                        print(
                            '{:20s}\t{:4g} ({:2.1f}%)\t{:4g} ({:2.1f}%)\t{:5s}'
                            .format(
                                '  ' + r, tbl.loc[r, False],
                                100.0 * tbl.loc[r, False].astype(float) /
                                np.sum(tbl.loc[:, False]), tbl.loc[r, True],
                                100.0 * tbl.loc[r, True].astype(float) /
                                np.sum(tbl.loc[:, True]),
                                ''))  # no individual p-value

                # additional lactate measurements output with lactate_max
                if curr_var == 'lactate_max':
                    # for lactate, we print two additional rows:
                    # 1) was lactate ever measured?
                    # 2) was lactate ever > 2 ?

                    # measured...
                    # build the contingency table
                    tbl = np.array([[
                        np.sum(df[~idx][curr_var].isnull()),
                        np.sum(df[idx][curr_var].isnull())
                    ],
                                    [
                                        np.sum(~df[~idx][curr_var].isnull()),
                                        np.sum(~df[idx][curr_var].isnull())
                                    ]])

                    # get the p-value
                    chi2, pvalue, dof, ex = scipy.stats.chi2_contingency(tbl)

                    # print out < 0.001 if it's a very low p-value
                    if pvalue < 0.001:
                        pvalue = '< 0.001'
                    else:
                        pvalue = '{:0.3f}'.format(pvalue)

                    print('{:20s}\t{:4g} ({:2.1f}%)\t{:4g} ({:2.1f}%)\t{:5s}'.
                          format(
                              curr_var.replace('_max', ' ') + 'measured',
                              np.sum(~df[~idx][curr_var].isnull()),
                              100.0 * np.mean(~df[~idx][curr_var].isnull()),
                              np.sum(~df[idx][curr_var].isnull()),
                              100.0 * np.mean(~df[idx][curr_var].isnull()),
                              pvalue))

                    # value > 2...
                    # build the contingency table
                    tbl = np.array([[
                        np.sum(df[~idx][curr_var] >= 2),
                        np.sum(df[idx][curr_var] >= 2)
                    ],
                                    [
                                        np.sum(~(df[~idx][curr_var] >= 2)),
                                        np.sum(~(df[idx][curr_var] >= 2))
                                    ]])

                    # get the p-value
                    chi2, pvalue, dof, ex = scipy.stats.chi2_contingency(tbl)

                    # print out < 0.001 if it's a very low p-value
                    if pvalue < 0.001:
                        pvalue = '< 0.001'
                    else:
                        pvalue = '{:0.3f}'.format(pvalue)

                    print('{:20s}\t{:4g} ({:2.1f}%)\t{:4g} ({:2.1f}%)\t{:5s}'.
                          format(
                              curr_var.replace('_max', ' ') + '> 2',
                              np.sum(df[~idx][curr_var] >= 2),
                              100.0 * np.mean(df[~idx][curr_var] >= 2),
                              np.sum(df[idx][curr_var] >= 2),
                              100.0 * np.mean(df[idx][curr_var] >= 2), pvalue))

            else:
                print('{:20s}'.format(curr_var))
示例#33
0
grid.map(plt.hist, 'Age', alpha=.5, bins=20)
grid.add_legend()

# In[ ]:

grid = sns.FacetGrid(all_data, row='Embarked', size=2.2, aspect=1.6)
grid.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', palette='deep')
grid.add_legend()

# **Creating new features**

# In[ ]:

all_data['Title'] = all_data.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

pd.crosstab(all_data['Title'], all_data['Sex'])

# In[ ]:

# Merging all columns with similar values and grouping rare values as "Other"
all_data['Title'] = all_data['Title'].replace([
    'Capt', 'Col', 'Countess', 'Don', 'Dona', 'Dr', 'Jonkheer', 'Lady',
    'Major', 'Rev', 'Sir'
], 'Other')

all_data['Title'] = all_data['Title'].replace(['Ms'], 'Miss')
all_data['Title'] = all_data['Title'].replace(['Mlle'], 'Miss')
all_data['Title'] = all_data['Title'].replace(['Mme'], 'Mrs')

all_data[['Title', 'Survived']].groupby('Title', as_index=False).mean()

df=pd.read_csv('HR_sep.csv')
df.head()


# In[4]:


df.groupby('left').mean()


# In[5]:


pd.crosstab(df.salary,df.left).plot(kind='bar')


# In[6]:


pd.crosstab(df.Department,df.left).plot(kind='bar')


# In[7]:


from sklearn.linear_model import LogisticRegression


# In[8]:
示例#35
0
bank = pd.read_csv(
    "C:/Users/HP/Desktop/ABubakar Files/abu_Data_Science/Assignments/Logisitc Regression/bank-full.csv",
    delimiter=";",
    header=0)
bank.tail(300)
# Droping first column
#claimants.drop(["CASENUM"],inplace=True,axis = 1)

#cat_cols = ["ATTORNEY","CLMSEX","SEATBELT","CLMINSUR"]
#cont_cols = ["CLMAGE","LOSS"]

# Getting the barplot for the categorical columns

sb.countplot(x="y", data=bank, palette="hls")
pd.crosstab(bank.y, bank.job).plot(kind="bar")
bank.columns
sb.countplot(x="education", data=bank, palette="hls")
pd.crosstab(bank.y, bank.education).plot(kind="bar")
sb.countplot(x="marital", data=bank, palette="hls")
pd.crosstab(bank.y, bank.marital).plot(kind="bar")

sb.countplot(x="housing", data=bank, palette="hls")
pd.crosstab(bank.y, bank.housing).plot(kind="bar")
pd.crosstab(bank.month, bank.y).plot(kind='bar', stacked=True)
plt.title('purchase frequency for month title')
plt.xlabel('month')
plt.ylabel('frequency of purchase')
pd.crosstab(bank.loan, bank.y).plot(kind='bar', stacked=True)
plt.title('purchase frequency for loan title')
plt.xlabel('loan')
示例#36
0
def do_response_scoring(df, xlab, ylab, Top_n, prt):
    df_ins = df  # data frame
    #columns = ['producer_cd','veh_make_name','veh_mdl_name'] # grouping of
    columns = xlab  #=['veh_make_name'] # grouping of
    columns_conc = ("|".join(columns))
    #columns_conc

    df_ins[f'{columns_conc}'] = df_ins[columns].astype(str).astype(str).apply(
        '|'.join, axis=1)
    #grid_view(df_ins.head())

    i = cat_col = columns_conc
    y2 = ylab  #="target"
    Top_n = Top_n  #=1500000000000000000000000000000000000000000000000
    ytag = ytag = 1
    col_count = df_ins[f'{i}'].value_counts()
    #print(col_count)
    col_count = col_count[:Top_n, ]

    col_count1 = df_ins[f'{i}'].value_counts(normalize=True) * 100
    col_count1 = col_count1[:Top_n, ]
    vol_inperc = col_count1.sum()
    vol_inperc = round(vol_inperc, 2)

    tmp = pd.crosstab(df_ins[f'{i}'], df_ins[f'{y2}'], normalize='index') * 100
    #tmp.head(5)

    tmp = pd.merge(col_count, tmp, left_index=True, right_index=True)
    #tmp.head(5)

    tmp = pd.DataFrame(tmp)
    #tmp.columns

    tmp.rename(columns={'0': 'NotRenwed%', '1': 'Renewed%'}, inplace=True)

    if 'NotRenwed%' not in tmp.columns:
        print("NotRenwed% is not present in ", i)
        tmp['NotRenwed%'] = 0
    if 'Renewed%' not in tmp.columns:
        print("Renewed% is not present in ", i)
        tmp['Renewed%'] = 0

    tmp1 = pd.crosstab(df[f'{i}'], df[f'{y2}'])
    tmp1.rename(columns={'0': 'NR_count', '1': 'R_count'}, inplace=True)
    if 'NR_count' not in tmp1.columns:
        print("NR_count is not present in ", i)
        tmp1['NR_count'] = 0
    if 'R_count' not in tmp1.columns:
        print("R_count is not present in ", i)
        tmp1['R_count'] = 0

    tmpz = pd.merge(tmp, tmp1, left_index=True, right_index=True)
    tmpz['Tot'] = tmpz['NR_count'] + tmpz['R_count']
    tmpz['Renewed%'] = round(tmpz['Renewed%'], 2)
    tmpz['Mean'] = tmpz['Renewed%'].mean()
    tmpz['Nperformer'] = np.where(tmpz['Renewed%'] < tmpz['Mean'], 1, 0)
    tmpz['score'] = round(tmpz['R_count'] / tmpz['Tot'], 2)
    #tmpz.sort_index(inplace=True)

    # Statistic calculation ------
    score_mean = tmpz['score'].mean()
    score_std = tmpz['score'].std()
    th_min = tmpz['score'].min()
    th_nsd = round(score_mean - score_std, 2)
    th_mean = round(score_mean, 2)
    th_psd = round(score_mean + score_std, 2)
    th_max = tmpz['score'].max()

    tmpz = pd.merge(tmp, tmp1, left_index=True, right_index=True)
    tmpz['Tot'] = tmpz['NR_count'] + tmpz['R_count']
    tmpz['Renewed%'] = round(tmpz['Renewed%'], 2)
    tmpz['Mean'] = tmpz['Renewed%'].mean()
    tmpz['Nperformer'] = np.where(tmpz['Renewed%'] < tmpz['Mean'], 1, 0)
    tmpz['score'] = round(tmpz['R_count'] / tmpz['Tot'], 2)
    tmpz = tmpz.reset_index()
    tmpz.columns.values[tmpz.columns.get_loc(f'{columns_conc}')] = 'count'
    tmpz.rename(columns={
        'index': f'{columns_conc}',
        'score': f'{columns_conc}_score'
    },
                inplace=True)

    if len(xlab) > 1:
        tmpz = tmpz.join(
            pd.DataFrame(
                tmpz[f'{tmpz.columns.values[0]}'].str.split('|').tolist()))
        namu = tmpz.columns.values[0]
        namu = namu.split('|')
        cl = dict(enumerate(namu))
        tmpz.rename(columns=cl, inplace=True)

    select_reqcolumn = []
    select_reqcolumn.append(xlab)
    columns = xlab  #=['veh_make_name'] # grouping of
    columns_conc = ("|".join(columns))
    score_col = [f'{columns_conc}_score']
    select_reqcolumn.append(score_col)
    select_reqcolumn = do_flatlist(select_reqcolumn)
    get_scoredf = tmpz[select_reqcolumn]
    tmpz = get_scoredf

    if prt == 'Y':
        tmpz.to_csv(f'{columns_conc}.csv', index=False)
    return tmpz
示例#37
0
文件: ldsc.py 项目: zhanyq/mtag
def ldscore(args, log):
    '''
    Wrapper function for estimating l1, l1^2, l2 and l4 (+ optionally standard errors) from
    reference panel genotypes.

    Annot format is
    chr snp bp cm <annotations>

    '''

    if args.bfile:
        snp_file, snp_obj = args.bfile + '.bim', ps.PlinkBIMFile
        ind_file, ind_obj = args.bfile + '.fam', ps.PlinkFAMFile
        array_file, array_obj = args.bfile + '.bed', ld.PlinkBEDFile

    # read bim/snp
    array_snps = snp_obj(snp_file)
    m = len(array_snps.IDList)
    log.log('Read list of {m} SNPs from {f}'.format(m=m, f=snp_file))
    if args.annot is not None:  # read --annot
        try:
            if args.thin_annot:  # annot file has only annotations
                annot = ps.ThinAnnotFile(args.annot)
                n_annot, ma = len(annot.df.columns), len(annot.df)
                log.log("Read {A} annotations for {M} SNPs from {f}".format(
                    f=args.annot, A=n_annot, M=ma))
                annot_matrix = annot.df.values
                annot_colnames = annot.df.columns
                keep_snps = None
            else:
                annot = ps.AnnotFile(args.annot)
                n_annot, ma = len(annot.df.columns) - 4, len(annot.df)
                log.log("Read {A} annotations for {M} SNPs from {f}".format(
                    f=args.annot, A=n_annot, M=ma))
                annot_matrix = np.array(annot.df.iloc[:, 4:])
                annot_colnames = annot.df.columns[4:]
                keep_snps = None
                if np.any(annot.df.SNP.values != array_snps.df.SNP.values):
                    raise ValueError('The .annot file must contain the same SNPs in the same'+\
                        ' order as the .bim file.')
        except Exception:
            log.log('Error parsing .annot file')
            raise

    elif args.extract is not None:  # --extract
        keep_snps = __filter__(args.extract, 'SNPs', 'include', array_snps)
        annot_matrix, annot_colnames, n_annot = None, None, 1

    elif args.cts_bin is not None and args.cts_breaks is not None:  # --cts-bin
        cts_fnames = sumstats._splitp(args.cts_bin)  # read filenames
        args.cts_breaks = args.cts_breaks.replace(
            'N', '-')  # replace N with negative sign
        try:  # split on x
            breaks = [[float(x) for x in y.split(',')]
                      for y in args.cts_breaks.split('x')]
        except ValueError as e:
            raise ValueError(
                '--cts-breaks must be a comma-separated list of numbers: ' +
                str(e.args))

        if len(breaks) != len(cts_fnames):
            raise ValueError(
                'Need to specify one set of breaks for each file in --cts-bin.'
            )

        if args.cts_names:
            cts_colnames = [str(x) for x in args.cts_names.split(',')]
            if len(cts_colnames) != len(cts_fnames):
                msg = 'Must specify either no --cts-names or one value for each file in --cts-bin.'
                raise ValueError(msg)

        else:
            cts_colnames = ['ANNOT' + str(i) for i in xrange(len(cts_fnames))]

        log.log('Reading numbers with which to bin SNPs from {F}'.format(
            F=args.cts_bin))

        cts_levs = []
        full_labs = []
        for i, fh in enumerate(cts_fnames):
            vec = ps.read_cts(cts_fnames[i], array_snps.df.SNP.values)

            max_cts = np.max(vec)
            min_cts = np.min(vec)
            cut_breaks = list(breaks[i])
            name_breaks = list(cut_breaks)
            if np.all(cut_breaks >= max_cts) or np.all(cut_breaks <= min_cts):
                raise ValueError(
                    'All breaks lie outside the range of the cts variable.')

            if np.all(cut_breaks <= max_cts):
                name_breaks.append(max_cts)
                cut_breaks.append(max_cts + 1)

            if np.all(cut_breaks >= min_cts):
                name_breaks.append(min_cts)
                cut_breaks.append(min_cts - 1)

            name_breaks.sort()
            cut_breaks.sort()
            n_breaks = len(cut_breaks)
            # so that col names are consistent across chromosomes with different max vals
            name_breaks[0] = 'min'
            name_breaks[-1] = 'max'
            name_breaks = [str(x) for x in name_breaks]
            labs = [
                name_breaks[i] + '_' + name_breaks[i + 1]
                for i in xrange(n_breaks - 1)
            ]
            cut_vec = pd.Series(pd.cut(vec, bins=cut_breaks, labels=labs))
            cts_levs.append(cut_vec)
            full_labs.append(labs)

        annot_matrix = pd.concat(cts_levs, axis=1)
        annot_matrix.columns = cts_colnames
        # crosstab -- for now we keep empty columns
        annot_matrix = pd.crosstab(
            annot_matrix.index,
            [annot_matrix[i] for i in annot_matrix.columns],
            dropna=False,
            colnames=annot_matrix.columns)

        # add missing columns
        if len(cts_colnames) > 1:
            for x in product(*full_labs):
                if x not in annot_matrix.columns:
                    annot_matrix[x] = 0
        else:
            for x in full_labs[0]:
                if x not in annot_matrix.columns:
                    annot_matrix[x] = 0

        annot_matrix = annot_matrix[sorted(annot_matrix.columns,
                                           key=annot_sort_key)]
        if len(cts_colnames) > 1:
            # flatten multi-index
            annot_colnames = [
                '_'.join([cts_colnames[i] + '_' + b for i, b in enumerate(c)])
                for c in annot_matrix.columns
            ]
        else:
            annot_colnames = [
                cts_colnames[0] + '_' + b for b in annot_matrix.columns
            ]

        annot_matrix = np.matrix(annot_matrix)
        keep_snps = None
        n_annot = len(annot_colnames)
        if np.any(np.sum(annot_matrix, axis=1) == 0):
            # This exception should never be raised. For debugging only.
            raise ValueError(
                'Some SNPs have no annotation in --cts-bin. This is a bug!')

    else:
        annot_matrix, annot_colnames, keep_snps = None, None, None,
        n_annot = 1

    # read fam
    array_indivs = ind_obj(ind_file)
    n = len(array_indivs.IDList)
    log.log('Read list of {n} individuals from {f}'.format(n=n, f=ind_file))
    # read keep_indivs
    if args.keep:
        keep_indivs = __filter__(args.keep, 'individuals', 'include',
                                 array_indivs)
    else:
        keep_indivs = None

    # read genotype array
    log.log('Reading genotypes from {fname}'.format(fname=array_file))
    geno_array = array_obj(array_file,
                           n,
                           array_snps,
                           keep_snps=keep_snps,
                           keep_indivs=keep_indivs,
                           mafMin=args.maf)

    # filter annot_matrix down to only SNPs passing MAF cutoffs
    if annot_matrix is not None:
        annot_keep = geno_array.kept_snps
        annot_matrix = annot_matrix[annot_keep, :]

    # determine block widths
    x = np.array((args.ld_wind_snps, args.ld_wind_kb, args.ld_wind_cm),
                 dtype=bool)
    if np.sum(x) != 1:
        raise ValueError('Must specify exactly one --ld-wind option')

    if args.ld_wind_snps:
        max_dist = args.ld_wind_snps
        coords = np.array(xrange(geno_array.m))
    elif args.ld_wind_kb:
        max_dist = args.ld_wind_kb * 1000
        coords = np.array(array_snps.df['BP'])[geno_array.kept_snps]
    elif args.ld_wind_cm:
        max_dist = args.ld_wind_cm
        coords = np.array(array_snps.df['CM'])[geno_array.kept_snps]

    block_left = ld.getBlockLefts(coords, max_dist)
    if block_left[len(block_left) - 1] == 0 and not args.yes_really:
        error_msg = 'Do you really want to compute whole-chomosome LD Score? If so, set the '
        error_msg += '--yes-really flag (warning: it will use a lot of time / memory)'
        raise ValueError(error_msg)

    scale_suffix = ''
    if args.pq_exp is not None:
        log.log('Computing LD with pq ^ {S}.'.format(S=args.pq_exp))
        msg = 'Note that LD Scores with pq raised to a nonzero power are'
        msg += 'not directly comparable to normal LD Scores.'
        log.log(msg)
        scale_suffix = '_S{S}'.format(S=args.pq_exp)
        pq = np.matrix(geno_array.maf * (1 - geno_array.maf)).reshape(
            (geno_array.m, 1))
        pq = np.power(pq, args.pq_exp)

        if annot_matrix is not None:
            annot_matrix = np.multiply(annot_matrix, pq)
        else:
            annot_matrix = pq

    log.log("Estimating LD Score.")
    lN = geno_array.ldScoreVarBlocks(block_left,
                                     args.chunk_size,
                                     annot=annot_matrix)
    col_prefix = "L2"
    file_suffix = "l2"

    if n_annot == 1:
        ldscore_colnames = [col_prefix + scale_suffix]
    else:
        ldscore_colnames = [
            y + col_prefix + scale_suffix for y in annot_colnames
        ]

    # print .ldscore. Output columns: CHR, BP, RS, [LD Scores]
    out_fname = args.out + '.' + file_suffix + '.ldscore'
    new_colnames = geno_array.colnames + ldscore_colnames
    df = pd.DataFrame.from_records(np.c_[geno_array.df, lN])
    df.columns = new_colnames
    if args.print_snps:
        if args.print_snps.endswith('gz'):
            print_snps = pd.read_csv(args.print_snps,
                                     header=None,
                                     compression='gzip')
        elif args.print_snps.endswith('bz2'):
            print_snps = pd.read_csv(args.print_snps,
                                     header=None,
                                     compression='bz2')
        else:
            print_snps = pd.read_csv(args.print_snps, header=None)
        if len(print_snps.columns) > 1:
            raise ValueError(
                '--print-snps must refer to a file with a one column of SNP IDs.'
            )
        log.log('Reading list of {N} SNPs for which to print LD Scores from {F}'.format(\
                        F=args.print_snps, N=len(print_snps)))

        print_snps.columns = ['SNP']
        df = df.ix[df.SNP.isin(print_snps.SNP), :]
        if len(df) == 0:
            raise ValueError(
                'After merging with --print-snps, no SNPs remain.')
        else:
            msg = 'After merging with --print-snps, LD Scores for {N} SNPs will be printed.'
            log.log(msg.format(N=len(df)))

    l2_suffix = '.gz'
    log.log("Writing LD Scores for {N} SNPs to {f}.gz".format(f=out_fname,
                                                              N=len(df)))
    df.drop(['CM', 'MAF'], axis=1).to_csv(out_fname,
                                          sep="\t",
                                          header=True,
                                          index=False,
                                          float_format='%.3f')
    call(['gzip', '-f', out_fname])
    if annot_matrix is not None:
        M = np.atleast_1d(np.squeeze(np.asarray(np.sum(annot_matrix, axis=0))))
        ii = geno_array.maf > 0.05
        M_5_50 = np.atleast_1d(
            np.squeeze(np.asarray(np.sum(annot_matrix[ii, :], axis=0))))
    else:
        M = [geno_array.m]
        M_5_50 = [np.sum(geno_array.maf > 0.05)]

    # print .M
    fout_M = open(args.out + '.' + file_suffix + '.M', 'wb')
    print >> fout_M, '\t'.join(map(str, M))
    fout_M.close()

    # print .M_5_50
    fout_M_5_50 = open(args.out + '.' + file_suffix + '.M_5_50', 'wb')
    print >> fout_M_5_50, '\t'.join(map(str, M_5_50))
    fout_M_5_50.close()

    # print annot matrix
    if (args.cts_bin is not None) and not args.no_print_annot:
        out_fname_annot = args.out + '.annot'
        new_colnames = geno_array.colnames + ldscore_colnames
        annot_df = pd.DataFrame(np.c_[geno_array.df, annot_matrix])
        annot_df.columns = new_colnames
        del annot_df['MAF']
        log.log("Writing annot matrix produced by --cts-bin to {F}".format(
            F=out_fname + '.gz'))
        annot_df.to_csv(out_fname_annot, sep="\t", header=True, index=False)
        call(['gzip', '-f', out_fname_annot])

    # print LD Score summary
    pd.set_option('display.max_rows', 200)
    log.log('\nSummary of LD Scores in {F}'.format(F=out_fname + l2_suffix))
    t = df.ix[:, 4:].describe()
    log.log(t.ix[1:, :])

    np.seterr(divide='ignore',
              invalid='ignore')  # print NaN instead of weird errors
    # print correlation matrix including all LD Scores and sample MAF
    log.log('')
    log.log('MAF/LD Score Correlation Matrix')
    log.log(df.ix[:, 4:].corr())

    # print condition number
    if n_annot > 1:  # condition number of a column vector w/ nonzero var is trivially one
        log.log('\nLD Score Matrix Condition Number')
        cond_num = np.linalg.cond(df.ix[:, 5:])
        log.log(reg.remove_brackets(str(np.matrix(cond_num))))
        if cond_num > 10000:
            log.log('WARNING: ill-conditioned LD Score Matrix!')

    # summarize annot matrix if there is one
    if annot_matrix is not None:
        # covariance matrix
        x = pd.DataFrame(annot_matrix, columns=annot_colnames)
        log.log('\nAnnotation Correlation Matrix')
        log.log(x.corr())

        # column sums
        log.log('\nAnnotation Matrix Column Sums')
        log.log(_remove_dtype(x.sum(axis=0)))

        # row sums
        log.log('\nSummary of Annotation Matrix Row Sums')
        row_sums = x.sum(axis=1).describe()
        log.log(_remove_dtype(row_sums))

    np.seterr(divide='raise', invalid='raise')
print(np.unique(data['occupation']))

data=pd.read_csv('income.csv',na_values=[" ?"]) #to read ' ?' as null

#data pre-processing
data.isnull().sum() #to get how many ' ?' are there in JobType and occupation

missing=data[data.isnull().any(axis=1)] #any missing value in a col (axis=1 !!!)
print(missing)

data2=data.dropna(axis=0) #dropping all rows w/ missing values as we don't know relationship b/w features

correlation=data2.corr() #relationship b/w independant var
data2.columns #gives col names

gender=pd.crosstab(index=data2['gender'],columns='count', normalize=True)
print(gender)

gender_salstat=pd.crosstab(index=data2['gender'],columns=data2['SalStat'], normalize='index', margins=True) #given gender salstat is....
print(gender_salstat)

SalStat=sns.countplot(data2['SalStat']) #bar chat 

sns.distplot(data2['age'], bins=10, kde=False, color='red') #histogram

sns.boxplot('SalStat','age',data=data2) #to get relationship b/w salstat and age
data2.groupby('SalStat')['age'].median() #to get median age for salstat categories


sns.countplot(y='JobType',hue='SalStat',data=data2) #group bar plot
def coocurance_matrix(col1, col2):
	co_mat = pd.crosstab(col1, col2)
	return co_mat
示例#40
0
    import pandas as pd
    import itertools


    emotions = ['angry', 'disgusted', 'fearful', 'happy', 'sad', 'surprise']
    # lookup = {0: 'fearful', 1: 'angry', 2: 'disgusted', 3: 'neutral', 4: 'surprised', 5: 'happy'}  # , 6:'happy'}
    lookup = {0: "angry", 1: "disgusted", 2: 'fearful',3: "happy", 4: 'sad', 5: "surprise"}  # , 6:"reassured"}
    # lookup = {0: 'Angry', 1:'Disgust', 2:'Fear', 3:'Happy', 4:'Neutral', 5:'Sad', 6:'Surprise'}
    # lookup = {0: 'Angry', 1: 'Disgust', 2: 'happy', 3: 'neutral', 4: 'surprised', 5: 'Sad', 6: 'fearful'}
    y_true = pd.Series([lookup[_] for _ in labelsValues])  # np.random.random_integers(0, 5, size=100)])
    y_pred = pd.Series([lookup[_] for _ in PredictedValues])  # np.random.random_integers(0, 5, size=100)])

    '''print('positive: ' + str(overallCounter))
    print('total: ' + str(numIm))
    print('accuracy: ' + str(acc))'''
    pd.crosstab(y_true, y_pred, rownames=['True'], colnames=['Predicted']).apply(lambda r: 100.0 * r / r.sum())

    import matplotlib.pyplot as plt

    conf = confusion_matrix(y_true, y_pred)

    #######################################################################################
    lookup = {0: "angry", 1: "disgusted", 2: "fearful", 3: "happy", 4:  "sad",
              5: "surprise"}  # , 6:"reassured"}

    ####lookup = {0: "angry", 1: "disgusted", 2: "fearful", 3: "happy", 4: "sad", 5: "surprise"}

    # lookup = {0: 'Angry', 1:'Disgust', 2:'Fear', 3:'Happy', 4:'Neutral', 5:'Sad', 6:'Surprise'}
    # lookup = {0: 'Angry', 1: 'Disgust', 2: 'happy', 3: 'neutral', 4: 'surprised', 5: 'Sad', 6: 'fearful'}
    y_true = pd.Series([lookup[_] for _ in labelsValues])  # np.random.random_integers(0, 5, size=100)])
    y_pred = pd.Series([lookup[_] for _ in PredictedValues])  # np.random.random_integers(0, 5, size=100)])
示例#41
0
final_vars = df2.index[0:10]
final_pred = pred_train[final_vars]

classifier_FvNF = classifier.fit(final_pred, resp_train.status_group)

print('Final predictor set estimated OOB accuracy (FUNC vs NONFUNC):',
      classifier_FvNF.oob_score_)

# Use the final fitted model to predict waterpoint operation status in the
# validation set using only the chosen predictors.
predictions_FvNF = classifier_FvNF.predict(pred_val[final_vars])

# Show a confusion matrix and accuracy score.
ct = pd.crosstab(resp_val.status_group,
                 predictions_FvNF,
                 rownames=['True'],
                 colnames=['Predicted'],
                 margins=True)
print(ct)
print('Estimated OOB accuracy from training (FUNC vs NONFUNC): ',
      classifier_FvNF.oob_score_)
print('Validation set accuracy (FUNC vs NONFUNC): ',
      skm.accuracy_score(resp_val, predictions_FvNF))

# Make a nice looking chart of the confusion matrix
colLabels = list(ct.columns.values)
rowLabels = list(ct.index)
#nrows, ncols = len(ct) + 1, len(colLabels) + 1
#hcell, wcell = 0.5, 1.
#hpad, wpad = 0, 0
combine = [train, test]
print( combine[0]  )


#extract a title for each Name in the train and test datasets
for dataset in combine:
    dataset['Title'] = dataset['Name'].str.extract(', ([A-Za-z]+)\.', expand=False)



print( "\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n"  )
print( train  )
print()


print( pd.crosstab(train['Title'], train['Sex'] )    )





# replace various titles with more common names
for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(
       ['Lady', 'Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Jonkheer', 'Dona'],
        'Rare')

    dataset['Title'] = dataset['Title'].replace(['Countess', 'Sir'], 'Royal')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
import pandas as pd

massa_dados = 'https://github.com/cleber-impacta/atividade03_python/blob/main/massa%20de%20dados/04-2020.csv?raw=true'

lista = pd.read_csv(massa_dados, sep=';')
lista
"""

Mostra as 5 primeiras linhas
"""

lista.head()
"""Número de linhas e coluna"""

lista.shape

lista['despacho'].value_counts().plot.bar()

lista['despacho'].unique()

lista['despacho'].value_counts()

lista['despacho'].value_counts(normalize=True)

lista.describe()

pd.crosstab(lista['especie'], lista['sexo'])
####### The Binomial Distribution ########
"""
The binomial distribution is a discrete probability distribution that models the outcomes of a given number of
random trails of some experiment or event. The binomial is defined by two parameters: the probability of success
in any given trial and the number of trials. The binomial distribution tells you how likely it is to achieve a given
number of successes in n trials of the experiment. For example, we could model flipping a fair coin 10 times with a
binomial distribution where the number of trials is set to 10 and the probability of success is set to 0.5.
In this case the distribution would tell us how likely it is to get zero heads, 1 head, 2 heads and so on.
"""
fair_coin_flips = stats.binom.rvs(
    n=10,  # Number of flips per trial
    p=0.5,  # Success probability
    size=10000)  # Number of trials

print(pd.crosstab(index="counts", columns=fair_coin_flips))
pd.DataFrame(fair_coin_flips).hist(range=(-0.5, 10.5), bins=11)
plt.show()
"""
col_0   0   1    2     3     4     5     6     7    8   9   10
row_0
counts   7  91  450  1212  1998  2435  2099  1182  422  93  11
Note that since the binomial distribution is discrete, it only takes on integer values
so we can summarize binomial data with a frequency table and its distribution with a histogram.
The histogram shows us that a binomial distribution with a 50% probability of success is roughly symmetric,
with the most likely outcomes lying at the center. This is reminiscent of the normal distribution,
but if we alter the success probability, the distribution won't be symmetric:
"""

biased_coin_flips = stats.binom.rvs(
    n=10,  # Number of flips per trial
# Model 2
# Gaussian Naive Bayes
sgnb = GaussianNB()
sgnb.fit(X_train_count_array,y_train)

## Gaussian Model Accuracy
sgnb.score(X_train_count_array,y_train) # 0.90
sgnb.score(X_test_count_array,y_test)  # 0.85

# From Above we can Conclude that Multinomial Naive Bayes Model gives us best result. So we are using it for future Predication.

# Prediction on Train & Test Data
pred_train = smnb.predict(X_train_count)
pred_test = smnb.predict(X_test_count)

# Confusion matrix of Train and Test
## Train
confusion_matrix_train = pd.crosstab(y_train,pred_train,rownames=['Actual'],colnames= ['Train Predictions']) 
sns.heatmap(confusion_matrix_train, annot = True, cmap = 'Blues',fmt='g')

## Test
confusion_matrix_test = pd.crosstab(y_test,pred_test,rownames=['Actual'],colnames= ['Test Predictions']) 
sns.heatmap(confusion_matrix_test, annot = True, cmap = 'Reds',fmt='g')

# Classification Report of test
print(classification_report(y_test,pred_test))


                        # ---------------------------------------------------- #
          
示例#46
0
# .crosstab(), short for cross_tabulation
import pandas as pd
import matplotlib.pyplot as plt

ri2 = pd.read_csv('/Users/apple/desktop/policeActivities/dataset/ri2.csv')
table = pd.crosstab(
    ri2['driver_race'],
    ri2['driver_gender'])  # NOTE: frequency table in form of dataframe
print(table)
# check the result of frequency table
asian_female = ri2[(ri2['driver_gender'] == 'F')
                   & (ri2['driver_race'] == 'Asian')]
print(asian_female.shape)

table = table.loc['Asian':'Hispanic']
print(table)

# create stacked bar plot
# table.plot(kind='bar', stacked=True)
# plt.show()

# district violation
# create frequency table with distric and violation
all_zones = pd.crosstab(ri2['district'], ri2['violation'])
print(all_zones)
# slice the dataframe to get k1-k3:
k_zones = all_zones.loc['Zone K1':'Zone K3']
print(k_zones)
示例#47
0
affairs = affairs.drop(['affairs'], axis=1)
affairs = pd.concat([affairs, affairs_cat_cat], axis=1)

###### Creating dummy variables for the categorical data

job_dum = pd.get_dummies(affairs.gender, drop_first=True)

df_dummies = pd.get_dummies(affairs,
                            columns=['affairs', 'gender', 'children'],
                            drop_first=True)
affairs = df_dummies

# Getting the barplot for the categorical columns (df[df.columns[0:30]])

sb.countplot(x="affairs_Yes", data=affairs, palette="hls")
pd.crosstab(affairs.affairs_Yes, affairs.gender_male).plot(kind="bar")

# Checking if we have na values or not
affairs.isnull().sum()  # No null values

#Model building

import statsmodels.formula.api as sm
logit_model = sm.logit(
    'affairs_Yes~age+yearsmarried+religiousness+education+occupation+rating+gender_male+children_yes',
    data=affairs).fit()

#summary
logit_model.summary()
y_pred = logit_model.predict(affairs)
示例#48
0
import pandas as pd
import matplotlib.pyplot as plt

data_csv = '/Users/pankaj/dev/git/smu/smu_ml1/class3/moon/moon_landing.csv'
lnd_data = pd.read_csv(data_csv)
lnd_data = lnd_data[::2]
oper_outcom = pd.crosstab(lnd_data['Operator'],
                          lnd_data['Outcome'],
                          margins=True,
                          margins_name="Total")
oper_outcom.sort_values('Total', inplace=True)
oper_outcom = oper_outcom.drop('Total', axis=1)
ax = oper_outcom.plot(kind='barh', stacked=True, title='Moon Missions')
ax.set_xlabel('Number of Missions')
ax.set_ylabel('Operator')

plt.show()
示例#49
0
def call_catscore(df, cat_col, y_lab, Top_n, thres, ytag, prt):
    i = cat_col  #"Age_Of_Vehicle"
    y2 = y_lab  #"Renewed2"
    Top_n = Top_n  #15
    ytag = ytag
    col_count = df[f'{i}'].value_counts()
    #print(col_count)
    col_count = col_count[:Top_n, ]

    col_count1 = df[f'{i}'].value_counts(normalize=True) * 100
    col_count1 = col_count1[:Top_n, ]
    vol_inperc = col_count1.sum()
    vol_inperc = round(vol_inperc, 2)

    tmp = pd.crosstab(df[f'{i}'], df[f'{y2}'], normalize='index') * 100
    tmp = pd.merge(col_count, tmp, left_index=True, right_index=True)
    tmp.rename(columns={0: 'NotRenwed%', 1: 'Renewed%'}, inplace=True)
    if 'NotRenwed%' not in tmp.columns:
        print("NotRenwed% is not present in ", i)
        tmp['NotRenwed%'] = 0
    if 'Renewed%' not in tmp.columns:
        print("Renewed% is not present in ", i)
        tmp['Renewed%'] = 0

    tmp1 = pd.crosstab(df[f'{i}'], df[f'{y2}'])
    tmp1.rename(columns={0: 'NR_count', 1: 'R_count'}, inplace=True)
    if 'NR_count' not in tmp1.columns:
        print("NR_count is not present in ", i)
        tmp1['NR_count'] = 0
    if 'R_count' not in tmp1.columns:
        print("R_count is not present in ", i)
        tmp1['R_count'] = 0

    tmpz = pd.merge(tmp, tmp1, left_index=True, right_index=True)
    tmpz['Tot'] = tmpz['NR_count'] + tmpz['R_count']
    tmpz['Renewed%'] = round(tmpz['Renewed%'], 2)
    tmpz['Mean'] = tmpz['Renewed%'].mean()
    tmpz['Nperformer'] = np.where(tmpz['Renewed%'] < tmpz['Mean'], 1, 0)
    tmpz['score'] = round(tmpz['R_count'] / tmpz['Tot'], 2)
    #tmpz.sort_index(inplace=True)

    # Statistic calculation ------
    score_mean = tmpz['score'].mean()
    score_std = tmpz['score'].std()
    th_min = tmpz['score'].min()
    th_nsd = round(score_mean - score_std, 2)
    th_mean = round(score_mean, 2)
    th_psd = round(score_mean + score_std, 2)
    th_max = tmpz['score'].max()

    def producer_clfy(tmpz):
        if (tmpz['score'] >= th_min and tmpz['score'] < th_nsd):
            return 'c4'
        if (tmpz['score'] >= th_nsd and tmpz['score'] < th_mean):
            return 'c3'
        elif (tmpz['score'] >= th_mean and tmpz['score'] < th_psd):
            return 'c2'
        elif (tmpz['score'] >= th_psd and tmpz['score'] <= 1):
            return 'c1'
        elif (tmpz['score'] > 1):
            return np.nan

    tmpz[f'{i}_class'] = tmpz.apply(producer_clfy, axis=1)
    tmpz.reset_index(inplace=True)
    tmpz.rename(columns={f'{i}': 'count'}, inplace=True)
    tmpz.rename(columns={'index': f'{i}'}, inplace=True)

    #tmpz = tmpz[[f'{i}',f'{i}_class','score']]

    #tmpz.drop('index',axis=1,inplace=True)

    #-----------------------------
    if prt == 'Y':
        print(tmpz)
        tmpzi = tmpz.reset_index()
        #tmpzii = tmpzi .join(pd.DataFrame(tmpzi.index.str.split('-').tolist()))
        #tmpzi = pd.concat([tmpzi,DataFrame(tmpzi.index.tolist())], axis=1, join='outer')
        tmpzi.to_excel("tmpz.xlsx")
    return (tmpz)
import pandas as pd
import matplotlib.pyplot as plt

bolsas_no_exterior = pd.read_csv('bolsa-exterior.csv', sep=';')
table = pd.crosstab(index=bolsas_no_exterior['Ano Fase'], columns='Count')
anos = table.index

total_de_bolsas_por_ano = []
for x in range(2006, 2016):
    total_de_bolsas_por_ano.append(bolsas_no_exterior[
        bolsas_no_exterior['Ano Fase'] == x]['Qtd Bolsas'].sum())

df_data = {'anos': anos, 'bolsas': total_de_bolsas_por_ano}
df = pd.DataFrame(df_data)
print(df)

plt.plot(anos,
         total_de_bolsas_por_ano,
         linestyle='--',
         color='r',
         marker='s',
         linewidth=3.0)
plt.title('Bolsas por Ano ofertadas no exterior')
plt.xlabel('Anos')
plt.ylabel('Quantidade de Bolsas')
plt.show()
fitting(X, y)
fpr, tpr, thresholds = roc_curve(probs, y_test, 1)

fpr, tpr, thresholds = metrics.roc_curve(y_test, probs, pos_label=1)

a = np.linspace(0, 1, 100)
plt.plot(fpr, tpr)
plt.plot(a, a)
plt.xlabel("False Positive Rate (1 - Specificity)")
plt.ylabel("True Positive Rate (Sensitivity, Recall)")
plt.title("ROC plot of Interest Rate")
plt.savefig('metrics_roc_interest')
plt.show()
'''Part 2'''
grad_school = pd.read_csv('data/grad.csv')
pd.crosstab(grad_school['admit'], grad_school['rank'])
grad_school['rank'].plot(kind='bar')
df = grad_school.set_index('rank')
x = []
for i in range(1, 5):
    x.append(sum(df.loc[i]['admit']) / float(len(df.loc[i]['admit'])))

plt.bar(range(1, 5), x)
plt.xlabel('Rank')
plt.ylabel('Percent of Admitted')
plt.savefig('percent_rank_bar')
plt.show()

plt.hist(grad_school['gpa'])
plt.savefig('hist_gpa')
plt.show()
示例#52
0
            n_summary = pd.merge(summary,
                                 d1,
                                 left_index=True,
                                 right_index=True)
            n_summary = pd.merge(n_summary,
                                 d2,
                                 left_index=True,
                                 right_index=True)

            n_summary.rename(columns={"mean": "AverageProbability"},
                             inplace=True)
            n_summary.to_csv(save_path + data_version + '_' + match_status +
                             '_bypatient_summary_' + weighted_status + '.csv')

        prescription_summary = pd.crosstab(index=summary.Prescribe,
                                           columns=summary.Match,
                                           margins=True,
                                           margins_name='Total')
        prescription_summary.columns = ['No Match', 'Match', 'Total']
        prescription_summary.drop('Total', axis=0)
        prescription_summary.sort_values('Total',
                                         ascending=False,
                                         inplace=True)
        prescription_summary.to_csv(save_path + data_version + '_' +
                                    match_status + '_bytreatment_summary_' +
                                    weighted_status + '.csv')

        # ===================================================================================
        # Prescription Effectiveness
        # We will show the difference in the percent of the population that survives.
        # Prescription Effectiveness compares the outcome with the algorithm's suggestion versus what happened in reality
        # ===================================================================================
ax = sns.barplot(x=df1["weathercondition"], y=df1["temp"])
ax.set_xticklabels(["clear", "mist", "light snow"])

# its clearly visible that season and temperature have a linear trend .temperatures increase at every season before dropping down in the winter season.

# feature selection

# In[436]:

from scipy.stats import chi2_contingency
from scipy.stats import f_oneway

# In[179]:

chi2, p, dof, ex = chi2_contingency(
    pd.crosstab(df1["holiday"], df1["workingday"]))
print(p)

# In[437]:

f_oneway(df1.totalcount, df1.weekday)

# the p value is less than 0.05. so it means that both the variables are dependent on each other. hence we will remove one of the two predictors for the model.

# In[92]:

df1 = df1.drop(["date"], axis=1)

# In[ ]:

# date column is not required for buildig model.so we will remove it.
示例#54
0
#
# * There are just over 40K customer records and 20 features for each customer.
# * The features are mixed--some numeric, some categorical.
# * The data appears to be sorted, at least by `time` and `contact`.
#

# ### Exploration
# Let's explore the data. First, let's understand how the features are distributed.

# In[6]:

# Frequency tables for each categorical feature
for column in data.select_dtypes(include=['object']).columns:
    display(
        pd.crosstab(index=data[column],
                    columns='% observations',
                    normalize='columns') * 100)

# In[7]:

# Histograms for each numeric feature
display(data.describe())
display(data.describe(include=np.object))
#%matplotlib inline
hist = data.hist(bins=30, sharey=True, figsize=(10, 10))

# Notice that:
#
# - Almost 90% of the values for our target variable y are "no", so most customers did not subscribe to a term deposit.
# - Many of the predictive features take on values of "unknown". Some are more common than others. We should think carefully as to what causes a value of "unknown" (are these customers non-representative in some way?) and how to handle that.
# - Even if "unknown" is included as its own distinct category, what does it mean, given that those observations likely fall within one of the other categories of that feature?
示例#55
0
    def significance_test(self, field1:str, field2:str, method:str="spearman",
                          verbose=True) -> pd.Series:
        """
        Execute a statistical test as follows
        - Both fields are categorical => chi-square test
        - Both fields are continuous => correlation
        - Otherwise => one-way ANOVA on ranks

        :param field1: field to compare
        :param field2: field to compare
        :param method: "spearman" (default) or "pearson"
        :param verbose: if warnings are shown
        :return: Series with index: field1, field2, test, statistic, pval
        """

        cats = self.get_cats()
        cons = self.get_cons()

        if field1 in cats and field2 in cats:
            #### chi2-test
            test = "chi-square test"
            contigency_table = pd.crosstab(self.data[field1], self.data[field2])

            if verbose and (contigency_table < 5).sum().sum() > 0:
                print("The contigency table (%s vs %s) contains too small cell(s)." % (field1,field2))
                print("Consult the documentation of stats.chi2_contingency")

            statistic, pval, dof, exp = stats.chi2_contingency(contigency_table)

        elif field1 in cons and field2 in cons:
            #### correlation
            if method == "spearman":
                test = "Spearman correlation"
                cor = stats.spearmanr
            else:
                test = "Peason correlation"
                cor = stats.pearsonr

            statistic, pval = cor(self.data[field1], self.data[field2])

        else:
            #### one-way ANOVA on ranks
            test = "one-way ANOVA on ranks"
            if field1 in cats and field2 in cons:
                cat, con = field1, field2
            elif field1 in cons and field2 in cats:
                cat, con = field2, field1
            else:
                raise ValueError("You gave a wrong field.")

            vals = self.data[cat].unique()

            samples = [self.data.loc[self.data[cat] == v, con] for v in vals]

            if verbose and any([len(s) < 5 for s in samples]):
                print("The groups withe less than 5 samples will be ignored.")
                samples = [x for x in samples if len(x) >= 5]

            statistic, pval = stats.kruskal(*samples)

        s = pd.Series([field1, field2, test, statistic, pval],
                      index=["field1", "field2", "test", "statistic", "pval"])
        return s
示例#56
0
def main():
    """Verify age and gender balance along the groups from the MIRIAD dataset."""
    # ----------------------------------------------------------------------------------------
    dataset_name = 'MIRIAD'

    participants_path = PROJECT_ROOT / 'data' / dataset_name / 'participants.tsv'
    freesurfer_path = PROJECT_ROOT / 'data' / dataset_name / 'freesurferData.csv'

    outputs_dir = PROJECT_ROOT / 'outputs'
    ids_path = outputs_dir / (dataset_name + '_cleaned_ids.csv')

    dataset_df = load_dataset(participants_path, ids_path, freesurfer_path)
    dataset_df = dataset_df[dataset_df['Diagn'].isin([1, 17])]
    dataset_df = dataset_df.reset_index(drop=True)
    dataset_df = dataset_df.set_index('participant_id')

    # ----------------------------------------------------------------------------------------
    print('Analysing {:}'.format(dataset_name))
    print('Total of participants = {:}'.format(len(dataset_df)))
    print('')
    print('Number of participants per diagnosis')
    print(dataset_df.groupby('Diagn')['Image_ID'].count())
    print('')

    contingency_table = pd.crosstab(dataset_df.Gender, dataset_df.Diagn)
    print('Contigency table of gender x diagnosis')
    print(contingency_table)
    print('')

    def print_age_stats(dataset_df):
        hc_age = dataset_df[dataset_df['Diagn'] == 1].Age.values
        ad_age = dataset_df[dataset_df['Diagn'] == 17].Age.values

        print('Age per diagnosis')
        print('HC = {:.1f}±{:.1f} [{:d}, {:d}]'.format(hc_age.mean(),
                                                       hc_age.std(),
                                                       math.ceil(hc_age.min()),
                                                       math.ceil(
                                                           hc_age.max())))
        print('AD = {:.1f}±{:.1f} [{:d}, {:d}]'.format(ad_age.mean(),
                                                       ad_age.std(),
                                                       math.ceil(ad_age.min()),
                                                       math.ceil(
                                                           ad_age.max())))
        print('')

    print_age_stats(dataset_df)

    # ----------------------------------------------------------------------------------------
    # Gender analysis
    print('------------- GENDER ANALYSIS ----------------')

    def print_gender_analysis(contingency_table):
        _, p_value, _, _ = chi2_contingency(contingency_table[[1, 17]],
                                            correction=False)
        print('Gender - HC vs AD p value {:.4f}'.format(p_value))
        _, p_value, _, _ = chi2_contingency(contingency_table,
                                            correction=False)
        print('Gender - TOTAL p value {:.4f}'.format(p_value))
        print('')

    print_gender_analysis(contingency_table)

    # ----------------------------------------------------------------------------------------
    # Age analysis
    print('------------- AGE ANALYSIS ----------------')
    print_age_stats(dataset_df)

    def print_age_analysis(dataset_df):
        hc_age = dataset_df[dataset_df['Diagn'] == 1].Age.values
        ad_age = dataset_df[dataset_df['Diagn'] == 17].Age.values

        _, p_value = ttest_ind(hc_age, ad_age)
        print('Age - HC vs AD p value {:.4f}'.format(p_value))
        print('Age - TOTAL p value {:.4f}'.format(
            f_oneway(hc_age, ad_age).pvalue))
        print()
        print('')

    print_age_analysis(dataset_df)

    # ----------------------------------------------------------------------------------------
    # Final dataset
    print('------------- FINAL DATASET ----------------')
    print_gender_analysis(contingency_table)
    print_age_stats(dataset_df)
    print_age_analysis(dataset_df)

    dataset_df[['Image_ID']].to_csv(outputs_dir /
                                    (dataset_name + '_homogeneous_ids.csv'),
                                    index=False)
示例#57
0
lab = newdf['label']
lab1 = newdf_test['label']

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=0)
t0 = time()
clf.fit(features, lab)
tt = time() - t0
print("Classifier trained in {} seconds".format(round(tt, 3)))
t0 = time()
pred = clf.predict(features1)
tt = time() - t0
print("Predicted in {} seconds".format(round(tt, 3)))
from sklearn.metrics import accuracy_score
acc = accuracy_score(pred, lab1)
print("Accuracy is {}.".format(round(acc, 4)))
print(
    pd.crosstab(lab1,
                pred,
                rownames=['Actual attacks'],
                colnames=['Predicted attacks']))

#Features selected : ['duration', 'src_bytes', 'dst_bytes', 'logged_in', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'Protocol_type_icmp', 'Protocol_type_tcp', 'service_eco_i', 'service_http', 'service_other', 'flag_OTH', 'flag_REJ', 'flag_RSTOS0', 'flag_RSTR', 'flag_S0', 'flag_SH']
#Classifier trained in 7.215 seconds
#Predicted in 0.493 seconds
#Accuracy is 0.9861.
#Predicted attacks  Probe  non-Probe
#Actual attacks
#Probe               1835        847
#non-Probe            224      74385
示例#58
0
del user_set1
del user_set2
del user_set3
del user_set4
del user_set5
del user_set6
del user_set7
del user_set8

# del users
# del up_df
gc.collect()

print('creating subsets..')
up_df1 = pd.crosstab(up_df1.user_id, up_df1.product_pair).astype('uint32')
up_df2 = pd.crosstab(up_df2.user_id, up_df2.product_pair).astype('uint32')
up_df3 = pd.crosstab(up_df3.user_id, up_df3.product_pair).astype('uint32')
up_df4 = pd.crosstab(up_df4.user_id, up_df4.product_pair).astype('uint32')
up_df5 = pd.crosstab(up_df5.user_id, up_df5.product_pair).astype('uint32')
up_df6 = pd.crosstab(up_df6.user_id, up_df6.product_pair).astype('uint32')
up_df7 = pd.crosstab(up_df7.user_id, up_df7.product_pair).astype('uint32')
up_df8 = pd.crosstab(up_df8.user_id, up_df8.product_pair).astype('uint32')

print('merging into one crosstab')
user_prodpair = pd.concat(
    [up_df1, up_df2, up_df3, up_df4, up_df5, up_df6, up_df7, up_df8])

print('user-product pair')
#print(user_prodpair.head())
print(user_prodpair.shape)
# In[ ]:

for dataset in combine:
    dataset['Age_null'] = data_raw['Age'].isnull() * 1
age_survive = sns.barplot(x='Age_null', y='Survived', data=data_raw)
age_survive = plt.xlabel('Age is Missing')
age_survive = plt.ylabel('Survived')
plt.show()

# In[ ]:

# Complete missing age Values. we noticed that in the name, there is also title information, this can be helpful to predict the age
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract('([A-Za-z]+)\.', expand=False)
pd.crosstab(data_raw['Title'], data_raw['Sex'])

# In[ ]:

# Group different title (especially rare titles) into common groups which are more closely related to the age
for dataset in combine:
    dataset['Title'] = dataset['Title'].replace([
        'Capt', 'Col', 'Sir', 'Don', 'Dr', 'Major', 'Rev', 'Jonkheer', 'Rev',
        'Countess', 'Lady', 'Dona'
    ], 'Rare')
    #     dataset['Title'] = dataset['Title'].replace(['Countess', 'Lady', 'Dona'], 'Rare_F')
    dataset['Title'] = dataset['Title'].replace(['Mlle', 'Ms'], 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    dataset['IsAlone'] = 1  #initialize to yes/1 is alone
    dataset['IsAlone'].loc[
# In[17]:

# Normalized Survival Rates for over 18
train.Survived[train.Child == 0].value_counts(normalize=True)

# ## 透视表(pivotTab)
# 透视表就是将指定原有DataFrame的列分别作为行索引和列索引,然后对指定的列应用聚集函数(默认情况下式mean函数)。
#
# ## 列联表(crossTab)
# 交叉表是用于统计分组频率的特殊透视表
#
# Compute a simple cross tabulation of two (or more) factors. By default computes a frequency table of the factors unless an array of values and an aggregation function are passed.

# In[4]:

pd.crosstab(train['Sex'], train['Survived'], margins=True)

# In[20]:

pd.crosstab(train['Sex'], train['Survived'], margins=True, normalize='index')

# In[19]:

pd.crosstab(train['Sex'], [train['Survived'], train['Pclass']], margins=True)

# In[22]:

pd.crosstab(train['Sex'], [train['Survived'], train['Pclass']],
            normalize='index')

# In[26]: