def main(): img_dir = 'images/' images = [img_dir + f for f in os.listdir(img_dir)] labels = [f.split('/')[-1].split('_')[0] for f in images] label2ids = {v: i for i, v in enumerate(sorted(set(labels), key=labels.index))} y = np.array([label2ids[l] for l in labels]) data = [] for image_file in images: img = img_to_matrix(image_file) img = flatten_image(img) data.append(img) data = np.array(data) # training samples is_train = np.random.uniform(0, 1, len(data)) <= 0.7 train_X, train_y = data[is_train], y[is_train] # training a classifier pca = RandomizedPCA(n_components=5) train_X = pca.fit_transform(train_X) multi_svm = OneVsRestClassifier(LinearSVC()) multi_svm.fit(train_X, train_y) # evaluating the model test_X, test_y = data[is_train == False], y[is_train == False] test_X = pca.transform(test_X) print pd.crosstab(test_y, multi_svm.predict(test_X), rownames=['Actual'], colnames=['Predicted'])
def generation_analysis(G, attribute, plot=True): """ Analyzes an attribute, e.g. health status, by generation. PARAMETERS ------------- G = networkx object attribute = case attribute for analysis, e.g. health status or sex table = print cross table of attribute by generation. Default is true. plot = produce histogram of attribute by generation. Default is true. RETURNS -------------- matplotlib figure and axis objects """ gen_df = pd.DataFrame(G.node).T print '{} by generation'.format(attribute) table = pd.crosstab(gen_df.generation, gen_df[attribute], margins=True) print table, '\n' if plot == True: fig, ax = plt.subplots() ax.set_aspect('auto') pd.crosstab(gen_df.generation, gen_df[attribute]).plot(kind='bar', ax=ax, alpha=.5) ax.set_xlabel('Generation') ax.set_ylabel('Case count') ax.grid(False) ax.legend(loc='best'); return fig, ax, table else: return table
def fit_ode(self, X, y, father): self.father = father self.Z = X.shape[1] # No of features self.Zval = map( len, map( np.unique, X.T ) ) self.C = map(int, list( set(y)) ) self.py = np.array([ list(y).count(i) for i in set( y )], float ) / X.shape[0] self.names[self.father] = map(int,list( set( X[:,self.father] ))) self.validity = map( int, [list( X[:,self.father]).count(i) > self.father for i in np.unique(X[:,self.father])] ) for z in range(self.Z): self.names[z] = map(int,list( set( X[:,z] ))) if z is father: self.names[z] = map(int,list( set( X[:,z] ))) ct = crosstab( X[:,z], y ) ct = ct.reindex_axis( self.names[z], axis=0).fillna(0) ct = ct.reindex_axis( self.C, axis=1).fillna(0) tmp = np.asarray ( (ct + self.Lap).apply(lambda r: r/r.sum(), axis=0) ) tmp = tmp.T self.pxy.append( tmp ) self.pxyx.append( None ) else: tmp_array = list() for curr_y in set( y ): ct = crosstab( X[y == curr_y,z], X[y == curr_y,self.father] ) ct = ct.reindex_axis( self.names[z], axis=0).fillna(0) ct = ct.reindex_axis( self.names[self.father], axis=1).fillna(0) pxx = np.asarray ( (ct + self.Lap).apply(lambda r: r/r.sum(), axis=0) ) tmp_array.append( pxx.T ) # Trasposition for a better indexing self.pxyx.append( tmp_array ) self.pxy.append( None ) self.kind = 'One-Dependency Estimator'
def test_margin_ignore_dropna_bug(self): # GH 12577 # pivot_table counts null into margin ('All') # when margins=true and dropna=true df = pd.DataFrame({'a': [1, 2, 2, 2, 2, np.nan], 'b': [3, 3, 4, 4, 4, 4]}) actual = pd.crosstab(df.a, df.b, margins=True, dropna=True) expected = pd.DataFrame([[1, 0, 1], [1, 3, 4], [2, 3, 5]]) expected.index = Index([1.0, 2.0, 'All'], name='a') expected.columns = Index([3, 4, 'All'], name='b') tm.assert_frame_equal(actual, expected) df = DataFrame({'a': [1, np.nan, np.nan, np.nan, 2, np.nan], 'b': [3, np.nan, 4, 4, 4, 4]}) actual = pd.crosstab(df.a, df.b, margins=True, dropna=True) expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]]) expected.index = Index([1.0, 2.0, 'All'], name='a') expected.columns = Index([3.0, 4.0, 'All'], name='b') tm.assert_frame_equal(actual, expected) df = DataFrame({'a': [1, np.nan, np.nan, np.nan, np.nan, 2], 'b': [3, 3, 4, 4, 4, 4]}) actual = pd.crosstab(df.a, df.b, margins=True, dropna=True) expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]]) expected.index = Index([1.0, 2.0, 'All'], name='a') expected.columns = Index([3, 4, 'All'], name='b') tm.assert_frame_equal(actual, expected)
def exploreData(df): """ Data Exploration """ import seaborn as sns print("Describe:", df.describe()) print(df.columns) print("Dtypes:", df.dtypes) # Scatter plot to show all attribute relations pd.plotting.scatter_matrix(df, alpha=0.2, figsize=(10, 10), diagonal="kde") pd.plotting.scatter_matrix(df, alpha=0.2, figsize=(10, 10), diagonal="hist") plt.tight_layout() plt.show() #Using seaborn to plot all charts to understand relation sns.set(style="ticks", color_codes=True) ##sns.pairplot(data=df.dropna(), hue="Loan_Status", size=2.5 ) print sns.lmplot("Credit_History", "CoapplicantIncome", data=df.dropna(), hue="Loan_Status", fit_reg=False) sns.lmplot("Credit_History", "LoanAmount", data=df.dropna(), hue="Loan_Status", fit_reg=False) sns.lmplot("Loan_Amount_Term", "LoanAmount", data=df.dropna(), hue="Loan_Status", fit_reg=False) print(pd.crosstab(df.Education, df.Self_Employed)) edu_empl = pd.crosstab(index=df.Education, columns=df.Self_Employed, margins=True) print("edu_empl", edu_empl) df[['Credit_History', 'Loan_Amount_Term', 'Loan_Status']].plot.bar(stacked=True) print("Training data size:",len(df)) #614 print("Training data size without NaNs:",len(df.dropna())) #480 # Plotting numeric columns to understand their ranges df[df.dtypes[(df.dtypes=="float64")|(df.dtypes=="int64")].index.values].hist(figsize=[11,11]) return df
def predict_variants(self, classifier, test_set, features, target_classes): """Uses a trained Random Forest classifier to predict variants""" cls_predict = classifier.predict(test_set[features]) # map integer predictions back to the original classes predictions = target_classes[cls_predict] actual = target_classes[np.array(test_set['actual'])] pandas.crosstab(actual, predictions, rownames=['actual'], colnames=['preds']) # variable importantance # http://nbviewer.ipython.org/github/rauanmaemirov/kaggle-titanic101/blob/master/Titanic101.ipynb feature_importance = classifier.feature_importances_ feature_importance = 100.0 * (feature_importance / feature_importance.max()) sorted_idx = np.argsort(feature_importance) pos = np.arange(sorted_idx.shape[0]) + .5 plt.barh(pos, feature_importance[sorted_idx], align='center') plt.yticks(pos, test_set[features].columns[sorted_idx]) plt.xlabel('Relative feature importance') plt.ylabel('Feature name') plt.title('EVE Random Forest Variable Importance'); plt.savefig(os.path.join(self.output_dir, 'EVE_Variable_Importance.png'), bbox_inches='tight')
def printConfusionMatrix(self): if ( self.runOn == 'train_set' ): print('----'*self.nDashes) cm = pd.crosstab(self.y_train, self.y_pred, rownames=['Actual'], colnames=['Predicted'], margins=False) ncm = cm / cm.sum(axis=1) print('----'*self.nDashes) print('Confusion matrix on train set') print('----'*self.nDashes) display(cm) print('----'*self.nDashes) print('Normalized Confusion matrix on train set') print('----'*self.nDashes) display(ncm) print('----'*self.nDashes) else: print('----'*self.nDashes) cm = pd.crosstab(self.y_test, self.y_pred_test, rownames=['Actual'], colnames=['Predicted'], margins=True) #cm = confusion_matrix(self.y_test, self.y_pred_test) ncm = cm / cm.sum(axis=1) print('----'*self.nDashes) print('Confusion matrix on test set') print('----'*self.nDashes) display(cm) print('----'*self.nDashes) print('Normalized Confusion matrix on test set') print('----'*self.nDashes) display(ncm)
def plot_heatmap(df,graphNo): #Cross-tabulate Category and PdDistrict if(graphNo == 1): df_crosstab = pd.crosstab(df.PdDistrict,df.Category,margins=True) elif(graphNo == 2): df_crosstab = pd.crosstab(df.Category,df.Month,margins=True) elif(graphNo == 3): df_crosstab = pd.crosstab(df.PdDistrict,df.Year,margins=True) elif(graphNo == 4): df_crosstab = pd.crosstab(df.PdDistrict,df.Month,margins=True) del df_crosstab['All'] df_crosstab = df_crosstab.ix[:-1] column_labels = list(df_crosstab.columns.values) row_labels = df_crosstab.index.values.tolist() if(graphNo == 2 or graphNo == 4): month_names=[] for month_number in column_labels: month_names.append(calendar.month_abbr[month_number]) column_labels = month_names fig,ax = plt.subplots() #Specify color map for each visualization if(graphNo == 1): heatmap = ax.pcolor(df_crosstab,cmap=plt.cm.Blues) elif(graphNo == 2): heatmap = ax.pcolor(df_crosstab,cmap=plt.cm.RdPu) elif(graphNo == 3): heatmap = ax.pcolor(df_crosstab,cmap=plt.cm.PuBuGn) elif(graphNo == 4): heatmap = ax.pcolor(df_crosstab,cmap=plt.cm.YlOrRd) fig = plt.gcf() fig.set_size_inches(15,5) ax.set_frame_on(False) ax.set_yticks(np.arange(df_crosstab.shape[0])+0.5, minor=False) ax.set_xticks(np.arange(df_crosstab.shape[1])+0.5, minor=False) ax.invert_yaxis() ax.xaxis.tick_top() ax.set_xticklabels(column_labels, minor=False) ax.set_yticklabels(row_labels, minor=False) if(graphNo == 1): plt.xticks(rotation=90) ax.grid(False) ax = plt.gca() for t in ax.xaxis.get_major_ticks(): t.tick1On = False t.tick2On = False for t in ax.yaxis.get_major_ticks(): t.tick1On = False t.tick2On = False plt.show()
def chi_comparision (data_chi, field1, field2): ct = pandas.crosstab(data_chi[field2], data_chi[field1]) chi2 = pandas.DataFrame(numpy.random.randn(ct.axes[1].max()+1, ct.axes[1].max()+1)) pvalue = pandas.DataFrame(numpy.random.randn(ct.axes[1].max()+1, ct.axes[1].max()+1)) pvalue.ix[:] = numpy.nan chi2.ix[:] = numpy.nan num_comp = 0 # Now the bucle, but it only calculates chi2 and pvalue if the pair has not # been already calculated for ax1 in ct.axes[1]: for ax2 in ct.axes[1]: if ax1 == ax2: continue ax1 = ax1.astype(numpy.int64) ax2 = ax2.astype(numpy.int64) if not(numpy.isnan(chi2[ax2][ax1])): continue recode_chi = {ax1:ax1, ax2:ax2} versus = ax1.astype('str') + 'v' + ax2.astype('str') data_chi[versus] = data_chi[field1].map(recode_chi) ct2 = pandas.crosstab(data_chi[field2], data_chi[versus]) cs2 = scipy.stats.chi2_contingency(ct2) chi2[ax1][ax2] = cs2[0] pvalue[ax1][ax2] = cs2[1] num_comp += 1 thresold_bonferroni = 0.05 / num_comp rejected_h0 = pvalue < thresold_bonferroni return (chi2, pvalue, rejected_h0)
def report(test,predictions): print pd.crosstab(test['Sentiment'], predictions, rownames=['Actual'], colnames=['Predicted'], margins=True) a=accuracy_score(test['Sentiment'],predictions) p=precision_score(test['Sentiment'],predictions, pos_label = "pos") r=recall_score(test['Sentiment'].values,predictions, pos_label = "pos") f=f1_score(test['Sentiment'].values,predictions, pos_label = "pos") print "Accuracy = ",a,"\nPrecision =",p,"\nRecall = ",r,"\nF-Score = ",f
def knnSimulate(param): trainSet = SimData.simulate2Group( n = int(param['n']), p = int(param['p']), effect = [param['effect']] * int(param['p']) ) knnFit = KNeighborsClassifier(n_neighbors=int(param['k'])) knnFit.fit(np.array(trainSet['x']), np.array(trainSet['y'])) testSet = SimData.simulate2Group( n = int(param['n']), p = int(param['p']), effect = [param['effect']] * int(param['p']) ) out = OrderedDict() out['p'] = int(param['p']) out['k'] = int(param['k']) out['train'] = trainSet out['test'] = testSet out['resubPreds'] = knnFit.predict(trainSet['x']) out['resubProbs'] = knnFit.predict_proba(trainSet['x']) out['testPreds'] = knnFit.predict(testSet['x']) out['testProbs'] = knnFit.predict_proba(testSet['x']) out['resubTable'] = pd.crosstab( Series(out['resubPreds'], index=trainSet['y'].index), trainSet['y'] ) out['resubAccuracy'] = (np.sum(np.diag(out['resubTable'])) / (1.0 * np.sum(np.sum(out['resubTable'])))) out['testTable'] = pd.crosstab( Series(out['testPreds'], index=testSet['y'].index), testSet['y'] ) out['testAccuracy'] = (np.sum(np.diag(out['testTable'])) / (1.0 * np.sum(np.sum(out['testTable'])))) return out
def cleanCrosstab(rows, cols, values, aggfunc=sum, weight=None): """ Performs a crosstab on the rows, cols and values specified. In the end, if there are no observations, it the value is zero, but if there are observations with a np.nan value, then those remain as missing values. Also, adds in proper row and column totals """ if weight is None: t = pd.crosstab(rows, cols, values, aggfunc=aggfunc, dropna=False) else: t = pd.crosstab(rows, cols, values*weight, aggfunc=aggfunc, dropna=False) count = pd.crosstab(rows, cols, dropna=False) t = t.mask(count==0, other=0) t['Total'] = t.sum(axis=1) t = t.append(pd.Series(t.sum(axis=0), name='Total')) return t
def displaySidAndQueryName(self): data = self.dataManager.loadData(["Aid","Sid","QueryName"],transformFields=False) # Specific Aid dataAid = data[data.Aid == "012abc55-5801-494f-a77f-a799f1d855de"] colors = cm.gist_ncar(np.linspace(0,1,dataAid.QueryName.nunique())) pd.crosstab(dataAid.Sid, dataAid.QueryName).plot.barh(stacked=True, color=colors,figsize=(20, 20)) plt.show()
def print_null_freq(df): """ for a given DataFrame, calculates how many values for each variable is null and prints the resulting table to stdout """ df_lng = pd.melt(df) null_variables = df_lng.value.isnull() print pd.crosstab(df_lng.variable, null_variables)
def find_coalition(train,clf): X_train = train.drop(['Vote','Financial_agenda_matters','Will_vote_only_large_party','Most_Important_Issue', 'Avg_Residancy_Altitude'], axis=1).values y_train = train.Vote.values clf.fit(X_train) clusters = clf.predict(X_train) print pd.crosstab(np.array(PARTIES)[y_train.astype(int)], clusters, rownames=["Party"], colnames=["Cluster"])
def printContingencyTable(y,ypred,labels): confusion_matrix = metrics.confusion_matrix(y, ypred) plt.matshow(confusion_matrix) plt.title('Confusion matrix') plt.colorbar() plt.ylabel('True label') plt.xlabel('Predicted label') plt.show() print pd.crosstab(y,ypred,rownames=[labels[0]],colnames=[labels[1]])
def create_secure_usage_graph(df): data = pd.crosstab(df['Age'],df['Connection Secure']) plot = data.plot(kind='barh',stacked=True) save_figure(plot, 'verifying_connection') data = pd.crosstab(df['Age'],df['Connection Secure (Banking)']) plot = data.plot(kind='barh',stacked=True) save_figure(plot, 'verifying_connection_banking') log.info('Generated Secure connection graphs')
def get_kappa(A=None,B=None): import pandas as pd import numpy as np if A is None or B is None: k=5 n=30 A=np.array([np.random.randint(k)+1 for _ in range(n)]) B=np.array([np.random.randint(k)+1 for _ in range(n)]) ## Wikipedia Example 1 A= np.append(np.zeros(25, dtype=int),np.ones(25, dtype=int)) B= np.roll(np.append(np.zeros(30, dtype=int),np.ones(20, dtype=int)), 5) # ## Wikipedia Example 2 # A= np.append(np.zeros(60, dtype=int),np.ones(40, dtype=int)) # B= np.roll(np.append(np.zeros(70, dtype=int),np.ones(30, dtype=int)), 15) # # ## Wikipedia Example 3 # A= np.append(np.zeros(60, dtype=int),np.ones(40, dtype=int)) # B= np.roll(np.append(np.zeros(30, dtype=int),np.ones(70, dtype=int)), -5) # print 'A',A # print 'B', B # colnames=['0', '1', '2', '3', '4', '5'] T=pd.crosstab(A,B,rownames='A',colnames='B').as_matrix() else: A=np.array(A) B=np.array(B) T=pd.crosstab(A,B,rownames='A',colnames='B')#.as_matrix()\ vals=['0', '1', '2', '3', '4', '5'] for v in vals: try: T[v] except : T[v] = pd.Series(np.zeros(len(list(T.index))), index=T.index) for v in vals: try: list(T.index).index(v) except : T.loc[v] = np.zeros(6) T= T.sort() T= T.reindex_axis(sorted(T.columns), axis=1).as_matrix() b= T.sum(0) a= T.sum(1) p=T.diagonal().sum()/float(T.sum()) b=b/float(b.sum()) a=a/float(a.sum()) e= sum(a*b) # e=sum((T.diagonal()/float(T.sum()))**2) ## xiaoqian's xls file kappa= max(1e-200,(p-e)/(1-e)) # return np.log(kappa) return kappa
def get_probabilities(self): pairs = pd.DataFrame(self.pairs) pairs.columns = ['parent', 'child'] frequency = pd.crosstab(pairs.parent, pairs.child) list_data = pd.DataFrame(self.list_data) list_data.columns = ['parent', 'child'] frequency_list_data = pd.crosstab(list_data.parent, list_data.child) frequency = frequency.astype(np.float) frequency.values[:] = frequency.values / frequency.values.sum(axis=1, keepdims=True) return frequency
def printReport(self, ROCcurve=False): print "\nModel Report" print "Confusion Matrix:" print pd.crosstab(self.data_train[self.target], self.train_predictions) print 'Note: rows - actual; col - predicted' # print "\nClassification Report:" # print metrics.classification_report(y_true=self.data_train[self.target], y_pred=self.train_predictions) print "Accuracy : %s" % "{0:.3%}".format(self.classification_output['Accuracy']) print "AUC : %s" % "{0:.3%}".format(self.classification_output['AUC']) print "CV Score : Mean - %s | Std - %s" % ("{0:.3%}".format(self.classification_output['CVScore_mean']),"{0:.3%}".format(self.classification_output['CVScore_std'])) if ROCcurve: fpr, tpr, thresholds = metrics.roc_curve(self.data_train[self.target],self.test_pred_prob)
def one_row_feature( lat, lon, column_tuple, feature_names ): # Return a 1-dimensional DataFrame or Series to be merged with other rows. temp_cos = map( math.cos, column_tuple['Lat (radian)'] ) x = 6373.0 * 1000 * abs( lon * 3.1415926 / 180 - column_tuple['Lon (radian)']) * temp_cos # In meters y = 6373.0 * 1000 * abs( lat * 3.1415926 / 180 - column_tuple['Lat (radian)']) temp = pd.DataFrame( {'x': x, 'y': y} ) distance = temp.max( axis = 1 ) temp_list_pool = [] for feature_name in feature_names: # 20 meters #if sum( distance < 20.0 ) > 1: temp_data_1 = pd.crosstab( '', column_tuple.ix[ distance < 20.0, feature_name ] ) try: temp_column_1 = list( temp_data_1.columns ) temp_column_1 = [ '20m ' + x for x in temp_column_1 ] temp_data_1.columns = temp_column_1 temp_list_pool.append( temp_data_1 ) except: pass # 200 Meteirs #if sum( distance < 200.0 ) > 1: temp_data_2 = pd.crosstab( '', column_tuple.ix[ distance < 200.0, feature_name ] ) try: temp_column_2 = list( temp_data_2.columns ) temp_column_2 = [ '200m ' + x for x in temp_column_2 ] temp_data_2.columns = temp_column_2 temp_list_pool.append( temp_data_2 ) except: pass # 2000 Meteirs #if sum( distance < 2000.0 ) > 1: temp_data_3 = pd.crosstab( '', column_tuple.ix[ distance < 2000.0, feature_name ] ) try: temp_column_3 = list( temp_data_3.columns ) temp_column_3 = [ '2000m ' + x for x in temp_column_3 ] temp_data_3.columns = temp_column_3 temp_list_pool.append( temp_data_3 ) except: pass temp_data = pd.concat( temp_list_pool, axis = 1 ) return temp_data
def draw(self): """ Draw a heat map. """ def get_crosstab(data, row_fact,col_fact, row_names, col_names): ct = pd.crosstab(data[row_fact], data[col_fact]) ct = ct.reindex_axis(row_names, axis=0).fillna(0) ct = ct.reindex_axis(col_names, axis=1).fillna(0) return ct def plot(data, color): ct = get_crosstab( data, self._groupby[0], self._groupby[1], self._levels[0], self._levels[1]) sns.heatmap(ct, robust=True, annot=True, cbar=False, cmap=cmap, fmt="g", vmax=vmax, #ax=plt.gca(), linewidths=1) if len(self._groupby) < 2: # create a dummy cross tab with one dimension containing empty # values: data_column = self._table[self._groupby[0]].reset_index(drop=True) tab = pd.crosstab( pd.Series([""] * len(data_column), name=""), data_column) plot_facet = lambda data, color: sns.heatmap( tab, robust=True, annot=True, cbar=False, cmap=cmap, fmt="g", linewidths=1) else: plot_facet = plot vmax = pd.crosstab( [self._table[x] for x in [self._row_factor, self._groupby[0]] if x != None], [self._table[x] for x in [self._col_factor, self._groupby[1]] if x != None]).values.max() cmap = ListedColormap(self.options["color_palette_values"]) self.map_data(plot_facet)
def plots(train,categories): matplotlib.style.use('ggplot') temp = pd.crosstab([train.Category],train.PdDistrict) temp.plot(kind='barh') temp = pd.crosstab([train.Category],train.DayOfWeek) temp.plot(kind='barh') temp = pd.crosstab([train.Category],train.time) temp.plot(kind='barh') temp = pd.crosstab([train.loc[train['Category'].isin(categories),'Category']],train.time) temp.plot(kind='barh') train.time.value_counts().plot(kind='barh') train.DayOfWeek.value_counts().plot(kind='barh') train.PdDistrict.value_counts().plot(kind='barh') train.Category.value_counts().plot(kind='barh') matplotlib.pyplot.show()
def Cramer(var1, var2): """ Compute Cramer's V statistic for two Pandas series Parameters: ---------- var1, var2: Pandas series Returns: -------- v : float The Cramer's V statistic of two categorical-variable series Status: ------- Cramer's V Implementation Author: Jesse Lund, [email protected] Date: 9/12/2015 ##Round 1## Comments: Thomas Roderick, [email protected] Date: 9/13/2015 """ table = crosstab(var1,var2) #For Pandas: must have an index, can't just feed in two lists. This could be a sticking point. Might be better to do a check or roll our own crosstab implementation l,w = table.shape #save on a (small) function call here--reads in both outputs df = min(l-1, w-1) colsum, rowsum = table.sum(0), table.sum(1) n = float(l*w) expectmat = outer(rowsum,colsum)/n outmat = outer(table.sum(0),table.sum(1))/n #this works if same size return sqrt((((table - expectmat)**2)/(expectmat*n*df)).sum().sum())
def fischer_bar_chart(bin_vec, response_vec, ax=None, filename=None): fig, ax = init_ax(ax) t = pd.crosstab(bin_vec, response_vec) t.plot(kind='bar', ax=ax) if filename is not None: fig.savefig(filename) return fig
def show_tabulation(request): """Send Response to Ajax Request for selected tabulated data""" required_fields = ['gender','handedness','uploaded_file_base_encoded'] for field_name in required_fields: try: request.POST[field_name] except: raise Exception(field_name+' is required') uploaded_file_name = request.POST.get('uploaded_file_base_encoded',None) uploaded_file_name_base_decoded = base64.b64decode(uploaded_file_name) gender = [request.POST['gender']] handedness = [request.POST['handedness']] if uploaded_file_name_base_decoded and os.path.isfile(uploaded_file_name_base_decoded): df = pd.read_csv(uploaded_file_name_base_decoded) gender_groups = df.groupby('Gender').groups.keys() handedness_groups = df.groupby('Handedness').groups.keys() if gender and gender[0]!='all': gender_groups = gender if handedness and handedness[0]!='all': handedness_groups = handedness df_filtered = df[df.Gender.isin(gender_groups) & df.Handedness.isin(handedness_groups)] df_calc = pd.crosstab(df_filtered.Gender , df_filtered.Handedness, rownames=['Gender'], colnames=['Handedness'], margins=True) response_data = {} response_data['tab_data'] = str(df_calc.to_html()) response_data['json_data'] = str(df_calc.to_json()) return HttpResponse(json.dumps(response_data), content_type="application/json") else: raise Exception('File not found or missing.')
def __init__(self,feature_data,features,N,K): """ feature_data: contains observations indexed by a 'key' column, rest of columns are features features: the list of features one wants to use in estimation K: number of latent types to estimate """ self.N = N self.K = K self.features = [f for feature in features for f in feature] self.F = len(self.features) data_index = [i for j,f in enumerate(features) for i in [j]*len(f)] # counts stored as lists because different features have different number of categories self.feature_counts = [] self.M_f = np.empty(self.F,dtype=np.int) self.observations = np.empty((self.N,self.F),dtype=np.int) for i,f in enumerate(self.features): self.feature_counts.append(pd.crosstab(feature_data[data_index[i]]['key'],feature_data[data_index[i]][f]).values) self.M_f[i] = self.feature_counts[i].shape[1] self.observations[:,i] = self.feature_counts[i].sum(axis=1) # seed parameters self.rho = np.full(self.K,1/self.K) # equal probability of all types self.mu = [] for M_f in self.M_f: self.mu.append(np.random.dirichlet(M_f*[1],self.K)) # uniform probability
def convert_to_matrix(player_df): print "Writing out player matrix" table_df = pd.crosstab(player_df['Club.Country'], player_df['Country'], dropna=False) cols = table_df.columns rows = list(set(table_df.index)) row_df = pd.DataFrame(np.zeros((len([col for col in cols if col not in rows]),table_df.shape[1]), dtype=int),columns=table_df.columns, index=[col for col in cols if col not in rows]) table_df = table_df.append(row_df) col_df = pd.DataFrame(np.zeros((table_df.shape[0],len([row for row in rows if row not in cols])), dtype=int),columns=[row for row in rows if row not in cols], index=table_df.index) table_df = pd.concat([table_df, col_df], axis=1) file_name = 'euro_matrix.json' all_countries = list(table_df.index) all_countries.sort() table_df = table_df.ix[all_countries][all_countries] lines = [",".join([str(ele) for ele in list(row[1])]) for row in table_df.iterrows()] with open (file_name, 'w') as file_handle: file_handle.write('[\n[') file_handle.write("],\n[".join(lines)) file_handle.write("]\n]") return table_df
def get_mobem(self): """Return a dataframe compatible with ANOVA analysis The returned object can be read by :class:`~gdsctools.readers.GenomicFeatures`. """ # Select gene that appear at least a minimum number of times #agg = self.unified.groupby("GENE")["GENE"].count() #self.selection = agg[agg>=minimum_gene] # keep only gene in the selection #df = self.unified.query("GENE in @self.selection.index") df = self.unified this = pd.crosstab(df['GENE'], columns=[ df["COSMIC_ID"], df['TISSUE_TYPE'], df["SAMPLE"]]) this = this.T this = this.reset_index() if "TISSUE_TYPE" in this.columns: this.rename(columns={"TISSUE_TYPE":"TISSUE_FACTOR"}, inplace=True) else: print("Expected TISSUE_TYPE column. Not found.") return this
def validate(x, gt_y, clf, subset): predicted_y = clf.predict(x) prob_predicted_y = clf.predict_proba(x) print('%s confusion matrix:' % subset) print(pd.crosstab(gt_y, predicted_y, rownames=['Actual'], colnames=['Predicted'])) return predicted_y, prob_predicted_y
#converting gender to numerical variable gender = {'male':1, 'female':2} trainingData['gender'] = trainingData['gender'].map(gender) trainingData = trainingData.fillna(0) trainingData['gender']=trainingData['gender'].astype(np.int8); #generating correlation matrix corrmatrix=trainingData[trainingData.columns[1:]].corr() f,ax=plt.subplots(figsize=(12,9)) sns.heatmap(corrmatrix, vmax=1, cbar=True, annot=True, square=True); plt.show() #checking gender data relationship with churn gender_crosstab=pd.crosstab(trainingData['gender'],trainingData['is_churn']) gender_crosstab.plot(kind='bar', stacked=True, grid=True) #checking age data relationship with churn age_crosstab=pd.crosstab(trainingData['bd'],trainingData['is_churn']) age_crosstab.plot(kind='bar', stacked=True, grid=True) #checking city data relationship with churn city_crosstab=pd.crosstab(trainingData['city'],trainingData['is_churn']) city_crosstab.plot(kind='bar', stacked=True, grid=True) #checking city data relationship with churn registered_via_crosstab=pd.crosstab(trainingData['registered_via'],trainingData['is_churn']) registered_via_crosstab.plot(kind='bar', stacked=True, grid=True) trainingData.to_csv('MembersProc_file.csv', sep=',')
def print_demographics(df, idx=None): # create a dictionary which maps each variable to a data type all_vars = OrderedDict( (('N', 'N'), ('age', 'median'), ('gender', 'gender'), ('bmi', 'continuous'), ('ethnicity', 'race'), ('elixhauser_hospital', 'median'), ('qsofa', 'median'), ('sirs', 'median'), ('sofa', 'median'), ('mlods', 'median'), ('lactate_max', 'continuous'), ('vent', 'binary'), ('icu_los', 'median'), ('hosp_los', 'median'), ('thirtyday_expire_flag', 'binary'), ('hospital_expire_flag', 'binary'))) if idx is None: # print demographics for entire dataset for i, curr_var in enumerate(all_vars): if all_vars[curr_var] == 'N': # print number of patients print('{:20s}\t{:4g}'.format(curr_var, df.shape[0])) elif curr_var in df.columns: if all_vars[curr_var] == 'continuous': # report mean +- STD print('{:20s}\t{:2.1f} +- {:2.1f}'.format( curr_var, df[curr_var].mean(), df[curr_var].std())) elif all_vars[curr_var] == 'gender': # convert from M/F print('{:20s}\t{:4g} ({:2.1f}%)'.format( curr_var, np.sum(df[curr_var].values == 'M'), 100.0 * np.sum(df[curr_var].values == 'M').astype(float) / df.shape[0])) # binary, report percentage elif all_vars[curr_var] == 'binary': print('{:20s}\t{:4g} ({:2.1f}%)'.format( curr_var, df[curr_var].sum(), 100.0 * (df[curr_var].mean()).astype(float))) # report median [25th percentile, 75th percentile] elif all_vars[curr_var] == 'median': print('{:20s}\t{:2.1f} [{:2.1f}, {:2.1f}]'.format( curr_var, df[curr_var].median(), np.percentile(df[curr_var].values, 25, interpolation='midpoint'), np.percentile(df[curr_var].values, 75, interpolation='midpoint'))) elif all_vars[curr_var] == 'measured': print('{:20s}\t{:2.1f}%'.format( curr_var, 100.0 * np.mean(df[curr_var].isnull()))) elif all_vars[curr_var] == 'race': # special case: print each race individually # race_black, race_other print('{:20s}\t'.format('Race')) # each component curr_var_tmp = 'White' print('{:20s}\t{:4g} ({:2.1f}%)'.format( curr_var_tmp, df['race_white'].sum(), 100.0 * (df['race_white'].mean()).astype(float))) curr_var_tmp = 'Black' print('{:20s}\t{:4g} ({:2.1f}%)'.format( curr_var_tmp, df['race_black'].sum(), 100.0 * (df['race_black'].mean()).astype(float))) curr_var_tmp = 'Hispanic' print('{:20s}\t{:4g} ({:2.1f}%)'.format( curr_var_tmp, df['race_hispanic'].sum(), 100.0 * (df['race_black'].mean()).astype(float))) # curr_var_tmp = 'Other' # print('{:20s}\t{:4g} ({:2.1f}%)'.format(curr_var_tmp, df['race_other'].sum(), # 100.0*(df['race_other'].mean()).astype(float))) # additional lactate measurements output with lactate_max if curr_var == 'lactate_max': # also print measured print('{:20s}\t{:4g} ({:2.1f}%)'.format( curr_var.replace('_max', ' ') + 'measured', np.sum(~df[curr_var].isnull()), 100.0 * np.mean(~df[curr_var].isnull()))) print('{:20s}\t{:4g} ({:2.1f}%)'.format( curr_var.replace('_max', ' ') + '> 2', np.sum(df[curr_var] >= 2), 100.0 * np.mean(df[curr_var] >= 2))) else: print('{:20s}'.format(curr_var)) else: # print demographics split into two groups # also print p-values testing between the two groups for i, curr_var in enumerate(all_vars): if all_vars[curr_var] == 'N': # print number of patients print('{:20s}\t{:4g}{:5s}\t{:4g}{:5s}\t{:5s}'.format( curr_var, np.sum(~idx), '', np.sum(idx), '', '')) elif curr_var in df.columns: if all_vars[curr_var] == 'continuous': # report mean +- STD tbl = np.array( [[df[~idx][curr_var].mean(), df[idx][curr_var].mean()], [ df.loc[~idx, curr_var].std(), df.loc[idx, curr_var].std() ]]) stat, pvalue = scipy.stats.ttest_ind(df[~idx][curr_var], df[idx][curr_var], equal_var=False, nan_policy='omit') # print out < 0.001 if it's a very low p-value if pvalue < 0.001: pvalue = '< 0.001' else: pvalue = '{:0.3f}'.format(pvalue) print( '{:20s}\t{:2.1f} +- {:2.1f}\t{:2.1f} +- {:2.1f}\t{:5s}' .format(curr_var, tbl[0, 0], tbl[1, 0], tbl[0, 1], tbl[1, 1], pvalue)) elif all_vars[curr_var] in ('gender', 'binary'): # convert from M/F # build the contingency table if all_vars[curr_var] == 'gender': tbl = np.array( [[ np.sum(df[~idx][curr_var].values == 'M'), np.sum(df[idx][curr_var].values == 'M') ], [ np.sum(df[~idx][curr_var].values != 'M'), np.sum(df[idx][curr_var].values != 'M') ]]) else: tbl = np.array( [[ np.sum(df[~idx][curr_var].values), np.sum(df[idx][curr_var].values) ], [ np.sum(1 - df[~idx][curr_var].values), np.sum(1 - df[idx][curr_var].values) ]]) # get the p-value chi2, pvalue, dof, ex = scipy.stats.chi2_contingency(tbl) # print out < 0.001 if it's a very low p-value if pvalue < 0.001: pvalue = '< 0.001' else: pvalue = '{:0.3f}'.format(pvalue) # binary, report percentage print('{:20s}\t{:4g} ({:2.1f}%)\t{:4g} ({:2.1f}%)\t{:5s}'. format( curr_var, tbl[0, 0], 100.0 * tbl[0, 0].astype(float) / (tbl[0, 0] + tbl[1, 0]), tbl[0, 1], 100.0 * tbl[0, 1].astype(float) / (tbl[0, 1] + tbl[1, 1]), pvalue)) elif all_vars[curr_var] == 'median': stat, pvalue = scipy.stats.mannwhitneyu( df[~idx][curr_var], df[idx][curr_var], use_continuity=True, alternative='two-sided') # print out < 0.001 if it's a very low p-value if pvalue < 0.001: pvalue = '< 0.001' else: pvalue = '{:0.3f}'.format(pvalue) print( '{:20s}\t{:2.1f} [{:2.1f}, {:2.1f}]\t{:2.1f} [{:2.1f}, {:2.1f}]\t{:5s}' .format( curr_var, df[~idx][curr_var].median(), np.percentile(df[~idx][curr_var].values, 25, interpolation='midpoint'), np.percentile(df[~idx][curr_var].values, 75, interpolation='midpoint'), df[idx][curr_var].median(), np.percentile(df[idx][curr_var].values, 25, interpolation='midpoint'), np.percentile(df[idx][curr_var].values, 75, interpolation='midpoint'), pvalue)) elif all_vars[curr_var] == 'measured': # build the contingency table tbl = np.array([[ np.sum(df[~idx][curr_var].isnull()), np.sum(df[idx][curr_var].isnull()) ], [ np.sum(~df[~idx][curr_var].isnull()), np.sum(~df[idx][curr_var].isnull()) ]]) # get the p-value chi2, pvalue, dof, ex = scipy.stats.chi2_contingency(tbl) # print out < 0.001 if it's a very low p-value if pvalue < 0.001: pvalue = '< 0.001' else: pvalue = '{:0.3f}'.format(pvalue) print('{:20s}\t{:2.1f}%\t{:2.1f}%'.format( curr_var, np.sum(~df[~idx][curr_var].isnull()), 100.0 * np.mean(~df[~idx][curr_var].isnull()), np.sum(~df[idx][curr_var].isnull()), 100.0 * np.mean(~df[idx][curr_var].isnull()), pvalue)) elif all_vars[curr_var] == 'race': # special case: evaluate each race in chi2 # race_black, race_other # create a contingency table with three rows # use crosstab df['race'] = 'other' df.loc[df['race_black'] == 1, 'race'] = 'black' df.loc[df['race_white'] == 1, 'race'] = 'white' df.loc[df['race_hispanic'] == 1, 'race'] = 'hispanic' tbl = pd.crosstab(df.race, idx, margins=True) curr_var_vec = tbl.index.values[0:-1] # Extract table without totals tbl = tbl.ix[0:-1, 0:-1] # get the p-value chi2, pvalue, dof, ex = scipy.stats.chi2_contingency( tbl, correction=False) # print out < 0.001 if it's a very low p-value if pvalue < 0.001: pvalue = '< 0.001' else: pvalue = '{:0.3f}'.format(pvalue) # first print out we are comparing races (with p-value) print('{:20s}\t{:10s}\t{:10s}\t{:5s}'.format( curr_var, '', '', pvalue)) # next print out individual race #s (no p-value) for r in curr_var_vec: print( '{:20s}\t{:4g} ({:2.1f}%)\t{:4g} ({:2.1f}%)\t{:5s}' .format( ' ' + r, tbl.loc[r, False], 100.0 * tbl.loc[r, False].astype(float) / np.sum(tbl.loc[:, False]), tbl.loc[r, True], 100.0 * tbl.loc[r, True].astype(float) / np.sum(tbl.loc[:, True]), '')) # no individual p-value # additional lactate measurements output with lactate_max if curr_var == 'lactate_max': # for lactate, we print two additional rows: # 1) was lactate ever measured? # 2) was lactate ever > 2 ? # measured... # build the contingency table tbl = np.array([[ np.sum(df[~idx][curr_var].isnull()), np.sum(df[idx][curr_var].isnull()) ], [ np.sum(~df[~idx][curr_var].isnull()), np.sum(~df[idx][curr_var].isnull()) ]]) # get the p-value chi2, pvalue, dof, ex = scipy.stats.chi2_contingency(tbl) # print out < 0.001 if it's a very low p-value if pvalue < 0.001: pvalue = '< 0.001' else: pvalue = '{:0.3f}'.format(pvalue) print('{:20s}\t{:4g} ({:2.1f}%)\t{:4g} ({:2.1f}%)\t{:5s}'. format( curr_var.replace('_max', ' ') + 'measured', np.sum(~df[~idx][curr_var].isnull()), 100.0 * np.mean(~df[~idx][curr_var].isnull()), np.sum(~df[idx][curr_var].isnull()), 100.0 * np.mean(~df[idx][curr_var].isnull()), pvalue)) # value > 2... # build the contingency table tbl = np.array([[ np.sum(df[~idx][curr_var] >= 2), np.sum(df[idx][curr_var] >= 2) ], [ np.sum(~(df[~idx][curr_var] >= 2)), np.sum(~(df[idx][curr_var] >= 2)) ]]) # get the p-value chi2, pvalue, dof, ex = scipy.stats.chi2_contingency(tbl) # print out < 0.001 if it's a very low p-value if pvalue < 0.001: pvalue = '< 0.001' else: pvalue = '{:0.3f}'.format(pvalue) print('{:20s}\t{:4g} ({:2.1f}%)\t{:4g} ({:2.1f}%)\t{:5s}'. format( curr_var.replace('_max', ' ') + '> 2', np.sum(df[~idx][curr_var] >= 2), 100.0 * np.mean(df[~idx][curr_var] >= 2), np.sum(df[idx][curr_var] >= 2), 100.0 * np.mean(df[idx][curr_var] >= 2), pvalue)) else: print('{:20s}'.format(curr_var))
grid.map(plt.hist, 'Age', alpha=.5, bins=20) grid.add_legend() # In[ ]: grid = sns.FacetGrid(all_data, row='Embarked', size=2.2, aspect=1.6) grid.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', palette='deep') grid.add_legend() # **Creating new features** # In[ ]: all_data['Title'] = all_data.Name.str.extract(' ([A-Za-z]+)\.', expand=False) pd.crosstab(all_data['Title'], all_data['Sex']) # In[ ]: # Merging all columns with similar values and grouping rare values as "Other" all_data['Title'] = all_data['Title'].replace([ 'Capt', 'Col', 'Countess', 'Don', 'Dona', 'Dr', 'Jonkheer', 'Lady', 'Major', 'Rev', 'Sir' ], 'Other') all_data['Title'] = all_data['Title'].replace(['Ms'], 'Miss') all_data['Title'] = all_data['Title'].replace(['Mlle'], 'Miss') all_data['Title'] = all_data['Title'].replace(['Mme'], 'Mrs') all_data[['Title', 'Survived']].groupby('Title', as_index=False).mean()
df=pd.read_csv('HR_sep.csv') df.head() # In[4]: df.groupby('left').mean() # In[5]: pd.crosstab(df.salary,df.left).plot(kind='bar') # In[6]: pd.crosstab(df.Department,df.left).plot(kind='bar') # In[7]: from sklearn.linear_model import LogisticRegression # In[8]:
bank = pd.read_csv( "C:/Users/HP/Desktop/ABubakar Files/abu_Data_Science/Assignments/Logisitc Regression/bank-full.csv", delimiter=";", header=0) bank.tail(300) # Droping first column #claimants.drop(["CASENUM"],inplace=True,axis = 1) #cat_cols = ["ATTORNEY","CLMSEX","SEATBELT","CLMINSUR"] #cont_cols = ["CLMAGE","LOSS"] # Getting the barplot for the categorical columns sb.countplot(x="y", data=bank, palette="hls") pd.crosstab(bank.y, bank.job).plot(kind="bar") bank.columns sb.countplot(x="education", data=bank, palette="hls") pd.crosstab(bank.y, bank.education).plot(kind="bar") sb.countplot(x="marital", data=bank, palette="hls") pd.crosstab(bank.y, bank.marital).plot(kind="bar") sb.countplot(x="housing", data=bank, palette="hls") pd.crosstab(bank.y, bank.housing).plot(kind="bar") pd.crosstab(bank.month, bank.y).plot(kind='bar', stacked=True) plt.title('purchase frequency for month title') plt.xlabel('month') plt.ylabel('frequency of purchase') pd.crosstab(bank.loan, bank.y).plot(kind='bar', stacked=True) plt.title('purchase frequency for loan title') plt.xlabel('loan')
def do_response_scoring(df, xlab, ylab, Top_n, prt): df_ins = df # data frame #columns = ['producer_cd','veh_make_name','veh_mdl_name'] # grouping of columns = xlab #=['veh_make_name'] # grouping of columns_conc = ("|".join(columns)) #columns_conc df_ins[f'{columns_conc}'] = df_ins[columns].astype(str).astype(str).apply( '|'.join, axis=1) #grid_view(df_ins.head()) i = cat_col = columns_conc y2 = ylab #="target" Top_n = Top_n #=1500000000000000000000000000000000000000000000000 ytag = ytag = 1 col_count = df_ins[f'{i}'].value_counts() #print(col_count) col_count = col_count[:Top_n, ] col_count1 = df_ins[f'{i}'].value_counts(normalize=True) * 100 col_count1 = col_count1[:Top_n, ] vol_inperc = col_count1.sum() vol_inperc = round(vol_inperc, 2) tmp = pd.crosstab(df_ins[f'{i}'], df_ins[f'{y2}'], normalize='index') * 100 #tmp.head(5) tmp = pd.merge(col_count, tmp, left_index=True, right_index=True) #tmp.head(5) tmp = pd.DataFrame(tmp) #tmp.columns tmp.rename(columns={'0': 'NotRenwed%', '1': 'Renewed%'}, inplace=True) if 'NotRenwed%' not in tmp.columns: print("NotRenwed% is not present in ", i) tmp['NotRenwed%'] = 0 if 'Renewed%' not in tmp.columns: print("Renewed% is not present in ", i) tmp['Renewed%'] = 0 tmp1 = pd.crosstab(df[f'{i}'], df[f'{y2}']) tmp1.rename(columns={'0': 'NR_count', '1': 'R_count'}, inplace=True) if 'NR_count' not in tmp1.columns: print("NR_count is not present in ", i) tmp1['NR_count'] = 0 if 'R_count' not in tmp1.columns: print("R_count is not present in ", i) tmp1['R_count'] = 0 tmpz = pd.merge(tmp, tmp1, left_index=True, right_index=True) tmpz['Tot'] = tmpz['NR_count'] + tmpz['R_count'] tmpz['Renewed%'] = round(tmpz['Renewed%'], 2) tmpz['Mean'] = tmpz['Renewed%'].mean() tmpz['Nperformer'] = np.where(tmpz['Renewed%'] < tmpz['Mean'], 1, 0) tmpz['score'] = round(tmpz['R_count'] / tmpz['Tot'], 2) #tmpz.sort_index(inplace=True) # Statistic calculation ------ score_mean = tmpz['score'].mean() score_std = tmpz['score'].std() th_min = tmpz['score'].min() th_nsd = round(score_mean - score_std, 2) th_mean = round(score_mean, 2) th_psd = round(score_mean + score_std, 2) th_max = tmpz['score'].max() tmpz = pd.merge(tmp, tmp1, left_index=True, right_index=True) tmpz['Tot'] = tmpz['NR_count'] + tmpz['R_count'] tmpz['Renewed%'] = round(tmpz['Renewed%'], 2) tmpz['Mean'] = tmpz['Renewed%'].mean() tmpz['Nperformer'] = np.where(tmpz['Renewed%'] < tmpz['Mean'], 1, 0) tmpz['score'] = round(tmpz['R_count'] / tmpz['Tot'], 2) tmpz = tmpz.reset_index() tmpz.columns.values[tmpz.columns.get_loc(f'{columns_conc}')] = 'count' tmpz.rename(columns={ 'index': f'{columns_conc}', 'score': f'{columns_conc}_score' }, inplace=True) if len(xlab) > 1: tmpz = tmpz.join( pd.DataFrame( tmpz[f'{tmpz.columns.values[0]}'].str.split('|').tolist())) namu = tmpz.columns.values[0] namu = namu.split('|') cl = dict(enumerate(namu)) tmpz.rename(columns=cl, inplace=True) select_reqcolumn = [] select_reqcolumn.append(xlab) columns = xlab #=['veh_make_name'] # grouping of columns_conc = ("|".join(columns)) score_col = [f'{columns_conc}_score'] select_reqcolumn.append(score_col) select_reqcolumn = do_flatlist(select_reqcolumn) get_scoredf = tmpz[select_reqcolumn] tmpz = get_scoredf if prt == 'Y': tmpz.to_csv(f'{columns_conc}.csv', index=False) return tmpz
def ldscore(args, log): ''' Wrapper function for estimating l1, l1^2, l2 and l4 (+ optionally standard errors) from reference panel genotypes. Annot format is chr snp bp cm <annotations> ''' if args.bfile: snp_file, snp_obj = args.bfile + '.bim', ps.PlinkBIMFile ind_file, ind_obj = args.bfile + '.fam', ps.PlinkFAMFile array_file, array_obj = args.bfile + '.bed', ld.PlinkBEDFile # read bim/snp array_snps = snp_obj(snp_file) m = len(array_snps.IDList) log.log('Read list of {m} SNPs from {f}'.format(m=m, f=snp_file)) if args.annot is not None: # read --annot try: if args.thin_annot: # annot file has only annotations annot = ps.ThinAnnotFile(args.annot) n_annot, ma = len(annot.df.columns), len(annot.df) log.log("Read {A} annotations for {M} SNPs from {f}".format( f=args.annot, A=n_annot, M=ma)) annot_matrix = annot.df.values annot_colnames = annot.df.columns keep_snps = None else: annot = ps.AnnotFile(args.annot) n_annot, ma = len(annot.df.columns) - 4, len(annot.df) log.log("Read {A} annotations for {M} SNPs from {f}".format( f=args.annot, A=n_annot, M=ma)) annot_matrix = np.array(annot.df.iloc[:, 4:]) annot_colnames = annot.df.columns[4:] keep_snps = None if np.any(annot.df.SNP.values != array_snps.df.SNP.values): raise ValueError('The .annot file must contain the same SNPs in the same'+\ ' order as the .bim file.') except Exception: log.log('Error parsing .annot file') raise elif args.extract is not None: # --extract keep_snps = __filter__(args.extract, 'SNPs', 'include', array_snps) annot_matrix, annot_colnames, n_annot = None, None, 1 elif args.cts_bin is not None and args.cts_breaks is not None: # --cts-bin cts_fnames = sumstats._splitp(args.cts_bin) # read filenames args.cts_breaks = args.cts_breaks.replace( 'N', '-') # replace N with negative sign try: # split on x breaks = [[float(x) for x in y.split(',')] for y in args.cts_breaks.split('x')] except ValueError as e: raise ValueError( '--cts-breaks must be a comma-separated list of numbers: ' + str(e.args)) if len(breaks) != len(cts_fnames): raise ValueError( 'Need to specify one set of breaks for each file in --cts-bin.' ) if args.cts_names: cts_colnames = [str(x) for x in args.cts_names.split(',')] if len(cts_colnames) != len(cts_fnames): msg = 'Must specify either no --cts-names or one value for each file in --cts-bin.' raise ValueError(msg) else: cts_colnames = ['ANNOT' + str(i) for i in xrange(len(cts_fnames))] log.log('Reading numbers with which to bin SNPs from {F}'.format( F=args.cts_bin)) cts_levs = [] full_labs = [] for i, fh in enumerate(cts_fnames): vec = ps.read_cts(cts_fnames[i], array_snps.df.SNP.values) max_cts = np.max(vec) min_cts = np.min(vec) cut_breaks = list(breaks[i]) name_breaks = list(cut_breaks) if np.all(cut_breaks >= max_cts) or np.all(cut_breaks <= min_cts): raise ValueError( 'All breaks lie outside the range of the cts variable.') if np.all(cut_breaks <= max_cts): name_breaks.append(max_cts) cut_breaks.append(max_cts + 1) if np.all(cut_breaks >= min_cts): name_breaks.append(min_cts) cut_breaks.append(min_cts - 1) name_breaks.sort() cut_breaks.sort() n_breaks = len(cut_breaks) # so that col names are consistent across chromosomes with different max vals name_breaks[0] = 'min' name_breaks[-1] = 'max' name_breaks = [str(x) for x in name_breaks] labs = [ name_breaks[i] + '_' + name_breaks[i + 1] for i in xrange(n_breaks - 1) ] cut_vec = pd.Series(pd.cut(vec, bins=cut_breaks, labels=labs)) cts_levs.append(cut_vec) full_labs.append(labs) annot_matrix = pd.concat(cts_levs, axis=1) annot_matrix.columns = cts_colnames # crosstab -- for now we keep empty columns annot_matrix = pd.crosstab( annot_matrix.index, [annot_matrix[i] for i in annot_matrix.columns], dropna=False, colnames=annot_matrix.columns) # add missing columns if len(cts_colnames) > 1: for x in product(*full_labs): if x not in annot_matrix.columns: annot_matrix[x] = 0 else: for x in full_labs[0]: if x not in annot_matrix.columns: annot_matrix[x] = 0 annot_matrix = annot_matrix[sorted(annot_matrix.columns, key=annot_sort_key)] if len(cts_colnames) > 1: # flatten multi-index annot_colnames = [ '_'.join([cts_colnames[i] + '_' + b for i, b in enumerate(c)]) for c in annot_matrix.columns ] else: annot_colnames = [ cts_colnames[0] + '_' + b for b in annot_matrix.columns ] annot_matrix = np.matrix(annot_matrix) keep_snps = None n_annot = len(annot_colnames) if np.any(np.sum(annot_matrix, axis=1) == 0): # This exception should never be raised. For debugging only. raise ValueError( 'Some SNPs have no annotation in --cts-bin. This is a bug!') else: annot_matrix, annot_colnames, keep_snps = None, None, None, n_annot = 1 # read fam array_indivs = ind_obj(ind_file) n = len(array_indivs.IDList) log.log('Read list of {n} individuals from {f}'.format(n=n, f=ind_file)) # read keep_indivs if args.keep: keep_indivs = __filter__(args.keep, 'individuals', 'include', array_indivs) else: keep_indivs = None # read genotype array log.log('Reading genotypes from {fname}'.format(fname=array_file)) geno_array = array_obj(array_file, n, array_snps, keep_snps=keep_snps, keep_indivs=keep_indivs, mafMin=args.maf) # filter annot_matrix down to only SNPs passing MAF cutoffs if annot_matrix is not None: annot_keep = geno_array.kept_snps annot_matrix = annot_matrix[annot_keep, :] # determine block widths x = np.array((args.ld_wind_snps, args.ld_wind_kb, args.ld_wind_cm), dtype=bool) if np.sum(x) != 1: raise ValueError('Must specify exactly one --ld-wind option') if args.ld_wind_snps: max_dist = args.ld_wind_snps coords = np.array(xrange(geno_array.m)) elif args.ld_wind_kb: max_dist = args.ld_wind_kb * 1000 coords = np.array(array_snps.df['BP'])[geno_array.kept_snps] elif args.ld_wind_cm: max_dist = args.ld_wind_cm coords = np.array(array_snps.df['CM'])[geno_array.kept_snps] block_left = ld.getBlockLefts(coords, max_dist) if block_left[len(block_left) - 1] == 0 and not args.yes_really: error_msg = 'Do you really want to compute whole-chomosome LD Score? If so, set the ' error_msg += '--yes-really flag (warning: it will use a lot of time / memory)' raise ValueError(error_msg) scale_suffix = '' if args.pq_exp is not None: log.log('Computing LD with pq ^ {S}.'.format(S=args.pq_exp)) msg = 'Note that LD Scores with pq raised to a nonzero power are' msg += 'not directly comparable to normal LD Scores.' log.log(msg) scale_suffix = '_S{S}'.format(S=args.pq_exp) pq = np.matrix(geno_array.maf * (1 - geno_array.maf)).reshape( (geno_array.m, 1)) pq = np.power(pq, args.pq_exp) if annot_matrix is not None: annot_matrix = np.multiply(annot_matrix, pq) else: annot_matrix = pq log.log("Estimating LD Score.") lN = geno_array.ldScoreVarBlocks(block_left, args.chunk_size, annot=annot_matrix) col_prefix = "L2" file_suffix = "l2" if n_annot == 1: ldscore_colnames = [col_prefix + scale_suffix] else: ldscore_colnames = [ y + col_prefix + scale_suffix for y in annot_colnames ] # print .ldscore. Output columns: CHR, BP, RS, [LD Scores] out_fname = args.out + '.' + file_suffix + '.ldscore' new_colnames = geno_array.colnames + ldscore_colnames df = pd.DataFrame.from_records(np.c_[geno_array.df, lN]) df.columns = new_colnames if args.print_snps: if args.print_snps.endswith('gz'): print_snps = pd.read_csv(args.print_snps, header=None, compression='gzip') elif args.print_snps.endswith('bz2'): print_snps = pd.read_csv(args.print_snps, header=None, compression='bz2') else: print_snps = pd.read_csv(args.print_snps, header=None) if len(print_snps.columns) > 1: raise ValueError( '--print-snps must refer to a file with a one column of SNP IDs.' ) log.log('Reading list of {N} SNPs for which to print LD Scores from {F}'.format(\ F=args.print_snps, N=len(print_snps))) print_snps.columns = ['SNP'] df = df.ix[df.SNP.isin(print_snps.SNP), :] if len(df) == 0: raise ValueError( 'After merging with --print-snps, no SNPs remain.') else: msg = 'After merging with --print-snps, LD Scores for {N} SNPs will be printed.' log.log(msg.format(N=len(df))) l2_suffix = '.gz' log.log("Writing LD Scores for {N} SNPs to {f}.gz".format(f=out_fname, N=len(df))) df.drop(['CM', 'MAF'], axis=1).to_csv(out_fname, sep="\t", header=True, index=False, float_format='%.3f') call(['gzip', '-f', out_fname]) if annot_matrix is not None: M = np.atleast_1d(np.squeeze(np.asarray(np.sum(annot_matrix, axis=0)))) ii = geno_array.maf > 0.05 M_5_50 = np.atleast_1d( np.squeeze(np.asarray(np.sum(annot_matrix[ii, :], axis=0)))) else: M = [geno_array.m] M_5_50 = [np.sum(geno_array.maf > 0.05)] # print .M fout_M = open(args.out + '.' + file_suffix + '.M', 'wb') print >> fout_M, '\t'.join(map(str, M)) fout_M.close() # print .M_5_50 fout_M_5_50 = open(args.out + '.' + file_suffix + '.M_5_50', 'wb') print >> fout_M_5_50, '\t'.join(map(str, M_5_50)) fout_M_5_50.close() # print annot matrix if (args.cts_bin is not None) and not args.no_print_annot: out_fname_annot = args.out + '.annot' new_colnames = geno_array.colnames + ldscore_colnames annot_df = pd.DataFrame(np.c_[geno_array.df, annot_matrix]) annot_df.columns = new_colnames del annot_df['MAF'] log.log("Writing annot matrix produced by --cts-bin to {F}".format( F=out_fname + '.gz')) annot_df.to_csv(out_fname_annot, sep="\t", header=True, index=False) call(['gzip', '-f', out_fname_annot]) # print LD Score summary pd.set_option('display.max_rows', 200) log.log('\nSummary of LD Scores in {F}'.format(F=out_fname + l2_suffix)) t = df.ix[:, 4:].describe() log.log(t.ix[1:, :]) np.seterr(divide='ignore', invalid='ignore') # print NaN instead of weird errors # print correlation matrix including all LD Scores and sample MAF log.log('') log.log('MAF/LD Score Correlation Matrix') log.log(df.ix[:, 4:].corr()) # print condition number if n_annot > 1: # condition number of a column vector w/ nonzero var is trivially one log.log('\nLD Score Matrix Condition Number') cond_num = np.linalg.cond(df.ix[:, 5:]) log.log(reg.remove_brackets(str(np.matrix(cond_num)))) if cond_num > 10000: log.log('WARNING: ill-conditioned LD Score Matrix!') # summarize annot matrix if there is one if annot_matrix is not None: # covariance matrix x = pd.DataFrame(annot_matrix, columns=annot_colnames) log.log('\nAnnotation Correlation Matrix') log.log(x.corr()) # column sums log.log('\nAnnotation Matrix Column Sums') log.log(_remove_dtype(x.sum(axis=0))) # row sums log.log('\nSummary of Annotation Matrix Row Sums') row_sums = x.sum(axis=1).describe() log.log(_remove_dtype(row_sums)) np.seterr(divide='raise', invalid='raise')
print(np.unique(data['occupation'])) data=pd.read_csv('income.csv',na_values=[" ?"]) #to read ' ?' as null #data pre-processing data.isnull().sum() #to get how many ' ?' are there in JobType and occupation missing=data[data.isnull().any(axis=1)] #any missing value in a col (axis=1 !!!) print(missing) data2=data.dropna(axis=0) #dropping all rows w/ missing values as we don't know relationship b/w features correlation=data2.corr() #relationship b/w independant var data2.columns #gives col names gender=pd.crosstab(index=data2['gender'],columns='count', normalize=True) print(gender) gender_salstat=pd.crosstab(index=data2['gender'],columns=data2['SalStat'], normalize='index', margins=True) #given gender salstat is.... print(gender_salstat) SalStat=sns.countplot(data2['SalStat']) #bar chat sns.distplot(data2['age'], bins=10, kde=False, color='red') #histogram sns.boxplot('SalStat','age',data=data2) #to get relationship b/w salstat and age data2.groupby('SalStat')['age'].median() #to get median age for salstat categories sns.countplot(y='JobType',hue='SalStat',data=data2) #group bar plot
def coocurance_matrix(col1, col2): co_mat = pd.crosstab(col1, col2) return co_mat
import pandas as pd import itertools emotions = ['angry', 'disgusted', 'fearful', 'happy', 'sad', 'surprise'] # lookup = {0: 'fearful', 1: 'angry', 2: 'disgusted', 3: 'neutral', 4: 'surprised', 5: 'happy'} # , 6:'happy'} lookup = {0: "angry", 1: "disgusted", 2: 'fearful',3: "happy", 4: 'sad', 5: "surprise"} # , 6:"reassured"} # lookup = {0: 'Angry', 1:'Disgust', 2:'Fear', 3:'Happy', 4:'Neutral', 5:'Sad', 6:'Surprise'} # lookup = {0: 'Angry', 1: 'Disgust', 2: 'happy', 3: 'neutral', 4: 'surprised', 5: 'Sad', 6: 'fearful'} y_true = pd.Series([lookup[_] for _ in labelsValues]) # np.random.random_integers(0, 5, size=100)]) y_pred = pd.Series([lookup[_] for _ in PredictedValues]) # np.random.random_integers(0, 5, size=100)]) '''print('positive: ' + str(overallCounter)) print('total: ' + str(numIm)) print('accuracy: ' + str(acc))''' pd.crosstab(y_true, y_pred, rownames=['True'], colnames=['Predicted']).apply(lambda r: 100.0 * r / r.sum()) import matplotlib.pyplot as plt conf = confusion_matrix(y_true, y_pred) ####################################################################################### lookup = {0: "angry", 1: "disgusted", 2: "fearful", 3: "happy", 4: "sad", 5: "surprise"} # , 6:"reassured"} ####lookup = {0: "angry", 1: "disgusted", 2: "fearful", 3: "happy", 4: "sad", 5: "surprise"} # lookup = {0: 'Angry', 1:'Disgust', 2:'Fear', 3:'Happy', 4:'Neutral', 5:'Sad', 6:'Surprise'} # lookup = {0: 'Angry', 1: 'Disgust', 2: 'happy', 3: 'neutral', 4: 'surprised', 5: 'Sad', 6: 'fearful'} y_true = pd.Series([lookup[_] for _ in labelsValues]) # np.random.random_integers(0, 5, size=100)]) y_pred = pd.Series([lookup[_] for _ in PredictedValues]) # np.random.random_integers(0, 5, size=100)])
final_vars = df2.index[0:10] final_pred = pred_train[final_vars] classifier_FvNF = classifier.fit(final_pred, resp_train.status_group) print('Final predictor set estimated OOB accuracy (FUNC vs NONFUNC):', classifier_FvNF.oob_score_) # Use the final fitted model to predict waterpoint operation status in the # validation set using only the chosen predictors. predictions_FvNF = classifier_FvNF.predict(pred_val[final_vars]) # Show a confusion matrix and accuracy score. ct = pd.crosstab(resp_val.status_group, predictions_FvNF, rownames=['True'], colnames=['Predicted'], margins=True) print(ct) print('Estimated OOB accuracy from training (FUNC vs NONFUNC): ', classifier_FvNF.oob_score_) print('Validation set accuracy (FUNC vs NONFUNC): ', skm.accuracy_score(resp_val, predictions_FvNF)) # Make a nice looking chart of the confusion matrix colLabels = list(ct.columns.values) rowLabels = list(ct.index) #nrows, ncols = len(ct) + 1, len(colLabels) + 1 #hcell, wcell = 0.5, 1. #hpad, wpad = 0, 0
combine = [train, test] print( combine[0] ) #extract a title for each Name in the train and test datasets for dataset in combine: dataset['Title'] = dataset['Name'].str.extract(', ([A-Za-z]+)\.', expand=False) print( "\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n" ) print( train ) print() print( pd.crosstab(train['Title'], train['Sex'] ) ) # replace various titles with more common names for dataset in combine: dataset['Title'] = dataset['Title'].replace( ['Lady', 'Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Jonkheer', 'Dona'], 'Rare') dataset['Title'] = dataset['Title'].replace(['Countess', 'Sir'], 'Royal') dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss') dataset['Title'] = dataset['Title'].replace('Ms', 'Miss') dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
import pandas as pd massa_dados = 'https://github.com/cleber-impacta/atividade03_python/blob/main/massa%20de%20dados/04-2020.csv?raw=true' lista = pd.read_csv(massa_dados, sep=';') lista """ Mostra as 5 primeiras linhas """ lista.head() """Número de linhas e coluna""" lista.shape lista['despacho'].value_counts().plot.bar() lista['despacho'].unique() lista['despacho'].value_counts() lista['despacho'].value_counts(normalize=True) lista.describe() pd.crosstab(lista['especie'], lista['sexo'])
####### The Binomial Distribution ######## """ The binomial distribution is a discrete probability distribution that models the outcomes of a given number of random trails of some experiment or event. The binomial is defined by two parameters: the probability of success in any given trial and the number of trials. The binomial distribution tells you how likely it is to achieve a given number of successes in n trials of the experiment. For example, we could model flipping a fair coin 10 times with a binomial distribution where the number of trials is set to 10 and the probability of success is set to 0.5. In this case the distribution would tell us how likely it is to get zero heads, 1 head, 2 heads and so on. """ fair_coin_flips = stats.binom.rvs( n=10, # Number of flips per trial p=0.5, # Success probability size=10000) # Number of trials print(pd.crosstab(index="counts", columns=fair_coin_flips)) pd.DataFrame(fair_coin_flips).hist(range=(-0.5, 10.5), bins=11) plt.show() """ col_0 0 1 2 3 4 5 6 7 8 9 10 row_0 counts 7 91 450 1212 1998 2435 2099 1182 422 93 11 Note that since the binomial distribution is discrete, it only takes on integer values so we can summarize binomial data with a frequency table and its distribution with a histogram. The histogram shows us that a binomial distribution with a 50% probability of success is roughly symmetric, with the most likely outcomes lying at the center. This is reminiscent of the normal distribution, but if we alter the success probability, the distribution won't be symmetric: """ biased_coin_flips = stats.binom.rvs( n=10, # Number of flips per trial
# Model 2 # Gaussian Naive Bayes sgnb = GaussianNB() sgnb.fit(X_train_count_array,y_train) ## Gaussian Model Accuracy sgnb.score(X_train_count_array,y_train) # 0.90 sgnb.score(X_test_count_array,y_test) # 0.85 # From Above we can Conclude that Multinomial Naive Bayes Model gives us best result. So we are using it for future Predication. # Prediction on Train & Test Data pred_train = smnb.predict(X_train_count) pred_test = smnb.predict(X_test_count) # Confusion matrix of Train and Test ## Train confusion_matrix_train = pd.crosstab(y_train,pred_train,rownames=['Actual'],colnames= ['Train Predictions']) sns.heatmap(confusion_matrix_train, annot = True, cmap = 'Blues',fmt='g') ## Test confusion_matrix_test = pd.crosstab(y_test,pred_test,rownames=['Actual'],colnames= ['Test Predictions']) sns.heatmap(confusion_matrix_test, annot = True, cmap = 'Reds',fmt='g') # Classification Report of test print(classification_report(y_test,pred_test)) # ---------------------------------------------------- #
# .crosstab(), short for cross_tabulation import pandas as pd import matplotlib.pyplot as plt ri2 = pd.read_csv('/Users/apple/desktop/policeActivities/dataset/ri2.csv') table = pd.crosstab( ri2['driver_race'], ri2['driver_gender']) # NOTE: frequency table in form of dataframe print(table) # check the result of frequency table asian_female = ri2[(ri2['driver_gender'] == 'F') & (ri2['driver_race'] == 'Asian')] print(asian_female.shape) table = table.loc['Asian':'Hispanic'] print(table) # create stacked bar plot # table.plot(kind='bar', stacked=True) # plt.show() # district violation # create frequency table with distric and violation all_zones = pd.crosstab(ri2['district'], ri2['violation']) print(all_zones) # slice the dataframe to get k1-k3: k_zones = all_zones.loc['Zone K1':'Zone K3'] print(k_zones)
affairs = affairs.drop(['affairs'], axis=1) affairs = pd.concat([affairs, affairs_cat_cat], axis=1) ###### Creating dummy variables for the categorical data job_dum = pd.get_dummies(affairs.gender, drop_first=True) df_dummies = pd.get_dummies(affairs, columns=['affairs', 'gender', 'children'], drop_first=True) affairs = df_dummies # Getting the barplot for the categorical columns (df[df.columns[0:30]]) sb.countplot(x="affairs_Yes", data=affairs, palette="hls") pd.crosstab(affairs.affairs_Yes, affairs.gender_male).plot(kind="bar") # Checking if we have na values or not affairs.isnull().sum() # No null values #Model building import statsmodels.formula.api as sm logit_model = sm.logit( 'affairs_Yes~age+yearsmarried+religiousness+education+occupation+rating+gender_male+children_yes', data=affairs).fit() #summary logit_model.summary() y_pred = logit_model.predict(affairs)
import pandas as pd import matplotlib.pyplot as plt data_csv = '/Users/pankaj/dev/git/smu/smu_ml1/class3/moon/moon_landing.csv' lnd_data = pd.read_csv(data_csv) lnd_data = lnd_data[::2] oper_outcom = pd.crosstab(lnd_data['Operator'], lnd_data['Outcome'], margins=True, margins_name="Total") oper_outcom.sort_values('Total', inplace=True) oper_outcom = oper_outcom.drop('Total', axis=1) ax = oper_outcom.plot(kind='barh', stacked=True, title='Moon Missions') ax.set_xlabel('Number of Missions') ax.set_ylabel('Operator') plt.show()
def call_catscore(df, cat_col, y_lab, Top_n, thres, ytag, prt): i = cat_col #"Age_Of_Vehicle" y2 = y_lab #"Renewed2" Top_n = Top_n #15 ytag = ytag col_count = df[f'{i}'].value_counts() #print(col_count) col_count = col_count[:Top_n, ] col_count1 = df[f'{i}'].value_counts(normalize=True) * 100 col_count1 = col_count1[:Top_n, ] vol_inperc = col_count1.sum() vol_inperc = round(vol_inperc, 2) tmp = pd.crosstab(df[f'{i}'], df[f'{y2}'], normalize='index') * 100 tmp = pd.merge(col_count, tmp, left_index=True, right_index=True) tmp.rename(columns={0: 'NotRenwed%', 1: 'Renewed%'}, inplace=True) if 'NotRenwed%' not in tmp.columns: print("NotRenwed% is not present in ", i) tmp['NotRenwed%'] = 0 if 'Renewed%' not in tmp.columns: print("Renewed% is not present in ", i) tmp['Renewed%'] = 0 tmp1 = pd.crosstab(df[f'{i}'], df[f'{y2}']) tmp1.rename(columns={0: 'NR_count', 1: 'R_count'}, inplace=True) if 'NR_count' not in tmp1.columns: print("NR_count is not present in ", i) tmp1['NR_count'] = 0 if 'R_count' not in tmp1.columns: print("R_count is not present in ", i) tmp1['R_count'] = 0 tmpz = pd.merge(tmp, tmp1, left_index=True, right_index=True) tmpz['Tot'] = tmpz['NR_count'] + tmpz['R_count'] tmpz['Renewed%'] = round(tmpz['Renewed%'], 2) tmpz['Mean'] = tmpz['Renewed%'].mean() tmpz['Nperformer'] = np.where(tmpz['Renewed%'] < tmpz['Mean'], 1, 0) tmpz['score'] = round(tmpz['R_count'] / tmpz['Tot'], 2) #tmpz.sort_index(inplace=True) # Statistic calculation ------ score_mean = tmpz['score'].mean() score_std = tmpz['score'].std() th_min = tmpz['score'].min() th_nsd = round(score_mean - score_std, 2) th_mean = round(score_mean, 2) th_psd = round(score_mean + score_std, 2) th_max = tmpz['score'].max() def producer_clfy(tmpz): if (tmpz['score'] >= th_min and tmpz['score'] < th_nsd): return 'c4' if (tmpz['score'] >= th_nsd and tmpz['score'] < th_mean): return 'c3' elif (tmpz['score'] >= th_mean and tmpz['score'] < th_psd): return 'c2' elif (tmpz['score'] >= th_psd and tmpz['score'] <= 1): return 'c1' elif (tmpz['score'] > 1): return np.nan tmpz[f'{i}_class'] = tmpz.apply(producer_clfy, axis=1) tmpz.reset_index(inplace=True) tmpz.rename(columns={f'{i}': 'count'}, inplace=True) tmpz.rename(columns={'index': f'{i}'}, inplace=True) #tmpz = tmpz[[f'{i}',f'{i}_class','score']] #tmpz.drop('index',axis=1,inplace=True) #----------------------------- if prt == 'Y': print(tmpz) tmpzi = tmpz.reset_index() #tmpzii = tmpzi .join(pd.DataFrame(tmpzi.index.str.split('-').tolist())) #tmpzi = pd.concat([tmpzi,DataFrame(tmpzi.index.tolist())], axis=1, join='outer') tmpzi.to_excel("tmpz.xlsx") return (tmpz)
import pandas as pd import matplotlib.pyplot as plt bolsas_no_exterior = pd.read_csv('bolsa-exterior.csv', sep=';') table = pd.crosstab(index=bolsas_no_exterior['Ano Fase'], columns='Count') anos = table.index total_de_bolsas_por_ano = [] for x in range(2006, 2016): total_de_bolsas_por_ano.append(bolsas_no_exterior[ bolsas_no_exterior['Ano Fase'] == x]['Qtd Bolsas'].sum()) df_data = {'anos': anos, 'bolsas': total_de_bolsas_por_ano} df = pd.DataFrame(df_data) print(df) plt.plot(anos, total_de_bolsas_por_ano, linestyle='--', color='r', marker='s', linewidth=3.0) plt.title('Bolsas por Ano ofertadas no exterior') plt.xlabel('Anos') plt.ylabel('Quantidade de Bolsas') plt.show()
fitting(X, y) fpr, tpr, thresholds = roc_curve(probs, y_test, 1) fpr, tpr, thresholds = metrics.roc_curve(y_test, probs, pos_label=1) a = np.linspace(0, 1, 100) plt.plot(fpr, tpr) plt.plot(a, a) plt.xlabel("False Positive Rate (1 - Specificity)") plt.ylabel("True Positive Rate (Sensitivity, Recall)") plt.title("ROC plot of Interest Rate") plt.savefig('metrics_roc_interest') plt.show() '''Part 2''' grad_school = pd.read_csv('data/grad.csv') pd.crosstab(grad_school['admit'], grad_school['rank']) grad_school['rank'].plot(kind='bar') df = grad_school.set_index('rank') x = [] for i in range(1, 5): x.append(sum(df.loc[i]['admit']) / float(len(df.loc[i]['admit']))) plt.bar(range(1, 5), x) plt.xlabel('Rank') plt.ylabel('Percent of Admitted') plt.savefig('percent_rank_bar') plt.show() plt.hist(grad_school['gpa']) plt.savefig('hist_gpa') plt.show()
n_summary = pd.merge(summary, d1, left_index=True, right_index=True) n_summary = pd.merge(n_summary, d2, left_index=True, right_index=True) n_summary.rename(columns={"mean": "AverageProbability"}, inplace=True) n_summary.to_csv(save_path + data_version + '_' + match_status + '_bypatient_summary_' + weighted_status + '.csv') prescription_summary = pd.crosstab(index=summary.Prescribe, columns=summary.Match, margins=True, margins_name='Total') prescription_summary.columns = ['No Match', 'Match', 'Total'] prescription_summary.drop('Total', axis=0) prescription_summary.sort_values('Total', ascending=False, inplace=True) prescription_summary.to_csv(save_path + data_version + '_' + match_status + '_bytreatment_summary_' + weighted_status + '.csv') # =================================================================================== # Prescription Effectiveness # We will show the difference in the percent of the population that survives. # Prescription Effectiveness compares the outcome with the algorithm's suggestion versus what happened in reality # ===================================================================================
ax = sns.barplot(x=df1["weathercondition"], y=df1["temp"]) ax.set_xticklabels(["clear", "mist", "light snow"]) # its clearly visible that season and temperature have a linear trend .temperatures increase at every season before dropping down in the winter season. # feature selection # In[436]: from scipy.stats import chi2_contingency from scipy.stats import f_oneway # In[179]: chi2, p, dof, ex = chi2_contingency( pd.crosstab(df1["holiday"], df1["workingday"])) print(p) # In[437]: f_oneway(df1.totalcount, df1.weekday) # the p value is less than 0.05. so it means that both the variables are dependent on each other. hence we will remove one of the two predictors for the model. # In[92]: df1 = df1.drop(["date"], axis=1) # In[ ]: # date column is not required for buildig model.so we will remove it.
# # * There are just over 40K customer records and 20 features for each customer. # * The features are mixed--some numeric, some categorical. # * The data appears to be sorted, at least by `time` and `contact`. # # ### Exploration # Let's explore the data. First, let's understand how the features are distributed. # In[6]: # Frequency tables for each categorical feature for column in data.select_dtypes(include=['object']).columns: display( pd.crosstab(index=data[column], columns='% observations', normalize='columns') * 100) # In[7]: # Histograms for each numeric feature display(data.describe()) display(data.describe(include=np.object)) #%matplotlib inline hist = data.hist(bins=30, sharey=True, figsize=(10, 10)) # Notice that: # # - Almost 90% of the values for our target variable y are "no", so most customers did not subscribe to a term deposit. # - Many of the predictive features take on values of "unknown". Some are more common than others. We should think carefully as to what causes a value of "unknown" (are these customers non-representative in some way?) and how to handle that. # - Even if "unknown" is included as its own distinct category, what does it mean, given that those observations likely fall within one of the other categories of that feature?
def significance_test(self, field1:str, field2:str, method:str="spearman", verbose=True) -> pd.Series: """ Execute a statistical test as follows - Both fields are categorical => chi-square test - Both fields are continuous => correlation - Otherwise => one-way ANOVA on ranks :param field1: field to compare :param field2: field to compare :param method: "spearman" (default) or "pearson" :param verbose: if warnings are shown :return: Series with index: field1, field2, test, statistic, pval """ cats = self.get_cats() cons = self.get_cons() if field1 in cats and field2 in cats: #### chi2-test test = "chi-square test" contigency_table = pd.crosstab(self.data[field1], self.data[field2]) if verbose and (contigency_table < 5).sum().sum() > 0: print("The contigency table (%s vs %s) contains too small cell(s)." % (field1,field2)) print("Consult the documentation of stats.chi2_contingency") statistic, pval, dof, exp = stats.chi2_contingency(contigency_table) elif field1 in cons and field2 in cons: #### correlation if method == "spearman": test = "Spearman correlation" cor = stats.spearmanr else: test = "Peason correlation" cor = stats.pearsonr statistic, pval = cor(self.data[field1], self.data[field2]) else: #### one-way ANOVA on ranks test = "one-way ANOVA on ranks" if field1 in cats and field2 in cons: cat, con = field1, field2 elif field1 in cons and field2 in cats: cat, con = field2, field1 else: raise ValueError("You gave a wrong field.") vals = self.data[cat].unique() samples = [self.data.loc[self.data[cat] == v, con] for v in vals] if verbose and any([len(s) < 5 for s in samples]): print("The groups withe less than 5 samples will be ignored.") samples = [x for x in samples if len(x) >= 5] statistic, pval = stats.kruskal(*samples) s = pd.Series([field1, field2, test, statistic, pval], index=["field1", "field2", "test", "statistic", "pval"]) return s
def main(): """Verify age and gender balance along the groups from the MIRIAD dataset.""" # ---------------------------------------------------------------------------------------- dataset_name = 'MIRIAD' participants_path = PROJECT_ROOT / 'data' / dataset_name / 'participants.tsv' freesurfer_path = PROJECT_ROOT / 'data' / dataset_name / 'freesurferData.csv' outputs_dir = PROJECT_ROOT / 'outputs' ids_path = outputs_dir / (dataset_name + '_cleaned_ids.csv') dataset_df = load_dataset(participants_path, ids_path, freesurfer_path) dataset_df = dataset_df[dataset_df['Diagn'].isin([1, 17])] dataset_df = dataset_df.reset_index(drop=True) dataset_df = dataset_df.set_index('participant_id') # ---------------------------------------------------------------------------------------- print('Analysing {:}'.format(dataset_name)) print('Total of participants = {:}'.format(len(dataset_df))) print('') print('Number of participants per diagnosis') print(dataset_df.groupby('Diagn')['Image_ID'].count()) print('') contingency_table = pd.crosstab(dataset_df.Gender, dataset_df.Diagn) print('Contigency table of gender x diagnosis') print(contingency_table) print('') def print_age_stats(dataset_df): hc_age = dataset_df[dataset_df['Diagn'] == 1].Age.values ad_age = dataset_df[dataset_df['Diagn'] == 17].Age.values print('Age per diagnosis') print('HC = {:.1f}±{:.1f} [{:d}, {:d}]'.format(hc_age.mean(), hc_age.std(), math.ceil(hc_age.min()), math.ceil( hc_age.max()))) print('AD = {:.1f}±{:.1f} [{:d}, {:d}]'.format(ad_age.mean(), ad_age.std(), math.ceil(ad_age.min()), math.ceil( ad_age.max()))) print('') print_age_stats(dataset_df) # ---------------------------------------------------------------------------------------- # Gender analysis print('------------- GENDER ANALYSIS ----------------') def print_gender_analysis(contingency_table): _, p_value, _, _ = chi2_contingency(contingency_table[[1, 17]], correction=False) print('Gender - HC vs AD p value {:.4f}'.format(p_value)) _, p_value, _, _ = chi2_contingency(contingency_table, correction=False) print('Gender - TOTAL p value {:.4f}'.format(p_value)) print('') print_gender_analysis(contingency_table) # ---------------------------------------------------------------------------------------- # Age analysis print('------------- AGE ANALYSIS ----------------') print_age_stats(dataset_df) def print_age_analysis(dataset_df): hc_age = dataset_df[dataset_df['Diagn'] == 1].Age.values ad_age = dataset_df[dataset_df['Diagn'] == 17].Age.values _, p_value = ttest_ind(hc_age, ad_age) print('Age - HC vs AD p value {:.4f}'.format(p_value)) print('Age - TOTAL p value {:.4f}'.format( f_oneway(hc_age, ad_age).pvalue)) print() print('') print_age_analysis(dataset_df) # ---------------------------------------------------------------------------------------- # Final dataset print('------------- FINAL DATASET ----------------') print_gender_analysis(contingency_table) print_age_stats(dataset_df) print_age_analysis(dataset_df) dataset_df[['Image_ID']].to_csv(outputs_dir / (dataset_name + '_homogeneous_ids.csv'), index=False)
lab = newdf['label'] lab1 = newdf_test['label'] from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(random_state=0) t0 = time() clf.fit(features, lab) tt = time() - t0 print("Classifier trained in {} seconds".format(round(tt, 3))) t0 = time() pred = clf.predict(features1) tt = time() - t0 print("Predicted in {} seconds".format(round(tt, 3))) from sklearn.metrics import accuracy_score acc = accuracy_score(pred, lab1) print("Accuracy is {}.".format(round(acc, 4))) print( pd.crosstab(lab1, pred, rownames=['Actual attacks'], colnames=['Predicted attacks'])) #Features selected : ['duration', 'src_bytes', 'dst_bytes', 'logged_in', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'Protocol_type_icmp', 'Protocol_type_tcp', 'service_eco_i', 'service_http', 'service_other', 'flag_OTH', 'flag_REJ', 'flag_RSTOS0', 'flag_RSTR', 'flag_S0', 'flag_SH'] #Classifier trained in 7.215 seconds #Predicted in 0.493 seconds #Accuracy is 0.9861. #Predicted attacks Probe non-Probe #Actual attacks #Probe 1835 847 #non-Probe 224 74385
del user_set1 del user_set2 del user_set3 del user_set4 del user_set5 del user_set6 del user_set7 del user_set8 # del users # del up_df gc.collect() print('creating subsets..') up_df1 = pd.crosstab(up_df1.user_id, up_df1.product_pair).astype('uint32') up_df2 = pd.crosstab(up_df2.user_id, up_df2.product_pair).astype('uint32') up_df3 = pd.crosstab(up_df3.user_id, up_df3.product_pair).astype('uint32') up_df4 = pd.crosstab(up_df4.user_id, up_df4.product_pair).astype('uint32') up_df5 = pd.crosstab(up_df5.user_id, up_df5.product_pair).astype('uint32') up_df6 = pd.crosstab(up_df6.user_id, up_df6.product_pair).astype('uint32') up_df7 = pd.crosstab(up_df7.user_id, up_df7.product_pair).astype('uint32') up_df8 = pd.crosstab(up_df8.user_id, up_df8.product_pair).astype('uint32') print('merging into one crosstab') user_prodpair = pd.concat( [up_df1, up_df2, up_df3, up_df4, up_df5, up_df6, up_df7, up_df8]) print('user-product pair') #print(user_prodpair.head()) print(user_prodpair.shape)
# In[ ]: for dataset in combine: dataset['Age_null'] = data_raw['Age'].isnull() * 1 age_survive = sns.barplot(x='Age_null', y='Survived', data=data_raw) age_survive = plt.xlabel('Age is Missing') age_survive = plt.ylabel('Survived') plt.show() # In[ ]: # Complete missing age Values. we noticed that in the name, there is also title information, this can be helpful to predict the age for dataset in combine: dataset['Title'] = dataset.Name.str.extract('([A-Za-z]+)\.', expand=False) pd.crosstab(data_raw['Title'], data_raw['Sex']) # In[ ]: # Group different title (especially rare titles) into common groups which are more closely related to the age for dataset in combine: dataset['Title'] = dataset['Title'].replace([ 'Capt', 'Col', 'Sir', 'Don', 'Dr', 'Major', 'Rev', 'Jonkheer', 'Rev', 'Countess', 'Lady', 'Dona' ], 'Rare') # dataset['Title'] = dataset['Title'].replace(['Countess', 'Lady', 'Dona'], 'Rare_F') dataset['Title'] = dataset['Title'].replace(['Mlle', 'Ms'], 'Miss') dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs') dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1 dataset['IsAlone'] = 1 #initialize to yes/1 is alone dataset['IsAlone'].loc[
# In[17]: # Normalized Survival Rates for over 18 train.Survived[train.Child == 0].value_counts(normalize=True) # ## 透视表(pivotTab) # 透视表就是将指定原有DataFrame的列分别作为行索引和列索引,然后对指定的列应用聚集函数(默认情况下式mean函数)。 # # ## 列联表(crossTab) # 交叉表是用于统计分组频率的特殊透视表 # # Compute a simple cross tabulation of two (or more) factors. By default computes a frequency table of the factors unless an array of values and an aggregation function are passed. # In[4]: pd.crosstab(train['Sex'], train['Survived'], margins=True) # In[20]: pd.crosstab(train['Sex'], train['Survived'], margins=True, normalize='index') # In[19]: pd.crosstab(train['Sex'], [train['Survived'], train['Pclass']], margins=True) # In[22]: pd.crosstab(train['Sex'], [train['Survived'], train['Pclass']], normalize='index') # In[26]: