def _distance_matrix_to_header( distance_matrix: pd.DataFrame) -> pd.DataFrame: """Converts a distance matrix to a header. Args: distance_matrix: as index the columns of the vector numeric data, as columns the blm names and contains the "distance" between each. Returns: The header, a list of blm names. """ return distance_matrix.idxmin(axis=1).tolist()
def align_dfq(dfq: pd.DataFrame) -> pd.DataFrame: """Perform time (x axis) and frequency (y axis) shifting to align QMB pulses""" tmin = dfq['pulse0'].idxmin() tshift = tmin - dfq.idxmin() refPulse = tshift.idxmin() taxis = dfq.index + tshift[refPulse] print(refPulse, taxis) dfqShift = pd.DataFrame(dfq['pulse0']) dfqShift.index = list(taxis) fref = dfq['pulse0'][0] for pul in dfq.columns: fshift = fref - dfq[pul][0] dfqShift[pul] = pd.Series(dfq[pul].values + fshift) dfqShift[pul] = dfqShift[pul].shift(-dfq[pul].idxmin()) return dfqShift
import pandas as pd arr = np.array([[1, 2, np.nan], [np.nan, 3, 4]]) dframe1 = DataFrame(arr, index=['A', 'B'], columns=['One', 'Two', 'Three']) dframe1 # Sum method dframe1.sum() # ignores null values (treats them as 0s) dframe1.sum(axis=1) # sum across rows # Min method dframe1.min() # finds the minimum value in each column dframe1.min(axis=1) # minimum value of each row dframe1.idxmin() # Find the index of minimum value column # Max method dframe1.max() dframe1.idxmax() # Cumulative sum dframe1.cumsum() # accumulates along each columns values # Describe method dframe1.describe() # summary statistics of dataframe (by columns) # correlation and covariance import pandas.io.data as pdweb # import pandas_datareader.data as pdweb import datetime
ser2.rank() # sort put a series in order of it's item ranks ##### Summary arr = np.array([[1,2,np.nan],[np.nan,3,4]]) arr df1 = DataFrame(arr,index = ['a','b'],columns = ['one','two','three']) df1 df1.sum() #default axis is 0, and Pandas will ignore the nan values df1.sum(axis=1) #by row df1 df1.min() df1.min(axis=1) df1.idxmin() df1.idxmin(axis=1) df1.cumsum() # accumulation sum ### unique() and value_count() methods for factor variables ser1 = Series(['w','w','x','y','z','w','x','y','x','a']) ser1 ser1.unique() ser1.value_counts() ###describe method df1.describe() # similar to the summary() method in R will provide summary stat. ###covariance matrices and some visulaization import pandas.io.data as pdweb
df2['Chinese'] = df2['Chinese'].astype(np.int64) df = df2.apply(plus, axis=1, args=[1, 2]) print(df) # 常用的统计函数 print('-' * 35 + '常用的统计函数' + '-' * 35) # count()统计个数,空值不计算 print(df2.count()) print(df2.count(axis=1)) # describe():输出多个统计指标 print(df2.describe()) print(df2.min()) print(score.idxmin()) print(score.std()) print(df2.describe(percentiles=[0.9])) # 数据表合并 print('-' * 35 + '数据表合并' + '-' * 35) df1 = DataFrame({ 'name': ['ZhangFei', 'GuanYu', 'a', 'b', 'c'], 'data': range(5), 'data1': range(0, 9, 2) }) df2 = DataFrame({ 'name': ['ZhangFei', 'GuanYu', 'A', 'B', 'C'], 'data': range(5), 'data2': range(5) })
###################################################################### # SUMMARY STATISTICS ###################################################################### # Will return the sum for each colm, ignores NaN dframe.sum() # Will return sum of Rows dframe.sum(axis=1) # Min/max val for each col dframe.min() # Min/max val index for each col dframe.idxmin() # Cumulation sum dframe.cumsum() # Describe method creates summary statistics for each colm dframe.descirbe() #count, mean, std, min, .... # Covariance and Correlation import pandas.io.data as pdweb import datetime # Getting the stock data from the internet and displaying the first 5 sets prices = pdweb.get_data_yahoo(['CVX', 'XOM', 'BP'],start=datetime.datetime(2010,1,1), end=datetime.datetime(2013,1,1))['Adj Close']
from pandas import DataFrame data2 = { 'Speed': [101, 109, 106], 'Temp': [34, 23, 42], 'Humidity': [45, 23, 58] } frame2 = DataFrame(data2) print(frame2) print(frame2.sum()) # Finding sum of attributes print(frame2.sum(axis=1)) # Finding sum of tuples print(frame2.idxmax()) # Which tuple has the max Speed, Humidity and Temp (returns respective indices print(frame2.idxmin()) print(frame2.max()) # Which is the maximum recorded value of Hum, Temp, Speed
print(df.drop(1), '\n') # 1행지우기 print(df.dropna(), '\n') #nan값을 지운다. print(df.dropna(how='any'), '\n') # nan값이 하나라도 있으면 지운다 print(df.dropna(how='all'), '\n') # 모든행의 값이 nan 이면 지운다 print(df.dropna(subset=['one']), '\n') # 특정열에 nan 이 있으면 그행을 제거한다. print(df.fillna(0), '\n') # 평균으로 채우기 sklearn 모듈의 SimpleInputer # 기술적 통계와 관련된 함수 print('**' * 10) print(df.sum(), '\n') #열단위의 합 nan은 제외 print(df.sum(axis=0), '\n') print(df.sum(axis=1), '\n') # 행단위의 합 print(df.mean(axis=1), '\n') # 행의 평균 print(df.mean(axis=1, skipna=True), '\n') # na포함 계산 print(df.mean(axis=1, skipna=False), '\n') # na 있을시 계산 x print(df.mean(axis=0, skipna=True), '\n') # nan이 있어도 계산 o (열단위) print(df.mean(axis=0, skipna=False), '\n') #nan이 있기 때문에 계산 x print(df.max(), '\n') print(df.max(axis=0), '\n') #열값중 가장 큰값 print(df.idxmax(), '\n') print(df.idxmin(), '\n') print(df.describe(), '\n') # 요약 통계망 print(df.info(), '\n') # 데이터프레임 구조 words = Series(['봄', '여름']) print(words.describe(), '\n')
obj.rank(ascending=False) obj.rank() obj.rank(method='first') frame3 = DataFrame({ 'b': [4, 7, 3, 2], 'a': [4, 9, 2, 5], 'c': [5, 3, 7, np.nan] }) frame3.rank(axis=1) frame3.sum() frame3.mean() frame3.sum(skipna=False) frame3.idxmax() # 최대치가 있는 인댁스값 frame3.idxmin() obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c']) ubiques = obj.unique() obj.value_counts obj.value_counts(sort=False) mask = obj.isin(['b', 'c']) obj[mask] # True인 값만 출력 obj[obj.isin(['b', 'c'])] frame4 = DataFrame({ 'X': ['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'], 'Y': ['f', 'g', 'd', 'g', 'h', 'e', 'd', 'h', 'f'], 'Z': ['a', 'e', 'd', 'g', 'd', 'e', 'q', 'b', 'c'] })
import numpy as np import pandas as pd from pandas import Series, DataFrame arr = np.array([[1, 2, np.nan], [np.nan, 3, 4]]) df1 = DataFrame(arr, index=['A', 'B'], columns=['One', 'Two', 'Three']) # sum of columns print(df1) print(df1.sum()) # sum by column print(df1.sum(axis=1)) # sum by row # min/max print(df1.min()) # return the min value for each column print(df1.min(axis=1)) # return the min value for each row print(df1.idxmin()) # return the index column instead # stats print(df1.describe())
Summarizing and Computing Descriptive Statistics """ from pandas import Series, DataFrame import pandas as pd import numpy as np df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=['a', 'b', 'c', 'd'], columns=['one', 'two']) df.sum() df.sum(axis=1) #sum along axis=1, columns df.mean(axis=1, skipna=False) df.idxmax() df.idxmin(axis=1) df.cumsum() df.describe() obj = Series(['a', 'a', 'b', 'c'] * 4) obj.describe() df['three'] = ['a','b','c','a'] df.describe() df['three'].describe() """ Correlation and Covariance """ import pandas.io.data as web all_data = {} for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']: all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010')
class KMeansPlusPlus: def __init__(self, data_frame, k, columns=None, max_iterations=None, appended_column_name=None): if not isinstance(data_frame, DataFrame): raise Exception("data_frame argument is not a pandas DataFrame") elif data_frame.empty: raise Exception("The given data frame is empty") if max_iterations is not None and max_iterations <= 0: raise Exception("max_iterations must be positive!") if not isinstance(k, Integral) or k <= 0: raise Exception("The value of k must be a positive integer") self.data_frame = data_frame # m x n self.numRows = data_frame.shape[0] # m # self.m = self.get_metric() # k x n, the i,j entry being the jth coordinate of center i self.centers = None self.index = [] # m x k , the i,j entry represents the distance # from point i to center j # (where i and j start at 0) self.distance_matrix = None # Series of length m, consisting of integers 0,1,...,k-1 self.clusters = None # To keep track of clusters in the previous iteration self.previous_clusters = None self.max_iterations = max_iterations self.appended_column_name = appended_column_name self.k = k # print ("df: ",self.data_frame.shape) if columns is None: self.columns = data_frame.columns else: for col in columns: if col not in data_frame.columns: raise Exception( "Column '%s' not found in the given DataFrame" % col) if not self._is_numeric(col): raise Exception( "The column '%s' is either not numeric or contains NaN values" % col) self.columns = columns def populate_initial_centers(self): rows = [] # rows.append(self.data_frame.iloc[0,:]) rows.append(self._grab_random_point()) distances = None while len(rows) < self.k: if distances is None: distances = self.get_dist_point(self.data_frame ,rows[0]) else: distances = self.get_dist_list(self.data_frame, rows) '''array / array''' normalized_distances = distances / distances.sum() index = random.choices(range(0,self.numRows), weights=normalized_distances, k=1) self.index.append(index) centroid = self.data_frame[self.columns].iloc[index, :].values rows.append(centroid.reshape(centroid.shape[1])) self.centers = DataFrame(rows, columns=self.columns) # print(self.centers) def _grab_random_point(self): # get values of an row index = np.random.random_integers(0, self.numRows - 1) self.index.append(index) return self.data_frame[self.columns].iloc[index, :].values def get_dist_single(self, x, y): #x is a sample, y is a the centroid of cluster # print ("X ",x) # print("Y ", y) Z = np.array([x - y]) # print(Z) dist = np.sum(Z != 0) # print(dist) return dist def get_dist_point(self, X, Y): dist_lst = [] # print ("len ", len(X)) for i in range(len(X)): x = np.array(X.iloc[i]) # y = np.array(Y.iloc[i]) y = Y dist_lst.append(self.get_dist_single(x, y)) l = np.array(dist_lst).reshape((len(dist_lst),1)) # return dist.values return l def get_dist_list(self, X, Y): result = None for point in Y: if result is None: # result = self.get_dist_point(X, point.values) result = self.get_dist_point(X, point) else: # print ("r ",result) # print("") l = self.get_dist_point(X, point) result = np.concatenate((result, l), axis=1) result = result.min(axis=1) return result def compute_distances(self): # dis_mat = DataFrame() dis_mat = np.zeros((len(self.data_frame.index),1)) for i in range(self.k): d = self.centers.iloc[i,:] - self.data_frame # print(d) # print(d!=0) dist = np.sum(d!=0,axis=1) # print(dist) dis_mat = np.concatenate((dis_mat, dist.reshape(dist.shape[0],1)),axis=1) # print (dis_mat) dis_mat = np.delete(dis_mat,0,1) # print(dis_mat) self.distance_matrix = DataFrame( dis_mat, columns=list(range(self.k))) def get_clusters(self): if self.distance_matrix is None: raise Exception( "Must compute distances before closest centers can be calculated") # min_distances = self.distance_matrix.min(axis=1) # # # We need to make sure the index # min_distances.index = list(range(self.numRows)) self.clusters = Series(self.distance_matrix.idxmin(axis=1).values, index=self.data_frame.index) def compute_new_centers(self): if self.centers is None: raise Exception("Centers not initialized!") if self.clusters is None: raise Exception("Clusters not computed!") # print(self.data_frame) # data for i in list(range(self.k)): self.centers.ix[i, :] = self.data_frame[ self.columns].ix[self.clusters == i].mean() # print("before ",self.centers) # self.centers = self.centers.astype(int) try: self.centers = self.centers.astype(int) except ValueError: index = (np.sum(self.distance_matrix, axis=1) == 9).values.nonzero()[0] # print(type(index)) # print((np.sum(self.distance_matrix, axis=1) == 9).nonzero()[0]) # asdf # print("index, ",index) # print("before ",self.centers) # self.distance_matrix ind =np.random.choice(index, int(len(index)/10)) # print("ind", ind) self.clusters[ind] = 2 for i in list(range(self.k)): self.centers.ix[i, :] = self.data_frame[ self.columns].ix[self.clusters == i].mean() # self.centers[np.sum(self.distance_matrix,axis=1) == 9] self.centers = self.centers.astype(int) # print(self.centers) def cluster(self): self.populate_initial_centers() self.compute_distances() self.get_clusters() counter = 0 while True: counter += 1 self.previous_clusters = self.clusters.copy() self.compute_new_centers() # print(self.centers) self.compute_distances() # print(self.distance_matrix) self.get_clusters() # print(self.clusters) if self.max_iterations is not None and counter >= self.max_iterations: break elif all(self.clusters == self.previous_clusters): break if self.appended_column_name is not None: try: self.data_frame[self.appended_column_name] = self.clusters except: warnings.warn( "Unable to append a column named %s to your data." % self.appended_column_name) warnings.warn( "However, the clusters are available via the cluster attribute")
__author__ = 'Executor' import numpy as np import pandas as pa from pandas import Series, DataFrame arr = np.array([[1, 2, np.nan], [np.nan, 3, 4]]) dframe1 = DataFrame(arr, index=['A', 'B'], columns=['One', 'Two', 'Three']) print(dframe1.sum()) print(dframe1.sum(axis=1)) print(dframe1.min()) print(dframe1) print(dframe1.idxmin()) print(dframe1) print(dframe1.cumsum()) print(dframe1.describe()) from IPython.display import YouTubeVideo YouTubeVideo('xGbpuFNR1ME') YouTubeVideo('4EXNedimDMs') ''' stupid thing doesn't work!'''
#This will show the minimum values of the column print "Minimun" print df1.min() #This will show the maximum values of the column print "Maximum" print df1.max() #This will show the minimum and maximum values of the index # It does not show the value but shows the index/ row of the values with the min and max values print "Max of index" print df1.idxmax() print "Min of index" print df1.idxmin() # Cumilitive Sums leave the first index as it is # and adds the second index with the first indexx to get the outcome print "Cumulitive Sum" print df1.cumsum() # Describe function helps with oral sets such as: # count, mean, standard deviation, minimum, mpercentages (25%, 50%, 75%) and max print "Describe Function" print df1.describe() #this dataframe looks at random numbers with a 3*3 grid # it has an index of 123 and column ABC df2 = DataFrame(randn(9).reshape(3, 3), index=[1, 2, 3], columns=list('ABC')) print "New DataFrame"
print(df.dropna(how='all')) #모두 결측치인 행 삭제 print(df.dropna(subset=['1st']))#칼럼명이 1st열에서 NaN이 있으면 그 행을 삭제 print(df.fillna(0)) #NaN을 0으로 채우기, 평균으로 채우기는 sklearn 모듈의 SimpleInputer를 이용 ''' 기술적 통계와 관련된 함수 axis=1은 행, axis=0은 열 ''' print(df.sum()) #NaN은 연산에서 제외, 열의 합 (같은의미 =>df.sum(axis=0)) print(df.sum(axis=1)) #행의 합, NaN끼리의 연산은 0으로 처리 print(df.mean(axis=1)) #행의 평균 (같은 의미 =>df.mean(axis=1, skipna=True), Na빼고 구하려면 skipna를 False로 처리 #최대값 print(df.max()) # => axis=0 print(df.idxmax()) #최대값을 가진 인덱스를 반환 print(df.idxmin()) #최소값도 동일 #요약통계량 print(df.describe()) ''' 1st 2nd count 3.000000 2.000000 mean 2.966667 -2.750000 std 3.521837 2.474874 min 0.500000 -4.500000 25% 0.950000 -3.625000 50% 1.400000 -2.750000 75% 4.200000 -1.875000 max 7.000000 -1.000000 '''
# In[209]: df.sum() # In[220]: df.sum(axis=1) # In[221]: df.mean(axis=1, skipna=False) # In[223]: df.idxmin() # In[224]: df.idxmax() # In[226]: df.cumsum() # In[227]: df.describe() # In[229]:
def find_min(df: pd.DataFrame): idx_min = int(df.idxmin(axis=0)[2]) min_row = df.loc[idx_min, :] r_min, theta_min, E_min = min_row[0], min_row[1], min_row[2] return r_min, theta_min, E_min
print(df1) #sum operations print(df1.sum()) #sum along the rows print(df1.sum(axis=1)) print(df1.min()) print(df1.max()) print('----------') print(df1.idxmax()) print(df1.idxmin()) print('----------') print(df1.cumsum()) print('----------') print(df1.describe()) print('----------') df2 = DataFrame(randn(9).reshape(3, 3), index=[1, 2, 3], columns=list('ABC')) print(df2) print('----------') plt.plot(df2) plt.legend(df2.columns, loc="lower right") plt.savefig("first graph in python")
import pandas as pd arr = np.array([[1, 2, np.nan], [np.nan, 3, 4]]) dframe1 = DataFrame(arr, index=["A", "B"], columns=["One", "Two", "Three"]) dframe1 # Sum method dframe1.sum() # ignores null values (treats them as 0s) dframe1.sum(axis=1) # sum across rows # Min method dframe1.min() # finds the minimum value in each column dframe1.min(axis=1) # minimum value of each row dframe1.idxmin() # Find the index of minimum value column # Max method dframe1.max() dframe1.idxmax() # Cumulative sum dframe1.cumsum() # accumulates along each columns values # Describe method dframe1.describe() # summary statistics of dataframe (by columns) # correlation and covariance import pandas.io.data as pdweb # import pandas_datareader.data as pdweb
ser1 = Series(range(3), index=['C', 'A', 'B']) ser1 ser1.sort_index() ser1.sort_values() ser2 = Series(randn(10)) ser2 arr = np.array([[1, 2, np.nan], [np.nan, 3, 4]]) arr dframe1 = DataFrame(arr, index=['A', 'B'], columns=['one', 'two', 'three']) dframe1 dframe1.sum(axis=1) dframe1.min(axis=1) dframe1.idxmin(axis=0) dframe1 dframe1.cumsum() dframe.cumsum() prices = pdweb.get_data_yahoo(['CVX', 'XOM', 'BP'], start=datetime.datetime(2010, 1, 1), end=datetime.datetime(2013, 1, 1))['Adj Close'] prices.head() valume = pdweb.get_data_yahoo(['CVX', 'XOM', 'BP'], start=datetime.datetime(2010, 1, 1), end=datetime.datetime(2013, 1, 1))['Volume'] valume.head() rets = prices.pct_change()
# ### Summary Statistics # In[449]: df = DataFrame(np.arange(16).reshape(4, 4), columns=list('ABCD'), index=list('PQRS')) df.loc['P', 'D'] = np.nan df.loc['R', 'A'] = np.nan df df.sum() # col df.sum(axis=1) # row df.min() # col df.idxmin() # col df.min(axis=1) # row df.idxmin(axis=1) #row df.cumsum() # In[452]: # describe df = DataFrame(np.arange(16).reshape(4, 4), columns=list('ABCD'), index=list('PQRS')) df.loc['P', 'D'] = np.nan df.loc['R', 'A'] = np.nan
import pandas as pd import numpy as np # datafile = 'D:/新建 Microsoft Office Excel 工作表.xlsx' # data = pd.read_excel(datafile,header=None) # min = (data-data.min())/(data.max()-data.min()) # zero = (data - data.mean())/data.std() # float = data/10**np.ceil(np.log10(data.abs().max())) #小数定标规范化 # print("原始数据为:\n",data) # print('--------------------') # print('最小-最大规范化后的数据:\n',min) from pandas import Series, DataFrame df = DataFrame(np.random.randn(4, 3), index=list('abcd'), columns=['frist', 'second', 'third']) print(df) print(df.describe()) print(df.sum()) print(df.sum(axis=1)) print('-----------') print(df.idxmax(), df.idxmin(), df.idxmin(axis=1)) print(df.cumsum()) print(df.var()) print(df.std()) print(df.pct_change()) print(df.cov()) print(df.corr())
index=['a', 'b', 'c', 'd'], columns=['one', 'two']) print(df) # sum() : 각 컬럼의 합을 더해서 Series 객체를 반환 print(df.sum()) print(df.sum(axis=1)) # 각 행의 합을 반환 # 전체 행이나 컬럼의 값이 NA가 아니라면 NA 값은 제외시키고 계산을 하는데 # skipna 옵션은 전체 행이나 컬럼의 값이 NA가 아니라도 제외시키지 않을 수 있다. # skipna의 기본값은 True print(df.sum(axis=1, skipna=False)) # idxmin, idxmax와 같은 메서드는 최소, 최대값을 가지고 있는 색인 값 같은 간접 통계를 반환한다. print(df.idxmax()) print(df.idxmin()) # 누산 메서드 : cumsum() print(df.cumsum()) # unique() : 중복된 값을 하나로 묶음 s1 = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c']) print(s1.unique()) # value_counts() : 값의 수를 계산(도수, 카운팅), 반환값은 Series 객체 print(s1.value_counts()) # 결과값이 내림차순으로 출력됨. # isin() : 어떤 값이 Series에 있는지 나타내는 메서드 ## boolean type(True, False)을 반환한다. mask = s1.isin(['b', 'c']) print(mask)
frame[0:3] frame.loc['a':'d', 'STL':] frame.iloc[0:3, 1:2] frame['UMST'] = 4 frame.reindex(index=['c', 'e', 'a'], columns=['UM', 'Washu']) frame[frame < 0] = np.nan frame.isnull() frame.dropna() frame.dropna(axis=1) um = frame['UM'] um[um.notnull()] frame.fillna(method='ffill', axis=0, limit=1, inplace=False) frame.fillna(method='ffill', axis=1, limit=1) frame.mean() frame.mean(axis=1, skipna=False) frame.idxmin() frame.idxmax(axis=1) frame2 = DataFrame( { 'Washu': np.random.randn(5), 'UM': np.random.randn(5), 'UMST': np.random.randn(5) }, index=list('abcde')) frame3 = DataFrame({ 'a': { 'Washu': 1, 'UM': 3 }, 'b': { 'Washu': 2,
from pandas import DataFrame data = { 'Speed': [101, 109, 106], 'Temp': [34, 32, 45], 'Humidity': [4500, 2300, 5800] } frame = DataFrame(data) print(frame) print(frame.sum()) #to calculate sum of all columns print(frame.sum(axis=1)) # to calculate sum of rows print(frame.idxmax()) # to calculate max value at particular index value. print(frame.idxmin())
print(stu_result) # 我们先复习了下对DataFrame的操作 # 现在我们有了一个迷你的学生成绩单 # 输出每列的和 stu_pure_result = stu_result.ix[:, ['math', 'physics']] stu_pure_result = DataFrame(stu_pure_result) print(stu_pure_result.sum()) print(stu_pure_result.sum(axis = 1)) # 求最大值的索引 print(stu_pure_result.idxmax()) print(stu_pure_result.idxmin()) # 求累计值 print(stu_pure_result.cumsum()) print(stu_pure_result.cumprod()) print(stu_pure_result.cummax()) print(stu_pure_result.cummin()) # 一次性产生一系列描述性统计结果 print(stu_pure_result.describe()) # Series也有describe()方法 history = Series([97, 99, 89, 79], index=range(4)) print(history.describe()) # 需要注意,describe()方法最好处理数值型数据,如果是其他类型数据则数据意义不大
df1.sum() #calculating the sum of individual rows df1.sum(axis=1) #here axis =1 represents the horizontal axis #calculating the maximum values for each individual columns #results will in the form of displayed index df1.idxmax() #similarly for minimum values for each individual columns df1.idxmin() #fundamental operations on DataFrames like addition,subtraction etc dic2 = { "cse": [10, 13, 11], "maths": [11, 14, 17], "english": [5, 7, 9], "ece": [11, 13, 15] } df2 = DataFrame(dic2) ##adding df1+df2 df1 + df2
#Dataframe Fails = { 'Cv-Folds': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'H1': errors[:, 0], 'H2': errors[:, 1], 'H3': errors[:, 2], 'H4': errors[:, 3], 'H5': errors[:, 4], 'H6': errors[:, 5], 'H7': errors[:, 6], 'H8': errors[:, 7], 'H9': errors[:, 8], 'H10': errors[:, 9] } Errordf = DataFrame(Fails, columns=[ 'Cv-Folds', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6', 'H7', 'H8', 'H9', 'H10' ]) with pd.option_context('display.max_rows', None, 'display.max_columns', None): print(Errordf) #finding the minimum error, and the number of hidden units for that error. minValues = Errordf.min() minIndex = Errordf.idxmin(axis=0) WubWub = {'hidden units': minIndex + 1, 'E-test': minValues} endData = DataFrame(WubWub, columns=['hidden units', 'E-test']) print(endData)
def main(): """ Calculation and aggregation of summary statistics """ # Summary of statistics # return is not ndarray df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=list('abcd'), columns=['one', 'two']) print df print df.sum() print df.sum(axis=1) print df.mean(axis=1) # exclude nan print df.mean(axis=1, skipna=False) print df.idxmin() print df.idxmax() print df.cumsum() print df.describe() # values are not number obj = Series(list('aabc') * 4) print obj.describe() methods = ['count', 'min', 'max', # 'argmin', 'argmax', 'quantile', 'median', 'mad', 'var', 'std', 'skew', 'kurt', 'cummin', 'cummax', 'cumprod', 'diff', 'pct_change'] for method in methods: print u'「{0}」'.format(method) print getattr(df, method)() print '' # Correspond and Covariance all_data = {} lst = [] # ['AAPL', 'IBM', 'MSFT'] #, 'GOOG']: for ticket in lst: #, 'GOOG']: # IOError: after 3 tries, Yahoo! did not return a 200 # for url 'http://ichart.finance.yahoo.com/table.csv?s=GOOG&a=0&b=1&c=2000&d=0&e=1&f=2010&g=d&ignore=.csv' all_data[ticket] = pd.io.data.get_data_yahoo(ticket, '1/1/2000', '1/1/2010') price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.iteritems()}) volume = DataFrame({tic: data['Volume'] for tic, data in all_data.iteritems()}) if all_data: returns = price.pct_change() print returns.tail() print '' print returns.MSFT.corr(returns.IBM) print returns.MSFT.cov(returns.IBM) print '' print returns.corr() print returns.cov() print '' print returns.corrwith(returns.IBM) print returns.corrwith(volume) # unique, frequency, belong print '','' obj = Series(list('cadaabbcc')) uniques = obj.unique() print uniques print obj.value_counts() print pd.value_counts(obj.values, sort=False) mask = obj.isin(['b', 'c']) print mask print obj[mask] data = DataFrame({ 'Qu1' : [1,3,4,3,4], 'Qu2' : [2,3,1,2,3], 'Qu3' : [1,5,2,4,4], }) print data print data.apply(pd.value_counts).fillna(0)
arr = np.array([[1,2,np.nan],[np.nan,3,4]]) dframe1 = DataFrame(arr,columns=["One","Two","Three"],index=["A","B"] ) dframe1 #Let's see the sum() method in action dframe1.sum() #Notice how it ignores NaN values #Notice how it ignores NaN values dframe1.sum(axis=1) #Can also grab min and max values of dataframe dframe1.min() #As well as there index dframe1.idxmin() dframe1.idxmax() dframe1.max() dframe1 #Can also do an accumulation sum dframe1.cumsum() #A very useful feature is describe, which provides summary statistics describe=dframe1.describe() # We can also get information on correlation and covariance #For more info on correlation and covariance, check out the videos below!
#Lecture 22 Summary Statistics import numpy as np import pandas as pd import IPython from pandas import Series, DataFrame arr = np.array([[1, 2, np.nan], [np.nan, 3, 4]]) dframe1 = DataFrame(arr, index=['A', 'B'], columns=['one', 'two', 'three']) print(dframe1) print(dframe1.sum()) # sums values across each column print( dframe1.sum(axis=0) ) # sums the value across each row. for Row use axis =1 , for column use axis =0 print(dframe1.min()) # returns min value in column print(dframe1.max()) # returns max value in column print(dframe1.idxmin()) # returns index of the min value in column print( dframe1.cumsum()) #accumulation row wise cumulative summin across column. print(dframe1.describe() ) #summary statiscs for data frame . Min , Max , ount , percentile # from IPython.display import YouTubeVideo # YouTubeVideo('xGbpuFNR1ME') # YouTubeVideo('4EXNedimDMs') from pandas_datareader import data #allow us to get some information from the web import datetime # Library for date input import matplotlib.pyplot as plt import seaborn as sns #%matplotlib inline prices = data.get_data_yahoo( ['CVX', 'XOM', 'BP'], start=datetime.datetime(2010, 1, 1), end=datetime.datetime(2013, 1,
A 3.0 B 7.0 dtype: float64 ''' # min print(dframe1.min()) ''' One 1.0 Two 2.0 Three 4.0 dtype: float64 ''' # index of min value print(dframe1.idxmin()) ''' One A Two A Three B dtype: object ''' # acumulation print(dframe1.cumsum()) ''' One Two Three A 1.0 2.0 NaN B NaN 5.0 4.0 '''
df pd.isnull(df) df.isnull() df.sum() df.sum(axis = 1) df.mean() df.mean(skipna = False) df.mean(axis = 1) df.mean(axis = 1, skipna = False) np.mean(df, axis = 1) df.idxmax() # 열기준 최고값 인덱스 : 과목별 고득점자 df.idxmin() # 열기준 최소값 인덱스 : 과목별 저득점자 df.cumsum() # row단위 누적합 df.cumsum(axis = 1) # col단위 누적합 df['영어'].sum() df['영어'].mean() df['영어'].var() df['영어'].std() df['영어'].max() df['영어'].min() df.loc['홍길동'].sum() df.loc['박찬호'].mean() df.describe()
frame2 frame2.sort_index(axis=1, ascending=False) series2=Series([100,200,500,50],index=['S',['p','o','u']]) series2 series2.sort_values() frame2.sort_value (by='Humidity') #check for duplicate series.index.is_unique #sum frame2.sum() frame2.sum(axis=1) frame2.idxmax() frame2.idxmin() #removing nan from pandas import Series import numpy as np ser = Series([1,2,3,4,np.nan],index=['a','b','c','d','e']) ser ser = ser.dropna() ser frame2 = frame2.dropna() #fillna value with 0 frame2.fillna(0) frame2.fillna(100) #loading data from file import pandas data_frame = pandas.read_csv("PMTCT.csv") data_frame
def descriptiveStatsDataFrame(): df = DataFrame([[1.4, np.nan], [7, 5], [np.nan, np.nan], [7,10]], index=['a','b','c','d'], columns=['one','two']) print (df) print ('Column Sum: \n{}'.format(df.sum(axis=0))) print ('Row Sum: \n{}'.format(df.sum(axis=1))) print ('Do not skip NA: \n{}'.format(df.sum(axis=1, skipna=False))) print ('Index with min Value: \n{}'.format(df.idxmin())) print ('Summary Statistic: \n{}'.format(df.describe()))