def skew_report(dataframe: pd.DataFrame, threshold: int = 5) -> None: highly_skewed = [ i[0] for i in zip(dataframe.columns.values, abs(dataframe.skew(numeric_only=True))) if i[1] > threshold ] print(f"There are {len(highly_skewed)} highly skewed data columns.") if highly_skewed: print("Please check them for miscoded na's") print(highly_skewed)
def test_skew(self): from scipy.stats import skew string_series = tm.makeStringSeries().rename("series") alt = lambda x: skew(x, bias=False) self._check_stat_op("skew", alt, string_series) # test corner cases, skew() returns NaN unless there's at least 3 # values min_N = 3 for i in range(1, min_N + 1): s = Series(np.ones(i)) df = DataFrame(np.ones((i, i))) if i < min_N: assert np.isnan(s.skew()) assert np.isnan(df.skew()).all() else: assert 0 == s.skew() assert (df.skew() == 0).all()
def test_skew(self): from scipy.stats import skew string_series = tm.makeStringSeries().rename('series') alt = lambda x: skew(x, bias=False) self._check_stat_op('skew', alt, string_series) # test corner cases, skew() returns NaN unless there's at least 3 # values min_N = 3 for i in range(1, min_N + 1): s = Series(np.ones(i)) df = DataFrame(np.ones((i, i))) if i < min_N: assert np.isnan(s.skew()) assert np.isnan(df.skew()).all() else: assert 0 == s.skew() assert (df.skew() == 0).all()
def moments_features(path): if not os.path.exists(path): logger.error(path + " is not exist!") return im = cv2.imread(path) [b, g, r] = cv2.split(im) moments = [] for n in [b, g, r]: df = DataFrame(np.array(n.flatten())) moments.extend(float(x) for x in [df.mean()[0], df.std()[0], df.skew()[0]]) return moments
def myconnect(self): user_text1 = str(self.host_le.text()) user_text2 = str(self.user_le.text()) user_text3 = str(self.password_le.text()) user_text4 = str(self.db_le.text()) try: mcon = MySQLdb.connect(host=user_text1, user=user_text2, passwd=user_text3, db=user_text4) self.browser.setText("[*] Welcome, connection successful.") text, ok = QInputDialog.getText(self, "Table Name", "Enter table name:") if ok and text: tb_name = str(text) try: sq_tb = pis.read_sql('select * from '+ ' %s ' % tb_name, mcon) df = DataFrame(sq_tb) mcon.close() size = str(len(df)) stat_description = df.describe() stats = str(stat_description) kt = str(df.kurt()) skew = str(df.skew()) cov = str(df.cov()) corr = str(df.corr()) head = str(df.head()) tail = str(df.tail()) summation = str(stat_description.sum()) self.browser1.setText("Size: " +"%s " %size +"\n"\ +"Statistics:" +"\n"\ +" %s " %stats +"\n"\ +"Kurt:" +"\n"\ +"%s" %kt +"\n"\ +"Skew:" +"\n"\ +"%s" %skew +"\n"\ +"Covarriance:" +"\n"\ +"%s" %cov +"\n"\ +"Correlation:" +"\n"\ +"%s" %corr +"\n"\ +"Summation:" +"\n"\ +"%s" %summation +"\n"\ +"Head:" +"\n"\ +"%s" % head +"\n"\ +"Tail:" +"\n"\ +"%s" %tail) self.browser.setText(stats) self.host_le.clear() self.user_le.clear() self.password_le.clear() self.db_le.clear() except Exception, e: self.browser.setText("[*] Ensure that the table name is correct and try again.") except Exception, e: self.browser.setText("Please specify correct connection details and try again")
def getstastv(a, apiset): index = ['RP' + str(i + 1) for i in range(a.shape[0])] sta = DataFrame(data=a, index=index, columns=apiset).transpose() stv = DataFrame(index=sta.index) stv['range'] = sta.max(axis=1) - sta.min(axis=1) stv['var'] = sta.var(axis=1) stv['skew'] = sta.skew(axis=1) stv['kurt'] = sta.kurt(axis=1) stv['cv'] = sta.std(axis=1) / sta.mean(axis=1) return sta, stv
def extractFeatures(self, data: DataFrame, columns): # 均值 A,协方差C,峰值K,偏度S, dataA = np.array(data.apply(np.average, axis=0)) dataC = np.array([x for x in data.apply(np.cov, axis=0).values]) # 分别使用df.kurt()方法和df.skew()即可完成峰度he偏度计算 dataK = np.array(data.kurt(axis=0)) dataS = np.array(data.skew(axis=0)) dataF = np.array(self.fft_T_function(data, columns)) df_features = np.concatenate( (dataA, dataC, dataK, dataS, dataF)) # len = 36列*5=180 return df_features
def moments_features(path): if not os.path.exists(path): logger.error(path + " is not exist!") return im = cv2.imread(path) [b, g, r] = cv2.split(im) moments = [] for n in [b, g, r]: df = DataFrame(np.array(n.flatten())) moments.extend( float(x) for x in [df.mean()[0], df.std()[0], df.skew()[0]]) return moments
def _feature_extraction(data: pd.DataFrame) -> pd.Series: def nlargest_index(df, n): return df.nlargest(n).index.unique()[0:n] # first 225 statistical features statistical = data.min() statistical = statistical.append(data.max(), ignore_index=True) statistical = statistical.append(data.mean(), ignore_index=True) statistical = statistical.append(data.skew(), ignore_index=True) statistical = statistical.append(data.kurtosis(), ignore_index=True) # FFT features fft = pd.DataFrame(np.fft.fft(data)) fft_angle = fft.applymap(np.angle) fft = fft.applymap(np.abs) largest_values = pd.Series() largest_angles = pd.Series() largest_indices = pd.Series() for i in range(0, 45): five_largest_idx = nlargest_index(fft.ix[:, i].map(abs), 5) # is map(abs) redundant? largest_indices = largest_indices.append(pd.Series(five_largest_idx), ignore_index=True) five_largest = fft_angle.ix[five_largest_idx, i].T largest_angles = largest_angles.append(five_largest) five_largest = fft.ix[five_largest_idx, i].T largest_values = largest_values.append(five_largest) # Autocorrelation autocorrelation = pd.Series() autocorrelation = autocorrelation.append(data.apply(lambda col: col.autocorr(1), axis=0)) for i in range(5, 51, 5): autocorrelation = autocorrelation.append(data.apply(lambda col: col.autocorr(i), axis=0)) # Make result feature_vector = pd.Series() feature_vector = feature_vector.append(statistical) feature_vector = feature_vector.append(largest_values) feature_vector = feature_vector.append(largest_angles) feature_vector = feature_vector.append(largest_indices) feature_vector = feature_vector.append(autocorrelation) return feature_vector
def filedata(filename): f=open(filename,'r') eigen_list = [] Act_fact = [] for line in f.readlines(): line=line.strip().split(" ") if '#' in line[0]: print(' ') else: AccXYZ =[[],[],[]] Acc = line[-1].split("*") for Accxyz in Acc: Accxyz=Accxyz.split(",") if len(Accxyz) == 3: AccXYZ[0].append(float(Accxyz[0])) AccXYZ[1].append(float(Accxyz[1])) AccXYZ[2].append(float(Accxyz[2])) AccXYZ = np.array(AccXYZ) num = float(AccXYZ.shape[1]) df = DataFrame(AccXYZ) skew = df.skew(1) #偏度 kurt = df.kurt(1) #峰度 mean = AccXYZ.mean(1)#均值 std = AccXYZ.std(1)#标准差 fft = np.fft.fft(AccXYZ)#傅里叶 pass_mean = passmean(AccXYZ,mean)/num#均值穿越次数 eigen_choose1 = np.append(np.array(skew),np.array(kurt)) eigen_choose2= np.append(mean,pass_mean) eigen_choose3= np.append(std,pow(abs(fft),2).sum(1)/num) eigen_choose12 = np.append(eigen_choose1,eigen_choose2) eigen_choose = np.append(eigen_choose12,eigen_choose3) if 'ACT' in line[0] : Act = line[-1].split(",") eigen_choose = np.append(eigen_choose,int(Act[-1])) eigen_list.append(list(eigen_choose)) f.close() eigen_fact_matrix = avg_data(eigen_list) return eigen_fact_matrix
def transform_features(X_train: pd.DataFrame, X_valid: pd.DataFrame, X_test: pd.DataFrame, parameters: dict) -> list: """Apply log transformations to skewed features Args: X_train: training data. X_valid: validation data. X_test: test data. Returns: A list containing the transformed training, validation and test data. """ log = logging.getLogger(__name__) paras = parameters["features"]["transformation"] threshold = paras[ "skew_threshold"] # threshold for applying log transformation inclusions = paras[ "inclusions"] # strings to include in the transformation exclusions = paras[ "exclusions"] # strings to exclude from the transformation # Select highly skewed variables feature_skew = X_train.skew().sort_values(ascending=False) skewed = list(feature_skew[feature_skew > threshold].index) log.info( blue("{} feature(s) exceed skew threshold of {}.".format( len(skewed), threshold))) if inclusions: n_skewed = len(skewed) skewed = [ var for var in skewed if any(str in var for str in inclusions) ] n_excluded = n_skewed - len(skewed) log.warning( red("Including {} variable(s) containing:\n{}.".format( n_excluded, inclusions))) pause() if exclusions: n_skewed = len(skewed) skewed = [ var for var in skewed if all(str not in var for str in exclusions) ] n_excluded = n_skewed - len(skewed) log.warning( red("Excluding {} variable(s) containing:\n{}.".format( n_excluded, exclusions))) pause() # Check variables are correct log.warning( red("Transforming {} variable(s): {}.".format(len(skewed), skewed))) pause() # Apply log transformation to skewed variables log.info( blue("Applying log transformation to {} feature(s).".format( len(skewed)))) log_X_train = X_train log_X_valid = X_valid log_X_test = X_test for var in skewed: var_name = "log_" + var log_X_train[var_name] = log_offset(log_X_train[var]) log_X_valid[var_name] = log_offset(log_X_valid[var]) log_X_test[var_name] = log_offset(log_X_test[var]) if paras["drop_vars"]: log_X_train = log_X_train.drop(var, axis=1) log_X_valid = log_X_valid.drop(var, axis=1) log_X_test = log_X_test.drop(var, axis=1) # Reorder columns log_X_train = log_X_train.reindex(sorted(log_X_train.columns), axis=1) log_X_valid = log_X_valid.reindex(sorted(log_X_train.columns), axis=1) log_X_test = log_X_test.reindex(sorted(log_X_train.columns), axis=1) return [log_X_train, log_X_valid, log_X_test]
del yhat #del path1 #del path2 # Making Data for RMSE statistics data_frame.to_csv('%s/after_RMSE.csv' % (directory_name3)) data = read_csv('%s/after_RMSE.csv' % (directory_name3), header=0, index_col=0) data = DataFrame(data) s = data.sum() ave = data.mean() median = data.median() var = data.var() std = data.std() skew = data.skew() kurt = data.kurt() frame_name = ['sum', 'ave', 'median', 'var', 'std', 'skew', 'kurt'] frame_name = DataFrame(frame_name) s = s.values ave = ave.values median = median.values var = var.values std = std.values skew = skew.values kurt = kurt.values data_numpy = [s, ave, median, var, std, skew, kurt] data = DataFrame(data_numpy)
# In[227]: df.describe() # In[229]: obj = Series(['a', 'a', 'b', 'c'] * 4) obj # In[230]: obj.describe() # In[231]: df.skew() # In[233]: from pandas_datareader import data as web all_data = {} for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']: all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010') # In[237]: price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.items()}) volume = DataFrame({tic: data['Volume'] for tic, data in all_data.items()}) # In[239]: