def boxplot_all(self, prop, show=None, fig=None): values, data = {}, [] for log in self.all_logs(filter_5=True): val = get_class_value(log, prop) if val is not None: values.setdefault(log.condition, []).append(val) conditions = EcoSonic_VP.condition_order_from_vp_id(1) data = [values[cond] for cond in conditions] boxplot(data, conditions, prop, 'condition', show, fig)
def main(): if DOWNLOAD: print("-- Downloading fresh log file from the server...") progression_run(downloader.download, (LOG_NAME,), 0.1) print "-- Parsing Log File...", pings = dict() progression_run(parsing, (LOG_NAME, pings), 0.2) real_pings = pings['real'] empty_pings = pings['empty'] print "-- There are", len(real_pings), "real pings :)" print "-- There are", len(empty_pings), "empty pings", (":(" if len(empty_pings)>0 else "") if len(sys.argv) == 2: # 1 argv plot all starting from date in argv real_pings = statistics.createFrame(real_pings, sys.argv[1]) elif len(sys.argv) == 3 and sys.argv[1] == '-1': real_pings = statistics.createFrame(real_pings, None, sys.argv[2]) elif len(sys.argv) == 3: begin, end = sys.argv[1], sys.argv[2] real_pings = statistics.createFrame(real_pings, begin, end) print("-- Generating Plots...") plots.pointoverTime(real_pings) plots.pointoverPingNum(real_pings) plots.statisticalPlot(real_pings) plots.boxplot(real_pings, statistics.segDataHours, 'hours') plots.boxplot(real_pings, statistics.segDataWeekdays, 'weekdays') plots.boxplot(real_pings, statistics.segDataMonths, 'months') plt.show()
plots.stemleaf(df, title='Stem and Leaf', save=True, savepath='.\\png\\plots\\stemleaf\\' + datasetname + '.txt') plots.histogram(df, save=True, savepath='.\\png\\plots\\histogram\\' + datasetname + '.png', close=True) plots.boxplot(df, save=True, savepath='.\\png\\plots\\boxplot\\' + datasetname + '.png', close=True) plots.scattermatrix(df, save=True, savepath='.\\png\\plots\\scattermatrix\\' + datasetname + '.png', close=True) plots.heatmap(df, save=True, savepath='.\\png\\plots\\heatmap\\' + datasetname + '.png', close=True)
import numpy as np df = pd.read_csv("train.csv") df.describe() df.head() df.columns # 7 col are num, 5 are cat df.isnull().sum() #age and cabin missing values df.dtypes df.Name plt.scatter(df,'Survived','Age') plt.scatter(df,df.index,'Age') plt.hist(df,'Cabin',30) plt.boxplot(df,'Age') import plotly.plotly as py from plotly.graph_objs import * data = {'x': df.Age.values, 'y': df.Fare.values, 'z': df.Survived.values, 'type': 'surface'} fig = Figure(data=data) py.plot(fig) ggplot(df,aes('Age','Fare',color='factor(Pclass)'))+\ geom_point(shape=df.Survived)
def eda(filepath: str, features=None, targets=None, removeOutliers: bool = False, datasetname: str = ''): # load the data df = pk.load(open(filepath, 'rb')) # process inputs # TODO: properly infer if features or targets are a sequence or a single string if features is None: features = list(set(df.columns) - set(targets)) # examine the data print( '----------------------------------------------------------------------' ) print('{0}Shape of dataset:'.format(' ')) print( '----------------------------------------------------------------------' ) print('{0}Number of Rows: {1}'.format(' ', df.shape[0])) print('{0}Number of Columns: {1}'.format(' ', df.shape[1])) print('', end='\n\n\n') print( '----------------------------------------------------------------------' ) print('{0}Column names:'.format(' ')) print( '----------------------------------------------------------------------' ) for col in df.columns: print('{0}{1}'.format(' ', col)) print('', end='\n\n\n') print( '----------------------------------------------------------------------' ) print('{0}First 10 rows:'.format(' ')) print( '----------------------------------------------------------------------' ) print(df.head(10)) print('', end='\n\n\n') print( '----------------------------------------------------------------------' ) print('{0}Last 10 rows:'.format(' ')) print( '----------------------------------------------------------------------' ) print(df.tail(10)) print('', end='\n\n\n') print( '----------------------------------------------------------------------' ) print('{0}Statistical Summary:'.format(' ')) print( '----------------------------------------------------------------------' ) print(df.describe()) print('', end='\n\n\n') # ---------------------------------------------------------------------- # infer data types of the input DataFrame # ---------------------------------------------------------------------- colNumeric = dfutl.numericColumns(df) # ---------------------------------------------------------------------- # mean centering and scaling: standardize or normalize # ---------------------------------------------------------------------- dfNumeric = df.loc[:, colNumeric] df.loc[:, colNumeric] = (dfNumeric - dfNumeric.mean()) / dfNumeric.std() dfNumeric = df.loc[:, colNumeric] # ---------------------------------------------------------------------- # outlier detection # ---------------------------------------------------------------------- # use z-score filtering # samples that are more than 3 standard deviations away from mean are to be discarded print( '----------------------------------------------------------------------' ) print('{0}Outlier Detection:'.format(' ')) print( '----------------------------------------------------------------------' ) numouttotal = 0 numout = 1 passNum = 0 while (numout > 0): # determine the number of outliers using zscore zscores = stats.zscore(dfNumeric) idx = np.logical_not(np.logical_or(zscores < -3, zscores > 3)) idxrows = np.all(idx, axis=1) idxrowsout = np.logical_not(idxrows) numout = len(idxrows) - len(idxrows[idxrows]) print('{0}Pass {1} detected {2} outliers'.format( ' ', passNum, numout)) if not removeOutliers: break # remove outliers and contineu if (numout > 0 and removeOutliers): df = df.loc[idxrows, :] dfNumeric = df.loc[:, colNumeric] numouttotal = numouttotal + numout passNum = passNum + 1 if removeOutliers: print('{0}Total number of outliers: {1}'.format(' ', numouttotal)) print('', end='\n\n\n') # ---------------------------------------------------------------------- # visualization # ---------------------------------------------------------------------- plt.close('all') save = True if len(datasetname) > 0: savepath = '.\\png\\{0}\\eda\\'.format(datasetname) isdir = os.path.isdir(savepath) if not isdir: os.makedirs(savepath) else: savepath = '.\\png\\' plots.boxplot(dfNumeric, save=save, savepath=savepath) plots.histogram(df, tightLayout=True, save=save, savepath=savepath) plots.scattermatrix(dfNumeric, save=save, savepath=savepath) plots.heatmap(dfNumeric, correlation=0.5, save=save, savepath=savepath) #plt.show() plt.close('all') return df
# Numerical summaries of data print(df.describe()) plots.stemleaf(df ,title = 'Stem and Leaf' ,save = True ,savepath = '.\\visual\\iris_stemleaf.txt') plots.histogram(df ,save = True ,savepath = '.\\visual\\iris_histogram.png' ,close = True) plots.boxplot(df ,save = True ,savepath = '.\\visual\\iris_boxplot.png' ,close = True) plots.scattermatrix(df ,save = True ,savepath = '.\\visual\\iris_scattermatrix.png' ,close = True) plots.heatmap(df ,save = True ,savepath = '.\\visual\\iris_heatmap.png' ,close = True) plots.probplot(df ,save = True ,savepath = '.\\visual\\iris_probplot.png'