# print('{} ({})'.format(col, df[col].dtype if isinstance(df[col], pd.Series) else type(df[col]))) # print(stats) # and if you thought that was tough to read, try printing out all the report['freq'] dicts of histograms # In[43]: display(HTML("<style>.container { width:100% !important; }</style>")) pd.set_option('display.max_rows', 500) pd.set_option('display.max_columns', 500) pd.set_option('display.width', 500) desc['variables'] # In[38]: desc['table'] # In[29]: # desc.keys() html = pandas_profiling.to_html(df.head(), desc).encode('utf8') with open('report.html', 'w') as fout: fout.write(html) display(HTML(html)) # report = pandas_profiling.ProfileReport(df)
index_col='id', compression='gzip', quotechar='"', quoting=pd.io.common.csv.QUOTE_NONNUMERIC, low_memory=False) print('df.describe() stats:') desc = df.describe() for col, stats in desc.T.iterrows(): print('') print('{} ({})'.format( col, df[col].dtype if isinstance(df[col], pd.Series) else type(df[col]))) print(stats) html = pandas_profiling.to_html(df.head(3), desc) open('report.html', 'w').write(html) # this is redundant with stats above and takes way longer than it should (30 minutes?) # print('Column, Count, Min, Mean, Max:') # for k, c, colmin, colmean, colmax in izip(df.columns, df.count().T, df.min().T, df.mean().T, df.max().T): # print('{:40s}\t{}\t{}\t{}\t{}'.format(k, c, colmin, colmean, colmax)) # this takes a few minutes print( 'Trying to compute a ProfileReport, including correlation between columns, skew etc' ) # pandas_profiling.ProfileReport raises Tkinter exceptions before it can produce any output, # at least describe produces a dataframe of stats report = dict2obj(pandas_profiling.describe(df)) print(report['table'])
from pug.nlp.util import dict2obj # the round-trip to disk cleans up encoding issues so encoding option no longer needs to be specified and gzip df = pd.read_csv(os.path.join(DATA_PATH, 'cleaned_tweets.csv.gz'), index_col='id', compression='gzip', quotechar='"', quoting=pd.io.common.csv.QUOTE_NONNUMERIC, low_memory=False) print('df.describe() stats:') desc = df.describe() for col, stats in desc.T.iterrows(): print('') print('{} ({})'.format(col, df[col].dtype if isinstance(df[col], pd.Series) else type(df[col]))) print(stats) html = pandas_profiling.to_html(df.head(3), desc) open('report.html', 'w').write(html) # this is redundant with stats above and takes way longer than it should (30 minutes?) # print('Column, Count, Min, Mean, Max:') # for k, c, colmin, colmean, colmax in izip(df.columns, df.count().T, df.min().T, df.mean().T, df.max().T): # print('{:40s}\t{}\t{}\t{}\t{}'.format(k, c, colmin, colmean, colmax)) # this takes a few minutes print('Trying to compute a ProfileReport, including correlation between columns, skew etc') # pandas_profiling.ProfileReport raises Tkinter exceptions before it can produce any output, # at least describe produces a dataframe of stats report = dict2obj(pandas_profiling.describe(df)) print(report['table']) print('')
'Using pandas_profiling to generate more detailed stats, including correlation between columns, skew etc' ) # pandas_profiling.ProfileReport raises Tkinter exceptions before it can produce any output, # at least describe produces a dataframe of stats desc = pandas_profiling.describe(df) desc['table'] # for col, stats in desc['variables'].iterrows(): # print('') # print(col) # print('{} ({})'.format(col, df[col].dtype if isinstance(df[col], pd.Series) else type(df[col]))) # print(stats) # and if you thought that was tough to read, try printing out all the report['freq'] dicts of histograms # In[43]: desc['variables'] # In[38]: desc['table'] # In[29]: # desc.keys() html = pandas_profiling.to_html(df.head(), desc).encode('utf8') with open('report.html', 'w') as fout: fout.write(html) display(HTML(html)) # report = pandas_profiling.ProfileReport(df)