def view_data(pdf): # pdf.style.set_properties(**{'width': '300px'}) field = 'moves_learnt_by_tr' + '_' pdf = pdf.filter(regex=f'{field}|name', axis=1) # pdf.columns[pdf.columns.str.contains(f'({field}.*|name|generation)', flags=re.I,regex=True)] total_rows = len(pdf.index.value_counts()) unique_rows = len(pdf.drop_duplicates().index.value_counts()) dup_rows = len(pdf[pdf.duplicated()].index.value_counts()) percent_duped = (dup_rows / total_rows) * 100 ps = Enumerable([ # lambda: pdf.isna().mean().sort_values(ascending=False), lambda: pdf.columns, lambda: total_rows, lambda: unique_rows, lambda: dup_rows, lambda: percent_duped, lambda: pdf.dropna().sample(7), # lambda: pdf.sort_values(['name', 'generation', 'moves_learnt_by_level_up_lvl']).drop([], axis=1).sample(5), # lambda: pdf.sort_values(['name', 'generation', 'moves_learnt_by_level_up_lvl']).sample(5), # lambda: pdf[pdf.duplicated()].sort_values(['name', 'generation']) ]) u.foreach(lambda f: print(f()), ps)
import re import toml import os import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import numpy as np import functools as ft from py_linq import Enumerable this_dir = os.path.dirname(os.path.realpath(__file__)) config = toml.load(os.path.join(this_dir, 'config.toml')) u.set_full_paths(config, this_dir) csv_loc = config['file_locations']['raw_taco_bell'] df: pd.DataFrame = pd.read_csv(csv_loc) # type: ignore df.columns = df.columns.str.lower().str.replace('\\s+', '', regex=True) df.drop(['web-scraper-order', 'web-scraper-start-url', 'category-href'], inplace=True, axis=1) pd.set_option('display.max_rows', df.shape[0] + 1) pd.set_option('display.max_columns', df.shape[1] + 1) pdf = df ps = Enumerable([ lambda: pdf.columns, lambda: pdf, ]) u.foreach(lambda f: print(f()), ps) df.to_csv(config['file_locations']['clean_taco_bell'], index=False)