def main(): original_stdout = ds.html_begin(output_url=output_url, header_title=header_title, header_id=header_id) # Example 1 series_x = ds.datetime_data() series_y = ds.random_data() fig, ax = ds.plot_line_x_y(X=series_x, y=series_y) fig.savefig(fname='plot_line_x_y_datex_test.svg', format='svg') ds.html_figure(file_name='plot_line_x_y_datex_test.svg') # Example 2 series_x = ds.random_data(distribution='randint').sort_values() fig, ax = ds.plot_line_x_y(X=series_x, y=series_y, figsize=(8, 4.5), marker='o', markersize=8, linestyle=':', colour='#337733') fig.savefig(fname='plot_line_x_y_intx_test.svg', format='svg') ds.html_figure(file_name='plot_line_x_y_intx_test.svg') # Example 3 series_x = ds.random_data(distribution='uniform').sort_values() fig, ax = ds.plot_line_x_y(X=series_x, y=series_y) fig.savefig(fname='plot_line_x_y_uniformx_test.svg', format='svg') ds.html_figure(file_name='plot_line_x_y_uniformx_test.svg') # Example 4 series_x = ds.random_data().sort_values() fig, ax = ds.plot_line_x_y(X=series_x, y=series_y) fig.savefig(fname='plot_line_x_y_normx_test.svg', format='svg') ds.html_figure(file_name='plot_line_x_y_normx_test.svg') ds.html_end(original_stdout=original_stdout, output_url=output_url)
def main(): original_stdout = ds.html_begin(output_url=output_url, header_title=header_title, header_id=header_id) series_x = ds.datetime_data() series_y1 = ds.random_data() series_y2 = ds.random_data() fig, ax = ds.plot_scatter_scatter_x_y1_y2(X=series_x, y1=series_y1, y2=series_y2) fig.savefig(fname='plot_scatter_scatter_x_y1_y2_datex_test.svg', format='svg') ds.html_figure(file_name='plot_scatter_scatter_x_y1_y2_datex_test.svg') series_x = ds.random_data(distribution='uniform') fig, ax = ds.plot_scatter_scatter_x_y1_y2(X=series_x, y1=series_y1, y2=series_y2, figsize=(8, 5), marker1='o', marker2='+', markersize1=8, markersize2=12, colour1='#cc3311', colour2='#ee3377', labellegendy1='y1', labellegendy2='y2') ax.legend(frameon=False) fig.savefig(fname='plot_scatter_scatter_x_y1_y2_test.svg', format='svg') ds.html_figure(file_name='plot_scatter_scatter_x_y1_y2_test.svg') ds.html_end(original_stdout=original_stdout, output_url=output_url)
def exit_script( *, original_stdout: IO[str], output_url: str ) -> NoReturn: """ Exit from a script and complete the html file. Parameters ---------- original_stdout : IO[str] The original stdout. output_url : str The output url. Example ------- import datasense as ds ds.exit_script( original_stdout=original_stdout, output_url=output_url ) """ html_end( original_stdout=original_stdout, output_url=output_url ) sys.exit()
def main(): input_value = eval(input(r'module.file.function name? > ')) original_stdout = ds.html_begin(output_url=output_url, header_title=header_title, header_id=header_id) print('<pre style="white-space: pre-wrap;">') help(input_value) print('</pre>') ds.html_end(original_stdout=original_stdout, output_url=output_url)
def main(): original_stdout = ds.html_begin(output_url=output_url, header_title=header_title, header_id=header_id) data = ds.random_data() fig, ax = ds.probability_plot(data=data) fig.savefig(fname='probability_plot_test.svg', format='svg') ds.html_figure(file_name='probability_plot_test.svg') ds.html_end(original_stdout=original_stdout, output_url=output_url)
def main(): original_stdout = ds.html_begin(output_url=output_url, header_title=header_title, header_id=header_id) print('<pre style="white-space: pre-wrap;">') series = ds.datetime_data() print('datetime series') print(series) print('</pre>') ds.html_end(original_stdout=original_stdout, output_url=output_url)
def main(): chdir(Path(__file__).parent.resolve()) # required for cron output_url = 'commits.html' header_title = 'Commits' header_id = 'commits' original_stdout = ds.html_begin(output_url=output_url, header_title=header_title, header_id=header_id) activity = recent_activity() plot_recent_activity(activity) activity.to_csv('activity.csv') ds.html_end(original_stdout=original_stdout, output_url=output_url)
def main(): original_stdout = ds.html_begin(output_url=output_url, header_title=header_title, header_id=header_id) print(help(ds.plot_pareto)) # Example 1 data = pd.DataFrame({ 'ordinate': ['Mo', 'Larry', 'Curly', 'Shemp', 'Joe'], 'abscissa': [21, 2, 10, 4, 16] }) fig, ax1, ax2 = ds.plot_pareto(X=data['ordinate'], y=data['abscissa']) fig.savefig(fname='pareto.svg', format='svg') ds.html_figure(file_name='pareto.svg') ds.html_end(original_stdout=original_stdout, output_url=output_url)
def main(): start_time = time.time() original_stdout = ds.html_begin(output_url=output_url, header_title=header_title, header_id=header_id) data = create_data() ds.page_break() xbar_chart(df=data) ds.page_break() r_chart(df=data) stop_time = time.time() ds.page_break() ds.report_summary(start_time=start_time, stop_time=stop_time) ds.html_end(original_stdout=original_stdout, output_url=output_url)
def main(): start_time = time.time() global figsize, date_time_parser file_names, graph_file_names, abscissa_names, ordinate_names,\ ordinate_predicted_names, x_axis_label, y_axis_label, axis_title,\ figsize, column_names_sort, date_time_parser,\ date_formatter, alpha_value, function, output_url,\ header_title, header_id, parser = parameters() original_stdout = ds.html_begin(output_url=output_url, header_title=header_title, header_id=header_id) print('<pre style="white-space: pre-wrap;">') for (file_name, abscissa_name, ordinate_name, ordinate_predicted_name, date_time_parser, column_names_sort, date_formatter, graph_file_name) in zip(file_names, abscissa_names, ordinate_names, ordinate_predicted_names, date_time_parser, column_names_sort, date_formatter, graph_file_names): if date_time_parser == 'None': data = ds.read_file(file_name=file_name, sort_columns=column_names_sort, sort_columns_bool=True) else: data = ds.read_file(file_name=file_name, parse_dates=[abscissa_name], sort_columns=column_names_sort, sort_columns_bool=True) data[ordinate_predicted_name] = data[ordinate_name]\ .ewm(alpha=alpha_value).mean() fig, ax = ds.plot_scatter_line_x_y1_y2( X=data[abscissa_name], y1=data[ordinate_name], y2=data[ordinate_predicted_name], figsize=figsize) ax.set_title(label=axis_title, fontweight='bold') ax.set_xlabel(xlabel=x_axis_label, fontweight='bold') ax.set_ylabel(ylabel=y_axis_label, fontweight='bold') ds.despine(ax=ax) fig.savefig(fname=f'{graph_file_name}.svg', format='svg') ds.html_figure(file_name=f'{graph_file_name}.svg') ds.page_break() stop_time = time.time() ds.report_summary(start_time=start_time, stop_time=stop_time, read_file_names=file_names, targets=ordinate_names, features=abscissa_names) print('</pre>') ds.html_end(original_stdout=original_stdout, output_url=output_url)
def main(): original_stdout = ds.html_begin( output_url=output_url, header_title=header_title, header_id=header_id ) # Example 1 series_y = ds.random_data() fig, ax = ds.plot_scatter_y(y=series_y) fig.savefig(fname="plot_scatter_y_test_1.svg", format="svg") ds.html_figure(file_name="plot_scatter_y_test_1.svg") # Example 2 fig, ax = ds.plot_scatter_y( y=series_y, figsize=(8, 4.5), marker="o", markersize=4, colour="#ee7733" ) fig.savefig(fname="plot_scatter_y_test_2.svg", format="svg") ds.html_figure(file_name="plot_scatter_y_test_2.svg") ds.html_end(original_stdout=original_stdout, output_url=output_url)
def main(): # Define parameters title_directory_out = 'Name of the directory to save as?' title_directory_in = 'Name of the directory to read in?' output_url = 'extract_text_from_pdf_file.html' header_title = 'Extract text from pdf file' header_id = 'extract-text-from-pdf-file' chdir(Path(__file__).parent.resolve()) extension_in = ['.pdf', '.PDF'] extension_out = '.txt' # Request file to read path_to_files_in = ds.ask_directory_path(title=title_directory_in, initialdir=Path.cwd()) # Request file to save path_to_files_out = ds.ask_directory_path( title=title_directory_out, initialdir=Path(*path_to_files_in.parts[:-1])) # Begin html output original_stdout = ds.html_begin(output_url=output_url, header_title=header_title, header_id=header_id) start_time = time.perf_counter() list_raw_files = ds.directory_file_list(directory=path_to_files_in, patterns=extension_in) # Process pdf, save txt for item in list_raw_files: string_with_lines = pdf_to_text(path=item) tidy = tidy_string(string=string_with_lines) save_to_file(path=Path(path_to_files_out, f'{Path(item).stem}{extension_out}'), string=tidy) list_raw_file_names = [Path(item).name for item in list_raw_files] list_txt_file_names = [ f'{Path(item).stem}{extension_out}' for item in list_raw_files ] stop_time = time.perf_counter() ds.report_summary(start_time=start_time, stop_time=stop_time, print_heading=False) ds.print_list_by_item(list=list_raw_file_names, title='Files read:') ds.print_list_by_item(list=list_txt_file_names, title='Files saved:') # End html output ds.html_end(original_stdout=original_stdout, output_url=output_url)
def main(): original_stdout = ds.html_begin(output_url=output_url, header_title=header_title, header_id=header_id) # Example 1 series_y = ds.random_data() fig, ax = ds.plot_line_y(y=series_y) fig.savefig(fname='plot_line_y_test_1.svg', format='svg') ds.html_figure(file_name='plot_line_y_test_1.svg') # Example 2 fig, ax = ds.plot_line_y(y=series_y, figsize=(8, 4.5), marker='o', markersize=4, linestyle=':', colour='#ee7733') fig.savefig(fname='plot_line_y_test_2.svg', format='svg') ds.html_figure(file_name='plot_line_y_test_2.svg') ds.html_end(original_stdout=original_stdout, output_url=output_url)
def main(): start_time = time.time() original_stdout = ds.html_begin(output_url=output_url, header_title=header_title, header_id=header_id) data = ds.random_data(distribution='norm', size=42, loc=69, scale=13) data = pd.DataFrame(data=data, columns=['X']) # print('dtype:', type(data).__name__) # print(data.head()) # Create X control chart ds.page_break() fig = plt.figure(figsize=figsize) x = cc.X(data=data) # print('class:', type(x).__name__) ax = x.ax(fig) fig.savefig(fname=graph_x_file_name) ds.html_figure(file_name=graph_x_file_name) print(f'X Report\n' f'============\n' f'UCL : {x.ucl.round(3)}\n' f'Xbar : {x.mean.round(3)}\n' f'LCL : {x.lcl.round(3)}\n' f'Sigma(X) : {x.sigma.round(3)}\n') # Create mr chart fig = plt.figure(figsize=figsize) mr = cc.mR(data=data) # print('class:', type(x).__name__) ax = mr.ax(fig) fig.savefig(fname=graph_mr_file_name) ds.html_figure(file_name=graph_mr_file_name) print(f'mR Report\n' f'============\n' f'UCL : {mr.ucl.round(3)}\n' f'mRbar : {mr.mean.round(3)}\n' f'LCL : {round(mr.lcl, 3)}\n' f'Sigma(mR) : {mr.sigma.round(3)}\n') stop_time = time.time() ds.page_break() ds.report_summary(start_time=start_time, stop_time=stop_time) ds.html_end(original_stdout=original_stdout, output_url=output_url)
def main(): start_time = time.time() global figsize, axis_title, x_axis_label, y_axis_label,\ graphics_directory file_names, targets, features, number_knots, graphics_directory,\ figsize, x_axis_label, y_axis_label, axis_title,\ date_parser, output_url, header_title, header_id = parameters() ds.create_directory(directories=graphics_directory) original_stdout = ds.html_begin(output_url=output_url, header_title=header_title, header_id=header_id) ds.page_break() for file_name, target, feature in zip(file_names, targets, features): data = ds.read_file(file_name=file_name, parse_dates=features) data[target] = data[target].fillna(data[target].mean()) dates = True X = pd.to_numeric(data[feature]) y = data[target] t = ((X, y, file_name, target, feature, knot, dates) for knot in number_knots) with Pool() as pool: for _ in pool.imap_unordered(plot_scatter_line, t): pass for knot in number_knots: ds.html_figure(file_name=f'{graphics_directory}/' f'spline_{file_name.strip(".csv")}_' f'{target}_{feature}_{knot}.svg') stop_time = time.time() ds.page_break() ds.report_summary(start_time=start_time, stop_time=stop_time, read_file_names=file_names, targets=targets, features=features, number_knots=number_knots) ds.html_end(original_stdout=original_stdout, output_url=output_url)
def main(): original_stdout = ds.html_begin(output_url=output_url, header_title=header_title, header_id=header_id) # Example 1 series_x1 = ds.datetime_data() series_x2 = ds.datetime_data() series_y1 = ds.random_data() series_y2 = ds.random_data() fig, ax = ds.plot_scatter_scatter_x1_x2_y1_y2(X1=series_x1, X2=series_x2, y1=series_y1, y2=series_y2) fig.savefig(fname='plot_scatter_scatter_x1_x2_y1_y2_datex_test.svg', format='svg') ds.html_figure(file_name='plot_scatter_scatter_x1_x2_y1_y2_datex_test.svg') # Example 2 fig, ax = ds.plot_scatter_scatter_x1_x2_y1_y2( X1=series_x1, X2=series_x2, y1=series_y1, y2=series_y2, smoothing='natural_cubic_spline', number_knots=7) fig.savefig(fname=('plot_scatter_scatter_x1_x2_y1_y2_' 'datex_smoothing_y1_y2_test.svg'), format='svg') ds.html_figure(file_name=( 'plot_scatter_scatter_x1_x2_y1_y2_datex_smoothing_y1_y2_test.svg')) # Example 3 series_x1 = ds.random_data(distribution='uniform').sort_values() series_x2 = ds.random_data(distribution='uniform').sort_values() fig, ax = ds.plot_scatter_scatter_x1_x2_y1_y2(X1=series_x1, X2=series_x2, y1=series_y1, y2=series_y2, figsize=(8, 5), marker1='o', marker2='+', markersize1=8, markersize2=12, colour1='#cc3311', colour2='#ee3377', labellegendy1='y1', labellegendy2='y2') ax.legend(frameon=False) fig.savefig(fname='plot_scatter_scatter_x1_x2_y1_y2_test.svg', format='svg') ds.html_figure(file_name='plot_scatter_scatter_x1_x2_y1_y2_test.svg') # Example 4 fig, ax = ds.plot_scatter_scatter_x1_x2_y1_y2( X1=series_x1, X2=series_x2, y1=series_y1, y2=series_y2, figsize=(8, 5), marker1='o', marker2='+', markersize1=8, markersize2=12, colour1='#cc3311', colour2='#ee3377', labellegendy1='y1', labellegendy2='y2', smoothing='natural_cubic_spline', number_knots=7) ax.legend(frameon=False) fig.savefig( fname='plot_scatter_scatter_x1_x2_y1_y2_smoothing_y1_y2_test.svg', format='svg') ds.html_figure( file_name='plot_scatter_scatter_x1_x2_y1_y2_smoothing_y1_y2_test.svg') ds.html_end(original_stdout=original_stdout, output_url=output_url)
def main(): header_title = 'Create dataframe' header_id = 'create-dataframe' output_url = 'create_dataframe.html' original_stdout = ds.html_begin(output_url=output_url, header_title=header_title, header_id=header_id) print('Create a pandas dataframe in different ways') print() print('Create lists and dictionaries to use in the dataframe') print() list_1 = [1, 2, np.nan, 4, 5] print('list_1 of Int64') print(list_1) dict_1 = {'A': list_1} print('dict_1') print(dict_1) list_2 = [6.0, np.nan, 8.0, 9.0, 10.0] print() print('list_2 of float64') print(list_2) dict_2 = {'B': list_2} print('dict_2:') print(dict_2) list_3 = ['a', 'b', 'c', '', 'e'] print() print('list_3 of str') print(list_3) dict_3 = {'C': list_3} print('dict_3:') print(dict_3) list_of_lists = [list_1, list_2, list_3] print() print('list of lists:') print(list_of_lists) dict_of_lists = {**dict_1, **dict_2, **dict_3} print() print('dict_of_lists:') print(dict_of_lists) print() dict_types = {'A': 'Int64', 'B': 'float64', 'C': 'str'} # Method zero print('Method zero. Use ds.create_dataframe()') df0 = ds.create_dataframe() print(df0.head(10)) print(df0.dtypes) print() print('dtype of column cs:', df0['cs'].dtype) print() # Method one df1 = pd.DataFrame(data={ **{ 'A': list_1 }, **{ 'B': list_2 }, **{ 'C': list_3 }, }).astype(dtype=dict_types) print('Method one') print(df1) print(df1.dtypes) print() # Method two df2 = pd.DataFrame(data=dict_of_lists).astype(dtype=dict_types) print('Method two') print(df2) print(df2.dtypes) print('df2:') print(df2) print(df2.dtypes) print() # Method three df3 = pd.DataFrame( data={ 'A': [1, 2, np.nan, 4, 5], 'B': [6.0, np.nan, 8.0, 9.0, 10.0], 'C': ['a', 'b', 'c', '', 'e'] }).astype(dtype=dict_types) print('Method three') print(df3) print(df3.dtypes) print() # Method four dict_of_lists = { 'A': [1, 2, np.nan, 4, 5], 'B': [6.0, np.nan, 8.0, 9.0, 10.0], 'C': ['a', 'b', 'c', '', 'e'] } df4 = pd.DataFrame(data=dict_of_lists).astype(dtype=dict_types) print('Method four') print(df4) print(df4.dtypes) print() # Method five df5 = pd.DataFrame(data=dict_of_lists, ).astype(dtype=dict_types) print('Method five') print(df5) print(df5.dtypes) print() # Method six dict_of_lists = {'A': list_1, 'B': list_2, 'C': list_3} df6 = pd.DataFrame(data=dict_of_lists).astype(dtype=dict_types) print('Method six') print(df6) print(df6.dtypes) print() ds.html_end(original_stdout=original_stdout, output_url=output_url)
def main(): start_time = time.time() pd.options.display.max_columns = 100 pd.options.display.max_rows = 100 pd.options.display.width = 120 file_name = 'myfile.csv' original_stdout = ds.html_begin( output_url=output_url, header_title=header_title, header_id=header_id ) help(ds.read_file) print() print('Create dataframe') df = ds.create_dataframe() print(df.head()) print(df.columns) print(df.dtypes) ds.dataframe_info( df=df, file_in=file_name ) help(ds.save_file) print() ds.save_file( df=df, file_name=file_name ) # Example 1 # Read a csv file. There is no guarantee the column dtypes will be correct. print('Example 1. Only [a, i, s, x, z] have the correct dtypes.') df = ds.read_file(file_name=file_name) print(df.dtypes) print() # Example 2 # Read a csv file. Ensure the dtypes of datetime columns. print('Example 2. Ensure the dtypes of datetime columns.') parse_dates = ['t', 'u'] df = ds.read_file( file_name=file_name, parse_dates=parse_dates ) print(df.dtypes) print() # Example 3 # Read a csv file. Ensure the dtypes of columns; not timedelta, datetime. print('Example 3. Ensure the dtypes of cols; not timedelta, datetime.') convert_dict = { 'a': 'float64', 'b': 'boolean', 'c': 'category', 'i': 'float64', 'r': 'str', 's': 'str', 'x': 'float64', 'y': 'Int64', 'z': 'float64' } df = ds.read_file( file_name=file_name, dtype=convert_dict ) print(df.dtypes) print() # Example 4 # Read a csv file. Ensure the dtypes of columns. Rename the columns. print( 'Example 4. Ensure the column dtypes are correct. Rename the columns.' ) column_names_dict = { 'a': 'A', 'b': 'B', 'c': 'C', 'd': 'D', 'i': 'I', 'r': 'R', 'r': 'R', 's': 'S', 't': 'T', 'u': 'U', 'x': 'X', 'y': 'Y', 'z': 'Z' } index_columns = ['Y'] parse_dates = ['t', 'u'] time_delta_columns = ['D'] category_columns = ['C'] integer_columns = ['A', 'I'] float_columns = ['X'] boolean_columns = ['R'] object_columns = ['Z'] sort_columns = ['I', 'A'] sort_columns_bool = [True, False] data = ds.read_file( file_name=file_name, column_names_dict=column_names_dict, index_columns=index_columns, parse_dates=parse_dates, date_parser=date_parser(), time_delta_columns=time_delta_columns, category_columns=category_columns, integer_columns=integer_columns, float_columns=float_columns, boolean_columns=boolean_columns, object_columns=object_columns, sort_columns=sort_columns, sort_columns_bool=sort_columns_bool ) print(data.head(10)) print() print('column dtypes') print(data.dtypes) print(data.info(verbose=True)) print() print('index', data.index.name, 'dtype:', data.index.dtype) print() ds.dataframe_info( df=data, file_in=file_name ) # Example 5 # Read an ods file. file_name = 'myfile.ods' df = ds.create_dataframe() ds.save_file( df=df, file_name=file_name ) parse_dates = ['t', 'u'] df = ds.read_file( file_name=file_name, parse_dates=parse_dates ) print( 'Example 5. Read an ods file.' ) print(data.head(10)) print() print('column dtypes') print(data.dtypes) print(data.info(verbose=True)) print() ds.dataframe_info( df=data, file_in=file_name ) # Example 6 # Read an xlsx file. df = ds.read_file(file_name=file_name) file_name = 'myfile.xlsx' sheet_name = 'raw_data' ds.save_file( df=df, file_name=file_name, sheet_name=sheet_name ) df = ds.read_file( file_name=file_name, sheet_name=sheet_name ) ds.dataframe_info( df=df, file_in=file_name ) stop_time = time.time() ds.page_break() ds.report_summary( start_time=start_time, stop_time=stop_time, read_file_names=file_name, save_file_names=file_name ) ds.html_end( original_stdout=original_stdout, output_url=output_url )
def main(): pd.options.display.width = 220 pd.options.display.max_columns = 220 pd.options.display.max_rows = 220 output_url = 'pivot_tables.html' header_title = 'Pivot tables' header_id = 'pivot-tables' original_stdout = ds.html_begin(output_url=output_url, header_title=header_title, header_id=header_id) df = ds.read_file(file_name='sales_funnel.xlsx') ds.dataframe_info(df=df, file_in='sales_funnel.xlsx') print(df.head()) print() print('Pivot table, implicit parameters') print( pd.pivot_table(data=df, values=['Price'], index=['Manager']).round(2)) print() print('Pivot table, explicit parameters') print( pd.pivot_table(data=df, values=['Price'], index=['Manager'], aggfunc='mean').round(2)) print() print('Pivot table, multiple-level index') print( pd.pivot_table(data=df, values=['Price'], index=['Manager', 'Rep'], aggfunc='mean').round(2)) print() print('Pivot table, multi-parameter aggfunc') print( pd.pivot_table(data=df, values=['Price'], index=['Manager', 'Rep'], aggfunc={ 'Price': [np.mean, np.sum, len] }).round(2)) print() print('Pivot table, columns parameter is optional') print( pd.pivot_table(data=df, values=['Price'], index=['Manager', 'Rep'], columns=['Product'], aggfunc=[np.sum]).round(2)) print() print('Pivot table, replace NaN with 0') print( pd.pivot_table(data=df, values=['Price'], index=['Manager', 'Rep'], columns=['Product'], aggfunc=[np.sum], fill_value=0).round(2)) print() print('Pivot table, add second colume to values parameter') print( pd.pivot_table(data=df, values=['Price', 'Quantity'], index=['Manager', 'Rep'], columns=['Product'], aggfunc=[np.sum], fill_value=0).round(2)) print() print('Pivot table, product column moved to the index') print( pd.pivot_table(data=df, values=['Price', 'Quantity'], index=['Manager', 'Rep', 'Product'], aggfunc=[np.sum], fill_value=0).round(2)) print() print('Pivot table, show totals') print( pd.pivot_table(data=df, values=['Price', 'Quantity'], index=['Manager', 'Rep', 'Product'], aggfunc=[np.sum], fill_value=0, margins=True).round(2)) print() print('Pivot table, change categories') print( pd.pivot_table(data=df, values=['Price', 'Quantity'], index=['Manager', 'Status'], aggfunc=[np.sum], fill_value=0, margins=True).round(2)) print() print('Pivot table, pass dictionary to aggfunc') print( pd.pivot_table(data=df, values=['Price', 'Quantity'], index=['Manager', 'Status'], columns=['Product'], aggfunc={ 'Quantity': len, 'Price': np.sum }, fill_value=0, margins=True).round(2)) print() print('Pivot table, pass dictionary to aggfunc') print( pd.pivot_table(data=df, values=['Price', 'Quantity'], index=['Manager', 'Status'], columns=['Product'], aggfunc={ 'Quantity': len, 'Price': [np.sum, np.mean] }, fill_value=0).round(2)) print() print('Pivot table, save to variable') table = pd.pivot_table(data=df, values=['Price', 'Quantity'], index=['Manager', 'Status'], columns=['Product'], aggfunc={ 'Quantity': len, 'Price': [np.sum, np.mean] }, fill_value=0).round(2) print(table) print() print('Pivot table, sort on price, mean CPU') table = table.sort_values(by=('Price', 'mean', 'CPU'), ascending=False) print(table) print() print('Pivot table, filter for one manager') table = table.query('Manager == ["Debra Henley"]') print(table) print() print('Pivot table, sort and filter with multiple "dots"') table = pd.pivot_table( data=df, values=['Price', 'Quantity'], index=['Manager', 'Status'], columns=['Product'], aggfunc={'Quantity': len, 'Price': [np.sum, np.mean]}, fill_value=0 )\ .sort_values(by=('Price', 'mean', 'CPU'), ascending=False)\ .query('Manager == ["Debra Henley"]')\ .round(2) print(table) print() print('Pivot table, another query') table = pd.pivot_table( data=df, values=['Price', 'Quantity'], index=['Manager', 'Status'], columns=['Product'], aggfunc={'Quantity': len, 'Price': [np.sum, np.mean]}, fill_value=0 )\ .query('Status == ["pending", "won"]')\ .round(2) print(table) print() print('Pivot table, another query') table = pd.pivot_table( data=df, values=['Price', 'Quantity'], index=['Manager', 'Status'], columns=['Product'], aggfunc={'Quantity': len, 'Price': [np.sum, np.mean]}, fill_value=0 )\ .query('Status == ["pending", "won"]')\ .query('Manager == ["Debra Henley"]')\ .round(2) print(table) print() print('Pivot table, another query') table = pd.pivot_table( data=df, values=['Price', 'Quantity'], index=['Manager', 'Status'], columns=['Product'], aggfunc={'Quantity': len, 'Price': [np.sum, np.mean]}, fill_value=0 )\ .query('Status == ["pending", "won"] & Manager == ["Debra Henley"]')\ .round(2) print(table) ds.html_end(original_stdout=original_stdout, output_url=output_url)
def main(): data_pumps = { 'x': [8.7, 11, 13.4, 14.9, 8.7, 8.9, 12.6, 10.7, 13.5, 16.4, 18.9, 16, 9], 'y': [17.9, 18.5, 17.4, 17.8, 14.9, 12.8, 11.7, 7.4, 8, 9.3, 9.7, 5, 5.1] } data_deaths = { 'x': [ 13.6, 9.9, 14.7, 15.2, 13.2, 13.8, 13.1, 11, 15.2, 11.1, 11.7, 12.3, 10.6, 14.6, 16.6, 9.5, 13.3, 15, 15.1, 10.9, 12.5, 11.8, 12.2, 13.9, 12.3, 11, 11, 13.5, 10.8, 12.2, 13.9, 12.5, 15.7, 12.9, 13, 13.7, 13.1, 13.4, 14.8, 13.2, 9.8, 12.5, 13.4, 14.4, 16, 10.9, 12.5, 15.8, 16.5, 11.2, 15.8, 11, 11.7, 11.5, 11.8, 13, 14.1, 14.8, 12.6, 14.6, 12.5, 14.5, 9.2, 17.9, 11.2, 9.5, 10.8, 16.1, 10.4, 13.7, 15.8, 12.2, 11.5, 15.4, 15.9, 10.2, 14, 16.5, 17.5, 13.8, 14.1, 14.7, 12.6, 11.7, 14.4, 15.2, 15.8, 13.9, 15.2, 13.2, 12.7, 15.1, 12.8, 13.6, 14, 15.4, 14.8, 10.3, 10.5, 9.9, 14.2, 13.6, 15, 15.7, 9.2, 16.8, 13.3, 10.6, 8.3, 13.4, 14.5, 16.8, 14.7, 10.3, 13.2, 9.1, 15.2, 8.4, 15.3, 13.3, 13.2, 12.6, 13.9, 13.5, 15.2, 11.6, 9.4, 11.4, 11.2, 11.9, 15, 13.6, 14.1, 10.5, 10.3, 13.8, 13.4, 12.6, 15.7, 13.8, 13.2, 15.1, 13.2, 14.6, 14.4, 15, 13.1, 13.4, 11.8, 16, 10.6, 14.6, 9.3, 14.5, 10.5, 14, 10.6, 10.6, 15.7, 11.8, 11.8, 11.6, 15.8, 10.7, 13.4, 11.2, 15.5, 14, 13.4, 14.6, 15.3, 14.5, 11.4, 12.5, 13.8, 10.6, 10.5, 14.8, 14.2, 13.3, 10.4, 12.5, 15, 10.6, 13.1, 10.1, 11.2, 12.9, 12.2, 12.4, 11.9, 11.5, 11.7, 12.5, 14.8, 14, 12.5, 9.2, 10.4, 12.7, 9.1, 8.3, 15.3, 11.2, 11.8, 12.7, 11.8, 12.2, 12.7, 13.6, 8.8, 12.2, 16.2, 12.8, 13.6, 12.7, 15, 13.9, 13.6, 12.5, 9.9, 17.6, 11.1, 16.7, 9.7, 13.4, 13.1, 13.7, 11.7, 13.1, 13.1, 12.3, 14.9, 11.5, 12.9, 11, 15.8, 12.9, 12.3, 9.8, 12.7, 12.7, 16, 11, 14.1, 11, 14.5, 15.6, 9.6, 15.5, 14.4, 11.4, 13.7, 11.4, 16, 9.9, 10.8, 12.6, 9.3, 13.8, 13.8, 14.7, 15.3, 14.1, 13.8, 11, 12.4, 14.8, 15, 15.4, 12.8, 11.3, 10.5, 11, 13.4, 10.3, 9.6, 14.5, 11.5, 12.7, 15, 12.3, 12.9, 15.3, 15.8, 13.1, 12.9, 15.3, 13, 12.5, 15.8, 14, 13, 13.9, 10.9, 16, 15.1, 13.5, 11.6, 10.7, 16.8, 13.9, 13.7, 13.3, 15.7, 15.5, 11, 15.5, 11.7, 12.4, 11.1, 9.6, 8.3, 14, 13.2, 15.3, 15.5, 15.4, 16.1, 12.4, 12.2, 15.1, 14, 9.6, 16.3, 12.6, 10.5, 14, 15.5, 11, 13.3, 13.9, 13.3, 12.7, 15.4, 15.7, 13.8, 11.7, 16.1, 12.6, 15.4, 16.3, 12.5, 15.7, 11.5, 14.2, 16.3, 13.6, 12.6, 11.7, 8.8, 10.9, 11.6, 13, 13.7, 11.1, 14.6, 10.4, 13.1, 11.9, 13.3, 9.9, 12.7, 9.3, 11.6, 13.1, 11.4, 13.6, 11.8, 16.3, 8.7, 15.7, 12.2, 14.8, 13.5, 15.7, 11.9, 12.5, 14.4, 12.2, 15.3, 11.5, 9, 10.8, 14.9, 13.2, 13.4, 14.4, 12.2, 12, 13.6, 8.8, 13.4, 12.6, 15.6, 12.5, 12.4, 12.4, 13.4, 12, 9.9, 12.1, 14.4, 15.7, 13.8, 11.4, 13.8, 15.6, 15, 14.1, 13.2, 13.3, 10.9, 12.1, 14, 11.3, 13.4, 13.5, 14.8, 13, 12.4, 12.3, 13.1, 14.8, 9.8, 14.1, 12, 16.1, 12.7, 14.7, 8.8, 10.5, 12.2, 11.7, 8.3, 12.4, 16.7, 15.7, 16.1, 14.3, 9.7, 13.3, 16.1, 14.4, 13.4, 14.5, 14.6, 13.7, 16.2, 16.3, 12.8, 13.4, 11.2, 11.6, 14.7, 9.4, 9.5, 15.7, 14.3, 14.2, 11.5, 15, 13.1, 11, 11.9, 13.5, 14.1, 10.9, 14.2, 10.4, 15.6, 11.3, 13, 13.6, 11.1, 14.6, 15.2, 13.4, 16.1, 14.3, 16.4, 12.6, 13.3, 11, 10.1, 15.6, 14.2, 13.4, 10.8, 13.2, 13.8, 11.4, 12.9, 13.6, 13.3, 12.2, 14.4, 8.9, 15.5, 14.3, 10, 13.2, 10.5, 9.5, 10.4, 10.9, 13.3, 12.6, 13.1, 11.6, 13.6, 13.4, 13.7, 10.7, 12.2, 14, 14.8, 15.1, 16.2, 15.3, 12.5, 14.7, 12.5, 10.9, 15.7, 11.7, 9.5, 13.3, 13.6, 13.4, 13.5, 11.1, 12.9, 12.8, 15.3, 12.8, 9.2, 14.4, 15.6, 10.4, 13, 13.2, 11, 13.5, 9.6, 14.5, 12.5, 10.7, 13.3, 9.4, 13.4, 13.5, 13.5, 13.6, 14.3, 9.6, 14.1, 12.9, 15.4, 13.8, 11.4, 12.4, 12.7, 13.6, 14.1, 15.1, 15.3, 14.7, 13.4, 12.3, 14.3, 12.4, 12.1, 12.4, 15.1, 17.3, 12.4, 15 ], 'y': [ 11.1, 12.6, 10.2, 10, 13, 8.9, 10.6, 11.9, 11.7, 9.6, 13.6, 11.5, 11.9, 10.6, 14.3, 10.7, 10.7, 10.2, 10, 9.8, 12, 11.8, 10.4, 12.8, 11.9, 11.9, 9.8, 13.3, 11.7, 13.6, 14, 11.6, 12.7, 9.9, 10.2, 11.4, 11.1, 11.1, 9.4, 13.2, 12.5, 12, 10.2, 11.6, 14.2, 12, 13.4, 12.4, 14.3, 8.6, 12.2, 9.8, 13.6, 12.3, 15.1, 13.9, 13.1, 10, 11, 12.9, 11.2, 8.7, 10.8, 7.2, 14.7, 10.7, 9.9, 14.1, 10.6, 9, 13.9, 11.8, 10.7, 11.2, 12.2, 11.9, 12.8, 11.4, 11.2, 8.9, 10.6, 11.5, 13.4, 10.4, 12.6, 17, 13.9, 12.7, 11.6, 12.9, 11.3, 13.2, 11.6, 13.2, 13.1, 13.3, 14.9, 11.4, 11.6, 12.3, 9.2, 12.5, 9.8, 12.4, 12, 11.4, 10.3, 11.6, 7.2, 13, 8.7, 11.4, 11.9, 11.4, 11.2, 13.2, 10, 7.4, 9.8, 12.4, 9.7, 11.5, 11, 12.5, 17, 11.1, 10.8, 9.6, 14.8, 10.2, 10.2, 11.5, 13.6, 11.2, 9.5, 12.7, 12.4, 11, 13.9, 8.9, 12.3, 10.1, 10.5, 11.4, 10.3, 14.1, 10.6, 8.8, 9.5, 14.4, 10.9, 10.6, 12.2, 11.5, 12.3, 12.8, 11.6, 11.7, 6.1, 11.2, 10.3, 11.1, 14, 11.7, 13.3, 11.5, 12.8, 10.7, 12.5, 16.2, 13.4, 8.8, 9.9, 11.2, 11.2, 11, 11, 10.8, 12, 12, 11.8, 10.6, 12, 9.9, 11.1, 11.5, 11.4, 11.7, 10, 11.3, 10.2, 11.4, 13.6, 11.5, 12.3, 12.7, 13.4, 12.3, 11.3, 9.2, 6.3, 7.1, 13.6, 11.4, 11.2, 11.3, 11.2, 13.6, 11.3, 13.4, 15.1, 11.3, 14.3, 11.6, 13.4, 10.2, 11.9, 12.7, 9.1, 13, 12.4, 7.3, 11.1, 14.4, 11, 13, 10.6, 10.1, 11.1, 11.8, 11.8, 11.9, 12.8, 12.6, 11.6, 11.2, 14, 10.3, 11.5, 11.8, 11.3, 12.1, 9.2, 11.9, 9.3, 11.7, 8.7, 11.3, 11.8, 11.1, 12.2, 9.9, 11.3, 11, 9.1, 11.9, 9.7, 11.6, 10.7, 13.8, 13.8, 10, 10, 12.4, 10.1, 11.3, 10.5, 10, 13.6, 13.5, 14.3, 9.8, 11.8, 11.2, 9.5, 11.4, 11.8, 12.3, 11.4, 10.7, 12, 14.8, 10.8, 9.1, 9.6, 13.9, 11.7, 12.1, 11.8, 11.2, 13, 13.4, 14.4, 14.6, 9.7, 14, 10.1, 12.6, 12.3, 12, 11.6, 14.6, 11.1, 9.6, 12.4, 11.2, 11.2, 11.1, 9.6, 11.4, 11.5, 10.9, 7.2, 13.3, 13, 9.3, 11.2, 7.3, 14.3, 13.7, 11.8, 11.7, 12.9, 10.9, 14.3, 10.6, 12.9, 12.8, 11.3, 11.2, 12.9, 13.6, 9.6, 10.7, 11.2, 13.9, 11.1, 10.5, 14.3, 11, 11.3, 14.4, 12.1, 12.9, 12.6, 12.1, 10.1, 11.2, 10.4, 11.2, 10.9, 14.7, 11.1, 11.2, 11.3, 11.8, 12.3, 11.6, 12.3, 14.2, 10.3, 12.2, 12.8, 10.6, 9.5, 12.3, 11, 11.5, 13.6, 14.4, 12.1, 12.2, 11.4, 14.1, 8.9, 12.9, 14.2, 12.8, 13.1, 14.1, 11.4, 10.8, 11.2, 8.8, 10.1, 11.9, 8.8, 12, 11.4, 14.6, 11.6, 12, 11, 12.8, 14.2, 12, 11.3, 11.4, 12.6, 11.4, 11.8, 13.6, 13.1, 12.9, 11.1, 10.9, 14.2, 11.3, 10.2, 13.1, 9.8, 13, 12, 10, 13.9, 10.8, 13, 12.5, 12, 11, 11.3, 14.8, 12.4, 13.2, 12.5, 10.6, 14.6, 14.2, 11.3, 10.1, 12.1, 12.3, 14.2, 13.6, 11.6, 13.2, 8.4, 7.2, 14.3, 10.5, 12.4, 12.9, 10, 10.4, 10.9, 9, 11.4, 13.9, 14.2, 14.4, 14.3, 11.9, 8.6, 12.2, 11.9, 10.8, 10.7, 11.6, 12.5, 13.4, 10.4, 10.4, 12.4, 12, 14.2, 8.9, 13.1, 16.5, 9.4, 11.2, 11.3, 11.5, 9.6, 13.1, 11.6, 11.4, 8.6, 10.4, 10.9, 10.4, 14.4, 15, 10.8, 12.1, 10.6, 12.6, 13.4, 12.5, 11.6, 10.6, 12.6, 11, 9.8, 11.1, 14.3, 11.4, 10.3, 12.1, 14.2, 10.4, 11.3, 14.3, 12.5, 10.6, 11.2, 14.6, 11, 11, 14.2, 10.5, 10.1, 13.1, 11.1, 12.5, 14.1, 13.3, 9.2, 10.1, 9.9, 11.4, 11.1, 11.4, 11.1, 14.7, 12.9, 12.6, 11.5, 14.8, 15.3, 10.3, 12.5, 11.1, 10.9, 16, 9.8, 11.3, 11.1, 12, 10.9, 11.2, 10.7, 11.9, 9.7, 11, 10.6, 10.6, 11.1, 12.5, 12.9, 11, 11.9, 11.1, 11, 9.1, 15.6, 10.6, 15.5, 13.8, 13.8, 12.7, 9.9, 11.4, 11.3, 11.5, 13.1, 10.1, 13.6, 10.2, 12.5, 11.9, 10.4, 11.6, 10.3, 11.5, 10.2, 11.6, 11.9, 12.5 ] } x_axis_label = 'X distance from lower left datum of map (m)' y_axis_label = 'Y distance from lower left datum of map (m)' axis_title = 'Broad Street Cholera Outbreak of 1854' file_graph = 'broad_street_cholera_outbreak.svg' output_url = 'broad_street_cholera.html' header_title = 'broad_street_cholera' header_id = 'broad-street-cholera' axis_subtitle = 'Soho, London, UK' legend1 = 'Deaths' legend2 = 'Pumps' figsize = (8, 6) ds.style_graph() original_stdout = ds.html_begin(output_url=output_url, header_title=header_title, header_id=header_id) deaths = pd.DataFrame(data=data_deaths) pumps = pd.DataFrame(data=data_pumps) fig, ax = ds.plot_scatter_scatter_x1_x2_y1_y2(X1=deaths['x'], X2=pumps['x'], y1=deaths['y'], y2=pumps['y'], figsize=figsize, markersize1=3, labellegendy2=legend2, colour1=colour1, colour2=colour2, markersize2=3, labellegendy1=legend1) ax.set_title(label=axis_title + '\n' + axis_subtitle) ax.set_ylabel(ylabel=y_axis_label) ax.set_xlabel(xlabel=x_axis_label) ax.legend(frameon=False) ds.despine(ax=ax) fig.savefig(fname=file_graph, format='svg') ds.html_figure(file_name=file_graph) ds.html_end(original_stdout=original_stdout, output_url=output_url)
def main(): original_stdout = ds.html_begin(output_url=output_url, header_title=header_title, header_id=header_id) print('<pre style="white-space: pre-wrap;">') df = ds.create_dataframe() print('df.shape') print(df.shape) print() print('df.head()') print(df.head()) print() print('df.dtypes') print(df.dtypes) print() print('df.columns') print(df.columns) print() print(help(ds.find_bool_columns)) print() columns_bool = ds.find_bool_columns(df=df) print('bool columns') print(columns_bool) print() print(help(ds.find_category_columns)) print() columns_category = ds.find_category_columns(df=df) print('category columns') print(columns_category) print() print(help(ds.find_datetime_columns)) print() columns_datetime = ds.find_datetime_columns(df=df) print('datetime columns') print(columns_datetime) print() print(help(ds.find_float_columns)) print() columns_float = ds.find_float_columns(df=df) print('float columns') print(columns_float) print() print(help(ds.find_int_columns)) print() columns_int = ds.find_int_columns(df=df) print('integer columns') print(columns_int) print() print(help(ds.find_int_float_columns)) print() columns_int_float = ds.find_int_float_columns(df=df) print('integer, float columns') print(columns_int_float) print() print(help(ds.find_object_columns)) print() columns_object = ds.find_object_columns(df=df) print('object columns') print(columns_object) print() print(help(ds.find_timedelta_columns)) print() columns_timedelta = ds.find_timedelta_columns(df=df) print('timedelta columns') print(columns_timedelta) print() df = ds.dataframe_info(df=df, file_in='test') print() print('df memory usage: ') print(ds.byte_size(num=df.memory_usage(index=True).sum())) print('</pre>') ds.html_end(original_stdout=original_stdout, output_url=output_url)
def main(): original_stdout = ds.html_begin(output_url=output_url, header_title=header_title, header_id=header_id) print('<pre style="white-space: pre-wrap;">') # Define decision variables: plants, warehouses plants = ['Rockford', 'Grand Rapids'] pretty_print(plants, 'Plants') capacities = [500, 600] pretty_print(capacities, 'Capacities') plant_capacity = dict(zip(plants, capacities)) pretty_print(plant_capacity, 'Plant capacity') warehouses = ['Chicago', 'Detroit', 'Indianapolis'] pretty_print(warehouses, 'Warehouses') demand = [400, 300, 350] pretty_print(demand, 'Demand') warehouse_demand = dict(zip(warehouses, demand)) pretty_print(warehouse_demand, 'Warehouse demand') # rows = plants, columns = warehouse, entries = costs plant -> warehouse lane_costs = [[10, 16, 12], [14, 8, 11]] pretty_print(lane_costs, 'Lane costs') # Create dictionary of transportation costs by plnats, warehouses warehouse_lane_costs =\ [dict(zip(warehouses, values)) for values in lane_costs] pretty_print(warehouse_lane_costs, 'Warehouse lane costs') plant_warehouse_lane_costs = dict(zip(plants, warehouse_lane_costs)) # Create the linear programming model object pretty_print(plant_warehouse_lane_costs, 'Plant warehouse lane costs') model = LpProblem(name='plant_warehouse_model', sense=LpMinimize) lanes =\ [(plant, warehouse) for plant in plants for warehouse in warehouses] pretty_print(lanes, 'Lanes') vars = LpVariable.dicts(name='Lane', indexs=(plants, warehouses), lowBound=0, upBound=None, cat=LpInteger) # Add the objective function model += lpSum([ vars[plant][warehouse] * plant_warehouse_lane_costs[plant][warehouse] for (plant, warehouse) in lanes ]) # Add plant capacity maximum constraints to model for each plant for plant in plants: model += lpSum([vars[plant][warehouse] for warehouse in warehouses])\ <= plant_capacity[plant],\ 'sum_of_products_out_of_plants_%s' % plant for warehouse in warehouses: model += lpSum([vars[plant][warehouse] for plant in plants])\ >= warehouse_demand[warehouse],\ 'sum_of_products_into_warehouses%s'\ % warehouse model.writeLP(filename='plants_warehouses.lp') # Solve the model using PuLP's choice of solver model.solve(solver=None) f = open('plants_warehouses.lp') print(f'\n{f.read()}\n') f.close() print(f'Status = {LpStatus[model.status]}\n') # Print resolved optimum value for each lane print('Lane shipments') for v in model.variables(): print(v.name, '=', v.varValue) print( f'\nTotal cost of transportation = {utilities.value(model.objective)}') print('</pre>') ds.html_end(original_stdout=original_stdout, output_url=output_url)
def main(): header_title = 'pandas merge' header_id = 'pandas-merge' output_url = 'pandas_merge.html' original_stdout = ds.html_begin(output_url=output_url, header_title=header_title, header_id=header_id) # df_one has unique values in 'id' df_one = pd.DataFrame({ 'id': [1, 2, 3, 4, 5], 'age': [134, 28, np.NaN, 29, 17], 'ctg': ['A', 'A', 'B', 'C', None] }).astype({'age': 'Int64'}) df_one.index.name = 'rows' # df_one has unique values in 'id' print( tabulate(tabular_data=df_one, headers='keys', tablefmt='tsv', numalign='right', stralign='right', floatfmt='.2f')) print() df_two = pd.DataFrame({ 'id': [3, 4, 5, 6, 7, 4, 4], 'ticket': [1001, 1002, 1003, 1004, 1005, 1006, 1007], 'amount': [24.1, np.NaN, 34.5, 19.5, 26.2, 27.3, np.NaN] }).astype({'ticket': 'Int64'}) df_two.index.name = 'rows' # df_two has multiple values in 'id' print( tabulate(tabular_data=df_two, headers='keys', tablefmt='tsv', numalign='right', stralign='right', floatfmt='.2f')) print() df_three = pd.DataFrame({ 'id': [4, 4, 4], 'ticket': [1002, 1006, 1007], 'amount': [13.69, 11.11, 69.13] }).astype({'ticket': 'Int64'}) df_three.index.name = 'rows' # df_three has multiple values in 'id' print( tabulate(tabular_data=df_three, headers='keys', tablefmt='tsv', numalign='right', stralign='right', floatfmt='.2f')) print() ds.page_break() df_one_two = df_one.merge( right=df_two, how='left', left_on=['id'], right_on=['id'], indicator=True, validate='one_to_many').astype(dtype={'age': 'Int64'}) df_one_two.index.name = 'rows' ds.save_file(df=df_one_two, file_name=Path.cwd() / 'df_one_two.csv') # df_one_two = df_one <- df_two is a one-to-many left merge print( tabulate(tabular_data=df_one_two, headers='keys', tablefmt='tsv', numalign='right', stralign='right', floatfmt='.2f')) print() ds.page_break() # df_one_two_three = df_one_two <- df_three is a one-to-one left merge print() df_one_two_three = df_one_two\ .drop(columns=['_merge'])\ .merge( right=df_three, how='left', left_on=['id', 'ticket'], right_on=['id', 'ticket'], suffixes=('_left', '_right'), indicator=True, validate='one_to_one' ) df_one_two_three.index.name = 'rows' print( tabulate(tabular_data=df_one_two_three, headers='keys', tablefmt='tsv', numalign='right', stralign='right', floatfmt='.2f')) print() df_one_two_three = df_one_two_three.drop(columns=['_merge']) # create 'amount' from 'amount_left' and replace np.NaN from 'amount_right' df_one_two_three['amount'] = df_one_two_three['amount_left']\ .where( df_one_two_three['amount_left'].notnull(), df_one_two_three['amount_right'] ) print( tabulate(tabular_data=df_one_two_three, headers='keys', tablefmt='tsv', numalign='right', stralign='right', floatfmt='.2f')) print() df_one_two_three = df_one_two_three\ .drop(columns=['amount_left', 'amount_right']) ds.save_file(df=df_one_two_three, file_name=Path.cwd() / 'df_one_two_three.csv') print( tabulate(tabular_data=df_one_two_three, headers='keys', tablefmt='tsv', numalign='right', stralign='right', floatfmt='.2f')) print() ds.html_end(original_stdout=original_stdout, output_url=output_url)
# Scatter plot of predicted versus measured fig, ax = ds.plot_scatter_x_y(X=y_all, y=predicted, figsize=figsize) ax.plot([y_all.min(), y_all.max()], [y_all.min(), y_all.max()], marker=None, linestyle='-', color=colour2) ax.set_ylabel(ylabel=label_predicted) ax.set_xlabel(xlabel=label_measured) ax.set_title(label=title) ds.despine(ax) fig.savefig(fname=f'{graph_name}_scatter.svg', format='svg') ds.html_figure(file_name=f'{graph_name}_scatter.svg', caption=f'{graph_name}_scatter.svg') # Line plot of predicted versus measured fig, ax = ds.plot_line_line_y1_y2(y1=y_all, y2=predicted, figsize=figsize, labellegendy1=label_measured, labellegendy2=label_predicted) ax.legend(frameon=False) ax.set_title(label=title) ds.despine(ax) fig.savefig(fname=f'{graph_name}_lines.svg', format='svg') ds.html_figure(file_name=f'{graph_name}_lines.svg', caption=f'{graph_name}_lines.svg') stop_time = time.time() ds.page_break() ds.report_summary(start_time=start_time, stop_time=stop_time) print('</pre>') ds.html_end(original_stdout=original_stdout, output_url=output_url)
def main(): header_title = 'Create series' header_id = 'create-series' output_url = 'create_series.html' original_stdout = ds.html_begin(output_url=output_url, header_title=header_title, header_id=header_id) print('Create Pandas series') print() print('uniform distribution, dtype: float, list_a, series_a') # series_a = ds.random_data( # distribution='uniform', # size=7, # loc=13, # scale=70 # ).rename('A') list_a = [14.758, 78.956, np.nan, 57.361, 39.018, 75.764, 65.869] print(list_a) series_a = pd.Series(data=list_a, name='A').astype(dtype='float64') print(series_a) print() print('boolean distribution, dtype: boolean (nullable), list_b, series_b') # series_b = ds.random_data( # distribution='bool', # size=7 # ).rename('B') list_b = [False, True, np.nan, False, True, True, False] print(list_b) series_b = pd.Series(data=list_b, name='B').astype(dtype='boolean') print(series_b) print() print('category distribution, dtype: category, list_c, series_c') # series_c = ds.random_data( # distribution='category, # size=7' # ).rename('C') # print(series_c.head()) list_c = ['small', 'medium', '', 'medium', 'large', 'large', 'small'] print(list_c) series_c = pd.Series(data=list_c, name='C').astype(dtype='category') print(series_c) print() print('C dtype:', series_c.dtype) print() print('C dtype:', type(series_c.dtype).__name__) print() print('category distribution, dtype: category, list_c, series_c') # series_c = ds.random_data( # distribution='categories, # size=7' # ).rename('C') # print(series_c.head()) list_cs = ['small', 'medium', '', 'medium', 'large', 'large', 'small'] categories = ['small', 'medium', 'large'] print(list_cs) print() size = 13 random_state = 42 random.seed(a=random_state) series_cs = pd.Series( data=random.choices(population=list_cs, k=size), name='CS').astype( dtype=CategoricalDtype(categories=categories, ordered=True)) print(series_cs) print() print('CS dtype:', series_cs.dtype) print() print('CS dtype:', type(series_cs.dtype).__name__) print() print('timedelta distribution, dtype: timedelta64[ns], list_d, series_d') # series_d = ds.random_data( # distribution='timedelta', # size=7 # ).rename('D') list_d = [0, 0, pd.NaT, 0, 0, 0, 0] print(list_d) series_d = pd.Series(data=list_d, name='D').astype(dtype='timedelta64[ns]') print(series_d) print() print('uniform distribution, dtype: float64, list_i, series_i') # series_i = ds.random_data( # distribution='uniform', # size=7, # loc=13, # scale=70 # ).rename('I') list_i = [ 6.554271, 23.958127, np.nan, 58.231292, 67.349036, 75.083105, 30.503073 ] print(list_i) series_i = pd.Series(data=list_i, name='I').astype(dtype='float64') print(series_i) print() print('strings distribution, dtype:str, list_r, series_r') # series_r = ds.random_data( # distribution='strings', # strings=['0', '1'], # size=7 # ).rename('R') list_r = ['1', '1', '', '0', '0', '1'] print(list_r) series_r = pd.Series(data=list_r, dtype='str', name='R').astype(dtype='str') print('series_r:') print(series_r) print() print('strings distribution, dtype:str, list_s, series_3') # series_s = ds.random_data( # distribution='strings', # size=7 # ).rename('S') list_s = ['male', 'female', '', 'male', 'female', 'female', 'male'] print(list_s) series_s = pd.Series(data=list_s, dtype='str', name='S').astype(dtype='str') print('series_s:') print(series_s) print() print('datetime distribution, dtype: datetime64[ns], list_t, series_t') # series_t = ds.random_data( # distribution='datetime', # size=7 # ).rename('T') list_t = [ '2020-12-12 16:33:48', '2020-12-13 16:33:48', pd.NaT, '2020-12-15 16:33:48', '2020-12-16 16:33:48', '2020-12-17 16:33:48', '2020-12-18 16:33:48' ] print(list_t) series_t = pd.Series(data=list_t, name='T').astype(dtype='datetime64[ns]') print(series_t) print() print('normal distribution, dtype: float64, list_x, series_x') # series_x = ds.random_data( # distribution='norm', # size=7, # loc=69, # scale=13 # ).rename('X') list_x = [42.195, 82.630, np.nan, 86.738, 85.656, 79.281, 50.015] print(list_x) series_x = pd.Series(data=list_x, dtype='float64', name='X').astype(dtype='float64') print(series_x) print() print('integer distribution, dtype: Int64 (nullable), list_y, series_y') # series_y = ds.random_data( # distribution='randint', # size=7, # low=0, # high=2 # ).rename('Y') list_y = [1, 0, 1, np.nan, 1, 0, 0] print(list_y) series_y = pd.Series(data=list_y, name='Y').astype(dtype='Int64') print(series_y) print() ds.html_end(original_stdout=original_stdout, output_url=output_url)
def main(): start_time = time.time() pd.options.display.width = 120 pd.options.display.max_columns = 100 pd.options.display.max_rows = 100 original_stdout = ds.html_begin(output_url=output_url, header_title=header_title, header_id=header_id) print('--------------------------') print('test dataframe_info') print('test example 1') my_file = 'myfile.csv' df = ds.read_file(my_file) df = ds.dataframe_info(df=df, file_in=my_file) print('--------------------------') print('test dataframe_info') print('test example 2') df = ds.create_dataframe() df = ds.dataframe_info(df=df, file_in='df') print('--------------------------') print('test find_bool_columns') print('test example') df = ds.create_dataframe() columns_bool = ds.find_bool_columns(df=df) print(columns_bool) print('--------------------------') print('test find_category_columns') print('test example') df = ds.create_dataframe() columns_category = ds.find_category_columns(df=df) print(columns_category) print('--------------------------') print('test find_datetime_columns') print('test example') df = ds.create_dataframe() columns_datetime = ds.find_datetime_columns(df=df) print(columns_datetime) print('--------------------------') print('test find_float_columns') print('test example') df = ds.create_dataframe() columns_float = ds.find_float_columns(df=df) print(columns_float) print('--------------------------') print('test find_int_columns') print('test example') df = ds.create_dataframe() columns_int = ds.find_int_columns(df=df) print(columns_int) print('--------------------------') print('test find_int_float_columns') print('test example') df = ds.create_dataframe() columns_int_float = ds.find_int_float_columns(df=df) print(columns_int_float) print('--------------------------') print('test find_object_columns') print('test example') df = ds.create_dataframe() columns_object = ds.find_object_columns(df=df) print(columns_object) print('--------------------------') print('test find_timedelta_columns') print('test example') df = ds.create_dataframe() columns_timedelta = ds.find_timedelta_columns(df=df) print(columns_timedelta) print('--------------------------') print('test number_empty_cells_in_columns') print('test example') df = pd.DataFrame({ 'X': [25.0, 24.0, 35.5, np.nan, 23.1], 'Y': [27, 24, np.nan, 23, np.nan], 'Z': ['a', 'b', np.nan, 'd', 'e'] }) empty_cells = ds.number_empty_cells_in_columns(df=df) print(empty_cells) print('--------------------------') print('test process_columns') print('test example') df = ds.create_dataframe() df, columns_in_count, columns_non_empty_count, columns_empty_count,\ columns_empty_list, columns_non_empty_list, columns_bool_list,\ columns_bool_count, columns_float_list, columns_float_count,\ columns_integer_list, columns_integer_count,\ columns_datetime_list, columns_datetime_count,\ columns_object_list, columns_object_count, columns_category_list,\ columns_category_count, columns_timedelta_list,\ columns_timedelta_count = ds.process_columns(df=df) print('columns_in_count :', columns_in_count) print('columns_non_empty_count:', columns_non_empty_count) print('columns_empty_count :', columns_empty_count) print('columns_empty_list :', columns_empty_list) print('columns_non_empty_list :', columns_non_empty_list) print('columns_bool_list :', columns_bool_list) print('columns_bool_count :', columns_bool_count) print('columns_float_list :', columns_float_list) print('columns_float_count :', columns_float_count) print('columns_integer_list :', columns_integer_list) print('columns_integer_count :', columns_integer_count) print('columns_datetime_list :', columns_datetime_list) print('columns_datetime_count :', columns_datetime_count) print('columns_object_list :', columns_object_list) print('columns_object_count :', columns_object_count) print('columns_category_list :', columns_category_list) print('columns_category_count :', columns_category_count) print('columns_timedelta_list :', columns_timedelta_list) print('columns_timedelta_count:', columns_timedelta_count) print('--------------------------') print('test process_rows') print('test example') df = ds.create_dataframe() df, rows_in_count, rows_out_count, rows_empty_count = ds.process_rows(df) print('rows_in_count :', rows_in_count) print('rows_out_count :', rows_out_count) print('rows_empty_count:', rows_empty_count) print('--------------------------') print('test save_file example 1') print('test example') df = ds.create_dataframe() ds.save_file(df=df, file_name='x_y.csv') print('--------------------------') print('test save_file example 2') print('test example') df = ds.create_dataframe() ds.save_file(df=df, file_name='x_y.csv', index=True, index_label='myindex') print('--------------------------') print('test save_file example 3') print('test example') df = ds.create_dataframe() ds.save_file(df=df, file_name='x_y.xlsx') print('--------------------------') print('test save_file example 4') print('test example') df = ds.create_dataframe() ds.save_file(df=df, file_name='x_y.xlsx', index=True, index_label='myindex') print('--------------------------') print('test byte_size') print('test example') df = ds.create_dataframe() print(ds.byte_size(num=df.memory_usage(index=True).sum())) print('--------------------------') print('test read_file') print('test example 1') my_file = 'myfile.csv' df = ds.create_dataframe() ds.save_file(df=df, file_name=my_file) df = ds.read_file(file_name=my_file) ds.dataframe_info(df=df, file_in=my_file) stop_time = time.time() print('--------------------------') print('test read_file') print('test example 2') file_name = 'myfile.csv' df = ds.create_dataframe() ds.save_file(df=df, file_name=file_name) parse_dates = ['t', 'u'] df = ds.read_file(file_name=file_name, parse_dates=parse_dates) ds.dataframe_info(df=df, file_in=my_file) print('--------------------------') print('test read_file') print('test example 3') file_name = 'myfile.csv' df = ds.create_dataframe() ds.save_file(df=df, file_name=file_name) column_names_dict = { 'a': 'A', 'b': 'B', 'c': 'C', 'd': 'D', 'i': 'I', 'r': 'R', 's': 'S', 't': 'T', 'u': 'U', 'y': 'Y', 'x': 'X', 'z': 'Z' } index_columns = ['Y'] parse_dates = ['t', 'u'] time_delta_columns = ['D'] category_columns = ['C'] integer_columns = ['A', 'I'] float_columns = ['X'] boolean_columns = ['R'] object_columns = ['Z'] df = ds.read_file(file_name=file_name, column_names_dict=column_names_dict, index_columns=index_columns, date_parser=date_parser(), parse_dates=parse_dates, time_delta_columns=time_delta_columns, category_columns=category_columns, integer_columns=integer_columns) ds.dataframe_info(df=df, file_in=file_name) print('--------------------------') print('test read_file') print('test example 4') file_name = 'myfile.ods' df = ds.create_dataframe() ds.save_file(df=df, file_name=file_name) parse_dates = ['t', 'u'] df = ds.read_file(file_name=file_name, parse_dates=parse_dates) ds.dataframe_info(df=df, file_in=file_name) stop_time = time.time() ds.page_break() ds.report_summary(start_time=start_time, stop_time=stop_time, read_file_names=file_name, save_file_names=file_name) ds.html_end(original_stdout=original_stdout, output_url=output_url)