示例#1
0
def main():
    original_stdout = ds.html_begin(output_url=output_url,
                                    header_title=header_title,
                                    header_id=header_id)
    # Example 1
    series_x = ds.datetime_data()
    series_y = ds.random_data()
    fig, ax = ds.plot_line_x_y(X=series_x, y=series_y)
    fig.savefig(fname='plot_line_x_y_datex_test.svg', format='svg')
    ds.html_figure(file_name='plot_line_x_y_datex_test.svg')
    # Example 2
    series_x = ds.random_data(distribution='randint').sort_values()
    fig, ax = ds.plot_line_x_y(X=series_x,
                               y=series_y,
                               figsize=(8, 4.5),
                               marker='o',
                               markersize=8,
                               linestyle=':',
                               colour='#337733')
    fig.savefig(fname='plot_line_x_y_intx_test.svg', format='svg')
    ds.html_figure(file_name='plot_line_x_y_intx_test.svg')
    # Example 3
    series_x = ds.random_data(distribution='uniform').sort_values()
    fig, ax = ds.plot_line_x_y(X=series_x, y=series_y)
    fig.savefig(fname='plot_line_x_y_uniformx_test.svg', format='svg')
    ds.html_figure(file_name='plot_line_x_y_uniformx_test.svg')
    # Example 4
    series_x = ds.random_data().sort_values()
    fig, ax = ds.plot_line_x_y(X=series_x, y=series_y)
    fig.savefig(fname='plot_line_x_y_normx_test.svg', format='svg')
    ds.html_figure(file_name='plot_line_x_y_normx_test.svg')
    ds.html_end(original_stdout=original_stdout, output_url=output_url)
def main():
    original_stdout = ds.html_begin(output_url=output_url,
                                    header_title=header_title,
                                    header_id=header_id)
    series_x = ds.datetime_data()
    series_y1 = ds.random_data()
    series_y2 = ds.random_data()
    fig, ax = ds.plot_scatter_scatter_x_y1_y2(X=series_x,
                                              y1=series_y1,
                                              y2=series_y2)
    fig.savefig(fname='plot_scatter_scatter_x_y1_y2_datex_test.svg',
                format='svg')
    ds.html_figure(file_name='plot_scatter_scatter_x_y1_y2_datex_test.svg')
    series_x = ds.random_data(distribution='uniform')
    fig, ax = ds.plot_scatter_scatter_x_y1_y2(X=series_x,
                                              y1=series_y1,
                                              y2=series_y2,
                                              figsize=(8, 5),
                                              marker1='o',
                                              marker2='+',
                                              markersize1=8,
                                              markersize2=12,
                                              colour1='#cc3311',
                                              colour2='#ee3377',
                                              labellegendy1='y1',
                                              labellegendy2='y2')
    ax.legend(frameon=False)
    fig.savefig(fname='plot_scatter_scatter_x_y1_y2_test.svg', format='svg')
    ds.html_figure(file_name='plot_scatter_scatter_x_y1_y2_test.svg')
    ds.html_end(original_stdout=original_stdout, output_url=output_url)
示例#3
0
def exit_script(
    *,
    original_stdout: IO[str],
    output_url: str
) -> NoReturn:
    """
    Exit from a script and complete the html file.

    Parameters
    ----------
    original_stdout : IO[str]
        The original stdout.
    output_url : str
        The output url.

    Example
    -------
    import datasense as ds
    ds.exit_script(
        original_stdout=original_stdout,
        output_url=output_url
    )
    """
    html_end(
        original_stdout=original_stdout,
        output_url=output_url
    )
    sys.exit()
示例#4
0
def main():
    input_value = eval(input(r'module.file.function name? > '))
    original_stdout = ds.html_begin(output_url=output_url,
                                    header_title=header_title,
                                    header_id=header_id)
    print('<pre style="white-space: pre-wrap;">')
    help(input_value)
    print('</pre>')
    ds.html_end(original_stdout=original_stdout, output_url=output_url)
def main():
    original_stdout = ds.html_begin(output_url=output_url,
                                    header_title=header_title,
                                    header_id=header_id)
    data = ds.random_data()
    fig, ax = ds.probability_plot(data=data)
    fig.savefig(fname='probability_plot_test.svg', format='svg')
    ds.html_figure(file_name='probability_plot_test.svg')
    ds.html_end(original_stdout=original_stdout, output_url=output_url)
示例#6
0
def main():
    original_stdout = ds.html_begin(output_url=output_url,
                                    header_title=header_title,
                                    header_id=header_id)
    print('<pre style="white-space: pre-wrap;">')
    series = ds.datetime_data()
    print('datetime series')
    print(series)
    print('</pre>')
    ds.html_end(original_stdout=original_stdout, output_url=output_url)
示例#7
0
def main():
    chdir(Path(__file__).parent.resolve())  # required for cron
    output_url = 'commits.html'
    header_title = 'Commits'
    header_id = 'commits'
    original_stdout = ds.html_begin(output_url=output_url,
                                    header_title=header_title,
                                    header_id=header_id)
    activity = recent_activity()
    plot_recent_activity(activity)
    activity.to_csv('activity.csv')
    ds.html_end(original_stdout=original_stdout, output_url=output_url)
示例#8
0
def main():
    original_stdout = ds.html_begin(output_url=output_url,
                                    header_title=header_title,
                                    header_id=header_id)
    print(help(ds.plot_pareto))
    # Example 1
    data = pd.DataFrame({
        'ordinate': ['Mo', 'Larry', 'Curly', 'Shemp', 'Joe'],
        'abscissa': [21, 2, 10, 4, 16]
    })
    fig, ax1, ax2 = ds.plot_pareto(X=data['ordinate'], y=data['abscissa'])
    fig.savefig(fname='pareto.svg', format='svg')
    ds.html_figure(file_name='pareto.svg')
    ds.html_end(original_stdout=original_stdout, output_url=output_url)
示例#9
0
def main():
    start_time = time.time()
    original_stdout = ds.html_begin(output_url=output_url,
                                    header_title=header_title,
                                    header_id=header_id)
    data = create_data()
    ds.page_break()
    xbar_chart(df=data)
    ds.page_break()
    r_chart(df=data)
    stop_time = time.time()
    ds.page_break()
    ds.report_summary(start_time=start_time, stop_time=stop_time)
    ds.html_end(original_stdout=original_stdout, output_url=output_url)
示例#10
0
def main():
    start_time = time.time()
    global figsize, date_time_parser
    file_names, graph_file_names, abscissa_names, ordinate_names,\
        ordinate_predicted_names, x_axis_label, y_axis_label, axis_title,\
        figsize, column_names_sort, date_time_parser,\
        date_formatter, alpha_value, function, output_url,\
        header_title, header_id, parser = parameters()
    original_stdout = ds.html_begin(output_url=output_url,
                                    header_title=header_title,
                                    header_id=header_id)
    print('<pre style="white-space: pre-wrap;">')
    for (file_name, abscissa_name, ordinate_name, ordinate_predicted_name,
         date_time_parser, column_names_sort, date_formatter,
         graph_file_name) in zip(file_names, abscissa_names, ordinate_names,
                                 ordinate_predicted_names, date_time_parser,
                                 column_names_sort, date_formatter,
                                 graph_file_names):
        if date_time_parser == 'None':
            data = ds.read_file(file_name=file_name,
                                sort_columns=column_names_sort,
                                sort_columns_bool=True)
        else:
            data = ds.read_file(file_name=file_name,
                                parse_dates=[abscissa_name],
                                sort_columns=column_names_sort,
                                sort_columns_bool=True)
        data[ordinate_predicted_name] = data[ordinate_name]\
            .ewm(alpha=alpha_value).mean()
        fig, ax = ds.plot_scatter_line_x_y1_y2(
            X=data[abscissa_name],
            y1=data[ordinate_name],
            y2=data[ordinate_predicted_name],
            figsize=figsize)
        ax.set_title(label=axis_title, fontweight='bold')
        ax.set_xlabel(xlabel=x_axis_label, fontweight='bold')
        ax.set_ylabel(ylabel=y_axis_label, fontweight='bold')
        ds.despine(ax=ax)
        fig.savefig(fname=f'{graph_file_name}.svg', format='svg')
        ds.html_figure(file_name=f'{graph_file_name}.svg')
    ds.page_break()
    stop_time = time.time()
    ds.report_summary(start_time=start_time,
                      stop_time=stop_time,
                      read_file_names=file_names,
                      targets=ordinate_names,
                      features=abscissa_names)
    print('</pre>')
    ds.html_end(original_stdout=original_stdout, output_url=output_url)
def main():
    original_stdout = ds.html_begin(
        output_url=output_url, header_title=header_title, header_id=header_id
    )
    # Example 1
    series_y = ds.random_data()
    fig, ax = ds.plot_scatter_y(y=series_y)
    fig.savefig(fname="plot_scatter_y_test_1.svg", format="svg")
    ds.html_figure(file_name="plot_scatter_y_test_1.svg")
    # Example 2
    fig, ax = ds.plot_scatter_y(
        y=series_y, figsize=(8, 4.5), marker="o", markersize=4,
        colour="#ee7733"
    )
    fig.savefig(fname="plot_scatter_y_test_2.svg", format="svg")
    ds.html_figure(file_name="plot_scatter_y_test_2.svg")
    ds.html_end(original_stdout=original_stdout, output_url=output_url)
示例#12
0
def main():
    # Define parameters
    title_directory_out = 'Name of the directory to save as?'
    title_directory_in = 'Name of the directory to read in?'
    output_url = 'extract_text_from_pdf_file.html'
    header_title = 'Extract text from pdf file'
    header_id = 'extract-text-from-pdf-file'
    chdir(Path(__file__).parent.resolve())
    extension_in = ['.pdf', '.PDF']
    extension_out = '.txt'
    # Request file to read
    path_to_files_in = ds.ask_directory_path(title=title_directory_in,
                                             initialdir=Path.cwd())
    # Request file to save
    path_to_files_out = ds.ask_directory_path(
        title=title_directory_out,
        initialdir=Path(*path_to_files_in.parts[:-1]))
    # Begin html output
    original_stdout = ds.html_begin(output_url=output_url,
                                    header_title=header_title,
                                    header_id=header_id)
    start_time = time.perf_counter()
    list_raw_files = ds.directory_file_list(directory=path_to_files_in,
                                            patterns=extension_in)
    # Process pdf, save txt
    for item in list_raw_files:
        string_with_lines = pdf_to_text(path=item)
        tidy = tidy_string(string=string_with_lines)
        save_to_file(path=Path(path_to_files_out,
                               f'{Path(item).stem}{extension_out}'),
                     string=tidy)
    list_raw_file_names = [Path(item).name for item in list_raw_files]
    list_txt_file_names = [
        f'{Path(item).stem}{extension_out}' for item in list_raw_files
    ]
    stop_time = time.perf_counter()
    ds.report_summary(start_time=start_time,
                      stop_time=stop_time,
                      print_heading=False)
    ds.print_list_by_item(list=list_raw_file_names, title='Files read:')
    ds.print_list_by_item(list=list_txt_file_names, title='Files saved:')
    # End html output
    ds.html_end(original_stdout=original_stdout, output_url=output_url)
示例#13
0
def main():
    original_stdout = ds.html_begin(output_url=output_url,
                                    header_title=header_title,
                                    header_id=header_id)
    # Example 1
    series_y = ds.random_data()
    fig, ax = ds.plot_line_y(y=series_y)
    fig.savefig(fname='plot_line_y_test_1.svg', format='svg')
    ds.html_figure(file_name='plot_line_y_test_1.svg')
    # Example 2
    fig, ax = ds.plot_line_y(y=series_y,
                             figsize=(8, 4.5),
                             marker='o',
                             markersize=4,
                             linestyle=':',
                             colour='#ee7733')
    fig.savefig(fname='plot_line_y_test_2.svg', format='svg')
    ds.html_figure(file_name='plot_line_y_test_2.svg')
    ds.html_end(original_stdout=original_stdout, output_url=output_url)
示例#14
0
def main():
    start_time = time.time()
    original_stdout = ds.html_begin(output_url=output_url,
                                    header_title=header_title,
                                    header_id=header_id)
    data = ds.random_data(distribution='norm', size=42, loc=69, scale=13)
    data = pd.DataFrame(data=data, columns=['X'])
    # print('dtype:', type(data).__name__)
    # print(data.head())
    # Create X control chart
    ds.page_break()
    fig = plt.figure(figsize=figsize)
    x = cc.X(data=data)
    # print('class:', type(x).__name__)
    ax = x.ax(fig)
    fig.savefig(fname=graph_x_file_name)
    ds.html_figure(file_name=graph_x_file_name)
    print(f'X Report\n'
          f'============\n'
          f'UCL        : {x.ucl.round(3)}\n'
          f'Xbar       : {x.mean.round(3)}\n'
          f'LCL        : {x.lcl.round(3)}\n'
          f'Sigma(X)   : {x.sigma.round(3)}\n')
    # Create mr chart
    fig = plt.figure(figsize=figsize)
    mr = cc.mR(data=data)
    # print('class:', type(x).__name__)
    ax = mr.ax(fig)
    fig.savefig(fname=graph_mr_file_name)
    ds.html_figure(file_name=graph_mr_file_name)
    print(f'mR Report\n'
          f'============\n'
          f'UCL        : {mr.ucl.round(3)}\n'
          f'mRbar      : {mr.mean.round(3)}\n'
          f'LCL        : {round(mr.lcl, 3)}\n'
          f'Sigma(mR)  : {mr.sigma.round(3)}\n')
    stop_time = time.time()
    ds.page_break()
    ds.report_summary(start_time=start_time, stop_time=stop_time)
    ds.html_end(original_stdout=original_stdout, output_url=output_url)
def main():
    start_time = time.time()
    global figsize, axis_title, x_axis_label, y_axis_label,\
        graphics_directory
    file_names, targets, features, number_knots, graphics_directory,\
        figsize, x_axis_label, y_axis_label, axis_title,\
        date_parser, output_url, header_title, header_id = parameters()
    ds.create_directory(directories=graphics_directory)
    original_stdout = ds.html_begin(output_url=output_url,
                                    header_title=header_title,
                                    header_id=header_id)
    ds.page_break()
    for file_name, target, feature in zip(file_names, targets, features):
        data = ds.read_file(file_name=file_name, parse_dates=features)
        data[target] = data[target].fillna(data[target].mean())
        dates = True
        X = pd.to_numeric(data[feature])
        y = data[target]
        t = ((X, y, file_name, target, feature, knot, dates)
             for knot in number_knots)
        with Pool() as pool:
            for _ in pool.imap_unordered(plot_scatter_line, t):
                pass
        for knot in number_knots:
            ds.html_figure(file_name=f'{graphics_directory}/'
                           f'spline_{file_name.strip(".csv")}_'
                           f'{target}_{feature}_{knot}.svg')
    stop_time = time.time()
    ds.page_break()
    ds.report_summary(start_time=start_time,
                      stop_time=stop_time,
                      read_file_names=file_names,
                      targets=targets,
                      features=features,
                      number_knots=number_knots)
    ds.html_end(original_stdout=original_stdout, output_url=output_url)
def main():
    original_stdout = ds.html_begin(output_url=output_url,
                                    header_title=header_title,
                                    header_id=header_id)
    # Example 1
    series_x1 = ds.datetime_data()
    series_x2 = ds.datetime_data()
    series_y1 = ds.random_data()
    series_y2 = ds.random_data()
    fig, ax = ds.plot_scatter_scatter_x1_x2_y1_y2(X1=series_x1,
                                                  X2=series_x2,
                                                  y1=series_y1,
                                                  y2=series_y2)
    fig.savefig(fname='plot_scatter_scatter_x1_x2_y1_y2_datex_test.svg',
                format='svg')
    ds.html_figure(file_name='plot_scatter_scatter_x1_x2_y1_y2_datex_test.svg')
    # Example 2
    fig, ax = ds.plot_scatter_scatter_x1_x2_y1_y2(
        X1=series_x1,
        X2=series_x2,
        y1=series_y1,
        y2=series_y2,
        smoothing='natural_cubic_spline',
        number_knots=7)
    fig.savefig(fname=('plot_scatter_scatter_x1_x2_y1_y2_'
                       'datex_smoothing_y1_y2_test.svg'),
                format='svg')
    ds.html_figure(file_name=(
        'plot_scatter_scatter_x1_x2_y1_y2_datex_smoothing_y1_y2_test.svg'))
    # Example 3
    series_x1 = ds.random_data(distribution='uniform').sort_values()
    series_x2 = ds.random_data(distribution='uniform').sort_values()
    fig, ax = ds.plot_scatter_scatter_x1_x2_y1_y2(X1=series_x1,
                                                  X2=series_x2,
                                                  y1=series_y1,
                                                  y2=series_y2,
                                                  figsize=(8, 5),
                                                  marker1='o',
                                                  marker2='+',
                                                  markersize1=8,
                                                  markersize2=12,
                                                  colour1='#cc3311',
                                                  colour2='#ee3377',
                                                  labellegendy1='y1',
                                                  labellegendy2='y2')
    ax.legend(frameon=False)
    fig.savefig(fname='plot_scatter_scatter_x1_x2_y1_y2_test.svg',
                format='svg')
    ds.html_figure(file_name='plot_scatter_scatter_x1_x2_y1_y2_test.svg')
    # Example 4
    fig, ax = ds.plot_scatter_scatter_x1_x2_y1_y2(
        X1=series_x1,
        X2=series_x2,
        y1=series_y1,
        y2=series_y2,
        figsize=(8, 5),
        marker1='o',
        marker2='+',
        markersize1=8,
        markersize2=12,
        colour1='#cc3311',
        colour2='#ee3377',
        labellegendy1='y1',
        labellegendy2='y2',
        smoothing='natural_cubic_spline',
        number_knots=7)
    ax.legend(frameon=False)
    fig.savefig(
        fname='plot_scatter_scatter_x1_x2_y1_y2_smoothing_y1_y2_test.svg',
        format='svg')
    ds.html_figure(
        file_name='plot_scatter_scatter_x1_x2_y1_y2_smoothing_y1_y2_test.svg')
    ds.html_end(original_stdout=original_stdout, output_url=output_url)
示例#17
0
def main():
    header_title = 'Create dataframe'
    header_id = 'create-dataframe'
    output_url = 'create_dataframe.html'
    original_stdout = ds.html_begin(output_url=output_url,
                                    header_title=header_title,
                                    header_id=header_id)
    print('Create a pandas dataframe in different ways')
    print()
    print('Create lists and dictionaries to use in the dataframe')
    print()
    list_1 = [1, 2, np.nan, 4, 5]
    print('list_1 of Int64')
    print(list_1)
    dict_1 = {'A': list_1}
    print('dict_1')
    print(dict_1)
    list_2 = [6.0, np.nan, 8.0, 9.0, 10.0]
    print()
    print('list_2 of float64')
    print(list_2)
    dict_2 = {'B': list_2}
    print('dict_2:')
    print(dict_2)
    list_3 = ['a', 'b', 'c', '', 'e']
    print()
    print('list_3 of str')
    print(list_3)
    dict_3 = {'C': list_3}
    print('dict_3:')
    print(dict_3)
    list_of_lists = [list_1, list_2, list_3]
    print()
    print('list of lists:')
    print(list_of_lists)
    dict_of_lists = {**dict_1, **dict_2, **dict_3}
    print()
    print('dict_of_lists:')
    print(dict_of_lists)
    print()
    dict_types = {'A': 'Int64', 'B': 'float64', 'C': 'str'}
    # Method zero
    print('Method zero. Use ds.create_dataframe()')
    df0 = ds.create_dataframe()
    print(df0.head(10))
    print(df0.dtypes)
    print()
    print('dtype of column cs:', df0['cs'].dtype)
    print()
    # Method one
    df1 = pd.DataFrame(data={
        **{
            'A': list_1
        },
        **{
            'B': list_2
        },
        **{
            'C': list_3
        },
    }).astype(dtype=dict_types)
    print('Method one')
    print(df1)
    print(df1.dtypes)
    print()
    # Method two
    df2 = pd.DataFrame(data=dict_of_lists).astype(dtype=dict_types)
    print('Method two')
    print(df2)
    print(df2.dtypes)
    print('df2:')
    print(df2)
    print(df2.dtypes)
    print()
    # Method three
    df3 = pd.DataFrame(
        data={
            'A': [1, 2, np.nan, 4, 5],
            'B': [6.0, np.nan, 8.0, 9.0, 10.0],
            'C': ['a', 'b', 'c', '', 'e']
        }).astype(dtype=dict_types)
    print('Method three')
    print(df3)
    print(df3.dtypes)
    print()
    # Method four
    dict_of_lists = {
        'A': [1, 2, np.nan, 4, 5],
        'B': [6.0, np.nan, 8.0, 9.0, 10.0],
        'C': ['a', 'b', 'c', '', 'e']
    }
    df4 = pd.DataFrame(data=dict_of_lists).astype(dtype=dict_types)
    print('Method four')
    print(df4)
    print(df4.dtypes)
    print()
    # Method five
    df5 = pd.DataFrame(data=dict_of_lists, ).astype(dtype=dict_types)
    print('Method five')
    print(df5)
    print(df5.dtypes)
    print()
    # Method six
    dict_of_lists = {'A': list_1, 'B': list_2, 'C': list_3}
    df6 = pd.DataFrame(data=dict_of_lists).astype(dtype=dict_types)
    print('Method six')
    print(df6)
    print(df6.dtypes)
    print()
    ds.html_end(original_stdout=original_stdout, output_url=output_url)
def main():
    start_time = time.time()
    pd.options.display.max_columns = 100
    pd.options.display.max_rows = 100
    pd.options.display.width = 120
    file_name = 'myfile.csv'
    original_stdout = ds.html_begin(
        output_url=output_url,
        header_title=header_title,
        header_id=header_id
    )
    help(ds.read_file)
    print()
    print('Create dataframe')
    df = ds.create_dataframe()
    print(df.head())
    print(df.columns)
    print(df.dtypes)
    ds.dataframe_info(
        df=df,
        file_in=file_name
    )
    help(ds.save_file)
    print()
    ds.save_file(
        df=df,
        file_name=file_name
    )
    # Example 1
    # Read a csv file. There is no guarantee the column dtypes will be correct.
    print('Example 1. Only [a, i, s, x, z] have the correct dtypes.')
    df = ds.read_file(file_name=file_name)
    print(df.dtypes)
    print()
    # Example 2
    # Read a csv file. Ensure the dtypes of datetime columns.
    print('Example 2. Ensure the dtypes of datetime columns.')
    parse_dates = ['t', 'u']
    df = ds.read_file(
        file_name=file_name,
        parse_dates=parse_dates
    )
    print(df.dtypes)
    print()
    # Example 3
    # Read a csv file. Ensure the dtypes of columns; not timedelta, datetime.
    print('Example 3. Ensure the dtypes of cols; not timedelta, datetime.')
    convert_dict = {
        'a': 'float64',
        'b': 'boolean',
        'c': 'category',
        'i': 'float64',
        'r': 'str',
        's': 'str',
        'x': 'float64',
        'y': 'Int64',
        'z': 'float64'
    }
    df = ds.read_file(
        file_name=file_name,
        dtype=convert_dict
    )
    print(df.dtypes)
    print()
    # Example 4
    # Read a csv file. Ensure the dtypes of columns. Rename the columns.
    print(
        'Example 4. Ensure the column dtypes are correct. Rename the columns.'
    )
    column_names_dict = {
        'a': 'A',
        'b': 'B',
        'c': 'C',
        'd': 'D',
        'i': 'I',
        'r': 'R',
        'r': 'R',
        's': 'S',
        't': 'T',
        'u': 'U',
        'x': 'X',
        'y': 'Y',
        'z': 'Z'
    }
    index_columns = ['Y']
    parse_dates = ['t', 'u']
    time_delta_columns = ['D']
    category_columns = ['C']
    integer_columns = ['A', 'I']
    float_columns = ['X']
    boolean_columns = ['R']
    object_columns = ['Z']
    sort_columns = ['I', 'A']
    sort_columns_bool = [True, False]
    data = ds.read_file(
        file_name=file_name,
        column_names_dict=column_names_dict,
        index_columns=index_columns,
        parse_dates=parse_dates,
        date_parser=date_parser(),
        time_delta_columns=time_delta_columns,
        category_columns=category_columns,
        integer_columns=integer_columns,
        float_columns=float_columns,
        boolean_columns=boolean_columns,
        object_columns=object_columns,
        sort_columns=sort_columns,
        sort_columns_bool=sort_columns_bool
    )
    print(data.head(10))
    print()
    print('column dtypes')
    print(data.dtypes)
    print(data.info(verbose=True))
    print()
    print('index', data.index.name, 'dtype:', data.index.dtype)
    print()
    ds.dataframe_info(
        df=data,
        file_in=file_name
    )
    # Example 5
    # Read an ods file.
    file_name = 'myfile.ods'
    df = ds.create_dataframe()
    ds.save_file(
        df=df,
        file_name=file_name
    )
    parse_dates = ['t', 'u']
    df = ds.read_file(
        file_name=file_name,
        parse_dates=parse_dates
    )
    print(
        'Example 5. Read an ods file.'
    )
    print(data.head(10))
    print()
    print('column dtypes')
    print(data.dtypes)
    print(data.info(verbose=True))
    print()
    ds.dataframe_info(
        df=data,
        file_in=file_name
    )
    # Example 6
    # Read an xlsx file.
    df = ds.read_file(file_name=file_name)
    file_name = 'myfile.xlsx'
    sheet_name = 'raw_data'
    ds.save_file(
        df=df,
        file_name=file_name,
        sheet_name=sheet_name
    )
    df = ds.read_file(
        file_name=file_name,
        sheet_name=sheet_name
    )
    ds.dataframe_info(
        df=df,
        file_in=file_name
    )
    stop_time = time.time()
    ds.page_break()
    ds.report_summary(
        start_time=start_time,
        stop_time=stop_time,
        read_file_names=file_name,
        save_file_names=file_name
    )
    ds.html_end(
        original_stdout=original_stdout,
        output_url=output_url
    )
示例#19
0
def main():
    pd.options.display.width = 220
    pd.options.display.max_columns = 220
    pd.options.display.max_rows = 220
    output_url = 'pivot_tables.html'
    header_title = 'Pivot tables'
    header_id = 'pivot-tables'
    original_stdout = ds.html_begin(output_url=output_url,
                                    header_title=header_title,
                                    header_id=header_id)
    df = ds.read_file(file_name='sales_funnel.xlsx')
    ds.dataframe_info(df=df, file_in='sales_funnel.xlsx')
    print(df.head())
    print()
    print('Pivot table, implicit parameters')
    print(
        pd.pivot_table(data=df, values=['Price'], index=['Manager']).round(2))
    print()
    print('Pivot table, explicit parameters')
    print(
        pd.pivot_table(data=df,
                       values=['Price'],
                       index=['Manager'],
                       aggfunc='mean').round(2))
    print()
    print('Pivot table, multiple-level index')
    print(
        pd.pivot_table(data=df,
                       values=['Price'],
                       index=['Manager', 'Rep'],
                       aggfunc='mean').round(2))
    print()
    print('Pivot table, multi-parameter aggfunc')
    print(
        pd.pivot_table(data=df,
                       values=['Price'],
                       index=['Manager', 'Rep'],
                       aggfunc={
                           'Price': [np.mean, np.sum, len]
                       }).round(2))
    print()
    print('Pivot table, columns parameter is optional')
    print(
        pd.pivot_table(data=df,
                       values=['Price'],
                       index=['Manager', 'Rep'],
                       columns=['Product'],
                       aggfunc=[np.sum]).round(2))
    print()
    print('Pivot table, replace NaN with 0')
    print(
        pd.pivot_table(data=df,
                       values=['Price'],
                       index=['Manager', 'Rep'],
                       columns=['Product'],
                       aggfunc=[np.sum],
                       fill_value=0).round(2))
    print()
    print('Pivot table, add second colume to values parameter')
    print(
        pd.pivot_table(data=df,
                       values=['Price', 'Quantity'],
                       index=['Manager', 'Rep'],
                       columns=['Product'],
                       aggfunc=[np.sum],
                       fill_value=0).round(2))
    print()
    print('Pivot table, product column moved to the index')
    print(
        pd.pivot_table(data=df,
                       values=['Price', 'Quantity'],
                       index=['Manager', 'Rep', 'Product'],
                       aggfunc=[np.sum],
                       fill_value=0).round(2))
    print()
    print('Pivot table, show totals')
    print(
        pd.pivot_table(data=df,
                       values=['Price', 'Quantity'],
                       index=['Manager', 'Rep', 'Product'],
                       aggfunc=[np.sum],
                       fill_value=0,
                       margins=True).round(2))
    print()
    print('Pivot table, change categories')
    print(
        pd.pivot_table(data=df,
                       values=['Price', 'Quantity'],
                       index=['Manager', 'Status'],
                       aggfunc=[np.sum],
                       fill_value=0,
                       margins=True).round(2))
    print()
    print('Pivot table, pass dictionary to aggfunc')
    print(
        pd.pivot_table(data=df,
                       values=['Price', 'Quantity'],
                       index=['Manager', 'Status'],
                       columns=['Product'],
                       aggfunc={
                           'Quantity': len,
                           'Price': np.sum
                       },
                       fill_value=0,
                       margins=True).round(2))
    print()
    print('Pivot table, pass dictionary to aggfunc')
    print(
        pd.pivot_table(data=df,
                       values=['Price', 'Quantity'],
                       index=['Manager', 'Status'],
                       columns=['Product'],
                       aggfunc={
                           'Quantity': len,
                           'Price': [np.sum, np.mean]
                       },
                       fill_value=0).round(2))
    print()
    print('Pivot table, save to variable')
    table = pd.pivot_table(data=df,
                           values=['Price', 'Quantity'],
                           index=['Manager', 'Status'],
                           columns=['Product'],
                           aggfunc={
                               'Quantity': len,
                               'Price': [np.sum, np.mean]
                           },
                           fill_value=0).round(2)
    print(table)
    print()
    print('Pivot table, sort on price, mean CPU')
    table = table.sort_values(by=('Price', 'mean', 'CPU'), ascending=False)
    print(table)
    print()
    print('Pivot table, filter for one manager')
    table = table.query('Manager == ["Debra Henley"]')
    print(table)
    print()
    print('Pivot table, sort and filter with multiple "dots"')
    table = pd.pivot_table(
        data=df,
        values=['Price', 'Quantity'],
        index=['Manager', 'Status'],
        columns=['Product'],
        aggfunc={'Quantity': len, 'Price': [np.sum, np.mean]},
        fill_value=0
    )\
        .sort_values(by=('Price', 'mean', 'CPU'), ascending=False)\
        .query('Manager == ["Debra Henley"]')\
        .round(2)
    print(table)
    print()
    print('Pivot table, another query')
    table = pd.pivot_table(
        data=df,
        values=['Price', 'Quantity'],
        index=['Manager', 'Status'],
        columns=['Product'],
        aggfunc={'Quantity': len, 'Price': [np.sum, np.mean]},
        fill_value=0
    )\
        .query('Status == ["pending", "won"]')\
        .round(2)
    print(table)
    print()
    print('Pivot table, another query')
    table = pd.pivot_table(
        data=df,
        values=['Price', 'Quantity'],
        index=['Manager', 'Status'],
        columns=['Product'],
        aggfunc={'Quantity': len, 'Price': [np.sum, np.mean]},
        fill_value=0
    )\
        .query('Status == ["pending", "won"]')\
        .query('Manager == ["Debra Henley"]')\
        .round(2)
    print(table)
    print()
    print('Pivot table, another query')
    table = pd.pivot_table(
        data=df,
        values=['Price', 'Quantity'],
        index=['Manager', 'Status'],
        columns=['Product'],
        aggfunc={'Quantity': len, 'Price': [np.sum, np.mean]},
        fill_value=0
    )\
        .query('Status == ["pending", "won"] & Manager == ["Debra Henley"]')\
        .round(2)
    print(table)
    ds.html_end(original_stdout=original_stdout, output_url=output_url)
def main():
    data_pumps = {
        'x':
        [8.7, 11, 13.4, 14.9, 8.7, 8.9, 12.6, 10.7, 13.5, 16.4, 18.9, 16, 9],
        'y':
        [17.9, 18.5, 17.4, 17.8, 14.9, 12.8, 11.7, 7.4, 8, 9.3, 9.7, 5, 5.1]
    }
    data_deaths = {
        'x': [
            13.6, 9.9, 14.7, 15.2, 13.2, 13.8, 13.1, 11, 15.2, 11.1, 11.7,
            12.3, 10.6, 14.6, 16.6, 9.5, 13.3, 15, 15.1, 10.9, 12.5, 11.8,
            12.2, 13.9, 12.3, 11, 11, 13.5, 10.8, 12.2, 13.9, 12.5, 15.7, 12.9,
            13, 13.7, 13.1, 13.4, 14.8, 13.2, 9.8, 12.5, 13.4, 14.4, 16, 10.9,
            12.5, 15.8, 16.5, 11.2, 15.8, 11, 11.7, 11.5, 11.8, 13, 14.1, 14.8,
            12.6, 14.6, 12.5, 14.5, 9.2, 17.9, 11.2, 9.5, 10.8, 16.1, 10.4,
            13.7, 15.8, 12.2, 11.5, 15.4, 15.9, 10.2, 14, 16.5, 17.5, 13.8,
            14.1, 14.7, 12.6, 11.7, 14.4, 15.2, 15.8, 13.9, 15.2, 13.2, 12.7,
            15.1, 12.8, 13.6, 14, 15.4, 14.8, 10.3, 10.5, 9.9, 14.2, 13.6, 15,
            15.7, 9.2, 16.8, 13.3, 10.6, 8.3, 13.4, 14.5, 16.8, 14.7, 10.3,
            13.2, 9.1, 15.2, 8.4, 15.3, 13.3, 13.2, 12.6, 13.9, 13.5, 15.2,
            11.6, 9.4, 11.4, 11.2, 11.9, 15, 13.6, 14.1, 10.5, 10.3, 13.8,
            13.4, 12.6, 15.7, 13.8, 13.2, 15.1, 13.2, 14.6, 14.4, 15, 13.1,
            13.4, 11.8, 16, 10.6, 14.6, 9.3, 14.5, 10.5, 14, 10.6, 10.6, 15.7,
            11.8, 11.8, 11.6, 15.8, 10.7, 13.4, 11.2, 15.5, 14, 13.4, 14.6,
            15.3, 14.5, 11.4, 12.5, 13.8, 10.6, 10.5, 14.8, 14.2, 13.3, 10.4,
            12.5, 15, 10.6, 13.1, 10.1, 11.2, 12.9, 12.2, 12.4, 11.9, 11.5,
            11.7, 12.5, 14.8, 14, 12.5, 9.2, 10.4, 12.7, 9.1, 8.3, 15.3, 11.2,
            11.8, 12.7, 11.8, 12.2, 12.7, 13.6, 8.8, 12.2, 16.2, 12.8, 13.6,
            12.7, 15, 13.9, 13.6, 12.5, 9.9, 17.6, 11.1, 16.7, 9.7, 13.4, 13.1,
            13.7, 11.7, 13.1, 13.1, 12.3, 14.9, 11.5, 12.9, 11, 15.8, 12.9,
            12.3, 9.8, 12.7, 12.7, 16, 11, 14.1, 11, 14.5, 15.6, 9.6, 15.5,
            14.4, 11.4, 13.7, 11.4, 16, 9.9, 10.8, 12.6, 9.3, 13.8, 13.8, 14.7,
            15.3, 14.1, 13.8, 11, 12.4, 14.8, 15, 15.4, 12.8, 11.3, 10.5, 11,
            13.4, 10.3, 9.6, 14.5, 11.5, 12.7, 15, 12.3, 12.9, 15.3, 15.8,
            13.1, 12.9, 15.3, 13, 12.5, 15.8, 14, 13, 13.9, 10.9, 16, 15.1,
            13.5, 11.6, 10.7, 16.8, 13.9, 13.7, 13.3, 15.7, 15.5, 11, 15.5,
            11.7, 12.4, 11.1, 9.6, 8.3, 14, 13.2, 15.3, 15.5, 15.4, 16.1, 12.4,
            12.2, 15.1, 14, 9.6, 16.3, 12.6, 10.5, 14, 15.5, 11, 13.3, 13.9,
            13.3, 12.7, 15.4, 15.7, 13.8, 11.7, 16.1, 12.6, 15.4, 16.3, 12.5,
            15.7, 11.5, 14.2, 16.3, 13.6, 12.6, 11.7, 8.8, 10.9, 11.6, 13,
            13.7, 11.1, 14.6, 10.4, 13.1, 11.9, 13.3, 9.9, 12.7, 9.3, 11.6,
            13.1, 11.4, 13.6, 11.8, 16.3, 8.7, 15.7, 12.2, 14.8, 13.5, 15.7,
            11.9, 12.5, 14.4, 12.2, 15.3, 11.5, 9, 10.8, 14.9, 13.2, 13.4,
            14.4, 12.2, 12, 13.6, 8.8, 13.4, 12.6, 15.6, 12.5, 12.4, 12.4,
            13.4, 12, 9.9, 12.1, 14.4, 15.7, 13.8, 11.4, 13.8, 15.6, 15, 14.1,
            13.2, 13.3, 10.9, 12.1, 14, 11.3, 13.4, 13.5, 14.8, 13, 12.4, 12.3,
            13.1, 14.8, 9.8, 14.1, 12, 16.1, 12.7, 14.7, 8.8, 10.5, 12.2, 11.7,
            8.3, 12.4, 16.7, 15.7, 16.1, 14.3, 9.7, 13.3, 16.1, 14.4, 13.4,
            14.5, 14.6, 13.7, 16.2, 16.3, 12.8, 13.4, 11.2, 11.6, 14.7, 9.4,
            9.5, 15.7, 14.3, 14.2, 11.5, 15, 13.1, 11, 11.9, 13.5, 14.1, 10.9,
            14.2, 10.4, 15.6, 11.3, 13, 13.6, 11.1, 14.6, 15.2, 13.4, 16.1,
            14.3, 16.4, 12.6, 13.3, 11, 10.1, 15.6, 14.2, 13.4, 10.8, 13.2,
            13.8, 11.4, 12.9, 13.6, 13.3, 12.2, 14.4, 8.9, 15.5, 14.3, 10,
            13.2, 10.5, 9.5, 10.4, 10.9, 13.3, 12.6, 13.1, 11.6, 13.6, 13.4,
            13.7, 10.7, 12.2, 14, 14.8, 15.1, 16.2, 15.3, 12.5, 14.7, 12.5,
            10.9, 15.7, 11.7, 9.5, 13.3, 13.6, 13.4, 13.5, 11.1, 12.9, 12.8,
            15.3, 12.8, 9.2, 14.4, 15.6, 10.4, 13, 13.2, 11, 13.5, 9.6, 14.5,
            12.5, 10.7, 13.3, 9.4, 13.4, 13.5, 13.5, 13.6, 14.3, 9.6, 14.1,
            12.9, 15.4, 13.8, 11.4, 12.4, 12.7, 13.6, 14.1, 15.1, 15.3, 14.7,
            13.4, 12.3, 14.3, 12.4, 12.1, 12.4, 15.1, 17.3, 12.4, 15
        ],
        'y': [
            11.1, 12.6, 10.2, 10, 13, 8.9, 10.6, 11.9, 11.7, 9.6, 13.6, 11.5,
            11.9, 10.6, 14.3, 10.7, 10.7, 10.2, 10, 9.8, 12, 11.8, 10.4, 12.8,
            11.9, 11.9, 9.8, 13.3, 11.7, 13.6, 14, 11.6, 12.7, 9.9, 10.2, 11.4,
            11.1, 11.1, 9.4, 13.2, 12.5, 12, 10.2, 11.6, 14.2, 12, 13.4, 12.4,
            14.3, 8.6, 12.2, 9.8, 13.6, 12.3, 15.1, 13.9, 13.1, 10, 11, 12.9,
            11.2, 8.7, 10.8, 7.2, 14.7, 10.7, 9.9, 14.1, 10.6, 9, 13.9, 11.8,
            10.7, 11.2, 12.2, 11.9, 12.8, 11.4, 11.2, 8.9, 10.6, 11.5, 13.4,
            10.4, 12.6, 17, 13.9, 12.7, 11.6, 12.9, 11.3, 13.2, 11.6, 13.2,
            13.1, 13.3, 14.9, 11.4, 11.6, 12.3, 9.2, 12.5, 9.8, 12.4, 12, 11.4,
            10.3, 11.6, 7.2, 13, 8.7, 11.4, 11.9, 11.4, 11.2, 13.2, 10, 7.4,
            9.8, 12.4, 9.7, 11.5, 11, 12.5, 17, 11.1, 10.8, 9.6, 14.8, 10.2,
            10.2, 11.5, 13.6, 11.2, 9.5, 12.7, 12.4, 11, 13.9, 8.9, 12.3, 10.1,
            10.5, 11.4, 10.3, 14.1, 10.6, 8.8, 9.5, 14.4, 10.9, 10.6, 12.2,
            11.5, 12.3, 12.8, 11.6, 11.7, 6.1, 11.2, 10.3, 11.1, 14, 11.7,
            13.3, 11.5, 12.8, 10.7, 12.5, 16.2, 13.4, 8.8, 9.9, 11.2, 11.2, 11,
            11, 10.8, 12, 12, 11.8, 10.6, 12, 9.9, 11.1, 11.5, 11.4, 11.7, 10,
            11.3, 10.2, 11.4, 13.6, 11.5, 12.3, 12.7, 13.4, 12.3, 11.3, 9.2,
            6.3, 7.1, 13.6, 11.4, 11.2, 11.3, 11.2, 13.6, 11.3, 13.4, 15.1,
            11.3, 14.3, 11.6, 13.4, 10.2, 11.9, 12.7, 9.1, 13, 12.4, 7.3, 11.1,
            14.4, 11, 13, 10.6, 10.1, 11.1, 11.8, 11.8, 11.9, 12.8, 12.6, 11.6,
            11.2, 14, 10.3, 11.5, 11.8, 11.3, 12.1, 9.2, 11.9, 9.3, 11.7, 8.7,
            11.3, 11.8, 11.1, 12.2, 9.9, 11.3, 11, 9.1, 11.9, 9.7, 11.6, 10.7,
            13.8, 13.8, 10, 10, 12.4, 10.1, 11.3, 10.5, 10, 13.6, 13.5, 14.3,
            9.8, 11.8, 11.2, 9.5, 11.4, 11.8, 12.3, 11.4, 10.7, 12, 14.8, 10.8,
            9.1, 9.6, 13.9, 11.7, 12.1, 11.8, 11.2, 13, 13.4, 14.4, 14.6, 9.7,
            14, 10.1, 12.6, 12.3, 12, 11.6, 14.6, 11.1, 9.6, 12.4, 11.2, 11.2,
            11.1, 9.6, 11.4, 11.5, 10.9, 7.2, 13.3, 13, 9.3, 11.2, 7.3, 14.3,
            13.7, 11.8, 11.7, 12.9, 10.9, 14.3, 10.6, 12.9, 12.8, 11.3, 11.2,
            12.9, 13.6, 9.6, 10.7, 11.2, 13.9, 11.1, 10.5, 14.3, 11, 11.3,
            14.4, 12.1, 12.9, 12.6, 12.1, 10.1, 11.2, 10.4, 11.2, 10.9, 14.7,
            11.1, 11.2, 11.3, 11.8, 12.3, 11.6, 12.3, 14.2, 10.3, 12.2, 12.8,
            10.6, 9.5, 12.3, 11, 11.5, 13.6, 14.4, 12.1, 12.2, 11.4, 14.1, 8.9,
            12.9, 14.2, 12.8, 13.1, 14.1, 11.4, 10.8, 11.2, 8.8, 10.1, 11.9,
            8.8, 12, 11.4, 14.6, 11.6, 12, 11, 12.8, 14.2, 12, 11.3, 11.4,
            12.6, 11.4, 11.8, 13.6, 13.1, 12.9, 11.1, 10.9, 14.2, 11.3, 10.2,
            13.1, 9.8, 13, 12, 10, 13.9, 10.8, 13, 12.5, 12, 11, 11.3, 14.8,
            12.4, 13.2, 12.5, 10.6, 14.6, 14.2, 11.3, 10.1, 12.1, 12.3, 14.2,
            13.6, 11.6, 13.2, 8.4, 7.2, 14.3, 10.5, 12.4, 12.9, 10, 10.4, 10.9,
            9, 11.4, 13.9, 14.2, 14.4, 14.3, 11.9, 8.6, 12.2, 11.9, 10.8, 10.7,
            11.6, 12.5, 13.4, 10.4, 10.4, 12.4, 12, 14.2, 8.9, 13.1, 16.5, 9.4,
            11.2, 11.3, 11.5, 9.6, 13.1, 11.6, 11.4, 8.6, 10.4, 10.9, 10.4,
            14.4, 15, 10.8, 12.1, 10.6, 12.6, 13.4, 12.5, 11.6, 10.6, 12.6, 11,
            9.8, 11.1, 14.3, 11.4, 10.3, 12.1, 14.2, 10.4, 11.3, 14.3, 12.5,
            10.6, 11.2, 14.6, 11, 11, 14.2, 10.5, 10.1, 13.1, 11.1, 12.5, 14.1,
            13.3, 9.2, 10.1, 9.9, 11.4, 11.1, 11.4, 11.1, 14.7, 12.9, 12.6,
            11.5, 14.8, 15.3, 10.3, 12.5, 11.1, 10.9, 16, 9.8, 11.3, 11.1, 12,
            10.9, 11.2, 10.7, 11.9, 9.7, 11, 10.6, 10.6, 11.1, 12.5, 12.9, 11,
            11.9, 11.1, 11, 9.1, 15.6, 10.6, 15.5, 13.8, 13.8, 12.7, 9.9, 11.4,
            11.3, 11.5, 13.1, 10.1, 13.6, 10.2, 12.5, 11.9, 10.4, 11.6, 10.3,
            11.5, 10.2, 11.6, 11.9, 12.5
        ]
    }
    x_axis_label = 'X distance from lower left datum of map (m)'
    y_axis_label = 'Y distance from lower left datum of map (m)'
    axis_title = 'Broad Street Cholera Outbreak of 1854'
    file_graph = 'broad_street_cholera_outbreak.svg'
    output_url = 'broad_street_cholera.html'
    header_title = 'broad_street_cholera'
    header_id = 'broad-street-cholera'
    axis_subtitle = 'Soho, London, UK'
    legend1 = 'Deaths'
    legend2 = 'Pumps'
    figsize = (8, 6)
    ds.style_graph()
    original_stdout = ds.html_begin(output_url=output_url,
                                    header_title=header_title,
                                    header_id=header_id)
    deaths = pd.DataFrame(data=data_deaths)
    pumps = pd.DataFrame(data=data_pumps)
    fig, ax = ds.plot_scatter_scatter_x1_x2_y1_y2(X1=deaths['x'],
                                                  X2=pumps['x'],
                                                  y1=deaths['y'],
                                                  y2=pumps['y'],
                                                  figsize=figsize,
                                                  markersize1=3,
                                                  labellegendy2=legend2,
                                                  colour1=colour1,
                                                  colour2=colour2,
                                                  markersize2=3,
                                                  labellegendy1=legend1)
    ax.set_title(label=axis_title + '\n' + axis_subtitle)
    ax.set_ylabel(ylabel=y_axis_label)
    ax.set_xlabel(xlabel=x_axis_label)
    ax.legend(frameon=False)
    ds.despine(ax=ax)
    fig.savefig(fname=file_graph, format='svg')
    ds.html_figure(file_name=file_graph)
    ds.html_end(original_stdout=original_stdout, output_url=output_url)
示例#21
0
def main():
    original_stdout = ds.html_begin(output_url=output_url,
                                    header_title=header_title,
                                    header_id=header_id)
    print('<pre style="white-space: pre-wrap;">')
    df = ds.create_dataframe()
    print('df.shape')
    print(df.shape)
    print()
    print('df.head()')
    print(df.head())
    print()
    print('df.dtypes')
    print(df.dtypes)
    print()
    print('df.columns')
    print(df.columns)
    print()
    print(help(ds.find_bool_columns))
    print()
    columns_bool = ds.find_bool_columns(df=df)
    print('bool columns')
    print(columns_bool)
    print()
    print(help(ds.find_category_columns))
    print()
    columns_category = ds.find_category_columns(df=df)
    print('category columns')
    print(columns_category)
    print()
    print(help(ds.find_datetime_columns))
    print()
    columns_datetime = ds.find_datetime_columns(df=df)
    print('datetime columns')
    print(columns_datetime)
    print()
    print(help(ds.find_float_columns))
    print()
    columns_float = ds.find_float_columns(df=df)
    print('float columns')
    print(columns_float)
    print()
    print(help(ds.find_int_columns))
    print()
    columns_int = ds.find_int_columns(df=df)
    print('integer columns')
    print(columns_int)
    print()
    print(help(ds.find_int_float_columns))
    print()
    columns_int_float = ds.find_int_float_columns(df=df)
    print('integer, float columns')
    print(columns_int_float)
    print()
    print(help(ds.find_object_columns))
    print()
    columns_object = ds.find_object_columns(df=df)
    print('object columns')
    print(columns_object)
    print()
    print(help(ds.find_timedelta_columns))
    print()
    columns_timedelta = ds.find_timedelta_columns(df=df)
    print('timedelta columns')
    print(columns_timedelta)
    print()
    df = ds.dataframe_info(df=df, file_in='test')
    print()
    print('df memory usage: ')
    print(ds.byte_size(num=df.memory_usage(index=True).sum()))
    print('</pre>')
    ds.html_end(original_stdout=original_stdout, output_url=output_url)
def main():
    original_stdout = ds.html_begin(output_url=output_url,
                                    header_title=header_title,
                                    header_id=header_id)
    print('<pre style="white-space: pre-wrap;">')
    # Define decision variables: plants, warehouses
    plants = ['Rockford', 'Grand Rapids']
    pretty_print(plants, 'Plants')
    capacities = [500, 600]
    pretty_print(capacities, 'Capacities')
    plant_capacity = dict(zip(plants, capacities))
    pretty_print(plant_capacity, 'Plant capacity')
    warehouses = ['Chicago', 'Detroit', 'Indianapolis']
    pretty_print(warehouses, 'Warehouses')
    demand = [400, 300, 350]
    pretty_print(demand, 'Demand')
    warehouse_demand = dict(zip(warehouses, demand))
    pretty_print(warehouse_demand, 'Warehouse demand')
    # rows = plants, columns = warehouse, entries = costs plant -> warehouse
    lane_costs = [[10, 16, 12], [14, 8, 11]]
    pretty_print(lane_costs, 'Lane costs')
    # Create dictionary of transportation costs by plnats, warehouses
    warehouse_lane_costs =\
        [dict(zip(warehouses, values)) for values in lane_costs]
    pretty_print(warehouse_lane_costs, 'Warehouse lane costs')
    plant_warehouse_lane_costs = dict(zip(plants, warehouse_lane_costs))
    # Create the linear programming model object
    pretty_print(plant_warehouse_lane_costs, 'Plant warehouse lane costs')
    model = LpProblem(name='plant_warehouse_model', sense=LpMinimize)
    lanes =\
        [(plant, warehouse) for plant in plants for warehouse in warehouses]
    pretty_print(lanes, 'Lanes')
    vars = LpVariable.dicts(name='Lane',
                            indexs=(plants, warehouses),
                            lowBound=0,
                            upBound=None,
                            cat=LpInteger)
    # Add the objective function
    model += lpSum([
        vars[plant][warehouse] * plant_warehouse_lane_costs[plant][warehouse]
        for (plant, warehouse) in lanes
    ])
    # Add plant capacity maximum constraints to model for each plant
    for plant in plants:
        model += lpSum([vars[plant][warehouse] for warehouse in warehouses])\
              <= plant_capacity[plant],\
              'sum_of_products_out_of_plants_%s' % plant
    for warehouse in warehouses:
        model += lpSum([vars[plant][warehouse] for plant in plants])\
              >= warehouse_demand[warehouse],\
              'sum_of_products_into_warehouses%s'\
              % warehouse
    model.writeLP(filename='plants_warehouses.lp')
    # Solve the model using PuLP's choice of solver
    model.solve(solver=None)
    f = open('plants_warehouses.lp')
    print(f'\n{f.read()}\n')
    f.close()
    print(f'Status = {LpStatus[model.status]}\n')
    # Print resolved optimum value for each lane
    print('Lane shipments')
    for v in model.variables():
        print(v.name, '=', v.varValue)
    print(
        f'\nTotal cost of transportation = {utilities.value(model.objective)}')
    print('</pre>')
    ds.html_end(original_stdout=original_stdout, output_url=output_url)
示例#23
0
def main():
    header_title = 'pandas merge'
    header_id = 'pandas-merge'
    output_url = 'pandas_merge.html'
    original_stdout = ds.html_begin(output_url=output_url,
                                    header_title=header_title,
                                    header_id=header_id)
    # df_one has unique values in 'id'
    df_one = pd.DataFrame({
        'id': [1, 2, 3, 4, 5],
        'age': [134, 28, np.NaN, 29, 17],
        'ctg': ['A', 'A', 'B', 'C', None]
    }).astype({'age': 'Int64'})
    df_one.index.name = 'rows'
    # df_one has unique values in 'id'
    print(
        tabulate(tabular_data=df_one,
                 headers='keys',
                 tablefmt='tsv',
                 numalign='right',
                 stralign='right',
                 floatfmt='.2f'))
    print()
    df_two = pd.DataFrame({
        'id': [3, 4, 5, 6, 7, 4, 4],
        'ticket': [1001, 1002, 1003, 1004, 1005, 1006, 1007],
        'amount': [24.1, np.NaN, 34.5, 19.5, 26.2, 27.3, np.NaN]
    }).astype({'ticket': 'Int64'})
    df_two.index.name = 'rows'
    # df_two has multiple values in 'id'
    print(
        tabulate(tabular_data=df_two,
                 headers='keys',
                 tablefmt='tsv',
                 numalign='right',
                 stralign='right',
                 floatfmt='.2f'))
    print()
    df_three = pd.DataFrame({
        'id': [4, 4, 4],
        'ticket': [1002, 1006, 1007],
        'amount': [13.69, 11.11, 69.13]
    }).astype({'ticket': 'Int64'})
    df_three.index.name = 'rows'
    # df_three has multiple values in 'id'
    print(
        tabulate(tabular_data=df_three,
                 headers='keys',
                 tablefmt='tsv',
                 numalign='right',
                 stralign='right',
                 floatfmt='.2f'))
    print()
    ds.page_break()
    df_one_two = df_one.merge(
        right=df_two,
        how='left',
        left_on=['id'],
        right_on=['id'],
        indicator=True,
        validate='one_to_many').astype(dtype={'age': 'Int64'})
    df_one_two.index.name = 'rows'
    ds.save_file(df=df_one_two, file_name=Path.cwd() / 'df_one_two.csv')
    # df_one_two = df_one <- df_two is a one-to-many left merge
    print(
        tabulate(tabular_data=df_one_two,
                 headers='keys',
                 tablefmt='tsv',
                 numalign='right',
                 stralign='right',
                 floatfmt='.2f'))
    print()
    ds.page_break()
    # df_one_two_three = df_one_two <- df_three is a one-to-one left merge
    print()
    df_one_two_three = df_one_two\
        .drop(columns=['_merge'])\
        .merge(
            right=df_three,
            how='left',
            left_on=['id', 'ticket'],
            right_on=['id', 'ticket'],
            suffixes=('_left', '_right'),
            indicator=True,
            validate='one_to_one'
        )
    df_one_two_three.index.name = 'rows'
    print(
        tabulate(tabular_data=df_one_two_three,
                 headers='keys',
                 tablefmt='tsv',
                 numalign='right',
                 stralign='right',
                 floatfmt='.2f'))
    print()
    df_one_two_three = df_one_two_three.drop(columns=['_merge'])
    # create 'amount' from 'amount_left' and replace np.NaN from 'amount_right'
    df_one_two_three['amount'] = df_one_two_three['amount_left']\
        .where(
            df_one_two_three['amount_left'].notnull(),
            df_one_two_three['amount_right']
        )
    print(
        tabulate(tabular_data=df_one_two_three,
                 headers='keys',
                 tablefmt='tsv',
                 numalign='right',
                 stralign='right',
                 floatfmt='.2f'))
    print()
    df_one_two_three = df_one_two_three\
        .drop(columns=['amount_left', 'amount_right'])
    ds.save_file(df=df_one_two_three,
                 file_name=Path.cwd() / 'df_one_two_three.csv')
    print(
        tabulate(tabular_data=df_one_two_three,
                 headers='keys',
                 tablefmt='tsv',
                 numalign='right',
                 stralign='right',
                 floatfmt='.2f'))
    print()
    ds.html_end(original_stdout=original_stdout, output_url=output_url)
示例#24
0
# Scatter plot of predicted versus measured
fig, ax = ds.plot_scatter_x_y(X=y_all, y=predicted, figsize=figsize)
ax.plot([y_all.min(), y_all.max()], [y_all.min(), y_all.max()],
        marker=None,
        linestyle='-',
        color=colour2)
ax.set_ylabel(ylabel=label_predicted)
ax.set_xlabel(xlabel=label_measured)
ax.set_title(label=title)
ds.despine(ax)
fig.savefig(fname=f'{graph_name}_scatter.svg', format='svg')
ds.html_figure(file_name=f'{graph_name}_scatter.svg',
               caption=f'{graph_name}_scatter.svg')
# Line plot of predicted versus measured
fig, ax = ds.plot_line_line_y1_y2(y1=y_all,
                                  y2=predicted,
                                  figsize=figsize,
                                  labellegendy1=label_measured,
                                  labellegendy2=label_predicted)
ax.legend(frameon=False)
ax.set_title(label=title)
ds.despine(ax)
fig.savefig(fname=f'{graph_name}_lines.svg', format='svg')
ds.html_figure(file_name=f'{graph_name}_lines.svg',
               caption=f'{graph_name}_lines.svg')
stop_time = time.time()
ds.page_break()
ds.report_summary(start_time=start_time, stop_time=stop_time)
print('</pre>')
ds.html_end(original_stdout=original_stdout, output_url=output_url)
示例#25
0
def main():
    header_title = 'Create series'
    header_id = 'create-series'
    output_url = 'create_series.html'
    original_stdout = ds.html_begin(output_url=output_url,
                                    header_title=header_title,
                                    header_id=header_id)
    print('Create Pandas series')
    print()
    print('uniform distribution, dtype: float, list_a, series_a')
    # series_a = ds.random_data(
    #     distribution='uniform',
    #     size=7,
    #     loc=13,
    #     scale=70
    # ).rename('A')
    list_a = [14.758, 78.956, np.nan, 57.361, 39.018, 75.764, 65.869]
    print(list_a)
    series_a = pd.Series(data=list_a, name='A').astype(dtype='float64')
    print(series_a)
    print()
    print('boolean distribution, dtype: boolean (nullable), list_b, series_b')
    # series_b = ds.random_data(
    #     distribution='bool',
    #     size=7
    # ).rename('B')
    list_b = [False, True, np.nan, False, True, True, False]
    print(list_b)
    series_b = pd.Series(data=list_b, name='B').astype(dtype='boolean')
    print(series_b)
    print()
    print('category distribution, dtype: category, list_c, series_c')
    # series_c = ds.random_data(
    #     distribution='category,
    #     size=7'
    # ).rename('C')
    # print(series_c.head())
    list_c = ['small', 'medium', '', 'medium', 'large', 'large', 'small']
    print(list_c)
    series_c = pd.Series(data=list_c, name='C').astype(dtype='category')
    print(series_c)
    print()
    print('C dtype:', series_c.dtype)
    print()
    print('C dtype:', type(series_c.dtype).__name__)
    print()
    print('category distribution, dtype: category, list_c, series_c')
    # series_c = ds.random_data(
    #     distribution='categories,
    #     size=7'
    # ).rename('C')
    # print(series_c.head())
    list_cs = ['small', 'medium', '', 'medium', 'large', 'large', 'small']
    categories = ['small', 'medium', 'large']
    print(list_cs)
    print()
    size = 13
    random_state = 42
    random.seed(a=random_state)
    series_cs = pd.Series(
        data=random.choices(population=list_cs, k=size), name='CS').astype(
            dtype=CategoricalDtype(categories=categories, ordered=True))
    print(series_cs)
    print()
    print('CS dtype:', series_cs.dtype)
    print()
    print('CS dtype:', type(series_cs.dtype).__name__)
    print()
    print('timedelta distribution, dtype: timedelta64[ns], list_d, series_d')
    # series_d = ds.random_data(
    #     distribution='timedelta',
    #     size=7
    # ).rename('D')
    list_d = [0, 0, pd.NaT, 0, 0, 0, 0]
    print(list_d)
    series_d = pd.Series(data=list_d, name='D').astype(dtype='timedelta64[ns]')
    print(series_d)
    print()
    print('uniform distribution, dtype: float64, list_i, series_i')
    # series_i = ds.random_data(
    #     distribution='uniform',
    #     size=7,
    #     loc=13,
    #     scale=70
    # ).rename('I')
    list_i = [
        6.554271, 23.958127, np.nan, 58.231292, 67.349036, 75.083105, 30.503073
    ]
    print(list_i)
    series_i = pd.Series(data=list_i, name='I').astype(dtype='float64')
    print(series_i)
    print()
    print('strings distribution, dtype:str, list_r, series_r')
    # series_r = ds.random_data(
    #     distribution='strings',
    #     strings=['0', '1'],
    #     size=7
    # ).rename('R')
    list_r = ['1', '1', '', '0', '0', '1']
    print(list_r)
    series_r = pd.Series(data=list_r, dtype='str',
                         name='R').astype(dtype='str')
    print('series_r:')
    print(series_r)
    print()
    print('strings distribution, dtype:str, list_s, series_3')
    # series_s = ds.random_data(
    #     distribution='strings',
    #     size=7
    # ).rename('S')
    list_s = ['male', 'female', '', 'male', 'female', 'female', 'male']
    print(list_s)
    series_s = pd.Series(data=list_s, dtype='str',
                         name='S').astype(dtype='str')
    print('series_s:')
    print(series_s)
    print()
    print('datetime distribution, dtype: datetime64[ns], list_t, series_t')
    # series_t = ds.random_data(
    #     distribution='datetime',
    #     size=7
    # ).rename('T')
    list_t = [
        '2020-12-12 16:33:48', '2020-12-13 16:33:48', pd.NaT,
        '2020-12-15 16:33:48', '2020-12-16 16:33:48', '2020-12-17 16:33:48',
        '2020-12-18 16:33:48'
    ]
    print(list_t)
    series_t = pd.Series(data=list_t, name='T').astype(dtype='datetime64[ns]')
    print(series_t)
    print()
    print('normal distribution, dtype: float64, list_x, series_x')
    # series_x = ds.random_data(
    #     distribution='norm',
    #     size=7,
    #     loc=69,
    #     scale=13
    # ).rename('X')
    list_x = [42.195, 82.630, np.nan, 86.738, 85.656, 79.281, 50.015]
    print(list_x)
    series_x = pd.Series(data=list_x, dtype='float64',
                         name='X').astype(dtype='float64')
    print(series_x)
    print()
    print('integer distribution, dtype: Int64 (nullable), list_y, series_y')
    # series_y = ds.random_data(
    #     distribution='randint',
    #     size=7,
    #     low=0,
    #     high=2
    # ).rename('Y')
    list_y = [1, 0, 1, np.nan, 1, 0, 0]
    print(list_y)
    series_y = pd.Series(data=list_y, name='Y').astype(dtype='Int64')
    print(series_y)
    print()
    ds.html_end(original_stdout=original_stdout, output_url=output_url)
示例#26
0
def main():
    start_time = time.time()
    pd.options.display.width = 120
    pd.options.display.max_columns = 100
    pd.options.display.max_rows = 100
    original_stdout = ds.html_begin(output_url=output_url,
                                    header_title=header_title,
                                    header_id=header_id)
    print('--------------------------')
    print('test dataframe_info')
    print('test example 1')
    my_file = 'myfile.csv'
    df = ds.read_file(my_file)
    df = ds.dataframe_info(df=df, file_in=my_file)
    print('--------------------------')
    print('test dataframe_info')
    print('test example 2')
    df = ds.create_dataframe()
    df = ds.dataframe_info(df=df, file_in='df')
    print('--------------------------')
    print('test find_bool_columns')
    print('test example')
    df = ds.create_dataframe()
    columns_bool = ds.find_bool_columns(df=df)
    print(columns_bool)
    print('--------------------------')
    print('test find_category_columns')
    print('test example')
    df = ds.create_dataframe()
    columns_category = ds.find_category_columns(df=df)
    print(columns_category)
    print('--------------------------')
    print('test find_datetime_columns')
    print('test example')
    df = ds.create_dataframe()
    columns_datetime = ds.find_datetime_columns(df=df)
    print(columns_datetime)
    print('--------------------------')
    print('test find_float_columns')
    print('test example')
    df = ds.create_dataframe()
    columns_float = ds.find_float_columns(df=df)
    print(columns_float)
    print('--------------------------')
    print('test find_int_columns')
    print('test example')
    df = ds.create_dataframe()
    columns_int = ds.find_int_columns(df=df)
    print(columns_int)
    print('--------------------------')
    print('test find_int_float_columns')
    print('test example')
    df = ds.create_dataframe()
    columns_int_float = ds.find_int_float_columns(df=df)
    print(columns_int_float)
    print('--------------------------')
    print('test find_object_columns')
    print('test example')
    df = ds.create_dataframe()
    columns_object = ds.find_object_columns(df=df)
    print(columns_object)
    print('--------------------------')
    print('test find_timedelta_columns')
    print('test example')
    df = ds.create_dataframe()
    columns_timedelta = ds.find_timedelta_columns(df=df)
    print(columns_timedelta)
    print('--------------------------')
    print('test number_empty_cells_in_columns')
    print('test example')
    df = pd.DataFrame({
        'X': [25.0, 24.0, 35.5, np.nan, 23.1],
        'Y': [27, 24, np.nan, 23, np.nan],
        'Z': ['a', 'b', np.nan, 'd', 'e']
    })
    empty_cells = ds.number_empty_cells_in_columns(df=df)
    print(empty_cells)
    print('--------------------------')
    print('test process_columns')
    print('test example')
    df = ds.create_dataframe()
    df, columns_in_count, columns_non_empty_count, columns_empty_count,\
        columns_empty_list, columns_non_empty_list, columns_bool_list,\
        columns_bool_count, columns_float_list, columns_float_count,\
        columns_integer_list, columns_integer_count,\
        columns_datetime_list, columns_datetime_count,\
        columns_object_list, columns_object_count, columns_category_list,\
        columns_category_count, columns_timedelta_list,\
        columns_timedelta_count = ds.process_columns(df=df)
    print('columns_in_count       :', columns_in_count)
    print('columns_non_empty_count:', columns_non_empty_count)
    print('columns_empty_count    :', columns_empty_count)
    print('columns_empty_list     :', columns_empty_list)
    print('columns_non_empty_list :', columns_non_empty_list)
    print('columns_bool_list      :', columns_bool_list)
    print('columns_bool_count     :', columns_bool_count)
    print('columns_float_list     :', columns_float_list)
    print('columns_float_count    :', columns_float_count)
    print('columns_integer_list   :', columns_integer_list)
    print('columns_integer_count  :', columns_integer_count)
    print('columns_datetime_list  :', columns_datetime_list)
    print('columns_datetime_count :', columns_datetime_count)
    print('columns_object_list    :', columns_object_list)
    print('columns_object_count   :', columns_object_count)
    print('columns_category_list  :', columns_category_list)
    print('columns_category_count :', columns_category_count)
    print('columns_timedelta_list :', columns_timedelta_list)
    print('columns_timedelta_count:', columns_timedelta_count)
    print('--------------------------')
    print('test process_rows')
    print('test example')
    df = ds.create_dataframe()
    df, rows_in_count, rows_out_count, rows_empty_count = ds.process_rows(df)
    print('rows_in_count   :', rows_in_count)
    print('rows_out_count  :', rows_out_count)
    print('rows_empty_count:', rows_empty_count)
    print('--------------------------')
    print('test save_file example 1')
    print('test example')
    df = ds.create_dataframe()
    ds.save_file(df=df, file_name='x_y.csv')
    print('--------------------------')
    print('test save_file example 2')
    print('test example')
    df = ds.create_dataframe()
    ds.save_file(df=df, file_name='x_y.csv', index=True, index_label='myindex')
    print('--------------------------')
    print('test save_file example 3')
    print('test example')
    df = ds.create_dataframe()
    ds.save_file(df=df, file_name='x_y.xlsx')
    print('--------------------------')
    print('test save_file example 4')
    print('test example')
    df = ds.create_dataframe()
    ds.save_file(df=df,
                 file_name='x_y.xlsx',
                 index=True,
                 index_label='myindex')
    print('--------------------------')
    print('test byte_size')
    print('test example')
    df = ds.create_dataframe()
    print(ds.byte_size(num=df.memory_usage(index=True).sum()))
    print('--------------------------')
    print('test read_file')
    print('test example 1')
    my_file = 'myfile.csv'
    df = ds.create_dataframe()
    ds.save_file(df=df, file_name=my_file)
    df = ds.read_file(file_name=my_file)
    ds.dataframe_info(df=df, file_in=my_file)
    stop_time = time.time()
    print('--------------------------')
    print('test read_file')
    print('test example 2')
    file_name = 'myfile.csv'
    df = ds.create_dataframe()
    ds.save_file(df=df, file_name=file_name)
    parse_dates = ['t', 'u']
    df = ds.read_file(file_name=file_name, parse_dates=parse_dates)
    ds.dataframe_info(df=df, file_in=my_file)
    print('--------------------------')
    print('test read_file')
    print('test example 3')
    file_name = 'myfile.csv'
    df = ds.create_dataframe()
    ds.save_file(df=df, file_name=file_name)
    column_names_dict = {
        'a': 'A',
        'b': 'B',
        'c': 'C',
        'd': 'D',
        'i': 'I',
        'r': 'R',
        's': 'S',
        't': 'T',
        'u': 'U',
        'y': 'Y',
        'x': 'X',
        'z': 'Z'
    }
    index_columns = ['Y']
    parse_dates = ['t', 'u']
    time_delta_columns = ['D']
    category_columns = ['C']
    integer_columns = ['A', 'I']
    float_columns = ['X']
    boolean_columns = ['R']
    object_columns = ['Z']
    df = ds.read_file(file_name=file_name,
                      column_names_dict=column_names_dict,
                      index_columns=index_columns,
                      date_parser=date_parser(),
                      parse_dates=parse_dates,
                      time_delta_columns=time_delta_columns,
                      category_columns=category_columns,
                      integer_columns=integer_columns)
    ds.dataframe_info(df=df, file_in=file_name)
    print('--------------------------')
    print('test read_file')
    print('test example 4')
    file_name = 'myfile.ods'
    df = ds.create_dataframe()
    ds.save_file(df=df, file_name=file_name)
    parse_dates = ['t', 'u']
    df = ds.read_file(file_name=file_name, parse_dates=parse_dates)
    ds.dataframe_info(df=df, file_in=file_name)
    stop_time = time.time()
    ds.page_break()
    ds.report_summary(start_time=start_time,
                      stop_time=stop_time,
                      read_file_names=file_name,
                      save_file_names=file_name)
    ds.html_end(original_stdout=original_stdout, output_url=output_url)