Пример #1
0
def tune_weight_regularization(train_X, train_y, validation_X, validation_y):
    # define scope of search
    regularizers = {
        1: L1L2(l1=0.0, l2=0.01),
        2: L1L2(l1=0.01, l2=0.0),
        3: L1L2(l1=0.0, l2=0.0),
        4: L1L2(l1=0.01, l2=0.01)
    }
    n_repeats = 5
    # grid search parameter values
    scores = DataFrame()
    for reg in regularizers.keys():
        # repeat each experiment multiple times
        loss_values = list()
        for i in range(n_repeats):
            loss = fit_weight_regularization_model(regularizers[reg], train_X,
                                                   train_y, validation_X,
                                                   validation_y)
            loss_values.append(loss)
            print('>%d/%d param=%f, loss=%f' % (i + 1, n_repeats, reg, loss))
        # store results for this parameter
        scores[str(reg)] = loss_values
    # summary statistics of results
    print(scores.describe())
    # box and whisker plot of results
    scores.boxplot()
    pyplot.show()
Пример #2
0
def print_error_info(errors):
    results = DataFrame()
    results["error"] = errors
    print(results.describe())
    results.boxplot()
    plt.show()
    print(errors)
Пример #3
0
def run():
    # load dataset
    series = read_csv('shampoo-sales.csv',
                      header=0,
                      parse_dates=[0],
                      index_col=0,
                      squeeze=True,
                      date_parser=parser)
    # configure the experiment
    n_lag = 1
    n_repeats = 30
    n_epochs = 1000
    n_batch = 4
    n_neurons = 3
    regularizers = [
        L1L2(l1=0.0, l2=0.0),
        L1L2(l1=0.01, l2=0.0),
        L1L2(l1=0.0, l2=0.01),
        L1L2(l1=0.01, l2=0.01)
    ]
    # run the experiment
    results = DataFrame()
    for reg in regularizers:
        name = ('l1 %.2f,l2 %.2f' % (reg.l1, reg.l2))
        results[name] = experiment(series, n_lag, n_repeats, n_epochs, n_batch,
                                   n_neurons, reg)
    # summarize results
    print(results.describe())
    # save boxplot
    results.boxplot()
    pyplot.savefig('experiment_reg_bias.png')
    def test_boxplot_legacy2(self):
        df = DataFrame(np.random.rand(10, 2), columns=['Col1', 'Col2'])
        df['X'] = Series(['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B'])
        df['Y'] = Series(['A'] * 10)
        with tm.assert_produces_warning(UserWarning):
            _check_plot_works(df.boxplot, by='X')

        # When ax is supplied and required number of axes is 1,
        # passed ax should be used:
        fig, ax = self.plt.subplots()
        axes = df.boxplot('Col1', by='X', ax=ax)
        ax_axes = ax.axes
        assert ax_axes is axes

        fig, ax = self.plt.subplots()
        axes = df.groupby('Y').boxplot(ax=ax, return_type='axes')
        ax_axes = ax.axes
        assert ax_axes is axes['A']

        # Multiple columns with an ax argument should use same figure
        fig, ax = self.plt.subplots()
        with tm.assert_produces_warning(UserWarning):
            axes = df.boxplot(column=['Col1', 'Col2'],
                              by='X', ax=ax, return_type='axes')
        assert axes['Col1'].get_figure() is fig

        # When by is None, check that all relevant lines are present in the
        # dict
        fig, ax = self.plt.subplots()
        d = df.boxplot(ax=ax, return_type='dict')
        lines = list(itertools.chain.from_iterable(d.values()))
        assert len(ax.get_lines()) == len(lines)
Пример #5
0
def print_stats(events, figure_path=None):
    """ compute stats """
    # explore durations by causes
    durations_by_causes = events.get_durations_by_cause()
    df_causes = DataFrame([{
        "cause": k,
        "duration": t.seconds / 3600.
    } for k, v in durations_by_causes.iteritems() for t in v])

    # explore durations by time
    durations_by_times = events.get_durations_by_time()
    df_times = DataFrame([{
        "time": k,
        "duration": t.seconds / 3600.
    } for k, v in durations_by_times.iteritems() for t in v])

    fig = plt.figure(figsize=(8, 12))
    ax = plt.subplot(211)
    df_causes.boxplot(column='duration', by='cause', rot=75, ax=ax)
    ax = plt.subplot(212)
    df_times.boxplot(column='duration', by='time', ax=ax)
    fig.tight_layout()
    if figure_path:
        fig.savefig(figure_path)
    return figure_path
Пример #6
0
    def test_boxplot_legacy2(self):
        df = DataFrame(np.random.rand(10, 2), columns=['Col1', 'Col2'])
        df['X'] = Series(['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B'])
        df['Y'] = Series(['A'] * 10)
        with tm.assert_produces_warning(UserWarning):
            _check_plot_works(df.boxplot, by='X')

        # When ax is supplied and required number of axes is 1,
        # passed ax should be used:
        fig, ax = self.plt.subplots()
        axes = df.boxplot('Col1', by='X', ax=ax)
        ax_axes = ax.axes
        assert ax_axes is axes

        fig, ax = self.plt.subplots()
        axes = df.groupby('Y').boxplot(ax=ax, return_type='axes')
        ax_axes = ax.axes
        assert ax_axes is axes['A']

        # Multiple columns with an ax argument should use same figure
        fig, ax = self.plt.subplots()
        with tm.assert_produces_warning(UserWarning):
            axes = df.boxplot(column=['Col1', 'Col2'],
                              by='X', ax=ax, return_type='axes')
        assert axes['Col1'].get_figure() is fig

        # When by is None, check that all relevant lines are present in the
        # dict
        fig, ax = self.plt.subplots()
        d = df.boxplot(ax=ax, return_type='dict')
        lines = list(itertools.chain.from_iterable(d.values()))
        assert len(ax.get_lines()) == len(lines)
    def test_boxplot_legacy2(self):
        df = DataFrame(np.random.rand(10, 2), columns=["Col1", "Col2"])
        df["X"] = Series(["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"])
        df["Y"] = Series(["A"] * 10)
        with tm.assert_produces_warning(UserWarning):
            _check_plot_works(df.boxplot, by="X")

        # When ax is supplied and required number of axes is 1,
        # passed ax should be used:
        fig, ax = self.plt.subplots()
        axes = df.boxplot("Col1", by="X", ax=ax)
        ax_axes = ax.axes
        assert ax_axes is axes

        fig, ax = self.plt.subplots()
        axes = df.groupby("Y").boxplot(ax=ax, return_type="axes")
        ax_axes = ax.axes
        assert ax_axes is axes["A"]

        # Multiple columns with an ax argument should use same figure
        fig, ax = self.plt.subplots()
        with tm.assert_produces_warning(UserWarning):
            axes = df.boxplot(column=["Col1", "Col2"],
                              by="X",
                              ax=ax,
                              return_type="axes")
        assert axes["Col1"].get_figure() is fig

        # When by is None, check that all relevant lines are present in the
        # dict
        fig, ax = self.plt.subplots()
        d = df.boxplot(ax=ax, return_type="dict")
        lines = list(itertools.chain.from_iterable(d.values()))
        assert len(ax.get_lines()) == len(lines)
def init():
    output_name = "boxplot_speed_diff_2019_04_dataset"

    one_series = series["speed_diff"]

    # one_year = one_series["2019"]
    # groups = one_year.groupby(Grouper(freq="M"))
    # months = concat([DataFrame(x[1].values) for x in groups], axis=1)
    # months = DataFrame(months)
    # months.columns = range(1, 13)
    # groups.plot()

    one_month = one_series["2019-04"]
    groups = one_month.groupby(Grouper(freq="D"))
    days = concat([DataFrame(x[1].values) for x in groups], axis=1)
    days = DataFrame(days)
    days.columns = range(1, 31)
    pyplot.gcf().set_size_inches(12, 7)
    days.boxplot()

    try:
        pyplot.savefig(os.path.join(OUTPUT_FOLDER, f"{output_name}.png"), format="png", dpi=300)
    except FileNotFoundError:
        os.makedirs(OUTPUT_FOLDER)
        pyplot.savefig(os.path.join(OUTPUT_FOLDER, f"{output_name}.png"), format="png", dpi=300)
Пример #9
0
def main(): 
    # experiment
    # 3D (sample , 26 , 3)
    data = load_data()

    param = {
        'features': data.n_features,
        'timesteps': 26,
        'batch_size': 26,
        'n_neurons': 1,
        'n_inputs': len(data['raw']),  #I am suppose to have 36
        'data': data
    }
    
    repeats = 1
    results = DataFrame()
    epochs = [1,2,3]

    # vary training epochs
    for e in epochs:
    	results[str(e)] = experiment(repeats, e, param)
    
    # summarize results
    print(results.describe())
    # save boxplot
    results.boxplot()
    pyplot.savefig('boxplot_epochs.png')
def mcts_variance_across_multiple_game_states(cheater: bool,
                                              options: MctsPlayerOptions,
                                              num_samples: int,
                                              num_game_states: int):
    data = []
    for seed in range(num_game_states):
        print(f"Evaluating on GameState.new(random_seed={seed})")
        dataframe = _get_dataframe(GameState.new(random_seed=seed), cheater,
                                   options, num_samples)
        details = dataframe.describe().T.sort_values(by="mean",
                                                     ascending=False)
        std_dev = list(details["std"].values)
        while len(std_dev) < 7:
            std_dev.append(np.nan)
        data.append(tuple(std_dev))
    dataframe = DataFrame(data,
                          columns=["BestAction"] +
                          [f"Action #{i}" for i in range(2, 8)])
    csv_path = "mcts_variance_across_game_states.csv"
    # noinspection PyTypeChecker
    dataframe.to_csv(csv_path, index=False)
    # dataframe = pandas.read_csv(csv_path)
    dataframe.boxplot()
    plt.xticks(rotation=45, ha='right')
    plt.gcf().set_size_inches((5, 5))
    plt.tight_layout()
    plt.savefig("mcts_variance_across_game_states.png")
Пример #11
0
def show_price_outliers(df: pd.DataFrame):
    df = df.copy()
    df['price'] = np.log(df['price'])
    df.boxplot(column='price')
    plt.title('Price')
    plt.ylabel('price')
    plt.show()
Пример #12
0
    def test_boxplot_legacy(self):
        df = DataFrame(randn(6, 4),
                       index=list(string.ascii_letters[:6]),
                       columns=['one', 'two', 'three', 'four'])
        df['indic'] = ['foo', 'bar'] * 3
        df['indic2'] = ['foo', 'bar', 'foo'] * 2

        _check_plot_works(df.boxplot, return_type='dict')
        _check_plot_works(df.boxplot,
                          column=['one', 'two'],
                          return_type='dict')
        # _check_plot_works adds an ax so catch warning. see GH #13188
        with tm.assert_produces_warning(UserWarning):
            _check_plot_works(df.boxplot, column=['one', 'two'], by='indic')
        _check_plot_works(df.boxplot, column='one', by=['indic', 'indic2'])
        with tm.assert_produces_warning(UserWarning):
            _check_plot_works(df.boxplot, by='indic')
        with tm.assert_produces_warning(UserWarning):
            _check_plot_works(df.boxplot, by=['indic', 'indic2'])
        _check_plot_works(plotting._core.boxplot,
                          data=df['one'],
                          return_type='dict')
        _check_plot_works(df.boxplot, notch=1, return_type='dict')
        with tm.assert_produces_warning(UserWarning):
            _check_plot_works(df.boxplot, by='indic', notch=1)

        df = DataFrame(np.random.rand(10, 2), columns=['Col1', 'Col2'])
        df['X'] = Series(['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B'])
        df['Y'] = Series(['A'] * 10)
        with tm.assert_produces_warning(UserWarning):
            _check_plot_works(df.boxplot, by='X')

        # When ax is supplied and required number of axes is 1,
        # passed ax should be used:
        fig, ax = self.plt.subplots()
        axes = df.boxplot('Col1', by='X', ax=ax)
        ax_axes = ax.axes if self.mpl_ge_1_5_0 else ax.get_axes()
        assert ax_axes is axes

        fig, ax = self.plt.subplots()
        axes = df.groupby('Y').boxplot(ax=ax, return_type='axes')
        ax_axes = ax.axes if self.mpl_ge_1_5_0 else ax.get_axes()
        assert ax_axes is axes['A']

        # Multiple columns with an ax argument should use same figure
        fig, ax = self.plt.subplots()
        with tm.assert_produces_warning(UserWarning):
            axes = df.boxplot(column=['Col1', 'Col2'],
                              by='X',
                              ax=ax,
                              return_type='axes')
        assert axes['Col1'].get_figure() is fig

        # When by is None, check that all relevant lines are present in the
        # dict
        fig, ax = self.plt.subplots()
        d = df.boxplot(ax=ax, return_type='dict')
        lines = list(itertools.chain.from_iterable(d.values()))
        self.assertEqual(len(ax.get_lines()), len(lines))
Пример #13
0
def boxplot(dataFrame:pandas.DataFrame,columnName:str,byColumnName:str=None):
    """
    Displays Boxplot of given column(s)
    """
    if dataFrame is None:
        return None
    dataFrame.boxplot(column=columnName,by=byColumnName)
    matplotlib.pyplot.show()
def getBoxWhiskerPlot():
    X = validate.getdatafile()
    X.astype('float32')
    groepen = X['1964':'1970'].groupby(TimeGrouper('A'))
    jaren = DataFrame()
    for name, groep in groepen:
       jaren[name.year] = groep.values
    jaren.boxplot()
    pyplot.show()
Пример #15
0
def showBoxPlot(df):
    groups = df.groupby(Grouper(freq='A'))
    years = DataFrame()

    for name, group in groups:
        years[name.year] = group.values

    years.boxplot()
    pyplot.show()
Пример #16
0
    def test_boxplot_legacy(self):
        df = DataFrame(randn(6, 4),
                       index=list(string.ascii_letters[:6]),
                       columns=['one', 'two', 'three', 'four'])
        df['indic'] = ['foo', 'bar'] * 3
        df['indic2'] = ['foo', 'bar', 'foo'] * 2

        _check_plot_works(df.boxplot, return_type='dict')
        _check_plot_works(df.boxplot, column=[
                          'one', 'two'], return_type='dict')
        # _check_plot_works adds an ax so catch warning. see GH #13188
        with tm.assert_produces_warning(UserWarning):
            _check_plot_works(df.boxplot, column=['one', 'two'],
                              by='indic')
        _check_plot_works(df.boxplot, column='one', by=['indic', 'indic2'])
        with tm.assert_produces_warning(UserWarning):
            _check_plot_works(df.boxplot, by='indic')
        with tm.assert_produces_warning(UserWarning):
            _check_plot_works(df.boxplot, by=['indic', 'indic2'])
        _check_plot_works(plotting._core.boxplot, data=df['one'],
                          return_type='dict')
        _check_plot_works(df.boxplot, notch=1, return_type='dict')
        with tm.assert_produces_warning(UserWarning):
            _check_plot_works(df.boxplot, by='indic', notch=1)

        df = DataFrame(np.random.rand(10, 2), columns=['Col1', 'Col2'])
        df['X'] = Series(['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B'])
        df['Y'] = Series(['A'] * 10)
        with tm.assert_produces_warning(UserWarning):
            _check_plot_works(df.boxplot, by='X')

        # When ax is supplied and required number of axes is 1,
        # passed ax should be used:
        fig, ax = self.plt.subplots()
        axes = df.boxplot('Col1', by='X', ax=ax)
        ax_axes = ax.axes if self.mpl_ge_1_5_0 else ax.get_axes()
        assert ax_axes is axes

        fig, ax = self.plt.subplots()
        axes = df.groupby('Y').boxplot(ax=ax, return_type='axes')
        ax_axes = ax.axes if self.mpl_ge_1_5_0 else ax.get_axes()
        assert ax_axes is axes['A']

        # Multiple columns with an ax argument should use same figure
        fig, ax = self.plt.subplots()
        with tm.assert_produces_warning(UserWarning):
            axes = df.boxplot(column=['Col1', 'Col2'],
                              by='X', ax=ax, return_type='axes')
        assert axes['Col1'].get_figure() is fig

        # When by is None, check that all relevant lines are present in the
        # dict
        fig, ax = self.plt.subplots()
        d = df.boxplot(ax=ax, return_type='dict')
        lines = list(itertools.chain.from_iterable(d.values()))
        assert len(ax.get_lines()) == len(lines)
def evaluate_hidden_layer(neurons, train_docs, test_docs, ytrain, ytest):
    results = DataFrame()
    for n in neurons:
        Xtrain, Xtest = train_model(train_docs, test_docs, 'binary')
        results[n] = evaluate_model(Xtrain, Xtest, ytrain, ytest, n)
#    print table describing results
    print(results.describe())
    # plot results
    results.boxplot()
    pyplot.show()
Пример #18
0
def individualBoxPlots(df: pd.DataFrame, library: str, dataset: str):
    plt.figure(figsize=(10, 8))
    df.boxplot(notch=True)
    plt.xlabel("Function")
    plt.ylabel("Runtime [s]")
    plt.title(library + " on " + dataset)
    plt.savefig("./graphics/" + library + "_" + dataset + ".png",
                bbox_inces="tight")
    plt.savefig("./graphics/" + library + "_" + dataset + ".pdf",
                bbox_inces="tight")
    plt.close()
def evaluate_modes(modes, train_docs, test_docs, ytrain, ytest):
    #define modes
    results = DataFrame()
    for m in modes:
        Xtrain, Xtest = train_model(train_docs, test_docs, m)
        results[m] = evaluate_model(Xtrain, Xtest, ytrain, ytest)


#    print table describing results
    print(results.describe())
    # plot results
    results.boxplot()
    pyplot.show()
Пример #20
0
def Main():
    data = [random.uniform(0, 1) for x in range(0, 47)]
    data = df(data)
    # data = data.T
    repeats = 30
    results = DataFrame()
    lag = 1
    neurons = 1
    epochs = [50, 100, 500, 1000, 2000]
    for e in epochs:
        results[str(e)] = experiment(repeats, data, e, lag, neurons)
    print(results.describe)
    results.boxplot()
    pyplot.savefig('boxplot_epochs.png')
def test_multiple_resampling_rates_and_epochs(experiment_func=experiment):
    # Experiment conditions
    resampling_rates = [5, 10, 20, 30, 40]
    resampling_rates = resampling_rates[::
                                        -1]  # will start with the higher/faster resampling rates, for faster feedback while developing
    epochs = [125, 250, 500, 1000, 2000, 4000]
    batch_size = 4
    neurons = 1
    repeats = 3

    experiment_results = {}
    overall_time_results = {}

    # load dataset
    series = read_csv('david240520160001-singleLegVertForceSeries.csv',
                      header=0,
                      parse_dates=[0],
                      index_col=0,
                      squeeze=True)  #, date_parser=parser)
    for r in resampling_rates:
        print('Training and testing at resampling rate: {}'.format(r))
        # reduce sampling rate to speed training of the model
        series_resampled = series.iloc[::r]
        # line plot
        series_resampled.plot()
        pyplot.show()
        # experiment
        epoch_results = DataFrame()
        time_results = {}
        for e in epochs:
            print(
                'Running function \"{}\" with parameters (epochs: {}; batch_size: {}; neurons: {})'
                .format(experiment_func.__name__, e, batch_size, neurons))
            ts = time()
            epoch_results[str(e)] = experiment_func(repeats, series_resampled,
                                                    e, batch_size, neurons)
            te = time()
            runtime = te - ts
            time_results[str(e)] = runtime
            print('Completed function \"{}\" in {} seconds'.format(
                experiment_func.__name__, runtime))
        # summarize results
        print(epoch_results.describe())
        experiment_results[str(r)] = epoch_results
        overall_time_results[str(r)] = time_results
        # save boxplot
        epoch_results.boxplot()
        pyplot.savefig('boxplot_epochs_resample{}.png'.format(r))
        pyplot.show()
    return experiment_results, overall_time_results
Пример #22
0
def time_grouper_plot(series,plot_file):
    plt.clf()
    plt.figure(figsize=(10,7))
    groups = series['2007':'2017'].groupby(TimeGrouper('A'))
    print(type(groups))
    years = DataFrame()
    for name, group in groups:
        years[name.year] = group.values
    years.boxplot()
    plt.xticks(rotation=45)
    plt.xlabel('Year')
    plt.ylabel('Temperature [°C]')
    plt.title('Temperature changes box and whisker plots')
    plt.savefig(plot_file)
def mcts_ci_widths_across_multiple_game_states(use_player: bool,
                                               options: MctsPlayerOptions,
                                               num_samples: int,
                                               num_game_states: int):
    data = []
    overlap_count = 0
    for seed in range(num_game_states):
        print(f"Evaluating on GameState.new(random_seed={seed})")
        for _ in range(num_samples):
            if use_player:
                dataframe = run_mcts_player_step_by_step(
                    GameState.new(random_seed=seed).next_player_view(),
                    options, options.max_iterations)
            else:
                dataframe = run_mcts_and_collect_data(
                    GameState.new(random_seed=seed), options,
                    options.max_iterations)
            dataframe[
                "ci_width"] = dataframe["score_upp"] - dataframe["score_low"]
            dataframe = dataframe[dataframe.iteration.eq(
                dataframe.iteration.max())].sort_values("score",
                                                        ascending=False)
            if _is_ci_overlap(dataframe.iloc[0].score_low,
                              dataframe.iloc[0].score_upp,
                              dataframe.iloc[1].score_low,
                              dataframe.iloc[1].score_upp):
                overlap_count += 1
            ci_widths = list(dataframe["ci_width"].values)
            while len(ci_widths) < 7:
                ci_widths.append(np.nan)
            data.append(tuple(ci_widths))
    overlap_pct = np.round(overlap_count / num_samples / num_game_states * 100,
                           2)
    print(f"Overlap in the CIs for the best two actions in {overlap_count} "
          f"cases out of {num_samples * num_game_states} ({overlap_pct}%)")
    dataframe = DataFrame(data,
                          columns=["BestAction"] +
                          [f"Action #{i}" for i in range(2, 8)])
    suffix = "_player" if use_player else ""
    csv_path = f"mcts_ci_widths_across_game_states{suffix}.csv"
    # noinspection PyTypeChecker
    dataframe.to_csv(csv_path, index=False)
    # dataframe = pandas.read_csv(csv_path)
    dataframe.boxplot()
    plt.xticks(rotation=45, ha='right')
    plt.gcf().set_size_inches((5, 5))
    plt.tight_layout()
    plt.savefig(f"mcts_ci_widths_across_game_states{suffix}.png")
Пример #24
0
    def test_grouped_box_return_type(self):
        df = self.hist_df

        # old style: return_type=None
        result = df.boxplot(by='gender')
        self.assertIsInstance(result, np.ndarray)
        self._check_box_return_type(
            result, None, expected_keys=['height', 'weight', 'category'])

        # now for groupby
        result = df.groupby('gender').boxplot(return_type='dict')
        self._check_box_return_type(result,
                                    'dict',
                                    expected_keys=['Male', 'Female'])

        columns2 = 'X B C D A G Y N Q O'.split()
        df2 = DataFrame(random.randn(50, 10), columns=columns2)
        categories2 = 'A B C D E F G H I J'.split()
        df2['category'] = categories2 * 5

        for t in ['dict', 'axes', 'both']:
            returned = df.groupby('classroom').boxplot(return_type=t)
            self._check_box_return_type(returned,
                                        t,
                                        expected_keys=['A', 'B', 'C'])

            returned = df.boxplot(by='classroom', return_type=t)
            self._check_box_return_type(
                returned, t, expected_keys=['height', 'weight', 'category'])

            returned = df2.groupby('category').boxplot(return_type=t)
            self._check_box_return_type(returned, t, expected_keys=categories2)

            returned = df2.boxplot(by='category', return_type=t)
            self._check_box_return_type(returned, t, expected_keys=columns2)
    def test_specified_props_kwd(self, props, expected):
        # GH 30346
        df = DataFrame({k: np.random.random(100) for k in "ABC"})
        kwd = {props: {"color": "C1"}}
        result = df.boxplot(return_type="dict", **kwd)

        assert result[expected][0].get_color() == "C1"
    def test_grouped_box_return_type(self):
        df = self.hist_df

        # old static: return_type=None
        result = df.boxplot(by="gender")
        assert isinstance(result, np.ndarray)
        self._check_box_return_type(
            result, None, expected_keys=["height", "weight", "category"])

        # now for groupby
        result = df.groupby("gender").boxplot(return_type="dict")
        self._check_box_return_type(result,
                                    "dict",
                                    expected_keys=["Male", "Female"])

        columns2 = "X B C D A G Y N Q O".split()
        df2 = DataFrame(np.random.randn(50, 10), columns=columns2)
        categories2 = "A B C D E F G H I J".split()
        df2["category"] = categories2 * 5

        for t in ["dict", "axes", "both"]:
            returned = df.groupby("classroom").boxplot(return_type=t)
            self._check_box_return_type(returned,
                                        t,
                                        expected_keys=["A", "B", "C"])

            returned = df.boxplot(by="classroom", return_type=t)
            self._check_box_return_type(
                returned, t, expected_keys=["height", "weight", "category"])

            returned = df2.groupby("category").boxplot(return_type=t)
            self._check_box_return_type(returned, t, expected_keys=categories2)

            returned = df2.boxplot(by="category", return_type=t)
            self._check_box_return_type(returned, t, expected_keys=columns2)
Пример #27
0
def boxplot(a, y_label, save=''):
    fig = plt.figure(figsize=(19, 5))
    plt.locator_params(axis='x', nticks=10)
    df = DataFrame(a)
    plt.figure()
    df.boxplot().set_xticklabels(
        [str(i) if i % 5 == 0 else '' for i in range(101)])

    plt.xlabel('Generations')
    plt.ylabel(y_label.title())
    plt.title(y_label.title() + ' x Generations')
    plt.grid(True)

    if save:
        print('saving to ' + save)
        plt.savefig(save + '_boxplot.png', dpi=fig.dpi)
def boxplot_path_by_outcome(trades, day):
    tf = trades.trade_frame(compacted=False, cumulative=False)
    # Get the daily returns from the day after the requested day onwards.
    # Remove any trades which are empty moving forward, as we know these would have been closed.
    forward = tf.loc[:, (day + 1):].dropna(how='all')
    forward = log(forward + 1)
    backward = tf.loc[forward.index, :day]
    backward = log(backward + 1)

    df = DataFrame(dtype=float)
    df['Final Return'] = qcut(forward.sum(axis=1).round(2), 5)
    df['Current Return'] = backward.sum(axis=1)

    bp = df.boxplot('Current Return', by='Final Return', return_type='dict')

    whisker_points = []
    [
        whisker_points.extend(list(whisker.get_ydata()))
        for whisker in bp[0]['whiskers']
    ]
    y_min = min(whisker_points) * 1.1
    y_max = max(whisker_points) * 1.1
    plt.ylim((y_min, y_max))
    plt.xticks(fontsize='small', rotation=30)
    plt.ylabel('Current Return')
    plt.title('Day {}'.format(day))
Пример #29
0
    def test_grouped_box_return_type(self):
        df = self.hist_df

        # old style: return_type=None
        result = df.boxplot(by='gender')
        self.assertIsInstance(result, np.ndarray)
        self._check_box_return_type(result, None,
                                    expected_keys=['height', 'weight', 'category'])

        # now for groupby
        with tm.assert_produces_warning(FutureWarning):
            result = df.groupby('gender').boxplot()
        self._check_box_return_type(result, 'dict', expected_keys=['Male', 'Female'])

        columns2 = 'X B C D A G Y N Q O'.split()
        df2 = DataFrame(random.randn(50, 10), columns=columns2)
        categories2 = 'A B C D E F G H I J'.split()
        df2['category'] = categories2 * 5

        for t in ['dict', 'axes', 'both']:
            returned = df.groupby('classroom').boxplot(return_type=t)
            self._check_box_return_type(returned, t, expected_keys=['A', 'B', 'C'])

            returned = df.boxplot(by='classroom', return_type=t)
            self._check_box_return_type(returned, t,
                                        expected_keys=['height', 'weight', 'category'])

            returned = df2.groupby('category').boxplot(return_type=t)
            self._check_box_return_type(returned, t, expected_keys=categories2)

            returned = df2.boxplot(by='category', return_type=t)
            self._check_box_return_type(returned, t, expected_keys=columns2)
Пример #30
0
    def test_boxplot_legacy(self):
        df = DataFrame(randn(6, 4),
                       index=list(string.ascii_letters[:6]),
                       columns=['one', 'two', 'three', 'four'])
        df['indic'] = ['foo', 'bar'] * 3
        df['indic2'] = ['foo', 'bar', 'foo'] * 2

        _check_plot_works(df.boxplot, return_type='dict')
        _check_plot_works(df.boxplot,
                          column=['one', 'two'],
                          return_type='dict')
        _check_plot_works(df.boxplot, column=['one', 'two'], by='indic')
        _check_plot_works(df.boxplot, column='one', by=['indic', 'indic2'])
        _check_plot_works(df.boxplot, by='indic')
        _check_plot_works(df.boxplot, by=['indic', 'indic2'])
        _check_plot_works(plotting.boxplot, df['one'], return_type='dict')
        _check_plot_works(df.boxplot, notch=1, return_type='dict')
        _check_plot_works(df.boxplot, by='indic', notch=1)

        df = DataFrame(np.random.rand(10, 2), columns=['Col1', 'Col2'])
        df['X'] = Series(['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B'])
        df['Y'] = Series(['A'] * 10)
        _check_plot_works(df.boxplot, by='X')

        # When ax is supplied and required number of axes is 1,
        # passed ax should be used:
        fig, ax = self.plt.subplots()
        axes = df.boxplot('Col1', by='X', ax=ax)
        self.assertIs(ax.get_axes(), axes)

        fig, ax = self.plt.subplots()
        axes = df.groupby('Y').boxplot(ax=ax, return_type='axes')
        self.assertIs(ax.get_axes(), axes['A'])

        # Multiple columns with an ax argument should use same figure
        fig, ax = self.plt.subplots()
        axes = df.boxplot(column=['Col1', 'Col2'],
                          by='X',
                          ax=ax,
                          return_type='axes')
        self.assertIs(axes['Col1'].get_figure(), fig)

        # When by is None, check that all relevant lines are present in the dict
        fig, ax = self.plt.subplots()
        d = df.boxplot(ax=ax, return_type='dict')
        lines = list(itertools.chain.from_iterable(d.values()))
        self.assertEqual(len(ax.get_lines()), len(lines))
Пример #31
0
    def test_boxplot_legacy(self):
        df = DataFrame(randn(6, 4),
                       index=list(string.ascii_letters[:6]),
                       columns=['one', 'two', 'three', 'four'])
        df['indic'] = ['foo', 'bar'] * 3
        df['indic2'] = ['foo', 'bar', 'foo'] * 2

        _check_plot_works(df.boxplot, return_type='dict')
        _check_plot_works(df.boxplot, column=[
                          'one', 'two'], return_type='dict')
        _check_plot_works(df.boxplot, column=['one', 'two'], by='indic')
        _check_plot_works(df.boxplot, column='one', by=['indic', 'indic2'])
        _check_plot_works(df.boxplot, by='indic')
        _check_plot_works(df.boxplot, by=['indic', 'indic2'])
        _check_plot_works(plotting.boxplot, data=df['one'], return_type='dict')
        _check_plot_works(df.boxplot, notch=1, return_type='dict')
        _check_plot_works(df.boxplot, by='indic', notch=1)

        df = DataFrame(np.random.rand(10, 2), columns=['Col1', 'Col2'])
        df['X'] = Series(['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B'])
        df['Y'] = Series(['A'] * 10)
        _check_plot_works(df.boxplot, by='X')

        # When ax is supplied and required number of axes is 1,
        # passed ax should be used:
        fig, ax = self.plt.subplots()
        axes = df.boxplot('Col1', by='X', ax=ax)
        self.assertIs(ax.get_axes(), axes)

        fig, ax = self.plt.subplots()
        axes = df.groupby('Y').boxplot(ax=ax, return_type='axes')
        self.assertIs(ax.get_axes(), axes['A'])

        # Multiple columns with an ax argument should use same figure
        fig, ax = self.plt.subplots()
        axes = df.boxplot(column=['Col1', 'Col2'],
                          by='X', ax=ax, return_type='axes')
        self.assertIs(axes['Col1'].get_figure(), fig)

        # When by is None, check that all relevant lines are present in the
        # dict
        fig, ax = self.plt.subplots()
        d = df.boxplot(ax=ax, return_type='dict')
        lines = list(itertools.chain.from_iterable(d.values()))
        self.assertEqual(len(ax.get_lines()), len(lines))
def test_multiple_batch_sizes_and_neurons(experiment_func=experiment):
    # Experiment conditions
    resampling_rate = 5
    epoch_size = 250
    batch_size = [1, 2, 4, 8]
    neurons = [1, 2, 4, 8]
    repeats = 3

    experiment_results = {}
    overall_time_results = {}

    # load dataset
    series = read_csv('david240520160001-singleLegVertForceSeries.csv',
                      header=0,
                      parse_dates=[0],
                      index_col=0,
                      squeeze=True)  #, date_parser=parser)
    # reduce sampling rate to speed training of the model
    series_resampled = series.iloc[::resampling_rate]
    for n in neurons:
        # experiment
        batch_results = DataFrame()
        time_results = {}
        for b in batch_size:
            print(
                'Running function \"{}\" with parameters (epochs: {}; batch_size: {}; neurons: {})'
                .format(experiment_func.__name__, epoch_size, b, n))
            ts = time()
            batch_results[str(b)] = experiment_func(repeats, series_resampled,
                                                    epoch_size, b, n)
            te = time()
            runtime = te - ts
            time_results[str(b)] = runtime
            print('Completed function \"{}\" in {} seconds'.format(
                experiment_func.__name__, runtime))
        # summarize results
        print(batch_results.describe())
        experiment_results[str(n)] = batch_results
        overall_time_results[str(n)] = time_results
        # save boxplot
        batch_results.boxplot()
        pyplot.savefig('boxplot_batch_size_neurons{}.png'.format(n))
        pyplot.show()
    return experiment_results, overall_time_results
Пример #33
0
    def test_boxplot_group_xlabel_ylabel(self, vert):
        df = DataFrame({
            "a": np.random.randn(100),
            "b": np.random.randn(100),
            "group": np.random.choice(["group1", "group2"], 100),
        })
        xlabel, ylabel = "x", "y"
        ax = df.boxplot(by="group", vert=vert, xlabel=xlabel, ylabel=ylabel)
        for subplot in ax:
            assert subplot.get_xlabel() == xlabel
            assert subplot.get_ylabel() == ylabel
        self.plt.close()

        ax = df.boxplot(by="group", vert=vert)
        for subplot in ax:
            target_label = subplot.get_xlabel(
            ) if vert else subplot.get_ylabel()
            assert target_label == pprint_thing(["group"])
        self.plt.close()
Пример #34
0
 def test_boxplot_xlabel_ylabel(self, vert):
     df = DataFrame({
         "a": np.random.randn(100),
         "b": np.random.randn(100),
         "group": np.random.choice(["group1", "group2"], 100),
     })
     xlabel, ylabel = "x", "y"
     ax = df.boxplot(vert=vert, xlabel=xlabel, ylabel=ylabel)
     assert ax.get_xlabel() == xlabel
     assert ax.get_ylabel() == ylabel
Пример #35
0
    def test_boxplot_return_type_legacy(self):
        # API change in https://github.com/pydata/pandas/pull/7096
        import matplotlib as mpl  # noqa

        df = DataFrame(randn(6, 4),
                       index=list(string.ascii_letters[:6]),
                       columns=['one', 'two', 'three', 'four'])
        with tm.assertRaises(ValueError):
            df.boxplot(return_type='NOTATYPE')

        with tm.assert_produces_warning(FutureWarning):
            result = df.boxplot()
        # change to Axes in future
        self._check_box_return_type(result, 'dict')

        with tm.assert_produces_warning(False):
            result = df.boxplot(return_type='dict')
        self._check_box_return_type(result, 'dict')

        with tm.assert_produces_warning(False):
            result = df.boxplot(return_type='axes')
        self._check_box_return_type(result, 'axes')

        with tm.assert_produces_warning(False):
            result = df.boxplot(return_type='both')
        self._check_box_return_type(result, 'both')
def boxplot_path_by_outcome(trades, day):
    tf = trades.trade_frame(compacted = False, cumulative = False)
    # Get the daily returns from the day after the requested day onwards.
    # Remove any trades which are empty moving forward, as we know these would have been closed.
    forward = tf.loc[:, (day + 1):].dropna(how = 'all')
    forward = log(forward + 1)
    backward = tf.loc[forward.index, :day]
    backward = log(backward + 1)

    df = DataFrame(dtype = float)
    df['Final Return'] = qcut(forward.sum(axis = 1).round(2), 5)
    df['Current Return'] = backward.sum(axis = 1)

    bp = df.boxplot('Current Return', by = 'Final Return', return_type = 'dict')

    whisker_points = []
    [whisker_points.extend(list(whisker.get_ydata())) for whisker in bp[0]['whiskers']]
    y_min = min(whisker_points) * 1.1
    y_max = max(whisker_points) * 1.1
    plt.ylim((y_min, y_max))
    plt.xticks(fontsize = 'small', rotation = 30)
    plt.ylabel('Current Return')
    plt.title('Day {}'.format(day))
Пример #37
0
np.max(t_grouped)-np.min(t_grouped)
##186.398124


import matplotlib
matplotlib.style.use('ggplot')

t_grouped = frame.groupby(['District'])[['ResponseTime']].mean()
ax=t_grouped.plot(kind='bar',legend=False,title='The Response Times in Each Police District')
ax.set_ylabel('The Means of Response Times(secs)')
ax.set_xlabel('The Police District')
ax.axhline(np.mean(delta2), color='k',linestyle='dashed')
plt.savefig('C:/Users/tianyu/Desktop/0415/means.png', dpi=400, bbox_inches='tight')

frame=frame[frame['Response Time']>0]
bp = frame.boxplot(by='District',sym='', meanline=True,figsize=(6,6))
bp.set_ylim([-10, 2000])
bp.set_xlabel('Police District')
plt.savefig('C:/Users/tianyu/Desktop/0415/box.png', dpi=400, bbox_inches='tight')


##We can define surprising event types as those that occur more often in a district than 
##they do over the whole city. What is the largest ratio of the conditional probability
##of an event type given a district to the unconditional probably of that event type? 
##Consider only events types which have more than 100 events. Note that some events have 
##their locations anonymized and are reported as being in district "0". These should be ignored.
clean_data=data_sum[['Type_','PoliceDistrict']][data_sum['PoliceDistrict']!=0]
type_counts = clean_data['Type_'].value_counts()
types=type_counts[type_counts>100].index
maxfr=0.0
for t in types:
Пример #38
0
 def test_figsize(self):
     df = DataFrame(np.random.rand(10, 5),
                    columns=['A', 'B', 'C', 'D', 'E'])
     result = df.boxplot(return_type='axes', figsize=(12, 8))
     assert result.figure.bbox_inches.width == 12
     assert result.figure.bbox_inches.height == 8
Пример #39
0
 def test_fontsize(self):
     df = DataFrame({"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]})
     self._check_ticks_props(df.boxplot("a", by="b", fontsize=16),
                             xlabelsize=16, ylabelsize=16)
class Dataset:
    """
    This class holds an object that stores all the tables and the results
    of the analysis.

    To access them once the analysis is over do:
    data = Dataset()
    data.df: for the raw word frequency data
    data.cdb for the table holding clusters and dep. vars.
    data.top_words for the table of most used words.
    data.desc_stat for a table of descriptive statistics for each cluster
    data.reg.results for the regression results stored as statsmodels
                     regressionResults objects

    You can show plots and print reg. results by doing
    data.show_plots()
    data.regression_results()
    """

    def __init__(self):
        self.tf_idf = DataFrame()
        self.df = DataFrame()
        self.cdb = DataFrame()
        self.top_words = DataFrame()
        self.desc_stat = DataFrame()
        self.reg_results = []
        self.multi_results = DataFrame()

    def create(self, paths, country_names, save_file="", clean=True,
               stopwords_path="../data/stopwords.csv", display_progress=False):

        # Create progress bar, Pbar class will handle import and
        # wheter or not to display.
        bar = Pbar(displayProgress)
        bar.create("Generating csv dataset...", len(paths))

        # Init database with as many rows as there are countries
        self.df = DataFrame(countryNames, columns=["country_id"])
        self.df['tot_terms'] = 0

        # A countre to keep track of which row we are on
        cnt = 0
        for p in paths:

            bar.update(cnt)

            c = load_constitution(p)
            frequencies = get_frequency(c)

            # Add number of words to each constitution
            self.df.loc[cnt, 'tot_terms'] = len(frequencies.keys())

            for word in frequencies.keys():
                # Initialize all words that have not appeared in other
                # constitutions to frequency 0
                if word not in self.df.columns:
                    self.df[word] = 0

                self.df[word][cnt] = frequencies[word]

            cnt += 1

        bar.finish()

        if(saveFile != ""):
            print "Saving dataset to csv file..."
            self.df.to_csv(saveFile, index=False)

        if(clean):
            self.clean(stopwordsPath, display_progress)

    def load(self, path, stopwords="../data/stopwords.csv", clean=True,
             display_progress=False):
        self.df = read_csv(path)
        if clean:
            self.clean(stopwords, display_progress)

    def clean(self, stopwords_path, display_progress=False):
        bar = Pbar("Cleaning dataset...", len(self.df.columns))

        with open(stopwordsPath, 'r+') as sw_file:
            stopwords = sw_file.read().split(',')

        numbers = [str(n) for n in range(10)]
        i = 0
        for c in self.df.columns:
            # Remove all words which meet the following conditions
            if c[0] in numbers or c in stopwords or not self.df[c].any > 0:
                self.df = self.df.drop(c, axis=1)

            bar.update(i)
            i += 1

        bar.finish()

    def build_tfidf_table(self):
        self.tf_idf = DataFrame()

        # Exclude country name and total words from data
        tf = self.df.ix[:, 2:]

        # To create the tf term, divide each row by the number of words
        # that appear in that country's constitution.
        for r in range(len(self.df)):
            tf.loc[r, :] = tf.loc[r, :] / self.df.loc[r, 'tot_terms']

        # To create idf, divide the number of documents by the number
        # of documents containing each word.
        # The operation here is vectorized using numpy arrays.
        # The number of documents containing each word is obtained by summing
        # a vector of bools where the documents in which the word has freq. > 0
        # are labeled true.
        idf = np.log(len(self.df.index) /
                     (self.df[self.df.ix[:, 2:] > 0].sum(axis=0))+1)

        self.tf_idf = tf*idf

        # Drop country and tot words columns from table in case they are still
        # there.
        if('country_id' in self.tf_idf.columns):
            self.tf_idf = self.tf_idf.drop('country_id', axis=1)
        if'tot_terms' in self.tf_idf.columns:
            self.tf_idf = self.tf_idf.drop('tot_terms', axis=1)

    def get_cluster(self, c_id, clusterCol='kmeans'):
        if c_id not in self.cdb[clusterCol]:
            raise KeyError("Selected cluster not in dataset")

        return self.cdb[self.cdb[clusterCol] == c_id]

    def get_topwords(self, countries, thresh=10, tf_idf=False):
        tw = DataFrame()
        for r in range(len(self.df)):
            if self.df.loc[r, 'country_id'] in countries:
                if tf_idf:
                    tw = tw.append(self.tf_idf.loc[r, :])
                else:
                    tw = tw.append(self.df.loc[r, :])

        return tw.mean().order(ascending=False)[:thresh]

    def get_word_avg(self, countries, word, tf_idf=False):
        w = 0
        for r in range(len(self.df)):
            if self.df.loc[r, 'country_id'] in countries:
                if tf_idf:
                    w += self.tf_idf.loc[r, word]
                else:
                    w += self.df.loc[r, word]
        return w/len(countries)

    def build_topwords_table(self, cluster_col="kmeans", thresh=10, raw=True):
        if len(self.cdb) == 0:
            raise Exception("Cluster database not initialized")

        # get themnames of all the clusters created
        labels = list(set(self.cdb[cluster_col]))

        self.top_words = DataFrame({'cluster': labels})
        for l in labels:
            countries = [c for c in self.get_cluster(l)['country']]
            tw = self.get_topwords(countries, thresh, tf_idf=(not raw))
            idx = self.top_words[self.top_words['cluster'] == l].index

            for w in tw.index:
                if w not in self.top_words.columns:
                    self.top_words[w] = 0
                self.top_words.loc[idx, w] = tw[w]

        for r in range(len(self.top_words)):
            countries = [c for c in self.get_cluster(self.top_words.loc[r,
                                                     'cluster'])['country']]
            for w in self.top_words.columns:
                if w != 'cluster' and self.top_words.loc[r, w] == 0:
                    self.top_words.loc[r, w] = self.get_word_avg(countries, w,
                                                            tf_idf=(not raw))

    def build_descstat_table(self, cluster_col="kmeans",
                             cols=['fh_score', 'LJI', 'fragility'],
                             na_cols=['fragility']):
        if len(self.cdb) == 0:
            raise Exception("Cluster database not initialized")

        labels = list(set(self.cdb[cluster_col]))
        # This weird list comprehension creates the labels for each colums of
        # the descstat table by pasting strings.
        col_labels = sum([[c + '_mean', c + '_median', c + '_std']
                          for c in cols], [])
        self.descStat = DataFrame(columns=['cluster'] + col_labels)

        for l in labels:
            row = [l]
            cluster = self.getCluster(l)

            for c in cols:
                if c in na_cols:
                    row.append(cluster[cluster[c] != 'NA'][c].mean())
                    row.append(cluster[cluster[c] != 'NA'][c].median())
                    row.append(cluster[cluster[c] != 'NA'][c].std())
                else:
                    row.append(cluster[c].mean())
                    row.append(cluster[c].median())
                    row.append(cluster[c].std())

            self.descStat.loc[l] = row

    def regression_results(self):
        if not self.reg_results:
            raise Error("Tried to access regression results before running\
                            regressions")

        for r in self.reg_results:
            print r.summary()

    def make_plots(self, save=False):
        if not self.cdb:
            raise Error("Tried to build plots with empty cluster table")

        plt.figure(1)
        self.cdb.boxplot(column="fh_score", by="kmeans")
        if save:
            plt.savefig("../output/img/FH.png")

        plt.figure(2)
        self.cdb.boxplot(column="LJI", by="kmeans")
        if save:
            plt.savefig("../output/img/LJ).png")

        plt.figure(3)
        self.cdb[self.cdb['fragility'] != 'NA'].boxplot(column="fragility",
                                                        by="kmeans")
        if save:
            plt.savefig("../output/img/SFI).png")

        if not save:
            plt.show()

    def show_plots(self):
        self.make_plots()
        plt.show()
Пример #41
0
        for host in hosts:
            data = {
                "os": osystem,
                "host": host,
                "kind": kind
            }
            data["name"] = "{os}_{kind}_{host}.csv".format(**data)
            if os.path.isfile(data["name"]):
                d = pd.read_csv("{name}".format(**data))
                for v in d.real:
                    data["value"] = v
                    values.append([data['value'], "{os}\n{host}".format(**data)])

    pprint(values)

    df = DataFrame(values)
    df.columns = ['Time in s', 'Host']
    print(df)

    df.boxplot(column='Time in s',
               by='Host',
               rot=0)

    plt.suptitle('Performance Comparison OpenFace: {}'.format(kind))

    pdf = "boxplot-{}.png".format(kind)
    plt.savefig(pdf)
    plt.close()

    os.system("open {}".format(pdf))
Пример #42
0
# In[15]:

X[0].hist()


##### we can also get the stats of that variable

# In[16]:

X[0].describe()


# In[22]:

X.boxplot(column=[0,1,2,3])


##### This is what is called a boxplot or a tail and a whisker diagram which gives the kind of distribution of each variable under study 

##### The red line in the boxplot gives the mean of that variable  The length of the box gives the spread of the variable  The minimum and maximum values are represented by the whisker ends The box endings are the lower and upper quantiles 

# In[38]:

X[4]=0
X.head(1)


# In[40]:

X[4]=Y