Пример #1
0
 def test_df_from_input_json_bad(self, open_file, json):
     open_file.return_value = MagicMock()
     json.loads = MagicMock()
     json.loads.side_effect = ValueError()
     args = MagicMock(names=[], input_options=['json'])
     in_file = MagicMock()
     with self.assertRaises(SystemExit):
         io_lib.df_from_input(args, in_file=in_file)
Пример #2
0
 def test_df_from_input_no_input(self, pd_mock):
     def raiser(*args, **kwargs):
         raise ValueError()
     pd_mock.read_csv = raiser
     args = MagicMock(names=[], input_options=[])
     in_file = MagicMock()
     with self.assertRaises(SystemExit):
         io_lib.df_from_input(args, in_file=in_file)
Пример #3
0
 def test_df_from_input_no_input(self, pd_mock):
     def raiser(*args, **kwargs):
         raise ValueError()
     pd_mock.read_csv = raiser
     args = MagicMock(names=[], input_options=[])
     in_file = MagicMock()
     with self.assertRaises(SystemExit):
         io_lib.df_from_input(args, in_file=in_file)
Пример #4
0
 def test_df_from_input_json_bad(self, open_file, json):
     open_file.return_value = MagicMock()
     json.loads = MagicMock()
     json.loads.side_effect = ValueError()
     args = MagicMock(names=[], input_options=['json'])
     in_file = MagicMock()
     with self.assertRaises(SystemExit):
         io_lib.df_from_input(args, in_file=in_file)
Пример #5
0
def main():
    args = get_input_args()
    df = io_lib.df_from_input(args)

    # extract parameters from arg parser
    nbins = args.nbins[0]
    range_tup = args.range
    layout_tup = args.layout
    alpha = args.alpha[0]
    do_density = args.density
    sharex = args.sharex
    sharey = args.sharey
    cols = args.cols if args.cols else [df.columns[0]]

    validate_args(args, cols, df)
    plot_lib.set_plot_styling(args)

    # no plotting if output requested
    if args.quiet:
        counts, edges = np.histogram(
            df[cols[0]], bins=nbins, range=range_tup, density=do_density)
        centers = edges[:-1] + 0.5 * np.diff(edges)
        df_out = pd.DataFrame({'bins': centers, 'counts': counts})
        io_lib.df_to_output(args, df_out)

    # otherwise do plotting
    else:
        df.hist(cols, bins=nbins, range=range_tup,
                alpha=alpha, sharex=sharex, sharey=sharey, layout=layout_tup,
                normed=do_density)

        plot_lib.refine_plot(args)
        plot_lib.show(args)
Пример #6
0
def main():
    msg = textwrap.dedent(
        """
        Create strings from a dataframe using python str.format() template.
        This tool is particularly useful for generating a list of commands
        that for piping into p.parallel.
        -----------------------------------------------------------------------
        Examples:

            * Create commands to touch a sequence of files in /tmp
                seq 10 | p.df --names n -i noheader\\
                | p.format -t 'touch /tmp/file{n:02d}.txt'
        -----------------------------------------------------------------------
        """
    )

    #  read command line arguments
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter, description=msg)

    arg_lib.add_args(parser, 'io_in')

    parser.add_argument('-t', '--template', required=True,
                        help='A python template string', nargs=1)

    # parse arguments
    args = parser.parse_args()

    # get the input dataframe
    df = io_lib.df_from_input(args)

    # write out the strings
    stream = OutStream(args.template[0])
    for rec in df.to_dict('records'):
        stream.write(**rec)
Пример #7
0
 def test_df_from_input_create_names(self, pd_mock):
     df_in = pd.DataFrame(columns=[1, 2])
     pd_mock.read_csv = MagicMock(return_value=df_in)
     pd_mock.Index = pd.Index
     args = MagicMock(names=[], input_options=['noheader'])
     df = io_lib.df_from_input(args, in_file=None)
     self.assertEqual(['c0', 'c1'], list(df.columns))
Пример #8
0
def main():
    msg = textwrap.dedent(
        """
        Create strings from a dataframe using python str.format() template.
        This tool is particularly useful for generating a list of commands
        that for piping into p.parallel.
        -----------------------------------------------------------------------
        Examples:

            * Create commands to touch a sequence of files in /tmp
                seq 10 | p.df --names n -i noheader\\
                | p.format -t 'touch /tmp/file{n:02d}.txt'
        -----------------------------------------------------------------------
        """
    )

    #  read command line arguments
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter, description=msg)

    arg_lib.add_args(parser, 'io_in')

    parser.add_argument('-t', '--template', required=True,
                        help='A python template string', nargs=1)

    # parse arguments
    args = parser.parse_args()

    # get the input dataframe
    df = io_lib.df_from_input(args)

    # write out the strings
    stream = OutStream(args.template[0])
    for rec in df.to_dict('records'):
        stream.write(**rec)
Пример #9
0
 def test_df_from_input_create_names(self, pd_mock):
     df_in = pd.DataFrame(columns=[1, 2])
     pd_mock.read_csv = MagicMock(return_value=df_in)
     pd_mock.Index = pd.Index
     args = MagicMock(names=[], input_options=['noheader'])
     df = io_lib.df_from_input(args, in_file=None)
     self.assertEqual(['c0', 'c1'], list(df.columns))
Пример #10
0
def main():
    msg = textwrap.dedent(
        """
        Computes a spectrogram using the lomb-scargle algorithm provided by
        the gatspy module.  The input time series need not have evenly spaced
        time-stamps.  The FFT-based algorithm has complexity O[N*log(N)].

        -----------------------------------------------------------------------
        Examples:

            * Plot the spectrum of a simple sine wave
                  p.linspace 0 10 100 \\
                  | p.df 'df["value"] = 7 * np.sin(2*np.pi*df.time / 1.5)'\\
                        --names time\\
                  | p.lomb_scargle -t time -y value --interp_exp 3\\
                  | p.plot -x period -y amp --xlim 0 3

            * Show the annual and 59-day peaks in the sealevel spectrum
                p.example_data -d sealevel\\
                | p.df 'df["day"] = 365.25 * df.year'\\
                        'df["day"] = df.day - df.day.iloc[0]'\\
                | p.lomb_scargle -t day -y sealevel_mm --interp_exp 3\\
                | p.df 'df[df.period < 720]'\\
                | p.plot -x period -y amp --xlim 1 400\\
                         --title 'Sea-surface height spectrum'\\
                         --xlabel 'period (days)'

        -----------------------------------------------------------------------
        """
    )

    # read command line arguments
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter, description=msg)

    arg_lib.add_args(parser, 'io_in', 'io_out')

    parser.add_argument('-t', '--time_col', help='Time Column',
                        nargs=1, required=True, type=str)

    parser.add_argument('-y', '--observation_col', help='Observation column',
                        nargs=1, dest='val_col', required=True, type=str)

    parser.add_argument('--interp_exp', help='Interpolate by this power of 2',
                        nargs=1, type=int, default=[1])
    parser.add_argument(
        '--freq_order', action='store_true', dest='freq_order', default=False,
        help='Order output by freqency instead of period')

    # parse arguments
    args = parser.parse_args()

    # get the input dataframe
    df = io_lib.df_from_input(args)
    df = lomb_scargle_lib.lomb_scargle(
        df, args.time_col[0], args.val_col[0], args.interp_exp[0],
        args.freq_order)

    # write dataframe to output
    io_lib.df_to_output(args, df)
Пример #11
0
 def test_df_from_input_json_names(self, open_file, json):
     open_file.return_value = MagicMock()
     json.loads = MagicMock(return_value=[{'a': 1}, {'a': 2}])
     args = MagicMock(names=['a'], input_options=['json'])
     in_file = MagicMock()
     df = io_lib.df_from_input(args, in_file=in_file)
     self.assertEqual(list(df.columns), ['a'])
     self.assertEqual(list(df.a), [1, 2])
Пример #12
0
 def test_df_from_input_json_names(self, open_file, json):
     open_file.return_value = MagicMock()
     json.loads = MagicMock(return_value=[{'a': 1}, {'a': 2}])
     args = MagicMock(names=['a'], input_options=['json'])
     in_file = MagicMock()
     df = io_lib.df_from_input(args, in_file=in_file)
     self.assertEqual(list(df.columns), ['a'])
     self.assertEqual(list(df.a), [1, 2])
Пример #13
0
def main():
    args = get_input_args()
    df = io_lib.df_from_input(args)

    # extract parameters from arg parser
    x_col = args.x[0] if args.x else None
    cols = args.y if args.y else [df.columns[0]]
    cols_to_check = cols + [x_col] if x_col else cols
    validate_args(args, cols_to_check, df)
    df = smooth(df, cols, x_col)
    io_lib.df_to_output(args, df)
Пример #14
0
def main():
    args = get_input_args()
    df = io_lib.df_from_input(args)

    # extract parameters from arg parser
    x_col = args.x[0] if args.x else None
    cols = args.y if args.y else [df.columns[0]]
    cols_to_check = cols + [x_col] if x_col else cols
    validate_args(args, cols_to_check, df)
    df = smooth(df, cols, x_col)
    io_lib.df_to_output(args, df)
Пример #15
0
def main():
    msg = textwrap.dedent("""
        Remove outliers from DataFrame columns using a recursive sigma-edit
        algorithm.  The algorithm will recursively NaN out values greater than
        sigma_thresh standard deviations away from sample mean.

        -----------------------------------------------------------------------
        Examples:

            * Do a 2.5-sigma edit on a gamma distribution and show histogram
                p.rand -n 1000 -t gamma --alpha=3 --beta=.01\\
                | p.df 'df["c1"] = df.c0'\\
                | p.sig_edit -c c1 -t 2.5\\
                | p.df 'pd.melt(df)' --names raw edited\\
                | p.facet_grid --hue variable --map pl.hist\\
                   --args value --kwargs 'alpha=.2' 'range=[0, 1000]' 'bins=50'
        -----------------------------------------------------------------------
        """)

    #  read command line arguments
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter, description=msg)

    arg_lib.add_args(parser, 'io_in', 'io_out', 'example')

    parser.add_argument("-t",
                        "--sigma_thresh",
                        help="Sigma threshold",
                        nargs=1,
                        required=True,
                        type=float)
    parser.add_argument("-c",
                        "--cols",
                        required=True,
                        help="Column(s) to sigma-edit",
                        nargs="+")
    parser.add_argument("--max_iter",
                        help="Max number of recursions",
                        nargs=1,
                        type=int,
                        default=[20])

    # parse arguments
    args = parser.parse_args()

    # get the input dataframe
    df = io_lib.df_from_input(args)
    df = outlier_lib.sigma_edit_dataframe(args.sigma_thresh[0],
                                          args.cols,
                                          df,
                                          max_iter=args.max_iter[0])

    # write dataframe to output
    io_lib.df_to_output(args, df)
Пример #16
0
def main():
    msg = textwrap.dedent("""
        Creates interactive xy plots.  Loosely based around matplotlib's
        pyplot.plot command.

        -----------------------------------------------------------------------
        Examples:

            * Really simple plot
                p.linspace 1 10 7 | p.plot -x c0 -y c0

            * Plot two traces
                p.linspace 0 6.28 100\\
                | p.df 'df["cos"]=np.cos(df.t)' 'df["sin"]=np.sin(df.t)'\\
                        --names t\\
                | p.plot -x t -y sin cos\\
                         --style '.-' 'o-' --alpha 1 .2 --legend best

            * Plot sea-level time series
                p.example_data -d sealevel\\
                | p.plot -x year -y sealevel_mm --style '.'\\
                --xlabel year --ylabel 'relative sea level (mm)'\\
                --title 'Sea Level Rise' --legend best --xlim 1995 2015
        -----------------------------------------------------------------------
        """)

    #  read command line arguments
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter, description=msg)

    arg_lib.add_args(parser, 'io_in', 'xy_plotting', 'decorating')

    parser.add_argument("-a",
                        "--alpha",
                        help="Set opacity level(s)",
                        nargs='+',
                        default=[1.],
                        type=float,
                        metavar='alpha')

    # parse arguments
    args = parser.parse_args()

    # get the input dataframe
    df = io_lib.df_from_input(args)

    # set the appropriate theme
    plot_lib.set_plot_styling(args)

    # draw the plot
    plot_lib.draw_xy_plot(args, df)
Пример #17
0
def main():
    msg = textwrap.dedent(
        """
        Creates interactive xy plots.  Loosely based around matplotlib's
        pyplot.plot command.

        -----------------------------------------------------------------------
        Examples:

            * Really simple plot
                p.linspace 1 10 7 | p.plot -x c0 -y c0

            * Plot two traces
                p.linspace 0 6.28 100\\
                | p.df 'df["cos"]=np.cos(df.t)' 'df["sin"]=np.sin(df.t)'\\
                        --names t\\
                | p.plot -x t -y sin cos\\
                         --style '.-' 'o-' --alpha 1 .2 --legend best

            * Plot sea-level time series
                p.example_data -d sealevel\\
                | p.plot -x year -y sealevel_mm --style '.'\\
                --xlabel year --ylabel 'relative sea level (mm)'\\
                --title 'Sea Level Rise' --legend best --xlim 1995 2015
        -----------------------------------------------------------------------
        """
    )

    #  read command line arguments
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter, description=msg)

    arg_lib.add_args(parser, 'io_in', 'xy_plotting', 'decorating')

    parser.add_argument(
        "-a", "--alpha", help="Set opacity level(s)", nargs='+', default=[1.],
        type=float, metavar='alpha')

    # parse arguments
    args = parser.parse_args()

    # get the input dataframe
    df = io_lib.df_from_input(args)

    # set the appropriate theme
    plot_lib.set_plot_styling(args)

    # draw the plot
    plot_lib.draw_xy_plot(args, df)
Пример #18
0
def main():
    msg = textwrap.dedent(
        """
        Remove outliers from DataFrame columns using a recursive sigma-edit
        algorithm.  The algorithm will recursively NaN out values greater than
        sigma_thresh standard deviations away from sample mean.

        -----------------------------------------------------------------------
        Examples:

            * Do a 2.5-sigma edit on a gamma distribution and show histogram
                p.rand -n 1000 -t gamma --alpha=3 --beta=.01\\
                | p.df 'df["c1"] = df.c0'\\
                | p.sig_edit -c c1 -t 2.5\\
                | p.df 'pd.melt(df)' --names raw edited\\
                | p.facet_grid --hue variable --map pl.hist\\
                   --args value --kwargs 'alpha=.2' 'range=[0, 1000]' 'bins=50'
        -----------------------------------------------------------------------
        """
    )

    #  read command line arguments
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter, description=msg)

    arg_lib.add_args(parser, 'io_in', 'io_out', 'example')

    parser.add_argument("-t", "--sigma_thresh", help="Sigma threshold",
                        nargs=1, required=True, type=float)
    parser.add_argument("-c", "--cols", required=True,
                        help="Column(s) to sigma-edit", nargs="+")
    parser.add_argument("--max_iter", help="Max number of recursions",
                        nargs=1, type=int, default=[20])

    # parse arguments
    args = parser.parse_args()

    # get the input dataframe
    df = io_lib.df_from_input(args)
    df = outlier_lib.sigma_edit_dataframe(
        args.sigma_thresh[0], args.cols, df, max_iter=args.max_iter[0])

    # write dataframe to output
    io_lib.df_to_output(args, df)
Пример #19
0
def main():
    args = get_input_args()
    df = io_lib.df_from_input(args)

    # extract parameters from arg parser
    nbins = args.nbins[0]
    range_tup = args.range
    layout_tup = args.layout
    alpha = args.alpha[0]
    do_density = args.density
    sharex = args.sharex
    sharey = args.sharey
    cols = args.cols if args.cols else [df.columns[0]]

    validate_args(args, cols, df)

    # no plotting if output requested
    if args.quiet:
        counts, edges = np.histogram(df[cols[0]],
                                     bins=nbins,
                                     range=range_tup,
                                     density=do_density)
        centers = edges[:-1] + 0.5 * np.diff(edges)
        df_out = pd.DataFrame({'bins': centers, 'counts': counts})
        io_lib.df_to_output(args, df_out)

    # otherwise do plotting
    else:
        module_checker_lib.check_for_modules(['matplotlib'])
        plot_lib = get_imports('pandashells.lib.plot_lib')
        plot_lib.set_plot_styling(args)
        df.hist(cols,
                bins=nbins,
                range=range_tup,
                alpha=alpha,
                sharex=sharex,
                sharey=sharey,
                layout=layout_tup,
                normed=do_density)

        plot_lib.refine_plot(args)
        plot_lib.show(args)
Пример #20
0
def main():
    msg = textwrap.dedent("""
        Tool to merge datasets.  Similar functionality to database
        joins. The arguments closely parallel those of the pandas merge
        command.  See the pandas merge documentation for more details.

        -----------------------------------------------------------------------
        Examples:

            * Merge election polls with electoral-college numbers
                p.merge <(p.example_data -d election) \\
                        <(p.example_data -d electoral_college) \\
                        --how left --on state \\
                | p.df -o table | head
        -----------------------------------------------------------------------
        """)

    # read command line arguments
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter, description=msg)

    arg_lib.add_args(parser, 'io_in', 'io_out', 'example')

    parser.add_argument('--how',
                        choices=['left', 'right', 'inner', 'outer'],
                        dest='how',
                        default=['inner'],
                        nargs=1,
                        help="Type of join.  Default='inner'")

    msg = 'List of of columns on which to join'
    parser.add_argument('--on',
                        nargs='+',
                        metavar='col',
                        type=str,
                        dest='on',
                        help=msg)

    msg = 'List of of columns from left file to join on. '
    parser.add_argument('--left_on',
                        nargs='+',
                        metavar='col',
                        type=str,
                        dest='left_on',
                        help=msg)

    msg = 'List of of columns from right file to join on. '
    parser.add_argument('--right_on',
                        nargs='+',
                        metavar='col',
                        type=str,
                        dest='right_on',
                        help=msg)

    msg = 'List of suffixes appended to identically '
    msg += 'named columns'
    parser.add_argument('--suffixes',
                        nargs=2,
                        metavar='_x _y',
                        type=str,
                        dest='suffixes',
                        default=['_x', '_y'],
                        help=msg)

    parser.add_argument("file",
                        help="Files to join",
                        nargs=2,
                        type=str,
                        metavar='file')

    args = parser.parse_args()
    validate_args(args)

    # get merge options from cli
    how = args.how[0]
    on = args.on if args.on else None
    left_on = args.left_on if args.left_on else None
    right_on = args.right_on if args.right_on else None
    suffixes = args.suffixes

    # get file names
    left_name, right_name = tuple(args.file)

    # load the dataframes
    df_left = io_lib.df_from_input(args, left_name)
    df_right = io_lib.df_from_input(args, right_name)

    # perform the merge
    dfj = pd.merge(df_left,
                   df_right,
                   how=how,
                   on=on,
                   left_on=left_on,
                   right_on=right_on,
                   sort=True,
                   suffixes=suffixes)

    # output the joined frame
    io_lib.df_to_output(args, dfj)
Пример #21
0
def main():
    msg = textwrap.dedent(
        """
        Tool to merge datasets.  Similar functionality to database
        joins. The arguments closely parallel those of the pandas merge
        command.  See the pandas merge documentation for more details.

        -----------------------------------------------------------------------
        Examples:

            * Merge election polls with electoral-college numbers
                p.merge <(p.example_data -d election) \\
                        <(p.example_data -d electoral_college) \\
                        --how left --on state \\
                | p.df -o table | head
        -----------------------------------------------------------------------
        """
    )

    # read command line arguments
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter, description=msg)

    arg_lib.add_args(parser, 'io_in', 'io_out', 'example')

    parser.add_argument('--how', choices=['left', 'right', 'inner', 'outer'],
                        dest='how', default=['inner'], nargs=1,
                        help="Type of join.  Default='inner'")

    msg = 'List of of columns on which to join'
    parser.add_argument('--on', nargs='+', metavar='col',
                        type=str, dest='on', help=msg)

    msg = 'List of of columns from left file to join on. '
    parser.add_argument('--left_on', nargs='+', metavar='col',
                        type=str, dest='left_on', help=msg)

    msg = 'List of of columns from right file to join on. '
    parser.add_argument('--right_on', nargs='+', metavar='col',
                        type=str, dest='right_on', help=msg)

    msg = 'List of suffixes appended to identically '
    msg += 'named columns'
    parser.add_argument('--suffixes', nargs=2, metavar='_x _y',
                        type=str, dest='suffixes', default=['_x', '_y'],
                        help=msg)

    parser.add_argument("file", help="Files to join", nargs=2, type=str,
                        metavar='file')

    args = parser.parse_args()
    validate_args(args)

    # get merge options from cli
    how = args.how[0]
    on = args.on if args.on else None
    left_on = args.left_on if args.left_on else None
    right_on = args.right_on if args.right_on else None
    suffixes = args.suffixes

    # get file names
    left_name, right_name = tuple(args.file)

    # load the dataframes
    df_left = io_lib.df_from_input(args, left_name)
    df_right = io_lib.df_from_input(args, right_name)

    # perform the merge
    dfj = pd.merge(df_left, df_right, how=how, on=on, left_on=left_on,
                   right_on=right_on, sort=True, suffixes=suffixes)

    # output the joined frame
    io_lib.df_to_output(args, dfj)
Пример #22
0
def main():
    msg = textwrap.dedent(
        """
        Plots the emperical cumulative distribution function (ECDF).

        -----------------------------------------------------------------------
        Examples:

            * Plot ECDF for 10k samples from the standard normal distribution.
                p.rand -t normal -n 10000 | p.cdf -c c0

            * Instead of plotting, send ECDF values to stdout
                p.rand -t normal -n 10000 | p.cdf -c c0 -q | head
        -----------------------------------------------------------------------
        """
    )

    # read command line arguments
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter, description=msg)

    # specify column to use
    parser.add_argument(
        "-c", "--col", required=True, nargs=1,
        help="Column to plot distribution")
    parser.add_argument(
        '-n', '--n_points', nargs=1, type=int,
        help='Number of output points (default is twice input len)')
    parser.add_argument(
        '-q', '--quiet', action='store_true', default=False,
        help='Quiet mean no plots. Send numeric output to stdout instead')

    # parse arguments
    arg_lib.add_args(parser, 'decorating', 'io_in', 'io_out',)
    args = parser.parse_args()

    # get the input dataframe and extract column
    df = io_lib.df_from_input(args)
    x = df[args.col[0]].values

    # create the output distribution
    n_out = 2 * len(x) if args.n_points is None else args.n_points[0]
    x_out = np.linspace(min(x), max(x), n_out)
    y_out = ECDF(x)(x_out)

    # send values to stdout if quiet specified
    if args.quiet:
        df_out = pd.DataFrame(
            {'x': x_out, 'p_less': y_out, 'p_greater': 1 - y_out})
        df_out = df_out[['x', 'p_less', 'p_greater']]
        io_lib.df_to_output(args, df_out)
        return

    # set the appropriate theme ad make plot
    plot_lib.set_plot_styling(args)
    pl.plot(x_out, y_out, label='P({} < x)'.format(args.col[0]))
    pl.plot(x_out, 1. - y_out, label='P({} > x)'.format(args.col[0]))
    pl.xlabel('x')
    pl.legend(loc='best')

    plot_lib.refine_plot(args)
    plot_lib.show(args)
Пример #23
0
def main():
    msg = textwrap.dedent(
        """
        Creates faceted plots using seaborn FacetGrid.

        With this tool, you can create a group of plots which show aspects
        of the same dataset broken down in different ways.  See the seaborn
        FacetGrid documentation for more detail.

        The --map argument to this function specifies a function to use
        for generating each of the plots.  The following modules are available
        in the namespace:
            pl = pylab
            sns = seaborn
        -----------------------------------------------------------------------
        Examples:

            * Scatterplot of tips vs bill for different combinations of sex,
              smoker, and day of the week:
                    p.example_data -d tips | \\
                    p.facet_grid --row smoker --col sex --hue day \\
                    --map pl.scatter \\
                    --args total_bill tip --kwargs 'alpha=.2' 's=100'

            * Histogram of tips broken down by sex, smoker and day
                    p.example_data -d tips | p.facet_grid --col day \\
                    --row sex --hue smoker  --sharex --sharey --aspect 1 \\
                    --map pl.hist --args tip \\
                    --kwargs 'alpha=.2' 'range=[0, 10]' 'bins=20'
        -----------------------------------------------------------------------
        """
    )

    #  read command line arguments
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter, description=msg)

    arg_lib.add_args(parser, 'io_in')

    msg = 'Different values of this variable in separate rows'
    parser.add_argument(
        '--row', nargs=1, type=str, dest='row', metavar='row', help=msg)

    msg = 'Different values of this variable in separate columns'
    parser.add_argument(
        '--col', nargs=1, type=str, dest='col', metavar='col', help=msg)

    msg = 'Different values of this variable in separate colors'
    parser.add_argument(
        '--hue', nargs=1, type=str, dest='hue', metavar='hue', help=msg)

    msg = 'The aspect ratio of each plot'
    parser.add_argument(
        '--aspect', nargs=1, type=float, dest='aspect', metavar='aspect',
        default=[2], help=msg)

    msg = 'The size of each plot (default=4)'
    parser.add_argument(
        '--size', nargs=1, type=float, dest='size', metavar='size',
        help=msg, default=[4])

    msg = 'The plotting function to use for each facet'
    parser.add_argument(
        '--map', nargs=1, type=str, dest='map', metavar='map', required=True,
        help=msg)

    msg = 'The args to pass to the plotting function'
    parser.add_argument(
        '--args', nargs='+', type=str, dest='args', metavar='args',
        required=True, help=msg)

    msg = 'Plotting function kwargs expressed as \'a=1\' \'b=2\' ... '
    parser.add_argument(
        '--kwargs', nargs='+', type=str, dest='kwargs',
        metavar='kwargs', help=msg)

    msg = 'Share x axis'
    parser.add_argument('--sharex', action='store_true', dest='sharex',
                        default=False, help=msg)

    msg = 'Share y axis'
    parser.add_argument('--sharey', action='store_true', dest='sharey',
                        default=False, help=msg)

    msg = 'x axis limits when sharex=True'
    parser.add_argument(
        '--xlim', nargs=2, type=float, dest='xlim', metavar='xlim', help=msg)

    msg = 'y axis limits when sharex=True'
    parser.add_argument(
        '--ylim', nargs=2, type=float, dest='ylim', metavar='ylim', help=msg)

    msg = "Save the figure to this file"
    parser.add_argument('--savefig', nargs=1, type=str, help=msg)

    warnings.filterwarnings('ignore')
    # parse arguments
    args = parser.parse_args()

    # get the input dataframe
    df = io_lib.df_from_input(args)

    facet_grid_kwargs = {
        'row': args.row[0] if args.row else None,
        'col': args.col[0] if args.col else None,
        'hue': args.hue[0] if args.hue else None,
        'aspect': args.aspect[0],
        'size': args.size[0],
        'sharex': args.sharex,
        'sharey': args.sharey,
        'xlim': args.xlim if args.xlim else None,
        'ylim': args.ylim if args.ylim else None,
    }
    grid = sns.FacetGrid(df, **facet_grid_kwargs)

    map_func_name = args.map[0]

    scope = {'pl': pl, 'sns': sns, 'map_func_name': map_func_name}
    exec('map_func = {}'.format(map_func_name), scope)
    map_func = scope['map_func']

    map_args = args.args

    map_kwargs = {}
    if args.kwargs:
        for kwarg in args.kwargs:
            exec('map_kwargs.update(dict({}))'.format(kwarg))

    grid.map(map_func, *map_args, **map_kwargs)  # noqa  defined in exec above
    grid.add_legend()
    plot_lib.show(args)
Пример #24
0
 def test_df_from_input_tsv(self, pd_mock):
     pd_mock.read_csv = MagicMock(return_value=pd.DataFrame())
     args = MagicMock(names=[], input_options=['tsv'])
     in_file = MagicMock()
     io_lib.df_from_input(args, in_file=in_file)
     self.assertEqual(pd_mock.read_csv.call_args_list[0][0][0], in_file)
Пример #25
0
 def test_df_from_input_with_infile(self, pd_mock):
     pd_mock.read_csv = MagicMock(return_value=pd.DataFrame())
     args = MagicMock(names=[], input_options=[])
     in_file = MagicMock()
     io_lib.df_from_input(args, in_file=in_file)
     self.assertEqual(pd_mock.read_csv.call_args_list[0][0][0], in_file)
Пример #26
0
def main():  # pragma: no cover
    # read command line arguments
    msg = textwrap.dedent(
        """
        Enables pandas dataframe processing at the unix command line.

        This is the real workhorse of the pandashells toolkit.  It reads data
        from stdin as a dataframe, which is passed through any number of pandas
        operations provided on the command line.  Output is always to stdout.

        Each operation assumes data is in a dataframe named df.  Operations
        performed on this dataframe will overwrite the df variable with
        the results of that operation.  Special consideration is taken for
        assignments such as df['a'] = df.b + df.c.  These are understood
        to agument the input dataframe with a new column. By way of example,
        this command:
            p.df 'df.groupby(by="a").b.count()' 'df.reset_index()'
        is equivalent to the python expressions:
            df = df.groupby(by="a").b.count()
            df = df.reset_index()

        In addition to providing access to pandas dataframes, a number of
        modules are loaded into the namespace so as to be accessible from the
        command line.  These modules are:
            pd = pandas
            np = numpy
            scp = scipy
            pl = pylab
            parse = dateutil.parser.parse
            datetime = datetime
            re = re

        ** Important **
        When creating chains of dataframe operations (see examples), it is
        important to express your chain of operations before any options. This
        is because some options can take multiple arguments and the parser
        won't be able to properly decode your meaning.
        For example:
            cat file.csv | p.df 'df["x"] = df.y + 1' -o table noheader  # GOOD
            cat file.csv | p.df -o table noheader 'df["x"] = df.y + 1'  # BAD

        Input can be read in different formats as specified by the -i switch.
        The most common formats are csv and table (white-space-delimited).  In
        either of these formats, p.df can accomodate input data that either
        does or doesn not have a header row.  When no header row is indicated,
        The columns of the Dataframe will be labeled as c0, c1, ..., cN.

        Plotting methods invoked on a Dataframe generate no output, but
        create an interactive plot instead.  There are a number of plot
        specific options available at the command line that govern the details
        of how these plots are rendered (e.g. --xlim, --legend, etc).

        -----------------------------------------------------------------------
        Examples:

            * Print a csv file in nice tabular format
                p.example_data -d tips | p.df -o table | head

            * Print a csv file to json
                p.example_data -d tips | head | p.df -o json

            * Transform csv to json then to table
                p.example_data -d tips | head | p.df -o json \\
                | p.df -i json -o table

            * Select by row
                p.example_data -d tips \\
                | p.df 'df[df.sex=="Female"]' 'df[df.smoker=="Yes"]' -o table

            * Extract columns
                p.example_data -d tips \\
                | p.df 'df[["total_bill", "tip"]].head()' -o table

            * Perform grouped aggregations
                p.example_data -d tips | p.df \\
                'df.groupby(by=["sex", "smoker"]).tip.sum()' -o table index

            * Use pandas plotting methods
                p.example_data -d tips | p.df \\
                'df.groupby(by="day").total_bill.sum().plot(kind="barh")'\\
                --xlabel 'Dollars' --title 'Total Bills by Day'

            * Convert between tabular and csv format with/without header rows
                seq 10 | awk '{print $1, 2*$1}'\\
                | p.df --names a b -i table noheader | p.df -o table noheader

        -----------------------------------------------------------------------
        """
    )
    from pandashells.lib import arg_lib

    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter, description=msg)
    arg_lib.add_args(parser, 'io_in', 'io_out', 'decorating')
    msg = (
        '(MUST come before any options) '
        '[statement ...] Statement(s) to execute. '
    )
    parser.add_argument(
        "statement", help=msg, nargs="*")
    args = parser.parse_args()

    get_modules_and_shortcuts(args.statement)
    from pandashells.lib import io_lib

    # get the input dataframe
    df = io_lib.df_from_input(args)

    # execute the statements in order
    # plot commands are terminal statements so will call sys.exit()
    for cmd in args.statement:
        df = process_command(args, cmd, df)

    # write the output
    io_lib.df_to_output(args, df)
Пример #27
0
def main():  # pragma: no cover
    # read command line arguments
    msg = textwrap.dedent(
        """
        Enables pandas dataframe processing at the unix command line.

        This is the real workhorse of the pandashells toolkit.  It reads data
        from stdin as a dataframe, which is passed through any number of pandas
        operations provided on the command line.  Output is always to stdout.

        Each operation assumes data is in a dataframe named df.  Operations
        performed on this dataframe will overwrite the df variable with
        the results of that operation.  Special consideration is taken for
        assignments such as df['a'] = df.b + df.c.  These are understood
        to agument the input dataframe with a new column. By way of example,
        this command:
            p.df 'df.groupby(by="a").b.count()' 'df.reset_index()'
        is equivalent to the python expressions:
            df = df.groupby(by="a").b.count()
            df = df.reset_index()

        In addition to providing access to pandas dataframes, a number of
        modules are loaded into the namespace so as to be accessible from the
        command line.  These modules are:
            pd = pandas
            np = numpy
            scp = scipy
            pl = pylab
            parse = dateutil.parser.parse
            datetime = datetime
            re = re

        ** Important **
        When creating chains of dataframe operations (see examples), it is
        important to express your chain of operations before any options. This
        is because some options can take multiple arguments and the parser
        won't be able to properly decode your meaning.
        For example:
            cat file.csv | p.df 'df["x"] = df.y + 1' -o table noheader  # GOOD
            cat file.csv | p.df -o table noheader 'df["x"] = df.y + 1'  # BAD

        Input can be read in different formats as specified by the -i switch.
        The most common formats are csv and table (white-space-delimited).  In
        either of these formats, p.df can accomodate input data that either
        does or doesn not have a header row.  When no header row is indicated,
        The columns of the Dataframe will be labeled as c0, c1, ..., cN.

        Plotting methods invoked on a Dataframe generate no output, but
        create an interactive plot instead.  There are a number of plot
        specific options available at the command line that govern the details
        of how these plots are rendered (e.g. --xlim, --legend, etc).

        -----------------------------------------------------------------------
        Examples:

            * Print a csv file in nice tabular format
                p.example_data -d tips | p.df -o table | head

            * Select by row
                p.example_data -d tips \\
                | p.df 'df[df.sex=="Female"]' 'df[df.smoker=="Yes"]' -o table

            * Extract columns
                p.example_data -d tips \\
                | p.df 'df[["total_bill", "tip"]].head()' -o table

            * Perform grouped aggregations
                p.example_data -d tips | p.df \\
                'df.groupby(by=["sex", "smoker"]).tip.sum()' -o table index

            * Use pandas plotting methods
                p.example_data -d tips | p.df \\
                'df.groupby(by="day").total_bill.sum().plot(kind="barh")'\\
                --xlabel 'Dollars' --title 'Total Bills by Day'

            * Convert between tabular and csv format with/without header rows
                seq 10 | awk '{print $1, 2*$1}'\\
                | p.df --names a b -i table noheader | p.df -o table noheader

        -----------------------------------------------------------------------
        """
    )

    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter, description=msg)
    arg_lib.add_args(parser, 'io_in', 'io_out', 'decorating', 'example')
    msg = (
        '(MUST come before any options) '
        '[statement ...] Statement(s) to execute. '
    )
    parser.add_argument(
        "statement", help=msg, nargs="*")
    args = parser.parse_args()

    # get the input dataframe
    df = io_lib.df_from_input(args)

    # execute the statements in order
    # plot commands are terminal statements so will call sys.exit()
    for cmd in args.statement:
        df = process_command(args, cmd, df)

    # write the output
    io_lib.df_to_output(args, df)
Пример #28
0
def main():
    msg = textwrap.dedent("""
        Computes a spectrogram using the lomb-scargle algorithm provided by
        the gatspy module.  The input time series need not have evenly spaced
        time-stamps.  The FFT-based algorithm has complexity O[N*log(N)].

        -----------------------------------------------------------------------
        Examples:

            * Plot the spectrum of a simple sine wave
                  p.linspace 0 10 100 \\
                  | p.df 'df["value"] = 7 * np.sin(2*np.pi*df.time / 1.5)'\\
                        --names time\\
                  | p.lomb_scargle -t time -y value --interp_exp 3\\
                  | p.plot -x period -y amp --xlim 0 3

            * Show the annual and 59-day peaks in the sealevel spectrum
                p.example_data -d sealevel\\
                | p.df 'df["day"] = 365.25 * df.year'\\
                        'df["day"] = df.day - df.day.iloc[0]'\\
                | p.lomb_scargle -t day -y sealevel_mm --interp_exp 3\\
                | p.df 'df[df.period < 720]'\\
                | p.plot -x period -y amp --xlim 1 400\\
                         --title 'Sea-surface height spectrum'\\
                         --xlabel 'period (days)'

        -----------------------------------------------------------------------
        """)

    # read command line arguments
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter, description=msg)

    arg_lib.add_args(parser, 'io_in', 'io_out')

    parser.add_argument('-t',
                        '--time_col',
                        help='Time Column',
                        nargs=1,
                        required=True,
                        type=str)

    parser.add_argument('-y',
                        '--observation_col',
                        help='Observation column',
                        nargs=1,
                        dest='val_col',
                        required=True,
                        type=str)

    parser.add_argument('--interp_exp',
                        help='Interpolate by this power of 2',
                        nargs=1,
                        type=int,
                        default=[1])
    parser.add_argument('--freq_order',
                        action='store_true',
                        dest='freq_order',
                        default=False,
                        help='Order output by freqency instead of period')

    # parse arguments
    args = parser.parse_args()

    # get the input dataframe
    df = io_lib.df_from_input(args)
    df = lomb_scargle_lib.lomb_scargle(df, args.time_col[0], args.val_col[0],
                                       args.interp_exp[0], args.freq_order)

    # write dataframe to output
    io_lib.df_to_output(args, df)
Пример #29
0
def main():
    msg = textwrap.dedent("""
        Creates faceted plots using seaborn FacetGrid.

        With this tool, you can create a group of plots which show aspects
        of the same dataset broken down in different ways.  See the seaborn
        FacetGrid documentation for more detail.

        The --map argument to this function specifies a function to use
        for generating each of the plots.  The following modules are available
        in the namespace:
            pl = pylab
            sns = seaborn
        -----------------------------------------------------------------------
        Examples:

            * Scatterplot of tips vs bill for different combinations of sex,
              smoker, and day of the week:
                    p.example_data -d tips | \\
                    p.facet_grid --row smoker --col sex --hue day \\
                    --map pl.scatter \\
                    --args total_bill tip --kwargs 'alpha=.2' 's=100'

            * Histogram of tips broken down by sex, smoker and day
                    p.example_data -d tips | p.facet_grid --col day \\
                    --row sex --hue smoker  --sharex --sharey --aspect 1 \\
                    --map pl.hist --args tip \\
                    --kwargs 'alpha=.2' 'range=[0, 10]' 'bins=20'
        -----------------------------------------------------------------------
        """)

    #  read command line arguments
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter, description=msg)

    arg_lib.add_args(parser, 'io_in')

    msg = 'Different values of this variable in separate rows'
    parser.add_argument('--row',
                        nargs=1,
                        type=str,
                        dest='row',
                        metavar='row',
                        help=msg)

    msg = 'Different values of this variable in separate columns'
    parser.add_argument('--col',
                        nargs=1,
                        type=str,
                        dest='col',
                        metavar='col',
                        help=msg)

    msg = 'Different values of this variable in separate colors'
    parser.add_argument('--hue',
                        nargs=1,
                        type=str,
                        dest='hue',
                        metavar='hue',
                        help=msg)

    msg = 'The aspect ratio of each plot'
    parser.add_argument('--aspect',
                        nargs=1,
                        type=float,
                        dest='aspect',
                        metavar='aspect',
                        default=[2],
                        help=msg)

    msg = 'The size of each plot (default=4)'
    parser.add_argument('--size',
                        nargs=1,
                        type=float,
                        dest='size',
                        metavar='size',
                        help=msg,
                        default=[4])

    msg = 'The plotting function to use for each facet'
    parser.add_argument('--map',
                        nargs=1,
                        type=str,
                        dest='map',
                        metavar='map',
                        required=True,
                        help=msg)

    msg = 'The args to pass to the plotting function'
    parser.add_argument('--args',
                        nargs='+',
                        type=str,
                        dest='args',
                        metavar='args',
                        required=True,
                        help=msg)

    msg = 'Plotting function kwargs expressed as \'a=1\' \'b=2\' ... '
    parser.add_argument('--kwargs',
                        nargs='+',
                        type=str,
                        dest='kwargs',
                        metavar='kwargs',
                        help=msg)

    msg = 'Share x axis'
    parser.add_argument('--sharex',
                        action='store_true',
                        dest='sharex',
                        default=False,
                        help=msg)

    msg = 'Share y axis'
    parser.add_argument('--sharey',
                        action='store_true',
                        dest='sharey',
                        default=False,
                        help=msg)

    msg = 'x axis limits when sharex=True'
    parser.add_argument('--xlim',
                        nargs=2,
                        type=float,
                        dest='xlim',
                        metavar='xlim',
                        help=msg)

    msg = 'y axis limits when sharex=True'
    parser.add_argument('--ylim',
                        nargs=2,
                        type=float,
                        dest='ylim',
                        metavar='ylim',
                        help=msg)

    msg = "Save the figure to this file"
    parser.add_argument('--savefig', nargs=1, type=str, help=msg)

    warnings.filterwarnings('ignore')
    # parse arguments
    args = parser.parse_args()

    # get the input dataframe
    df = io_lib.df_from_input(args)

    facet_grid_kwargs = {
        'row': args.row[0] if args.row else None,
        'col': args.col[0] if args.col else None,
        'hue': args.hue[0] if args.hue else None,
        'aspect': args.aspect[0],
        'size': args.size[0],
        'sharex': args.sharex,
        'sharey': args.sharey,
        'xlim': args.xlim if args.xlim else None,
        'ylim': args.ylim if args.ylim else None,
    }
    grid = sns.FacetGrid(df, **facet_grid_kwargs)

    map_func_name = args.map[0]

    scope = {'pl': pl, 'sns': sns, 'map_func_name': map_func_name}
    exec('map_func = {}'.format(map_func_name), scope)
    map_func = scope['map_func']

    map_args = args.args

    map_kwargs = {}
    if args.kwargs:
        for kwarg in args.kwargs:
            exec('map_kwargs.update(dict({}))'.format(kwarg))

    grid.map(map_func, *map_args, **map_kwargs)  # noqa  defined in exec above
    grid.add_legend()
    plot_lib.show(args)
Пример #30
0
def main():
    msg = textwrap.dedent(
        """
        Create a single variable regression plot of specified order.

        -----------------------------------------------------------------------
        Examples:
            * Fit a line to synthetic data with boostrap errors.
                p.linspace 0 10 20 \\
                | p.df 'df["y_true"] = .2 * df.x' \\
                       'df["noise"] = np.random.randn(20)' \\
                        'df["y"] = df.y_true + df.noise' --names x \\
                | p.regplot -x x -y y

            * Fit a quadratic to synthetic data with boostrap errors.
                p.linspace 0 10 40 \\
                | p.df 'df["y_true"] = .5 * df.x  + .3 * df.x ** 2'\\
                       'df["noise"] = np.random.randn(40)' \\
                        'df["y"] = df.y_true + df.noise' --names x \\
                | p.regplot -x x -y y --order 2

            * Fit sealevel data with no bootstrap
                p.example_data -d sealevel\\
                | p.regplot -x year -y sealevel_mm --n_boot 1


        -----------------------------------------------------------------------
        """
    )

    #  read command line arguments
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter, description=msg)

    arg_lib.add_args(parser, 'io_in', 'io_out', 'decorating')

    msg = 'Column for dependent variable'
    parser.add_argument('-x', nargs=1, type=str, dest='x', metavar='col',
                        help=msg, required=True)

    msg = 'Column for independent variable'
    parser.add_argument('-y', nargs=1, type=str, dest='y',
                        metavar='col', help=msg, required=True)

    msg = 'The order of the polynomial to fit (default = 1)'
    parser.add_argument('--order', help=msg, nargs=1, default=[1], type=int)

    msg = 'Number of bootstrap samples for uncertainty region (default=1000)'
    parser.add_argument(
        '--n_boot', help=msg, nargs=1, default=[1000], type=int)

    parser.add_argument('-a', '--alpha', help='Set opacity',
                        nargs=1, default=[0.5], type=float)

    # parse arguments
    args = parser.parse_args()

    # get the input dataframe
    df = io_lib.df_from_input(args)

    # extract command line params
    x = df[args.x[0]].values
    y = df[args.y[0]].values

    # do a polyfit with the specified order
    coeffs = np.polyfit(x, y, args.order[0])

    label = make_label(coeffs, args.savefig)

    sns.regplot(
        x, y, order=args.order[0], n_boot=args.n_boot[0],
        line_kws={'label': label, 'color': CC[2], 'alpha': .5},
        scatter_kws={'alpha': args.alpha[0], 'color': CC[0]})

    pl.legend(loc='best')
    pl.xlabel(args.x[0])
    pl.ylabel(args.y[0])
    plot_lib.refine_plot(args)
    plot_lib.show(args)
Пример #31
0
def main():
    msg = textwrap.dedent(
        """
        Performs (multivariable) linear regression.  The fitting model
        is specified using the R-like, patsy syntax.  Input is from stdin
        and output is either fitting information or the input data
        with columns added for the fit and residuals.

        -----------------------------------------------------------------------
        Examples:
            * Fit a line to the sea-level data
                p.example_data -d sealevel | p.regress -m 'sealevel_mm ~ year'

            * Fit a trend plus annual cycle to sealevel data
                p.example_data -d sealevel \\
                | p.df 'df["sin"] =  np.sin(2 * np.pi * df.year)' \\
                | p.df 'df["cos"] = np.cos(2 * np.pi * df.year)' \\
                | p.regress -m 'sealevel_mm ~ year + cos + sin'

            * Examine residual ECDF of trend plus annual fit
                p.example_data -d sealevel \\
                | p.df 'df["sin"] =  np.sin(2 * np.pi * df.year)' \\
                | p.df 'df["cos"] = np.cos(2 * np.pi * df.year)' \\
                | p.regress -m 'sealevel_mm ~ year + cos + sin' --fit \\
                | p.cdf -c 'resid_' --title 'ECDF of trend + annual'

            * Detrend sealevel data to more clearly reveal oscillations
                p.example_data -d sealevel \\
                | p.regress -m 'sealevel_mm ~ year' --fit \\
                | p.plot -x year -y resid_ --ylabel 'Trend removed (mm)' \\
                         --title 'Global Sea Surface Height'

            * Set origin of sealevel data to 0 and regress with no intercept
                p.example_data -d sealevel\\
                | p.df 'df["year"] = df.year - df.year.iloc[0]'\\
                'df["sealevel_mm"] = df.sealevel_mm - df.sealevel_mm.iloc[0]'\\
                | p.regress -m 'sealevel_mm ~ year - 1' --fit\\
                | p.plot -x year -y sealevel_mm fit_ --style '.' '-'\\
                     --alpha .2 1 --legend best --title 'Force Zero Intercept'

        -----------------------------------------------------------------------
        """
    )

    #  read command line arguments
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter, description=msg)

    arg_lib.add_args(parser, 'io_in', 'io_out')

    # specify columns to histogram
    parser.add_argument("-m", "--model", type=str, nargs=1, required=True,
                        help="The model expressed in patsy syntax")

    msg = "Return input with fit and residual appended"
    parser.add_argument("--fit", action="store_true", dest='retfit',
                        default=False, help=msg)

    parser.add_argument("--plot", action="store_true",
                        default=False, help="Make residual plots")

    # parse arguments
    args = parser.parse_args()

    # get the input dataframe
    df = io_lib.df_from_input(args)

    # fit the model and add fit, resid columns
    result = sm.ols(formula=args.model[0], data=df).fit()
    df['fit_'] = result.fittedvalues
    df['resid_'] = result.resid

    # add and output the fit results if requested
    if args.retfit:
        io_lib.df_to_output(args, df)
        return

    # print the fit summary
    sys.stdout.write('\n{}\n'.format(result.summary()))
    sys.stdout.flush()

    # do plots if requested
    if args.plot:
        module_checker_lib.check_for_modules(['matplotlib', 'seaborn'])
        plot_lib = get_module('pandashells.lib.plot_lib')
        mpl = get_module('matplotlib')
        pl = get_module('pylab')
        sns = get_module('seaborn')

        pl.subplot(211)
        pl.plot(df.fit_, df.resid_, '.', alpha=.5)
        pl.xlabel('Fit')
        pl.ylabel('Residual')
        pl.title(args.model[0])

        pl.subplot(212)
        sns.distplot(df.resid_, bins=50)
        pl.xlabel('Residual with R^2 = {:0.4f}'.format(result.rsquared))
        pl.ylabel('Counts')

        # annoying issue with osx backend forces if statement here
        if mpl.get_backend().lower() in ['agg', 'macosx']:
            pl.gcf().set_tight_layout(True)
        else:
            pl.gcf().tight_layout()

        plot_lib.show(args)
Пример #32
0
def main():
    msg = textwrap.dedent("""
        Performs (multivariable) linear regression.  The fitting model
        is specified using the R-like, patsy syntax.  Input is from stdin
        and output is either fitting information or the input data
        with columns added for the fit and residuals.

        -----------------------------------------------------------------------
        Examples:
            * Fit a line to the sea-level data
                p.example_data -d sealevel | p.regress -m 'sealevel_mm ~ year'

            * Fit a trend plus annual cycle to sealevel data
                p.example_data -d sealevel \\
                | p.df 'df["sin"] =  np.sin(2 * np.pi * df.year)' \\
                | p.df 'df["cos"] = np.cos(2 * np.pi * df.year)' \\
                | p.regress -m 'sealevel_mm ~ year + cos + sin'

            * Examine residual ECDF of trend plus annual fit
                p.example_data -d sealevel \\
                | p.df 'df["sin"] =  np.sin(2 * np.pi * df.year)' \\
                | p.df 'df["cos"] = np.cos(2 * np.pi * df.year)' \\
                | p.regress -m 'sealevel_mm ~ year + cos + sin' --fit \\
                | p.cdf -c 'resid_' --title 'ECDF of trend + annual'

            * Detrend sealevel data to more clearly reveal oscillations
                p.example_data -d sealevel \\
                | p.regress -m 'sealevel_mm ~ year' --fit \\
                | p.plot -x year -y resid_ --ylabel 'Trend removed (mm)' \\
                         --title 'Global Sea Surface Height'

            * Set origin of sealevel data to 0 and regress with no intercept
                p.example_data -d sealevel\\
                | p.df 'df["year"] = df.year - df.year.iloc[0]'\\
                'df["sealevel_mm"] = df.sealevel_mm - df.sealevel_mm.iloc[0]'\\
                | p.regress -m 'sealevel_mm ~ year - 1' --fit\\
                | p.plot -x year -y sealevel_mm fit_ --style '.' '-'\\
                     --alpha .2 1 --legend best --title 'Force Zero Intercept'

        -----------------------------------------------------------------------
        """)

    #  read command line arguments
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter, description=msg)

    arg_lib.add_args(parser, 'io_in', 'io_out', 'example')

    # specify columns to histogram
    parser.add_argument("-m",
                        "--model",
                        type=str,
                        nargs=1,
                        required=True,
                        help="The model expressed in patsy syntax")

    msg = "Return input with fit and residual appended"
    parser.add_argument("--fit",
                        action="store_true",
                        dest='retfit',
                        default=False,
                        help=msg)

    parser.add_argument("--plot",
                        action="store_true",
                        default=False,
                        help="Make residual plots")

    # parse arguments
    args = parser.parse_args()

    # get the input dataframe
    df = io_lib.df_from_input(args)

    # fit the model and add fit, resid columns
    result = sm.ols(formula=args.model[0], data=df).fit()
    df['fit_'] = result.fittedvalues
    df['resid_'] = result.resid

    # add and output the fit results if requested
    if args.retfit:
        io_lib.df_to_output(args, df)
        return

    # print the fit summary
    sys.stdout.write('\n{}\n'.format(result.summary()))
    sys.stdout.flush()

    # do plots if requested
    if args.plot:
        pl.subplot(211)
        pl.plot(df.fit_, df.resid_, '.', alpha=.5)
        pl.xlabel('Fit')
        pl.ylabel('Residual')
        pl.title(args.model[0])

        pl.subplot(212)
        sns.distplot(df.resid_, bins=50)
        pl.xlabel('Residual with R^2 = {:0.4f}'.format(result.rsquared))
        pl.ylabel('Counts')

        # annoying issue with osx backend forces if statement here
        if mpl.get_backend().lower() in ['agg', 'macosx']:
            pl.gcf().set_tight_layout(True)
        else:
            pl.gcf().tight_layout()

        plot_lib.show(args)
Пример #33
0
def main():
    msg = textwrap.dedent(
        """
        Creates faceted plots using seaborn FacetGrid.

        With this tool, you can create a group of plots which show aspects
        of the same dataset broken down in different ways.  See the seaborn
        FacetGrid documentation for more detail.

        The --map argument to this function specifies a function to use
        for generating each of the plots.  The following modules are available
        in the namespace:
            pl = pylab
            sns = seaborn
        -----------------------------------------------------------------------
        Examples:

            * Scatterplot of tips vs bill for different combinations of sex,
              smoker, and day of the week:
                    p.example_data -d tips | \\
                    p.facet_grid --row smoker --col sex --hue day \\
                    --map pl.scatter \\
                    --args total_bill tip --kwargs 'alpha=.2' 's=100'

            * Histogram of tips broken down by sex, smoker and day
                    p.example_data -d tips | p.facet_grid --col day \\
                    --row sex --hue smoker  --sharex --sharey --aspect 1 \\
                    --map pl.hist --args tip \\
                    --kwargs 'alpha=.2' 'range=[0, 10]' 'bins=20'
        -----------------------------------------------------------------------
        """
    )

    #  read command line arguments
    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=msg)

    arg_lib.add_args(parser, "io_in")

    msg = "Different values of this variable in separate rows"
    parser.add_argument("--row", nargs=1, type=str, dest="row", metavar="row", help=msg)

    msg = "Different values of this variable in separate columns"
    parser.add_argument("--col", nargs=1, type=str, dest="col", metavar="col", help=msg)

    msg = "Different values of this variable in separate colors"
    parser.add_argument("--hue", nargs=1, type=str, dest="hue", metavar="hue", help=msg)

    msg = "The aspect ratio of each plot"
    parser.add_argument("--aspect", nargs=1, type=float, dest="aspect", metavar="aspect", default=[2], help=msg)

    msg = "The size of each plot (default=4)"
    parser.add_argument("--size", nargs=1, type=float, dest="size", metavar="size", help=msg, default=[4])

    msg = "The plotting function to use for each facet"
    parser.add_argument("--map", nargs=1, type=str, dest="map", metavar="map", required=True, help=msg)

    msg = "The args to pass to the plotting function"
    parser.add_argument("--args", nargs="+", type=str, dest="args", metavar="args", required=True, help=msg)

    msg = "Plotting function kwargs expressed as 'a=1' 'b=2' ... "
    parser.add_argument("--kwargs", nargs="+", type=str, dest="kwargs", metavar="kwargs", help=msg)

    msg = "Share x axis"
    parser.add_argument("--sharex", action="store_true", dest="sharex", default=False, help=msg)

    msg = "Share y axis"
    parser.add_argument("--sharey", action="store_true", dest="sharey", default=False, help=msg)

    msg = "x axis limits when sharex=True"
    parser.add_argument("--xlim", nargs=2, type=float, dest="xlim", metavar="xlim", help=msg)

    msg = "y axis limits when sharex=True"
    parser.add_argument("--ylim", nargs=2, type=float, dest="ylim", metavar="ylim", help=msg)

    msg = "Save the figure to this file"
    parser.add_argument("--savefig", nargs=1, type=str, help=msg)

    # parse arguments
    args = parser.parse_args()

    # get the input dataframe
    df = io_lib.df_from_input(args)

    facet_grid_kwargs = {
        "row": args.row[0] if args.row else None,
        "col": args.col[0] if args.col else None,
        "hue": args.hue[0] if args.hue else None,
        "aspect": args.aspect[0],
        "size": args.size[0],
        "sharex": args.sharex,
        "sharey": args.sharey,
        "xlim": args.xlim if args.xlim else None,
        "ylim": args.ylim if args.ylim else None,
    }
    grid = sns.FacetGrid(df, **facet_grid_kwargs)

    map_func_name = args.map[0]

    scope = {"pl": pl, "sns": sns, "map_func_name": map_func_name}
    exec("map_func = {}".format(map_func_name), scope)
    map_func = scope["map_func"]

    map_args = args.args

    map_kwargs = {}
    if args.kwargs:
        for kwarg in args.kwargs:
            exec("map_kwargs.update(dict({}))".format(kwarg))

    grid.map(map_func, *map_args, **map_kwargs)  # noqa  defined in exec above
    grid.add_legend()
    plot_lib.show(args)