Exemplo n.º 1
0
    def frequency(columns=None, buckets=10, output_format="plot", output_path=None):
        """
        Plot frequency chart
        :param columns: Columns to be printed
        :param buckets: Number of buckets
        :param output_format:
        :param output_path: path where the image is going to be saved
        :return:
        """
        columns = parse_columns(self, columns)
        data = self.cols.frequency(columns, buckets)

        for k, v in data.items():
            plot_frequency({k: v}, output=output_format, path=output_path)
Exemplo n.º 2
0
    def run(self,
            df,
            columns="*",
            buckets=MAX_BUCKETS,
            infer=False,
            relative_error=RELATIVE_ERROR,
            approx_count=True,
            mismatch=None):
        """
        Return dataframe statistical information in HTML Format
        :param df: Dataframe to be analyzed
        :param columns: Columns to be analyzed
        :param buckets: Number of buckets calculated to print the histogram
        :param infer: infer data type
        :param relative_error: Relative Error for quantile discretizer calculation
        :param approx_count: Use approx_count_distinct or countDistinct
        :param mismatch:
        :return:
        """

        columns = parse_columns(df, columns)
        columns, output = self.dataset(df,
                                       columns,
                                       buckets,
                                       infer,
                                       relative_error,
                                       approx_count,
                                       format="dict",
                                       mismatch=mismatch)

        # Load jinja
        template_loader = jinja2.FileSystemLoader(
            searchpath=absolute_path("/profiler/templates/out"))
        template_env = jinja2.Environment(loader=template_loader,
                                          autoescape=True)

        # Render template
        # Create the profiler info header
        html = ""
        general_template = template_env.get_template("general_info.html")
        html = html + general_template.render(data=output)

        template = template_env.get_template("one_column.html")
        # Create every column stats
        for col_name in columns:
            hist_pic = None
            freq_pic = None

            col = output["columns"][col_name]
            if "hist" in col["stats"]:
                hist_dict = col["stats"]["hist"]

                if col["column_dtype"] == "date":
                    hist_year = plot_hist({col_name: hist_dict["years"]},
                                          "base64", "years")
                    hist_month = plot_hist({col_name: hist_dict["months"]},
                                           "base64", "months")
                    hist_weekday = plot_hist({col_name: hist_dict["weekdays"]},
                                             "base64", "weekdays")
                    hist_hour = plot_hist({col_name: hist_dict["hours"]},
                                          "base64", "hours")
                    hist_minute = plot_hist({col_name: hist_dict["minutes"]},
                                            "base64", "minutes")
                    hist_pic = {
                        "hist_years": hist_year,
                        "hist_months": hist_month,
                        "hist_weekdays": hist_weekday,
                        "hist_hours": hist_hour,
                        "hist_minutes": hist_minute
                    }

                elif col["column_dtype"] == "int" or col[
                        "column_dtype"] == "string" or col[
                            "column_dtype"] == "decimal":
                    hist = plot_hist({col_name: hist_dict}, output="base64")
                    hist_pic = {"hist_numeric_string": hist}
            if "frequency" in col:
                freq_pic = plot_frequency({col_name: col["frequency"]},
                                          output="base64")

            html = html + template.render(
                data=col, freq_pic=freq_pic, hist_pic=hist_pic)

        # Save in case we want to output to a html file
        # self.html = html + df.table_html(10)
        self.html = html

        # Display HTML
        print_html(self.html)

        # JSON
        # Save in case we want to output to a json file
        self.json = output

        return self
Exemplo n.º 3
0
    def run(self,
            df,
            columns,
            buckets=40,
            infer=False,
            relative_error=1,
            approx_count=True):
        """
        Return dataframe statistical information in HTML Format
        :param df: Dataframe to be analyzed
        :param columns: Columns to be analyzed
        :param buckets: Number of buckets calculated to print the histogram
        :param infer: infer data type
        :param relative_error: Relative Error for quantile discretizer calculation
        :param approx_count: Use approx_count_distinct or countDistinct
        :return:
        """

        columns = parse_columns(df, columns)

        output = Profiler.to_json(df, columns, buckets, infer, relative_error,
                                  approx_count)

        # Load jinja
        path = os.path.dirname(os.path.abspath(__file__))
        template_loader = jinja2.FileSystemLoader(searchpath=path +
                                                  "//templates")
        template_env = jinja2.Environment(loader=template_loader,
                                          autoescape=True)

        # Render template
        # Create the profiler info header
        html = ""
        general_template = template_env.get_template("general_info.html")
        html = html + general_template.render(data=output)

        template = template_env.get_template("one_column.html")

        # Create every column stats
        for col_name in columns:
            hist_pic = None
            col = output["columns"][col_name]
            if "hist" in col:
                if col["column_dtype"] == "date":
                    hist_year = plot_hist({col_name: col["hist"]["years"]},
                                          "base64", "years")
                    hist_month = plot_hist({col_name: col["hist"]["months"]},
                                           "base64", "months")
                    hist_weekday = plot_hist(
                        {col_name: col["hist"]["weekdays"]}, "base64",
                        "weekdays")
                    hist_hour = plot_hist({col_name: col["hist"]["hours"]},
                                          "base64", "hours")
                    hist_minute = plot_hist({col_name: col["hist"]["minutes"]},
                                            "base64", "minutes")
                    hist_pic = {
                        "hist_years": hist_year,
                        "hist_months": hist_month,
                        "hist_weekdays": hist_weekday,
                        "hist_hours": hist_hour,
                        "hist_minutes": hist_minute
                    }
                else:

                    hist = plot_hist({col_name: col["hist"]}, output="base64")
                    hist_pic = {"hist_pic": hist}

            if "frequency" in col:
                freq_pic = plot_frequency({col_name: col["frequency"]},
                                          output="base64")
            else:
                freq_pic = None

            html = html + template.render(
                data=col, freq_pic=freq_pic, **hist_pic)

        html = html + df.table_html(10)

        # Display HTML
        print_html(html)

        # send to queue

        if self.queue_url is not None:
            self.to_queue(output)

        # JSON
        # Save in case we want to output to a json file
        self.json = output

        # Save file in json format
        write_json(output, self.path)

        # Save in case we want to output to a html file
        self.html = html