示例#1
0
 def post(self):
     """
     Returns summary description of the numeric variables in the dataset
     """
     file_name = request.get_json()['sessionId']
     data = file_service.read_file(file_name)
     return data.describe().to_json()
示例#2
0
    def get(self, datetime, file_name, x, y, hue, reg):
        """ 
        Returns a scatter plot for the two provided x and y numeric variables and an optional
        hue for a categorical variable.

        :param datetime: the current datetime in integer format
        :param file_name: the file name represented by session_id
        :param x: the x variable
        :param y: the y variable
        :param hue: the categorical hue for the plot (optional) 
        :param reg: whether to display as a regression plot
        :return: A bytes image seaborn scatter plot
        """
        columns = [x, y] if hue == 'none' else [x, y, hue]
        data = file_service.read_file(file_name)

        if len(data.index) > 15000:
            data = data.sample(n=15000)

        plot = plot_service.get_scatter_plot(data, hue, x, y, reg)
        plt.tight_layout()

        bytes_image = io.BytesIO()
        plt.savefig(bytes_image, format='png')
        bytes_image.seek(0)
        plt.close()

        return send_file(bytes_image,
                         attachment_filename=f"scatter.png",
                         mimetype='image/png')
示例#3
0
    def get(self, datetime, file_name, columns):
        """ 
        Returns a correlation heatmap visualisation for the dataset
        """
        data = file_service.read_file(file_name)

        f, ax = plt.subplots(figsize=(16, 9))
        correlation_df = data[columns.split(',')].select_dtypes(
            include=[np.number]).corr(method='pearson')

        cmap = sns.diverging_palette(220, 10, as_cmap=True)
        mask = np.zeros_like(correlation_df, dtype=np.bool)
        mask[np.triu_indices_from(mask)] = True

        correlation_plot = sns.heatmap(correlation_df, mask=mask, annot=True,
                                       cmap=cmap, vmax=.3, center=0, square=True, linewidths=.5,
                                       cbar_kws={"shrink": .5})

        plt.tight_layout()
        bytes_image = io.BytesIO()
        plt.savefig(bytes_image, format='png')
        bytes_image.seek(0)
        plt.close()

        return send_file(bytes_image,
                         attachment_filename=f"correlation-plot.png",
                         mimetype='image/png')
示例#4
0
 def post(self):
     """ 
     Returns shape of the data 
     """
     file_name = request.get_json()['sessionId']
     data = file_service.read_file(file_name)
     shape = data.shape
     return json.dumps({'rows': shape[0], 'columns': shape[1]})
示例#5
0
    def post(self):
        """
        Returns summary description of the categorical variables in the dataset
        """
        file_name = request.get_json()['sessionId']
        data = file_service.read_file(file_name)
        categorical_df = data.select_dtypes(
            include=['object', 'bool'])

        return categorical_df.describe().to_json()
示例#6
0
 def post(self, column_name):
     """ Returns the unique values for the given column """
     file_name = request.get_json()['sessionId']
     data = file_service.read_file(file_name)
     data[column_name].fillna('Missing', inplace=True)
     uniques = pd.unique(data[column_name])
     uniques_json = json.dumps({
         'uniques': np.sort(uniques).tolist()
     })
     return uniques_json
示例#7
0
    def get(self, datetime, file_name):
        """ 
        Returns a CSV with all missing data imputed 
        """
        data = file_service.read_file(file_name)
        data.fillna(data.median(), inplace=True)
        data.fillna(value='missing', inplace=True)

        resp = make_response(data.to_csv(index=False))
        resp.headers["Content-Disposition"] = "attachment; filename=imputed_missing.csv"
        resp.headers["Content-Type"] = "text/csv"
        return resp
示例#8
0
 def post(self):
     """ 
     Returns the labels of the columns for both numeric
     and categorical variables 
     """
     file_name = request.get_json()['sessionId']
     data = file_service.read_file(file_name)
     numeric_df = data.select_dtypes(include=[np.number])
     categorical_df = data.select_dtypes(include=['object', 'bool'])
     return json.dumps({
         'categorical': list(categorical_df.columns.values),
         'numeric': list(numeric_df.columns.values)
     })
示例#9
0
    def post(self, x, y, show_as_percentages=False):
        """ 
        Returns a json object containing a cross tabulation (pivot table)
        data structure for x and y variables 
        """
        file_name = request.get_json()['sessionId']
        data = file_service.read_file(file_name)
        if (show_as_percentages):
            crosstab = pd.crosstab(
                data[x], data[y], normalize='index').round(4) * 100
        else:
            crosstab = pd.crosstab(data[x], data[y])

        return {
            'crosstab': crosstab.to_json(),
            'transposed': crosstab.transpose().to_json()
        }
示例#10
0
    def get(self, datetime, file_name, x, target_column, target_value, analysis_type, is_actuals):
        """ 
        Returns a bar plot showing the percentages of each categorical features in the chosen column
        against the target variable.

        :param datetime: the current datetime in integer format
        :param file_name: the file name represented by session_id
        :param x: the x variable
        :param target_column: the target variable of the analysis
        :param target_value: the target value of the analysis
        :param analysis_type: a flag of 'categorical' or 'continuous' analysis type
        :param is_actuals: a flag where 1 is actual counts and 0 is percentages
        """
        data = file_service.read_file(file_name)

        data = influencers_service.bin_continuous_cols(data, target_column)
        f, ax = plt.subplots(figsize=(15, 11))

        if analysis_type != 'continuous':
            target_value = plot_service.convert_continuous_target_dtype(
                data, target_column, target_value
            )

        filtered_df = data[data[target_column] == target_value]

        plot = plot_service.get_influencers_plot(
            data, filtered_df, x, target_column, analysis_type, is_actuals)

        if (is_actuals == 1):
            base_count = len(data[data[target_column] == target_value])
        else:
            if (is_numeric_dtype(data[target_column])):
                base_mean = data[target_column].mean()
                plot.axhline(base_mean, ls='--',
                             label=f'Average {target_column}')
                plt.legend()

        plt.tight_layout()
        bytes_image = io.BytesIO()
        plt.savefig(bytes_image, format='png')
        bytes_image.seek(0)
        plt.close()

        return send_file(bytes_image,
                         attachment_filename=f"influencers.png",
                         mimetype='image/png')
示例#11
0
    def get(self, datetime, file_name):
        """ 
        Returns a standard pairplot for the dataset
        """
        data = file_service.read_file(file_name)

        f, ax = plt.subplots(figsize=(16, 9))
        pairplot = sns.pairplot(data)
        plt.tight_layout()

        bytes_image = io.BytesIO()
        plt.savefig(bytes_image, format='png')
        bytes_image.seek(0)
        plt.close()

        return send_file(bytes_image,
                         attachment_filename=f"pairplot.png",
                         mimetype='image/png')
示例#12
0
    def get(self, variable, file_name, datetime):
        """ 
        Returns distribution plot for numeric variables
        and a countplot for categorical variables.
        """
        data = file_service.read_file(file_name)

        f, ax = plt.subplots(figsize=(11, 9))
        plot = plot_service.get_distribution_plot(data, variable)
        plt.tight_layout()

        bytes_image = io.BytesIO()
        plt.savefig(bytes_image, format='png')
        bytes_image.seek(0)
        plt.close()

        return send_file(bytes_image,
                         attachment_filename=f"{variable}_distribution.png",
                         mimetype='image/png')
示例#13
0
    def get(self, datetime, file_name):
        """ 
        Returns a missing data visualisation for the dataset
        """
        data = file_service.read_file(file_name)

        f, ax = plt.subplots(figsize=(16, 9))
        missing_plot = sns.heatmap(
            data.isnull(), cbar=False, cmap="YlGnBu_r")
        plt.tight_layout()

        bytes_image = io.BytesIO()
        plt.savefig(bytes_image, format='png')
        bytes_image.seek(0)
        plt.close()

        return send_file(bytes_image,
                         attachment_filename=f"missing-data.png",
                         mimetype='image/png')
示例#14
0
    def post(self):
        """ 
        Returns a json object with information (metadata) on the dataset 
        """
        file_name = request.get_json()['sessionId']
        data = file_service.read_file(file_name)
        total_missing_values, total_missing_percent = data_service.get_missing_values_info(
            data)
        total_duplicate_rows, total_duplicates_percent = data_service.get_duplicates_info(
            data)

        return json.dumps({
            'totalMissingValues': int(total_missing_values),
            'totalMissingPercent': total_missing_percent,
            'totalKbInMemory': data_service.get_total_kilobytes_in_memory(data),
            'variables': int(data.shape[1]),
            'observations': int(data.shape[0]),
            'totalDuplicatedRows': int(total_duplicate_rows),
            'totalDuplicatedPercent': total_duplicates_percent,
            'columnsInfo': data_service.get_column_type_counts(data),
            'warningMessages': data_service.populate_warning_messages(data)
        })
示例#15
0
    def post(self, analysis_type, target_column, target_value):
        """ Returns key influencers json """
        file_name = request.get_json()['sessionId']
        data = file_service.read_file(file_name)

        target_dtype = data[target_column].dtypes

        if analysis_type != 'continuous':
            if target_dtype == 'float64':
                target_value = float(target_value)
            elif target_dtype == 'int64':
                target_value = int(target_value)
            else:
                target_value = str(target_value)

        method = 'classification' if analysis_type == 'categorical' else 'regression'

        influencers = influencers_service.find_key_influencers(target_column,
                                                               target_value,
                                                               df=data,
                                                               method=method)

        return json.dumps({'influencers': influencers})
示例#16
0
def test_read_file(client):
    # act
    df = file_service.read_file(FILE_NAME)

    # assert
    assert df.shape == (891, 12)