Пример #1
0
def EDA(source_df, reply_df):
    """function that does exploratory data analysis on the data"""

    # Creating profile report
    source_report = ProfileReport(source_df,
                                  title='Profile Report',
                                  html={'style': {
                                      'full_width': True
                                  }})
    source_report.to_notebook_iframe()
    source_report.to_file(output_file="EDA_source_report.html")

    reply_report = ProfileReport(reply_df,
                                 title='Profile Report',
                                 html={'style': {
                                     'full_width': True
                                 }})
    reply_report.to_notebook_iframe()
    reply_report.to_file(output_file="EDA_reply_report.html")

    correlation_heatmap(source_df)
    correlation_heatmap(reply_df)

    import pdfkit
    pdfkit.from_file('EDA_source_report.html', 'EDA_source_report.pdf')
    pdfkit.from_file('EDA_reply_report.html', 'EDA_reply_report.pdf')
Пример #2
0
def create_report(df, filename=None):
    profile = ProfileReport(df, title='Pandas Profiling Report')

    if filename:
        profile.to_file(output_file = filename)
    else:
        return profile.to_notebook_iframe()
Пример #3
0
def simple_report(data: pd.DataFrame):
    from pandas_profiling import ProfileReport
    import sys
    inJupyter = sys.argv[-1].endswith('json')
    report = ProfileReport(data,
                           title='Report',
                           html={'style': {
                               'full_width': True
                           }})
    if inJupyter:
        return report.to_notebook_iframe()
    else:
        return report.to_file(output_file='simple_report.html')
Пример #4
0
    def profile(self,
                title: str = 'Dataset profile report',
                html_path: str = None,
                show_report_in_notebook: bool = False):
        """Generates a pandas-profiling report of the dataset to be displayed in a jupyter notebook.
        Optionally saves the report as an html file

        :param html_path: If provided, the pandas-profiling report will be saved to disk
        :param show_report_in_notebook: Whether or not to show report in jupyter notebook
        :return: None
        """
        if not os.path.exists(html_path):
            logger.info('Generating the profiling report')
            profile_report = ProfileReport(self.data, title=title)
            if html_path is not None:
                profile_report.to_file(html_path)
                logger.info(
                    f'Saved the pandas-profiling report to ``{html_path}``')
            profile_report.to_notebook_iframe()
        else:
            logger.info(
                f'A profiling report was already generated and will be loaded from ``{html_path}``'
            )
            display(IFrame(src=html_path, width=10**3, height=10**3))
Пример #5
0
    alpha=0.7,
    palette=list(reversed(sns.color_palette("RdYlGn_r", 4))),
)
plt.legend(title="Conformidad con el salario")
plt.ylabel("Salario bruto")
plt.xlabel("Salario neto")
plt.title('Relación entre salario neto y salario bruto')
plt.show()
# -

# ## Referencias
#
# - [Documentación de seaborn](http://seaborn.pydata.org/tutorial/color_palettes.html)
# - [Diverging Color Maps for Scientific Visualization - Kenneth Moreland](https://cfwebprod.sandia.gov/cfdocs/CompResearch/docs/ColorMapsExpanded.pdf)
# - [XKCD color survey](https://blog.xkcd.com/2010/05/03/color-survey-results/)
# - [Subtleties of colors series](https://earthobservatory.nasa.gov/blogs/elegantfigures/2013/08/05/subtleties-of-color-part-1-of-6/)
# - [Documentación de matplotlib](https://matplotlib.org/tutorials/colors/colormaps.html)

# # Pandas profiling

# +
from pandas_profiling import ProfileReport

report = ProfileReport(df,
                       title='Encuesta de sueldos sysarmy 2020.02',
                       explorative=True,
                       lazy=False)
# -

report.to_notebook_iframe()
warnings.simplefilter("ignore")

plt.rcParams['figure.figsize'] = (12,8)

"""#Step 2: Exploratory Data Analysis
---
"""

# Importing the dataset

df = pd.read_csv('employee_data.csv')
df.head()

profile = ProfileReport(df, title = "Pandas Profiling Report", html = {"style":{'full_width':False}})

profile.to_notebook_iframe()

profile.to_file(output_file="REPORT.html")

"""#Step 3: Encode Categorical Features
---
"""

pd.crosstab(df.salary, df.quit).plot(kind='bar')
plt.title('Turnover Frequency on Salary Bracket')
plt.xlabel('Salary')
plt.ylabel('Frequency of Turnover')
plt.show()

pd.crosstab(df.department,df.quit).plot(kind='bar')
plt.title('Turnover Frequency for Department')
Пример #7
0
print("Media de Contaminados:{:.2f} ".format(df['C_Acu_Dia'].mean()))

plt.hist('C_Acu_Dia',data = df,bins = 8,density = False)
plt.show()

df.plot.scatter(x='C_Acu_Dia', y = 'O_Acu_Dia',s=60)
plt.show()

df.boxplot(['C_Acu_Dia','O_Acu_Dia'])

corr = df[['C_Acu_Dia','O_Acu_Dia']].corr()
display(corr)

sns.heatmap(corr,cmap='RdBu', fmt = '.2f', square = True,linecolor='white', annot=True)

!pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip -q

import pandas as pd                                       #biblioteca pandas
from pandas_profiling import ProfileReport                #biblioteca para usar profile report
report = ProfileReport(df)                              #cria o profile report
report.to_notebook_iframe()                             #gera overview dos dados

! pip install sweetviz -q # INSTALA SWEETVIZ

import sweetviz
relatorio_sweet = sweetviz.analyze(df)

relatorio_sweet.show_html('Teste.html')

Пример #8
0
class HelloDataset():
    '''
    Stores methods for dataset operations.
    '''

    # Dataloading
    def load(self, path_obj):
        '''
        Loads a .csv into a class attribute.
        param path_obj: Path object pointing to dataset stored as .csv
        '''
        print('Loading data...')
        self.df = pd.read_csv(path_obj)
        print('Data loaded.')
        # print('Columns found:')
        # for col in self.df.columns:
        #     print(f'Feature name: {col}\tDatatype: {self.df[col]}\tMissing values: \
        #         {self.df[col].isna().count()}')

    def explore(self):
        self.profile = ProfileReport(self.df,
                                     title='Explore the Dataset',
                                     minimal=True,
                                     progress_bar=False)
        return self.profile.to_notebook_iframe()

    # Data cleaning
    def drop_feature(self, feature_list):
        '''
        Removes the given column(s).
        '''
        self.df = self.df.drop(feature_list, axis=1)
        print(f'Feature(s) {feature_list} removed.')

    def convert_to_numerical(self, feature_list):
        '''
        Converts the given column(s) to dummies.
        '''
        print(
            f'Converting features {feature_list} to numerical by creating dummy variables...'
        )
        dummy_cols = []
        for col in feature_list:
            dummies = pd.get_dummies(self.df[col], drop_first=True, prefix=col)
            dummy_cols.extend(dummies.columns)
            self.df = pd.concat([self.df, dummies], axis=1)
            self.df = self.df.drop(col, axis=1)
        print(f'Features removed: {feature_list}.')
        print(f'Dummy features added: {list(dummy_cols)}')

    # def drop_example(self, row):
    #     '''
    #     Removes the given row.
    #     Commented out due to need for better data exploration support for
    #     appropriate usage.
    #     '''
    #     self.df = self.df.drop(row, axis=0)
    #     print(f'Example {row} removed.')

    # Feature engineering
    def sum_feature(self, new_feat, feature_list):
        '''
        Sum columns.
        param feature_list: List of column names (as strings) to sum with each other.
        param new_feat: The name of the new column.
        '''
        self.df[new_feat] = self.df[feature_list].sum(axis=1)
        print(
            f'Created new feature "{new_feat}" by summing features {feature_list}.'
        )

    def subtract_feature(self, new_feat, feat_a, feat_b):
        '''
        Divide one column by another.
        param feat_a: The column from which to subtract.
        param feat_b: The column to subtract from feat_a.
        param new_feat: The name of the new column.
        '''
        self.df[new_feat] = self.df[feat_a] - self.df[feat_b]
        print(
            f'Created new feature "{new_feat}" by dividing {feat_num} by {feat_denom}.'
        )

    def divide_feature(self, new_feat, feat_num, feat_denom):
        '''
        Divide one column by another.
        param feat_num: The column containing the feature in the numerator.
        param feat_denom: The column containing the feature in the denominator.
        param new_feat: The name of the new column.
        '''
        self.df[new_feat] = self.df[feat_num] / self.df[feat_denom]
        print(
            f'Created new feature "{new_feat}" by dividing {feat_num} by {feat_denom}.'
        )

    def multiply_feature(self, new_feat, feature_list):
        '''
        Multiply one column by another.
        param feature_list: List of column names (as strings) to multiply by each other.
        param new_feat: The name of the new column.
        '''
        self.df[new_feat] = self.df[feature_list].prod(axis=1,
                                                       numeric_only=True)
        print(
            f'Created new feature "{new_feat}" by multiplying features {feature_list}.'
        )

    def set_target_feature(self, target_feature, scale=True):
        '''
        param target_feature: name of column to predict.
        All other columns will be normalised and used for prediction.
        '''
        self.X = self.df.drop(target_feature, axis=1)
        self.Y = self.df[target_feature]

        print(f'Target feature set to {target_feature}.')
        print(f'Features used for prediction:')
        for i in list(self.X.columns):
            print(f'- {i}')

        if scale:
            scaler = RobustScaler()
            num_df = scaler.fit_transform(
                self.X.select_dtypes(include=[np.number]))
            self.X.update(num_df)

    def impute_missing_values(self, feature_list, imputation_type='auto'):
        '''
        Impute missing values in the selected columns of the dataset
        param feature_list: List of column names (as strings) to impute.
        param imputation_type: What kind of imputation to use (auto, mode, or mean)
        '''

        for feature in feature_list:
            num_na = sum(self.df[feature].isna())
            if num_na == 0:
                print(
                    f'No null values detected in column "{feature}"... skipping to next feature'
                )
                continue
            elif imputation_type == 'auto':
                if (self.df[feature].dtype == 'O'
                        or self.df[feature].dtype == 'str'):
                    print(
                        f'{num_na} null values detected in column "{feature}"... replacing with the mode'
                    )
                    new_feature = self.df[feature].fillna(
                        self.df[feature].mode()[0])
                else:
                    print(
                        f'{num_na} null values detected in column "{feature}"... replacing with mean'
                    )
                    new_feature = self.df[feature].fillna(
                        self.df[feature].mean()[0])
            elif imputation_type == 'mode':
                print(
                    f'{num_na} null values detected in column "{feature}"... replacing with the mode'
                )
                new_feature = self.df[feature].fillna(
                    self.df[feature].mode()[0])
            elif imputation_type == 'mean' and (self.df[feature].dtype == 'O'
                                                or self.df[feature].dtype
                                                == 'str'):
                print(
                    f'Error: Cannot impute mean of a non-numerical feature. Skipping imputation of feature "{feature}"...'
                )
                continue
            elif imputation_type == 'mean':
                print(
                    f'{num_na} null values detected in column "{feature}"... replacing with mean'
                )
                new_feature = self.df[feature].fillna(
                    self.df[feature].mean()[0])
            else:
                raise AssertionError(
                    'Error: Incorrect imputation type. Please enter one of the options: "auto", "mean", or "mode" only.'
                )
            self.df[feature] = new_feature
Пример #9
0
def do_a_pandas_profile(df, name):
    from pandas_profiling import ProfileReport
    profile = ProfileReport(df, title=name, html={'style':{'full_width':True}})
    profile.to_widgets(), profile.to_notebook_iframe()