예제 #1
0
def explore_06_heat_weeks_days(df_to_explore):
    # Create a dictionary for mapping week day numbers into week day names
    wd_names = {0: 'Mo', 1: 'Tu', 2: 'We', 3: 'Th', 4: 'Fr', 5: 'Sa', 6: 'Su'}

    # Transform the dataset for exploration
    find_criteria_total = df_to_explore['criteria_tidy'] == 'TOTAL MARK'
    find_year_2019 = df_to_explore['year'] == 2019
    choose_rows = find_criteria_total & find_year_2019
    output_df = df_to_explore[choose_rows].pivot_table(
        index=['date_week'],
        columns=['date_dnn'],
        values='mark',
        aggfunc=np.count_nonzero,
        fill_value=np.nan)
    output_df = output_df.rename(columns=wd_names)

    # View the first 5 rows after transformation
    print('\n6. View the first 5 rows after transformation\n')
    print(print_pretty_table(output_df.head()))

    # Make a plot, and write it to a .png file
    output_file_name = 'output/cleaner_marks_df_2014_06_heatmap_weeks_days.png'
    output_plot = {
        'title': 'Adjudicated towns by weeks / week days for 2019',
        'x_label': 'Week Day',
        'y_label': 'Week of Year'
    }
    heatmap_df_to_png(output_df, output_file_name, output_plot, (6.6, 3))
예제 #2
0
def explore_07_heat_criteria_categories(df_to_explore):
    # Transform the dataset for exploration
    find_criteria_total = df_to_explore['criteria_tidy'] == 'TOTAL MARK'
    find_year_2019 = df_to_explore['year'] == 2019
    choose_rows = ~find_criteria_total & find_year_2019
    output_df = df_to_explore[choose_rows].pivot_table(index=['criteria_tidy'],
                                                       columns=['category'],
                                                       values='mark',
                                                       aggfunc=np.mean,
                                                       fill_value=0)

    # View the first 5 rows after transformation
    print('\n7. View the first 5 rows after transformation\n')
    print(print_pretty_table(output_df.head()))

    # Make a plot, and write it to a .png file
    output_file_name = 'output/cleaner_marks_df_2014_07_heat_criteria_categories.png'
    output_plot = {
        'title': 'Mean marks by criteria / categories / 2019',
        'x_label': 'Category',
        'y_label': 'Criteria'
    }
    heatmap_df_to_png(output_df, output_file_name, output_plot, (6.6, 3))
예제 #3
0
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

if __name__ == '__main__':
    df = pd.read_csv('../cleaner/output/cleaner_marks_df_2014.csv')

    # View the first 5 rows
    print('\nView the first 5 rows\n')
    print(print_pretty_table(df.head()))

    # Transform the dataset for predicting
    # Model 1: Predict 'TOTAL MARK' using only 'TOTAL MARK' dynamics by years for non-nan rows
    find_total_mark = df['criteria_tidy'] == 'TOTAL MARK'
    choose_cols = ['town_tidy', 'county_l1', 'criteria_tidy']
    df = df[find_total_mark].pivot_table(index=choose_cols,
                                         columns=['year'],
                                         values='mark',
                                         aggfunc=np.sum,
                                         fill_value=np.nan).reset_index()
    df = df.dropna()

    # View the first 5 rows after transformation
    print('\nView the first 5 rows after transformation\n')
    print(print_pretty_table(df.head()))
예제 #4
0
from functions import write_df_to_csv, print_pretty_table, show_extended_info
import pandas as pd
import numpy as np

if __name__ == '__main__':
    # Read the 'dirty' dataset from a .csv file
    df = pd.read_csv('../parser/output/parser_marks_df.csv',
                     parse_dates=['date'],
                     dtype={'mark': np.float64, 'max_mark': np.float64, 'year': np.float64})

    # Show dataframe info
    print('\nShow dataframe info\n')
    print(print_pretty_table(show_extended_info(df)))

    # Clean 'category' column

    # > Step 1: Read the mapping of population categories to National Awards categories from a .csv file
    categories_df = pd.read_csv('input/categories.csv')

    # > Step 2: Merge the dataframes – enrich the dataset with a ‘category_tidy’ column
    df = df.merge(categories_df, how='left', on=['category'])

    # Clean 'county' column
    df['county_l1'] = df['county'].apply(lambda x: x.split(' ')[0])

    # Check & clean 'town' column

    # > Step 1: Add a column 'pdf_path_county' with a county extracted from 'pdf_path'
    df['pdf_path_county'] = df['pdf_path'].apply(lambda x: x.split('/')[-1])

    # > Step 2: Process 'Ballingarry' to differentiate Ballingarry (North Tipperary) -- a civil parish
예제 #5
0
from functions import write_df_to_csv, print_pretty_table, show_extended_info, boxplot_df_to_png
import pandas as pd
import numpy as np

if __name__ == '__main__':
    year_start = 1996
    year_end = 2019
    year_range = range(year_start, year_end + 1)

    # Read the report with the results of crawling from a .csv file
    df = pd.read_csv('output/crawler_pdfs_df.csv')

    # Show dataframe info
    print('\nShow dataframe info\n')
    print(print_pretty_table(show_extended_info(df)))

    # View the first 5 rows
    print('\nView the first 5 rows\n')
    print(print_pretty_table(df.head()))

    # 1. Check crawler stats

    # > Step 1: Check the number of pdfs crawled
    output_pt = ([['dataframe', 'shape'], ['pdfs crawled', df.shape]])

    # > Step 2: Check the number of pdfs downloaded / available for download
    find_pdf_success = df['pdf_success'] == 1
    output_df = df[find_pdf_success]
    output_pt.append(['pdfs downloaded', output_df.shape])

    # > Step 3: Check the number of pdfs to parse
예제 #6
0
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

if __name__ == '__main__':
    SEED = 42
    predictor_name = 'predictor_02'

    # Read the clean dataset from a .csv file
    df = pd.read_csv('../cleaner/output/cleaner_marks_df_2014.csv')

    # View the first 5 rows
    print('\nView the first 5 rows\n')
    print(print_pretty_table(df.head()))

    # Approach 2: Predict non-'TOTAL MARK' using non-'TOTAL MARK' dynamics by years

    # Transform the dataset for predicting
    find_criteria_total = df['criteria_tidy'] == 'TOTAL MARK'
    choose_rows = ~find_criteria_total
    choose_cols = ['category_tidy', 'county_l1', 'town_tidy', 'criteria_tidy']
    df_Xy = df[choose_rows].pivot_table(index=choose_cols,
                                        columns=['year'],
                                        values='mark',
                                        aggfunc=np.sum,
                                        fill_value=np.nan)
    df_Xy = df_Xy.reset_index().dropna()

    # Show dataframe info after transformation
예제 #7
0
import pandas as pd
import numpy as np

if __name__ == '__main__':
    # Read the 'dirty' dataset from a .csv file
    df = pd.read_csv('output/parser_marks_df.csv',
                     parse_dates=['date'],
                     dtype={
                         'mark': np.float64,
                         'max_mark': np.float64,
                         'year': np.float64
                     })

    # Show dataframe info
    print('\nShow dataframe info\n')
    print(print_pretty_table(show_extended_info(df)))

    # View the first 5 rows
    print('\nView the first 5 rows\n')
    print(print_pretty_table(df.iloc[:, 0:8].head()))
    print(print_pretty_table(df.iloc[:, 8:].head()))

    # Check unique values in 'category', 'county', 'criteria'
    choose_cols = ['category', 'county', 'criteria']
    output_pt = [['column', 'unique values']]
    for col in choose_cols:
        show_col_unique = df[col].unique()
        show_col_unique = sorted(show_col_unique)
        show_col_unique = print_pretty_list(show_col_unique)
        output_pt.append([col, show_col_unique])
예제 #8
0
    heatmap_df_to_png(output_df, output_file_name, output_plot, (6.6, 3))


if __name__ == '__main__':
    # Read the clean dataset from a .csv file
    df = pd.read_csv('../cleaner/output/cleaner_marks_df_2014.csv',
                     parse_dates=['date'],
                     dtype={
                         'mark': np.float64,
                         'max_mark': np.float64,
                         'year': np.float64
                     })

    # Show dataframe info
    print('\nShow dataframe info\n')
    print(print_pretty_table(show_extended_info(df)))

    # View the first 5 rows
    print('\nView the first 5 rows\n')
    print(print_pretty_table(df.head()))

    # 1. Explore 'TOTAL MARK' distribution dynamics by years
    explore_01_box_years_marks(df)

    # 2. Explore 'TOTAL MARK' distribution for 2019 by counties
    explore_02_box_counties_marks(df)
    explore_02_bar_counties_count(df)

    # Add a column 'date_dnn' with a week day number extracted from 'date'
    df['date_dnn'] = df['date'].dt.weekday