def reusing_code_random_forest_on_iris() -> Dict:
    """
    Again I will run a regression on the iris dataset, but reusing
    the existing code from assignment1. I am also including the species column as a one_hot_encoded
    value for the prediction. Use this to check how different the results are (score and
    predictions).
    """
    df = read_dataset(Path('..', '..', 'iris.csv'))
    for c in list(df.columns):
        df = fix_outliers(df, c)
        df = fix_nans(df, c)
        df[c] = normalize_column(df[c])

    ohe = generate_one_hot_encoder(df['species'])
    df = replace_with_one_hot_encoder(df, 'species', ohe,
                                      list(ohe.get_feature_names()))
    '''
    !!!My Results!!!
    When comparing the two regression functions together, it seems that the raw data will typically have a higher 
    accuracy compared to the processed data, but they are essentially the same.
    I believe this is due to the same reasons I mentioned in the classification file, the data is essentially the same,
    the only preprocessing which was done fixed the outliers and removed the missing values in the dataset, but this
    dataset does not actually have too many outliers or nans, thus we are left with a very similar dataset.
    The normalization and one hot encoding changes in the dataset in a way that processing it with a model may be more
    efficient, but it will not actually change the meaning of the data! 
    So, although the datasets are essentially the same, I would still choose to use the preprocessed data, as the 
    normalization and one hot encoding may make the model process the data more efficiently compared to just the
    raw data.
    '''
    X, y = df.iloc[:, 1:], df.iloc[:, 0]
    model = simple_random_forest_regressor(X, y)
    print(model)
    return model
def reusing_code_random_forest_on_iris() -> Dict:
    """
    Again I will run a classification on the iris dataset, but reusing
    the existing code from assignment1. Use this to check how different the results are (score and
    predictions).
    """
    df = read_dataset(Path('..', '..', 'iris.csv'))
    for c in list(df.columns):
        # Notice that I am now passing though all columns.
        # If your code does not handle normalizing categorical columns, do so now (just return the unchanged column)
        df = fix_outliers(df, c)
        df = fix_nans(df, c)
        df[c] = normalize_column(df[c])

    X, y = df.iloc[:, :4], df.iloc[:, 4]
    le = generate_label_encoder(y)

    # Be careful to return a copy of the input with the changes, instead of changing inplace the inputs here!
    y_encoded = replace_with_label_encoder(y.to_frame(), column='species', le=le)
    rf = simple_random_forest_classifier(X, y_encoded['species'])

    '''
    !!Explanation!!
    Both the classifier in this function and the one in the last yield just about the same score on average
    I believe this is because the two datasets are essentially the same at this point:
    They both have label encoded classes
    The only difference is this function removed nans and outliers, which the dataset does not possess many of anyway
    And also normalizes the dataset, which from what my understanding might not actually change the values 
    in relation to other values. This normalization may just make the model in this function more efficient!
    Due to this potential boost in efficiency due to normalization, I would choose this function's model over the last 
    '''
    print(rf['accuracy'])
    return rf
Пример #3
0
def process_iris_dataset_again() -> pd.DataFrame:
    """
    Consider the example above and once again perform a preprocessing and cleaning of the iris dataset.
    This time, use normalization for the numeric columns and use label_encoder for the categorical column.
    Also, for this example, consider that all petal_widths should be between 0.0 and 1.0, replace the wong_values
    of that column with the mean of that column. Also include a new (binary) column called "large_sepal_lenght"
    saying whether the row's sepal_length is larger (true) or not (false) than 5.0
    :return: A dataframe with the above conditions.
    """
    print("loading dataset...")
    df = read_dataset(Path('..', '..', 'iris.csv'))

    """
    thought a comment here may be useful as the ordering of this answer differs from that of the question
    I chose to implement the large_sepal_length first, because we need to look at sepal lengths
    before they are normalized
    this is because we need to analyse their true length values
    rather than values that are between 0 and 1
    """
    print("generating large sepal length column...")
    large_sepal_length = []
    for i in range(len(df["sepal_length"])):
        if df.loc[i, "sepal_length"] > 5.0:
            large_sepal_length.append(True)
        else:
            large_sepal_length.append(False)
    df["large_sepal_length"] = large_sepal_length

    """
    I then chose to replace petal_widths missing values with their means before normalization
    as after normalization, all values will be between 0 and 1 regardless
    Once again, I think it's important to take a look at the true values here
    I also take the mean of the petal_widths are setting values > 1 or < 0 to np.nan
    this ensures the true values to be between 0 and 1, those which were np.nan will now be the mean
    """
    print("fixing wrong and missing petal_width values (replacing them w/ mean)...")
    df = fix_numeric_wrong_values(df, "petal_width", WrongValueNumericRule.MUST_BE_LESS_THAN, 1)
    df = fix_numeric_wrong_values(df, "petal_width", WrongValueNumericRule.MUST_BE_GREATER_THAN, 0)
    mean = get_column_mean(df, "petal_width")
    missing = df["petal_width"].isnull()
    df.at[missing, "petal_width"] = mean

    # normalization and encoding now goes along as normal
    numeric_columns = get_numeric_columns(df)
    categorical_columns = get_text_categorical_columns(df)

    print("normalizing numeric columns...")
    for nc in numeric_columns:
        df.loc[:, nc] = normalize_column(df.loc[:, nc])

    print("label encoding categorical columns...")
    for cc in categorical_columns:
        le = generate_label_encoder(df.loc[:, cc])
        df = replace_with_label_encoder(df, cc, le)

    print("done :)")
    print(df)
    return df
Пример #4
0
def process_amazon_video_game_dataset():
    """
    Now use the rating_Video_Games dataset following these rules:
    1. The rating has to be between 1.0 and 5.0
    2. Time should be converted from milliseconds to datetime.datetime format
    3. For the future use of this data, I don't care about who voted what, I only want the average rating per product,
        therefore replace the user column by counting how many ratings each product had (which should be a column called count),
        and the average rating (as the "review" column).
    :return: A dataframe with the above conditions. The columns at the end should be: asin,review,time,count
    """
    print("loading dataset...")
    df = read_dataset(Path('..', '..', 'ratings_Video_Games.csv'))

    print("fixing wrong values in review columns (replacing them with mean)...")
    # used the same logic as the making sure the petal_widths were b/w 0.0 and 1.0
    df = fix_numeric_wrong_values(df, "review", WrongValueNumericRule.MUST_BE_LESS_THAN, 5.0)
    df = fix_numeric_wrong_values(df, "review", WrongValueNumericRule.MUST_BE_GREATER_THAN, 1.0)
    mean = get_column_mean(df, "review")
    missing = df["review"].isnull()
    df.at[missing, "review"] = mean

    """"
    CITATION
    this CMSDK post from July 2019 helped with this method
    I fully acknowledge and give credit to them for the code found from this article
    The article can be found at the link below
    https://cmsdk.com/python/pandas-converting-row-with-unix-timestamp-in-milliseconds-to-datetime.html
    """
    print("converting milliseconds to datetime in time column...")
    df['time'] = pd.to_datetime(df['time'], unit='ms')

    """"
    CITATION
    this Stack Overflow post helped with this method
    the user's name was 'unutbu'
    I fully acknowledge and give credit to them for the code found in their section of the article
    The article can be found at the link below
    https://stackoverflow.com/questions/17709270/create-column-of-value-counts-in-pandas-dataframe
    """
    print("replacing user count with review count column...")
    df['user'] = df.groupby(['asin'])['user'].transform('count')
    df.rename(columns={'user': '******'}, inplace=True)

    print("replacing review with average review per product...")
    # here I assume average rating is [product_rating] / rating_count
    df['review'] = df.groupby(['asin'])['review'].transform('sum') / df.groupby(['asin'])['review'].transform('count')

    print("done :)")
    #print(df)
    # for the new df, I figured I would not combined rows w/ the same asin value because the time was unique

    return df
Пример #5
0
def process_amazon_video_game_dataset_again():
    """
    Now use the rating_Video_Games dataset following these rules (the third rule changed, and is more open-ended):
    1. The rating has to be between 1.0 and 5.0, drop any rows not following this rule
    2. Time should be converted from milliseconds to datetime.datetime format
    3. For the future use of this data, I just want to know more about the users, therefore show me how many reviews each user has,
        and a statistical analysis of each user (average, median, std, etc..., each as its own row)
    :return: A dataframe with the above conditions.
    """
    print("loading dataset...")
    df = read_dataset(Path('..', '..', 'ratings_Video_Games.csv'))

    print("dropping 'wrong value' rows...")
    # used the same logic as the making sure the petal_widths were b/w 0.0 and 1.0
    df = fix_numeric_wrong_values(df, "review", WrongValueNumericRule.MUST_BE_LESS_THAN, 5.0)
    df = fix_numeric_wrong_values(df, "review", WrongValueNumericRule.MUST_BE_GREATER_THAN, 1.0)
    df = df[df['review'].notnull()]

    """"
    CITATION
    this CMSDK post from July 2019 helped with this method
    I fully acknowledge and give credit to them for the code found from this article
    The article can be found at the link below
    https://cmsdk.com/python/pandas-converting-row-with-unix-timestamp-in-milliseconds-to-datetime.html
    """

    print("converting milliseconds to datetime in time column...")
    df['time'] = pd.to_datetime(df['time'], unit='ms')

    """"
    CITATION
    this Pandas Documentation for the groupby function really helped
    I'm not quite sure if I need to cite documentation but I thought I would to be safe -
    as this specific page helped me a lot
    I fully acknowledge and give credit to them for the code found from this article
    The documentation can be found at the link below
    https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.core.groupby.DataFrameGroupBy.transform.html?highlight=transform#pandas.core.groupby.DataFrameGroupBy.transform
    """
    print("creating columns for statistical analysis...")
    # the number of reviews the user has made
    df['user_count'] = df.groupby(['user'])['review'].transform('count')
    # the user's average review score
    df['user_average'] = df.groupby(['user'])['review'].transform('sum') / df.groupby(['user'])['review'].transform('count')
    # the user's median review score
    #df['user_median'] = df.groupby(['user'])['review'].transform('median')
    # the std dev of the user's average review score
    #df['user_std'] = df.groupby(['user'])['review'].transform('std')

    print("done :)")
    print(df)
    return df
Пример #6
0
def reusing_code_random_forest_on_iris() -> Dict:
    """
    Again I will run a regression on the iris dataset, but reusing
    the existing code from assignment1. I am also including the species column as a one_hot_encoded
    value for the prediction. Use this to check how different the results are (score and
    predictions).
    """
    df = read_dataset(Path('..', '..', 'iris.csv'))
    for c in list(df.columns):
        df = fix_outliers(df, c)
        df = fix_nans(df, c)
        df[c] = normalize_column(df[c])

    ohe = generate_one_hot_encoder(df['species'])
    df = replace_with_one_hot_encoder(df, 'species', ohe, list(ohe.get_feature_names()))

    X, y = df.iloc[:, 1:], df.iloc[:, 0]
    return simple_random_forest_regressor(X, y)
Пример #7
0
def process_iris_dataset() -> pd.DataFrame:
    """
    In this example, I call the methods you should have implemented in the other files
    to read and preprocess the iris dataset. This dataset is simple, and only has 4 columns:
    three numeric and one categorical. Depending on what I want to do in the future, I may want
    to transform these columns in other things (for example, I could transform a numeric column
    into a categorical one by splitting the number into bins, similar to how a histogram creates bins
    to be shown as a bar chart).

    In my case, what I want to do is to *remove missing numbers*, replacing them with valid ones,
    and *delete outliers* rows altogether (I could have decided to do something else, and this decision
    will be on you depending on what you'll do with the data afterwords, e.g. what machine learning
    algorithm you'll use). I will also standardize the numeric columns, create a new column with the average
    distance between the three numeric column and convert the categorical column to a onehot-encoding format.

    :return: A dataframe with no missing values, no outliers and onehotencoded categorical columns
    """
    print("loading datasets from csv files...")
    df = read_dataset(Path('..', '..', 'iris.csv'))
    numeric_columns = get_numeric_columns(df)
    categorical_columns = get_text_categorical_columns(df)
    print("fixing missing values, outliers, and standardizing data...")
    for nc in numeric_columns:
        df = fix_outliers(df, nc)
        df = fix_nans(df, nc)
        df.loc[:, nc] = standardize_column(df.loc[:, nc])
    print("calculating numeric distances...")
    distances = pd.DataFrame()
    for nc_combination in list(itertools.combinations(numeric_columns, 2)):
        distances[str(nc_combination)] = calculate_numeric_distance(df.loc[:, nc_combination[0]],
                                                                    df.loc[:, nc_combination[1]],
                                                                    DistanceMetric.EUCLIDEAN).values
    print("calculating mean...")
    df['numeric_mean'] = distances.mean(axis=1)
    print("one hot encoding categorical columns...")
    for cc in categorical_columns:
        ohe = generate_one_hot_encoder(df.loc[:, cc])
        df = replace_with_one_hot_encoder(df, cc, ohe, list(ohe.get_feature_names()))
    print("done :)")
    print(df)
    return df
Пример #8
0
def reusing_code_random_forest_on_iris() -> Dict:
    """
    Again I will run a classification on the iris dataset, but reusing
    the existing code from assignment1. Use this to check how different the results are (score and
    predictions).
    """
    df = read_dataset(Path('..', '..', 'iris.csv'))
    for c in list(df.columns):
        # Notice that I am now passing though all columns.
        # If your code does not handle normalizing categorical columns, do so now (just return the unchanged column)
        df = fix_outliers(df, c)
        df = fix_nans(df, c)
        df[c] = normalize_column(df[c])

    X, y = df.iloc[:, :4], df.iloc[:, 4]
    le = generate_label_encoder(y)

    # Be careful to return a copy of the input with the changes, instead of changing inplace the inputs here!
    y_encoded = replace_with_label_encoder(y.toframe(),
                                           column='species',
                                           le=le)
    return simple_random_forest_classifier(X, y_encoded['species'])
Пример #9
0
def dash_task():
    """
    There is only only one task to do, a web app with:
    1. Some nice title
    2. One visualization placeholder for dataset visualization
        a. A dropdown to allow me to select which dataset I want to see (iris, video_game and life_expectancy)
        b. Two other dropdowns for me to choose what column to put in x and what column to put in y of the visualization
        c. Another dropdown for me to choose what type of graph I want (see examples in file a_) (at least 3 choices of graphs)
        d. Feel free to change the structure of the dataset if you prefer (e.g. change life_expectancy so that
            there is one column of "year", one for "country" and one for "value")
    4. A https://dash-bootstrap-components.opensource.faculty.ai/docs/components/card/ with the number of rows being showed on the above graph
    5. Another visualization with:
        a. It will containing the figure created in the tasks in a_, b_ or c_ related to plotly's figures
        b. Add a dropdown for me to choose among 3 (or more if you wish) different graphs from a_, b_ or c_ (choose the ones you like)
        c. In this visualization, if I select data in the visualization, update some text in the page (can be a new bootstrap card with text inside)
            with the number of values selected. (see https://dash.plotly.com/interactive-graphing for examples)
    """
    '''
    My Dash app is coded below!
    The drop downs allow to choose datasets, x's and y's to be displayed for a Line, Bar, and Scatter plot.
    The number of rows are then displayed in a card.
    The next section allows for the selection of a Map, Treemap, or Data Table from file b_.
    The type of visualization is then displayed in another card.
    
    The loading times are very dependent on the preprocessing of the data, so that holds everything up a little bit.
    It also seems that the map, treemap, and data table all take a little while to load so you'll have to wait around 7
    seconds after selecting a new visualization. 
    '''
    '''
    The App
    '''
    app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
    '''
    The Data
    I also round up all of the column names here for the drop downs 
    '''
    iris_data = read_dataset(Path('..', '..', 'iris.csv'))
    cols = get_numeric_columns(iris_data)
    iris_data = iris_data[cols]

    iris_choices = []
    for col in iris_data.columns:
        iris_choices.append({'label': col, 'value': col})

    video_game_data = process_amazon_video_game_dataset()
    # drop duplicates for efficiency reasons
    video_game_data = video_game_data.drop_duplicates(subset=['asin'])
    cols = get_numeric_columns(video_game_data)
    video_game_data = video_game_data[cols]

    video_game_choices = []
    for col in video_game_data.columns:
        video_game_choices.append({'label': col, 'value': col})

    expect_data = process_life_expectancy_dataset()
    cols = get_numeric_columns(expect_data)
    expect_data = expect_data[cols]

    expect_choices = []
    for col in expect_data.columns:
        expect_choices.append({'label': col, 'value': col})
    '''
    The HTML 
    '''

    app.layout = dbc.Container([
        # LAY OUT FOR PART 1
        html.H1(children='What a nice title'),
        html.Div(
            children='The following are the containers required for this task.'
        ),
        html.Hr(),
        dbc.FormGroup([
            dbc.Label('Choose Dataset'),
            dcc.Dropdown(id='choose_dataset',
                         value='iris',
                         options=[{
                             'label': 'Iris',
                             'value': 'iris'
                         }, {
                             'label': 'Video Game',
                             'value': 'video_game'
                         }, {
                             'label': 'Expectancy',
                             'value': 'expect'
                         }],
                         clearable=False,
                         searchable=False),
        ]),
        dbc.FormGroup([
            dbc.Label('Choose X'),
            dcc.Dropdown(id='choose_x',
                         value='sepal_length',
                         options=[{
                             'label': 'Hold',
                             'value': 1
                         }],
                         clearable=False,
                         searchable=False),
        ]),
        dbc.FormGroup([
            dbc.Label('Choose Y'),
            dcc.Dropdown(id='choose_y',
                         value='sepal_width',
                         options=[{
                             'label': 'Hold',
                             'value': 1
                         }],
                         clearable=False,
                         searchable=False),
        ]),
        dbc.FormGroup([
            dbc.Label('Choose Graph'),
            dcc.Dropdown(id='choose_graph',
                         value='scatter',
                         options=[{
                             'label': 'Line',
                             'value': 'line'
                         }, {
                             'label': 'Bar',
                             'value': 'bar'
                         }, {
                             'label': 'Scatter',
                             'value': 'scatter'
                         }],
                         clearable=False,
                         searchable=False),
        ]),
        dbc.Row([
            dbc.Col(dcc.Graph(id='first_graph')),
        ]),
        # LAYOUT FOR PART 2
        html.Div([
            dbc.Card(
                dbc.CardBody([
                    html.H4("Number of Rows"),
                    html.P(id='number_of_rows', children={}),
                ]), ),
        ]),

        # LAYOUT FOR PART 3
        dbc.FormGroup([
            dbc.Label('Choose Figure'),
            dcc.Dropdown(id='choose_second_graph',
                         value='map',
                         options=[{
                             'label': 'Plotly Map from b_',
                             'value': 'map'
                         }, {
                             'label': 'Plotly Treemap from b_',
                             'value': 'treemap'
                         }, {
                             'label': 'Plotly Table from b_',
                             'value': 'table'
                         }],
                         clearable=False,
                         searchable=False),
        ]),
        dbc.Row([
            dbc.Col(dcc.Graph(id='second_graph')),
        ]),
        html.Div([
            dbc.Card(
                dbc.CardBody([
                    html.H4("Number of Values Selected"),
                    html.P(id='number_of_values', children={}),
                ]), ),
        ]),
    ])
    '''
    The Callback Functions
    '''
    '''
    First Callback Function
    Depending on the chosen data set, the x and y selection drop downs will fill up with columns from that data set.
    '''
    @app.callback([
        Output(component_id='choose_x', component_property='options'),
        Output(component_id='choose_y', component_property='options'),
        Output(component_id='choose_x', component_property='value'),
        Output(component_id='choose_y', component_property='value')
    ], [Input(component_id='choose_dataset', component_property='value')])
    def x_y(dataset_choice):

        if dataset_choice == 'iris':
            x = iris_choices
            y = iris_choices
            val_x = iris_choices[0]['value']
            val_y = iris_choices[1]['value']
            return x, y, val_x, val_y
        elif dataset_choice == 'video_game':
            x = video_game_choices
            y = video_game_choices
            val_x = video_game_choices[0]['value']
            val_y = video_game_choices[1]['value']
            return x, y, val_x, val_y
        elif dataset_choice == 'expect':
            x = expect_choices
            y = expect_choices
            val_x = expect_choices[0]['value']
            val_y = expect_choices[1]['value']
            return x, y, val_x, val_y

    '''
    Second Callback Function
    Outputs the figure when you select the x and y values as well as the graph type
    Will also update the card showing the number of rows
    '''

    @app.callback([
        Output(component_id='first_graph', component_property='figure'),
        Output(component_id='number_of_rows', component_property='children')
    ], [
        Input(component_id='choose_dataset', component_property='value'),
        Input(component_id='choose_x', component_property='value'),
        Input(component_id='choose_y', component_property='value'),
        Input(component_id='choose_graph', component_property='value')
    ])
    def update_graph(dataset_choice, x_choice, y_choice, graph_choice):
        if dataset_choice == 'iris':
            if graph_choice == 'bar':
                return px.bar(iris_data, x=x_choice,
                              y=y_choice), 'Rows: ' + str(iris_data.shape[0])
            elif graph_choice == 'line':
                return px.line(iris_data, x=x_choice,
                               y=y_choice), 'Rows: ' + str(iris_data.shape[0])
            elif graph_choice == 'scatter':
                return px.scatter(
                    iris_data, x=x_choice,
                    y=y_choice), 'Rows: ' + str(iris_data.shape[0])
        elif dataset_choice == 'video_game':
            if graph_choice == 'bar':
                return px.bar(
                    video_game_data, x=x_choice,
                    y=y_choice), 'Rows: ' + str(video_game_data.shape[0])
            elif graph_choice == 'line':
                return px.line(
                    video_game_data, x=x_choice,
                    y=y_choice), 'Rows: ' + str(video_game_data.shape[0])
            elif graph_choice == 'scatter':
                return px.scatter(
                    video_game_data, x=x_choice,
                    y=y_choice), 'Rows: ' + str(video_game_data.shape[0])
        elif dataset_choice == 'expect':
            if graph_choice == 'bar':
                return px.bar(expect_data, x=x_choice,
                              y=y_choice), 'Rows: ' + str(expect_data.shape[0])
            elif graph_choice == 'line':
                return px.line(
                    expect_data, x=x_choice,
                    y=y_choice), 'Rows: ' + str(expect_data.shape[0])
            elif graph_choice == 'scatter':
                return px.scatter(
                    expect_data, x=x_choice,
                    y=y_choice), 'Rows: ' + str(expect_data.shape[0])
        else:
            return px.bar(iris_data, x='sepal_length',
                          y='sepal_width'), 'Rows: ' + str(iris_data.shape[0])

    '''
    Third Callback Function
    Calls the function from file b_ depending on the selected visualization and outputs it as a figure
    Will also update the card showing which type of visualization was selected
    '''

    @app.callback([
        Output(component_id='second_graph', component_property='figure'),
        Output(component_id='number_of_values', component_property='children')
    ], Input(component_id='choose_second_graph', component_property='value'))
    def update_second(choose_graph):
        if choose_graph == 'map':
            return plotly_map(), 'Map from b_ using life_expectancy data set.'
        elif choose_graph == 'treemap':
            return plotly_tree_map(
            ), 'Tree map from b_ using life_expectancy data set.'
        elif choose_graph == 'table':
            return plotly_table(
            ), 'Data Table from b_ using life_expectancy data set.'

    return app
Пример #10
0
    string_df = df[df.columns[query]]
    # print("Categorical Data Types (in df): " + str(cols))
    return list(string_df)


def get_correlation_between_columns(df: pd.DataFrame, col1: str,
                                    col2: str) -> float:
    """
    Calculate and return the pearson correlation between two columns
    """
    # print("Correlation between " + col1 + " and " + col2 + ": " + str(df[col1].corr(df[col2])))
    cor = df[col1].corr(df[col2])
    print(cor)
    return cor


if __name__ == "__main__":
    df = read_dataset(Path('..', '..', 'iris.csv'))
    #a = pandas_profile(df)
    #assert get_column_max(df, df.columns[0]) is not None
    #assert get_column_min(df, df.columns[0]) is not None
    #assert get_column_mean(df, df.columns[0]) is not None
    #assert get_column_count_of_nan(df, df.columns[0]) is not None
    #assert get_column_number_of_duplicates(df, df.columns[0]) is not None
    #assert get_numeric_columns(df) is not None
    #assert get_binary_columns(df) is not None
    #assert get_text_categorical_columns(df) is not None
    assert get_correlation_between_columns(df, df.columns[0],
                                           df.columns[1]) is not None
    print("ok")
Пример #11
0
def dash_task():
    """
    There is only only one task to do, a web app with:
    1. Some nice title
    2. One visualization placeholder for dataset visualization
        a. A dropdown to allow me to select which dataset I want to see (iris, video_game and life_expectancy)
        b. Two other dropdowns for me to choose what column to put in x and what column to put in y of the visualization
        c. Another dropdown for me to choose what type of graph I want (see examples in file a_) (at least 3 choices of graphs)
        d. Feel free to change the structure of the dataset if you prefer (e.g. change life_expectancy so that
            there is one column of "year", one for "country" and one for "value")
    4. A https://dash-bootstrap-components.opensource.faculty.ai/docs/components/card/ with the number of rows being showed on the above graph
    5. Another visualization with:
        a. It will containing the figure created in the tasks in a_, b_ or c_ related to plotly's figures
        b. Add a dropdown for me to choose among 3 (or more if you wish) different graphs from a_, b_ or c_ (choose the ones you like)
        c. In this visualization, if I select data in the visualization, update some text in the page (can be a new bootstrap card with text inside)
            with the number of values selected. (see https://dash.plotly.com/interactive-graphing for examples)
    """

    # Step-0 : Creating a new Dash Application Object with BootStrap properties
    app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
    iris_df = read_dataset(Path("..", "..", "iris.csv"))
    iris_columns = list(iris_df.columns)

    video_game_df = read_dataset(Path("..", "..", "ratings_Video_Games.csv"))
    video_game_columns = list(video_game_df.columns)

    life_expectancy_df = process_life_expectancy_dataset()
    life_expectancy_columns = list(life_expectancy_df.columns)

    # Step-1 : Giving a Nice Title
    app.layout = dbc.Container([
        html.Center([
            html.H1(children='Dash Plots', style={'color': 'brown'}),
            html.Hr(),
        ]),
        dbc.Row([
            dbc.Col(
                dbc.FormGroup([
                    # Creating a dropdown for selecting the required dataset
                    dbc.Label("Choose Your Dataset"),
                    dcc.Dropdown(id='dataset_selector_dropdown',
                                 options=[{
                                     'label': 'Iris Dataset',
                                     'value': 'iris'
                                 }, {
                                     'label': 'Video Game Dataset',
                                     'value': 'video_game'
                                 }, {
                                     'label': 'Life Expectancy Dataset',
                                     'value': 'life_expectancy'
                                 }],
                                 value='iris',
                                 style={'width': '250px'})
                ])),
            # Creating a dropdown for selecting X axis. Initially, it will be blank.
            # On Selecting a dataset from the aforementioned dropdown, the options will be
            # seen through the callback function : "setXandY()" declared below
            dbc.Col(
                dbc.FormGroup([
                    dbc.Label("Select Column for X axis"),
                    dcc.Dropdown(id='x_selector_dropdown',
                                 style={'width': '160px'})
                ])),
            # Creating a dropdown for selecting Y axis. Initially, it will be blank.
            # On Selecting a dataset from the aforementioned dropdown, the options will be
            # seen through the callback function : "setXandY()" declared below
            dbc.Col(
                dbc.FormGroup([
                    dbc.Label("Select Column for Y axis"),
                    dcc.Dropdown(id='y_selector_dropdown',
                                 style={'width': '160px'})
                ])),

            # Creating a dropdown for selecting the type of graph to be plotted
            dbc.Col(
                dbc.FormGroup([
                    dbc.Label("Choose Graph Type"),
                    dcc.Dropdown(id='graph_type_selector_dropdown',
                                 options=[{
                                     'label': 'Scatter Plot',
                                     'value': 'scatter'
                                 }, {
                                     'label': 'Histogram Plot',
                                     'value': 'histogram'
                                 }, {
                                     'label': 'Polar Plot',
                                     'value': 'polar'
                                 }],
                                 value='scatter',
                                 style={'width': '160px'})
                ])),
        ]),
        # Creating a button on click of which the Dash program will display the required plot
        dbc.Row(
            dbc.Button('Generate Plot',
                       id='button_for_generating_plot',
                       style={'width': '100%'},
                       color='primary')),

        # Creating the first graph component
        html.Center([dbc.Col(dcc.Graph(id='first_plot'))]),

        # Creating the card component for counting the scanned rows
        html.Center([
            dbc.Col(
                dbc.Card([
                    dbc.CardBody([
                        html.H4(children=['Number of rows read : 0'],
                                id='card_text')
                    ])
                ],
                         style={"width": "18rem"}))
        ]),
        html.Br(),
        # Creating a dropdown for selecting one visualization out of 3 from a_, b_ or c_ files
        dbc.FormGroup([
            dbc.Label("Choose a Visualization"),
            dcc.Dropdown(id='second_plot_selector_dropdown',
                         options=[{
                             'label': 'Pie Chart From File A',
                             'value': 'a_pie'
                         }, {
                             'label': 'Polar Scatter Plot From File B',
                             'value': 'b_polar'
                         }, {
                             'label': 'Cluster Plot From File C',
                             'value': 'c_clusters'
                         }],
                         value='c_clusters',
                         style={'width': '100%'})
        ]),
        html.Br(),
        html.Center([dcc.Graph(id='second_plot')]),

        # Creating another Card for viewing the selected data from the second plot
        html.Center([
            dbc.Col(
                dbc.Card([
                    dbc.CardBody([
                        html.H4(children=['No Data Selected'],
                                id='second_card_text')
                    ])
                ],
                         style={"width": "18rem"}))
        ])
    ])

    # The callback function below sets the X and Y dropdown options on selection of dataset selection
    @app.callback([
        Output('x_selector_dropdown', 'options'),
        Output('x_selector_dropdown', 'value'),
        Output('y_selector_dropdown', 'options'),
        Output('y_selector_dropdown', 'value')
    ], [Input('dataset_selector_dropdown', 'value')])
    def setXandY(dropdownValue):

        required_options = []
        default_value = ''
        if dropdownValue == 'iris':
            for data_column in iris_columns:
                temp_dict = {}
                temp_dict['label'] = data_column
                temp_dict['value'] = data_column
                required_options.append(temp_dict)

            default_value = iris_columns[0]

        elif dropdownValue == 'video_game':
            for data_column in video_game_columns:
                temp_dict = {}
                temp_dict['label'] = data_column
                temp_dict['value'] = data_column
                required_options.append(temp_dict)

            default_value = video_game_columns[0]

        elif dropdownValue == 'life_expectancy':
            for data_column in life_expectancy_columns:
                temp_dict = {}
                temp_dict['label'] = data_column
                temp_dict['value'] = data_column
                required_options.append(temp_dict)

            default_value = life_expectancy_columns[0]

        return required_options, default_value, required_options, default_value

    # Callback function for plotting the first plot and updating the first card
    @app.callback(
        [Output('first_plot', 'figure'),
         Output('card_text', 'children')], [
             Input('button_for_generating_plot', 'n_clicks'),
         ], [
             State('dataset_selector_dropdown', 'value'),
             State('x_selector_dropdown', 'value'),
             State('y_selector_dropdown', 'value'),
             State('graph_type_selector_dropdown', 'value')
         ])
    def plot_first_graph(n_clicks, dataset_name, x_column, y_column,
                         graph_type):

        if n_clicks == None:
            return go.Figure(), 'Rows Scanned : 0'

        if dataset_name == 'iris':
            df = iris_df
            rows_count = len(iris_df)
        elif dataset_name == 'video_game':
            df = video_game_df
            rows_count = len(video_game_df)
        elif dataset_name == 'life_expectancy':
            df = life_expectancy_df
            rows_count = len(life_expectancy_df)
        else:
            df = None

        categorical_cols = get_text_categorical_columns(df)

        if x_column in categorical_cols:
            le = generate_label_encoder(df[x_column])
            df = replace_with_label_encoder(df, x_column, le)

        if y_column in categorical_cols:
            le = generate_label_encoder(df[y_column])
            df = replace_with_label_encoder(df, y_column, le)

        if graph_type == 'scatter':
            first_figure = px.scatter(df, x=x_column, y=y_column)

        elif graph_type == 'histogram':
            first_figure = px.histogram(df, x=x_column, color=y_column)

        elif graph_type == 'polar':
            first_figure = px.scatter_polar(df, r=x_column, theta=y_column)

        else:
            first_figure = None

        final_rows_call = 'Rows Read : ' + str(rows_count)

        return first_figure, final_rows_call

    # Callback function for plotting the second plot
    @app.callback(Output('second_plot', 'figure'),
                  [Input('second_plot_selector_dropdown', 'value')])
    def generate_second_plot(plot_name):

        if plot_name == 'a_pie':
            x = np.random.rand(50) * np.random.randint(-10, 10)
            y = np.random.rand(50) * np.random.randint(-10, 10)
            df = pd.DataFrame(dict(x=x, y=y, z=x + y))
            fig = plotly_pie_chart(df)

        elif plot_name == 'b_polar':
            fig = plotly_polar_scatterplot_chart()

        elif plot_name == 'c_clusters':
            fig = plotly_interactivity()

        else:
            fig = None

        return fig

    # Callback function for updating the second card. Please Select Lasso Select and then perform selection via dragging
    @app.callback(Output('second_card_text', 'children'),
                  Input('second_plot', 'selectedData'))
    def act_on_selecting_data(selectedData):
        print(json.dumps(selectedData, indent=2))
        print(type(selectedData))
        return json.dumps(selectedData, indent=2)

    return app
Пример #12
0
def process_life_expectancy_dataset():
    """
    Now use the life_expectancy_years and geography datasets following these rules:
    1. The life expectancy dataset has missing values and outliers. Fix them.
    2. The geography dataset has problems with unicode letters. Make sure your code is handling it properly.
    3. Change the format of life expectancy, so that instead of one row with all 28 years, the data has 28 rows, one for each year,
        and with a column "year" with the year and a column "value" with the original value
    4. Merge (or more specifically, join) the two datasets with the common column being the country name (be careful with wrong values here)
    5. Drop all columns except country, continent, year, value and latitude (in this hypothetical example, we wish to analyse differences
        between southern and northern hemisphere)
    6. Change the latitude column from numerical to categorical (north vs south) and pass it though a label_encoder
    7. Change the continent column to a one_hot_encoder version of it
    :return: A dataframe with the above conditions.
    """
    print("loading datasets from csv files...")
    expectancy = read_dataset(Path('..', '..', 'life_expectancy_years.csv'))
    geography = read_dataset(Path('..', '..', 'geography.csv'))
    # for each column, I've decided to set outliers and missing values to the mean
    num_cols = get_numeric_columns(expectancy)

    # print("fixing missing values and outliers in expectancy...")
    # for i in range(len(num_cols)):
    #     # set outliers to nan
    #     expectancy = fix_outliers(expectancy, num_cols[i])
    #     # if anything is less than 5, set it to mean
    #     expectancy = fix_numeric_wrong_values(expectancy, num_cols[i], WrongValueNumericRule.MUST_BE_GREATER_THAN, 5)
    #     mean = get_column_mean(expectancy, num_cols[i])
    #     missing = expectancy[num_cols[i]].isnull()
    #     expectancy.at[missing, num_cols[i]] = mean

    # note: instead of 28 years, I just use the whole data set
    # this may take longer to process, but it may give us a better idea of the data as a whole
    # I think it's good to keep all the data as we do not have a specific future goal yet
    print("reformatting expectancy...")
    countries = []
    years = []
    expectancies = []
    for i in range(len(expectancy['country'])):
        for n in range(len(num_cols)):
            countries.append(expectancy['country'][i])
            years.append(num_cols[n])
            expectancies.append(expectancy.loc[i, num_cols[n]])
    new_expectancy = pd.DataFrame()
    new_expectancy['name'] = countries # name the column name so it will match w/ geography dataset
    new_expectancy['year'] = years
    new_expectancy['expectancy'] = expectancies

    print("merging new expectancy and geography...")
    merged = pd.merge(new_expectancy, geography, how="outer", on='name')

    """"
    CITATION
    this Stack Overflow post helped me with this method
    the user's name is 'cs95'
    I fully acknowledge and give credit to them for the code found from their section in this article
    The documentation can be found at the link below
    https://stackoverflow.com/questions/16616141/keep-certain-columns-in-a-pandas-dataframe-deleting-everything-else#16616454
    """

    print("dropping all columns except country, continent, year, value, and latitude...")
    keep = ['name', 'four_regions', 'year', 'expectancy', 'Latitude', 'Longitude']
    drop = merged.columns.difference(keep)
    merged = merged.drop(drop, axis=1)

    # I'm assuming negative latitude means south, as it will be south of the equator
    # and positive means north, as it will be north of the equator
    # print("changing latitude to categorical and putting it through label encoder...")
    # cat_lat = []
    # for i in range(len(merged['Latitude'])):
    #     if merged.loc[i, 'Latitude'] > 0:
    #         cat_lat.append("North")
    #     else:
    #         cat_lat.append("South")
    # merged = merged.drop('Latitude', axis=1)
    # merged['latitude'] = cat_lat
    #
    # le = generate_label_encoder(merged.loc[:, 'latitude'])
    # merged = replace_with_label_encoder(merged, 'latitude', le)

    # print("one hot encoding continent...")
    # ohe = generate_one_hot_encoder(merged['four_regions'])
    # merged = replace_with_one_hot_encoder(merged, 'four_regions', ohe, ohe.get_feature_names())

    print("done :)")
    #print(merged)
    return merged