def reusing_code_random_forest_on_iris() -> Dict:
    """
    Again I will run a classification on the iris dataset, but reusing
    the existing code from assignment1. Use this to check how different the results are (score and
    predictions).
    """
    df = read_dataset(Path('..', '..', 'iris.csv'))
    for c in list(df.columns):
        # Notice that I am now passing though all columns.
        # If your code does not handle normalizing categorical columns, do so now (just return the unchanged column)
        df = fix_outliers(df, c)
        df = fix_nans(df, c)
        df[c] = normalize_column(df[c])

    X, y = df.iloc[:, :4], df.iloc[:, 4]
    le = generate_label_encoder(y)

    # Be careful to return a copy of the input with the changes, instead of changing inplace the inputs here!
    y_encoded = replace_with_label_encoder(y.to_frame(), column='species', le=le)
    rf = simple_random_forest_classifier(X, y_encoded['species'])

    '''
    !!Explanation!!
    Both the classifier in this function and the one in the last yield just about the same score on average
    I believe this is because the two datasets are essentially the same at this point:
    They both have label encoded classes
    The only difference is this function removed nans and outliers, which the dataset does not possess many of anyway
    And also normalizes the dataset, which from what my understanding might not actually change the values 
    in relation to other values. This normalization may just make the model in this function more efficient!
    Due to this potential boost in efficiency due to normalization, I would choose this function's model over the last 
    '''
    print(rf['accuracy'])
    return rf
Пример #2
0
    def plot_first_graph(n_clicks, dataset_name, x_column, y_column,
                         graph_type):

        if n_clicks == None:
            return go.Figure(), 'Rows Scanned : 0'

        if dataset_name == 'iris':
            df = iris_df
            rows_count = len(iris_df)
        elif dataset_name == 'video_game':
            df = video_game_df
            rows_count = len(video_game_df)
        elif dataset_name == 'life_expectancy':
            df = life_expectancy_df
            rows_count = len(life_expectancy_df)
        else:
            df = None

        categorical_cols = get_text_categorical_columns(df)

        if x_column in categorical_cols:
            le = generate_label_encoder(df[x_column])
            df = replace_with_label_encoder(df, x_column, le)

        if y_column in categorical_cols:
            le = generate_label_encoder(df[y_column])
            df = replace_with_label_encoder(df, y_column, le)

        if graph_type == 'scatter':
            first_figure = px.scatter(df, x=x_column, y=y_column)

        elif graph_type == 'histogram':
            first_figure = px.histogram(df, x=x_column, color=y_column)

        elif graph_type == 'polar':
            first_figure = px.scatter_polar(df, r=x_column, theta=y_column)

        else:
            first_figure = None

        final_rows_call = 'Rows Read : ' + str(rows_count)

        return first_figure, final_rows_call
Пример #3
0
def reusing_code_random_forest_on_iris() -> Dict:
    """
    Again I will run a classification on the iris dataset, but reusing
    the existing code from assignment1. Use this to check how different the results are (score and
    predictions).
    """
    df = read_dataset(Path('..', '..', 'iris.csv'))
    for c in list(df.columns):
        # Notice that I am now passing though all columns.
        # If your code does not handle normalizing categorical columns, do so now (just return the unchanged column)
        df = fix_outliers(df, c)
        df = fix_nans(df, c)
        df[c] = normalize_column(df[c])

    X, y = df.iloc[:, :4], df.iloc[:, 4]
    le = generate_label_encoder(y)

    # Be careful to return a copy of the input with the changes, instead of changing inplace the inputs here!
    y_encoded = replace_with_label_encoder(y.toframe(),
                                           column='species',
                                           le=le)
    return simple_random_forest_classifier(X, y_encoded['species'])
Пример #4
0
def iris_clusters() -> Dict:
    """
    Let's use the iris dataset and clusterise it:
    """
    df = pd.read_csv(Path('..', '..', 'iris.csv'))
    for c in list(df.columns):
        df = fix_outliers(df, c)
        df = fix_nans(df, c)
        df[c] = normalize_column(df[c])

    # Let's generate the clusters considering only the numeric columns first
    no_species_column = simple_k_means(df.iloc[:, :4])

    ohe = generate_one_hot_encoder(df['species'])
    df_ohe = replace_with_one_hot_encoder(df, 'species', ohe,
                                          list(ohe.get_feature_names()))

    # Notice that here I have binary columns, but I am using euclidean distance to do the clustering AND score evaluation
    # This is pretty bad
    no_binary_distance_clusters = simple_k_means(df_ohe)

    # Finally, lets use just a label encoder for the species.
    # It is still bad to change the labels to numbers directly because the distances between them does not make sense
    le = generate_label_encoder(df['species'])
    df_le = replace_with_label_encoder(df, 'species', le)
    labeled_encoded_clusters = simple_k_means(df_le)

    # See the result for yourself:
    print(no_species_column['score'], no_binary_distance_clusters['score'],
          labeled_encoded_clusters['score'])
    ret = no_species_column
    if no_binary_distance_clusters['score'] > ret['score']:
        print('no binary distance')
        ret = no_binary_distance_clusters
    if labeled_encoded_clusters['score'] > ret['score']:
        print('labeled encoded')
        ret = labeled_encoded_clusters
    return ret
def train_life_expectancy() -> Dict:
    """
    Do the same as the previous task with the result of the life expectancy task of e_experimentation.
    The label column is the value column. Remember to convert drop columns you think are useless for
    the machine learning (say why you think so) and convert the remaining categorical columns with one_hot_encoding.
    Feel free to change your e_experimentation code (changes there will not be considered for grading
    purposes) to optimise the model (e.g. score, parameters, etc).
    """

    df = process_life_expectancy_dataset()
    '''
    !!!Explanation!!!
    This code below is interesting, and it aims to fix several Nan values that existed as expectancy values
    I originally intended to replace the expectancy with the mean of the column as I did in the classification file,
    BUT this cause the issue of a large number of data instances having the same target value, this resulted in 
    regressors with very low scores. This wasn't good so I needed to come up with a new solution and I decided to 
    drop each row which had a Nan expectancy value, and simply work with the data that we had.
    You'll see that this resulted in a model with a very high score, and I will discuss it more in the results section.

    Additionally this drops various instances which had year set to Nan, I did not think I could represent date 
    by replacing the Nan values with anything from the other instances of the Year column, so I would have dropped those
    values anyway.
    '''
    df['expectancy'].fillna(value=0, inplace=True)
    df = df[df['expectancy'] != 0]
    print(df)

    le = generate_label_encoder(df['name'])
    df = replace_with_label_encoder(df, 'name', le)
    print(df)

    X = df
    X = X.drop(['expectancy'], axis=1)
    y = df['expectancy']

    rf = simple_random_forest_regressor(X=X, y=y)
    print(rf)

    dt = decision_tree_regressor(X=X, y=y)
    print(dt)
    '''
    !!!My Results!!!
    The models perform very well! BUT I think this may be due to the model being overfit. It seems that there is some
    risk of contamination with this data set. There are many instances which are essentially the same, except for the 
    year value. These values would risk being split into both the training and testing dataset, and the model might
    train and learn one instance just to be tested on what is a very, very similar version of that instance.
    It is like the model is cheating! And I do not predict it would do so well on new, real life data.

    In order to reduce risk of overfitting, while also not removing any more instances, I would collect more data in
    order to include information specific to the year and the country, this way the model would have more information
    about the demographics and conditions the country held and could perhaps find patterns to predict life expectancy
    that way.

    To quickly conclude my observation on dropping expectancy values from before,
    I do not think setting the expectancy values should have been set to the mean as they were what was being predicted,
    and by extracting a value for the prediction that way causes the model to misrepresent the real data I think.
    In order to include the dropped data in future tests, we would simply have to find the actual data for those 
    instances, as I do not believe we should be using false values for such an important feature! To do this, we could
    search through government papers or statistical analyses for the target country. Once we have the complete data
    profile of that country, then I would feel more comfortable adding it back into the data set.
    '''

    if rf['score'] > dt['score']:
        print('random forest wins!')
        return rf
    else:
        print('decision tree wins!')
        return dt
def train_amazon_video_game() -> Dict:
    """
    Run the result of the amazon dataset task of e_experimentation using the
    decision tree regressor AND random_forest regressor. Return the one with lowest R^2.
    The Label column is the count column
    Discuss (1 sentence) what you found different between the results.
    In one sentence, why is the score different (or the same) compared to the iris score?
    Feel free to change your e_experimentation code (changes there will not be considered for grading
    purposes) to optimise the model (e.g. score, parameters, etc).
    """
    df = process_amazon_video_game_dataset()
    print(df)
    '''
    !!!Explanation!!!
    I used this same logic in the classification file, but we would need many and more bits to represent a one hot
    encoding of all of the movie names, so instead I just label encode them, though they still represent the same data
    '''
    le = generate_label_encoder(df['asin'])
    df = replace_with_label_encoder(df, 'asin', le)
    '''
    !!!Explanation!!!
    Note: this is the same explanation I gave for dropping the time column in the classification file, but I 
    thought I would restate it here just in case.
    I decided to drop the time column as I personally don't think it will have a correlation with the target labels.
    The time only seems to indicate the activity of the user, which is easily updates once the user reviews again.
    Thus, my theory is that the model might learn to check when a user is active, which could overfit the model if user
    activity is somewhat random.
    For example, if they reviewed a video game that came out today, after not reviewing one after 10 years,
    the model may not predict the user because it is biased to the activity dates.
    Sometimes sequels to games come out after a long, long time as any video game fan knows, and perhaps a player might
    want to review the newest sequel of a game series they used to like to review.
    I believe the model should be able to predict the user from other features relating to the users rating behaviours,
    but should be independent of time, as there are no set rules to when a user might review
    '''
    df = df.drop('time', axis=1)

    X, y = df.iloc[:, 1:], df.iloc[:, 0]

    rf = simple_random_forest_regressor(X, y)
    print(rf)

    dt = decision_tree_regressor(X, y)
    print(dt)
    '''
    !!!My Results!!!
    Using the original data with little processing other than encoding, we get a models that have scores of around 
    0.9999 on average. This may look very good but I don't think there is any way this could be realistic.
    I think there are two pretty big issues with this data set, both of which I discuss in the classification file
    but they seem even more apparent here so I will reiterate my analysis.
    Issue #1: Unique Labels.
        - A unique label refers to an instance of data in the data set that possesses a label, which no other instances
        in that dataset possess. I believe this is an issue in regression and classification as if that instance is put
        in a test set, the label will not be learned by the model. This becomes an issue as it will be automatically 
        wrongly predicted. 
        - I believe a solution to this would be to simply choose better labels; ones that can describe the data in a 
        more general way. I believe the count feature is problematic as it is somewhat trivial (you can just count the
        number of reviews for each user) and without an idea of that counting pattern, it will be somewhat of a guess!
    Issue #2: Data Duplication.
        - Although it seems this may contradict issue #1, I think there should be a nice mix of data that have the same 
        labels for supervised learning, but are different, I think this is the basis of a balanced data set.
        - The issue here is that a lot of data instances in this data set are extremely similar to others, apart from 
        the video game being reviewed. This can lead to overfitting of the model.
        - This leads to overfitting of the model as these duplicates can show up in both the training set AND the test
        set, leading to contamination of the sets. I've talked about this a lot throughout the assignment but the model
        will learn the instances in training, and then be tested on those same instances for the score function. This is
        really bad because the score now misrepresents the models ability to predict actually new data.
        - I don't think its as an easy fix as issue #1, but I think one solution may be more data collection; features
        that are not just unique to the class, but also unique to the individual instances must be collected! This data
        diversity in theory should reduce a lot of the contamination, and thus reduce the overfiting in the model.
    '''
    '''
    !!!Versus the Iris Data Set!!!
    I think the only thing to say here should be obvious from my responses to the iris function and this one. The big
    differences between these results is that there is evident overfitting going on with this data set, and not too much
    going on with the iris set. 
    I believe the biggest differences of these data sets (if we were to somehow standardize them relative to each other)
    is that the iris data set is a lot more balanced, and does not contain as much potential for train/test
    contamination. I believe if you balanced out the data and fix the duplication issues (as listed above) in the 
    Amazon data set, it could have the potential to have a model learn it just as well as it learns the iris data set.
    Although the Amazon data sets models had better scores, I believe that is just due to various instances of 
    overfitting, and thus the more balanced Iris data set produced better models.
    '''

    if rf['score'] < dt['score']:
        print('random forest wins!')
        return rf
    else:
        print('decision tree wins!')
        return dt
def your_choice() -> Dict:
    """
    Now choose one of the datasets included in the assignment1 (the raw one, before anything done to them)
    and decide for yourself a set of instructions to be done (similar to the e_experimentation tasks).
    Specify your goal (e.g. analyse the reviews of the amazon dataset), say what you did to try to achieve the goal
    and use one (or both) of the models above to help you answer that. Remember that these models are classification
    models, therefore it is useful only for categorical labels.
    We will not grade your result itself, but your decision-making and suppositions given the goal you decided.
    Use this as a small exercise of what you will do in the project.
    """
    '''
    !!!My Goal!!!
    I will be using the dataset "Geography"
    With this dataset, I want to find out if we can fit a model to predict the World Bank Income Group of a country
    given a some geographical and bank related features
    To find this out, I will preprocess the data in the following ways:
        - Fix any missing data in the columns that are mentioned below
        - Extract and label encode the World Bank groups column into the labels vector 
        - Extract and one hot encode World bank region column into the features vector
        - Extract latitude into the features vector
        - Extract longitude into the features vector
    I will train both a Decision Tree and Random Forest to find my goal, and return the model with the greater accuracy
    '''
    df = pd.read_csv(Path('..', '..', 'geography.csv'))

    '''
    !!!Explanation!!!
    The only columns with Nans for the target features for this were from the Vatican, 
    so I replaced their null values with the values from Italy.
    I know they are technically separate, but until the data set can be filled we will simply consider them the same.
    '''
    df['World bank region'].fillna(value='Europe & Central Asia', inplace=True)
    df['World bank, 4 income groups 2017'].fillna('High Income', inplace=True)

    le = generate_label_encoder(df_column=df['World bank, 4 income groups 2017'])
    df = replace_with_label_encoder(df=df, column='World bank, 4 income groups 2017', le=le)

    ohe = generate_one_hot_encoder(df_column=df['World bank region'])
    df = replace_with_one_hot_encoder(df=df, column='World bank region', ohe=ohe,
                                      ohe_column_names=ohe.get_feature_names())

    columns = ['Latitude', 'Longitude', 'x0_East Asia & Pacific', 'x0_Europe & Central Asia',
               'x0_Latin America & Caribbean', 'x0_Middle East & North Africa', 'x0_North America',
               'x0_South Asia', 'x0_Sub-Saharan Africa']
    X = df[columns]
    y = df['World bank, 4 income groups 2017']

    dt = decision_tree_classifier(X=X, y=y)
    #print(dt)
    rf = simple_random_forest_classifier(X=X, y=y)
    #print(rf)
    '''
    !!!My Results!!!
    It seems that once again on average the Decision Tree and Random Forest are yielding similar results.
    Their accuracies are quite low, and range from around 50 to nearly 70 percent accuracy.
    I don't think a lot of overfitting is occurring here, as the datasets are well balanced, and properly split
    into training and testing.
    The data set does have a lack of columns that relate to the economy, wealth, or demographics of the country,
    So I believe that more data may improve the model to fit a mapping between the demographic and wealth data of a
    given country, and its income group (target label).
    Features that could be collected as additional data columns could include things such as average income, employment
    rate, tax information, and more!
    I believe although this model is just a start, it could be beneficial to companies who are figuring out economic
    policies or tax plans. I believe, the ability to use this model while trying to come up with plans to benefit a 
    country's economy could be useful, with enough relevant training and data :)
    '''
    if rf['accuracy'] > dt['accuracy']:
        #print('random forest wins')
        return rf
    else:
        #print('decision tree wins')
        return dt