示例#1
0
def test_naive_bayes_classifier_predict():
    train = [[1, 5], [2, 6], [1, 5], [1, 5], [1, 6], [2, 6], [1, 5], [1, 6]]
    y = ["yes", "yes", "no", "no", "yes", "no", "yes", "yes"]

    nb = MyNaiveBayesClassifier()
    nb.fit(train, y)

    pred = nb.predict([[1, 5]])

    assert pred == ["yes"]  # TODO: fix this
    # RQ5 (fake) iPhone purchases dataset
    iphone_col_names = [
        "standing", "job_status", "credit_rating", "buys_iphone"
    ]
    iphone_table = [[1, 3, "fair", "no"], [1, 3, "excellent", "no"],
                    [2, 3, "fair", "yes"], [2, 2, "fair", "yes"],
                    [2, 1, "fair", "yes"], [2, 1, "excellent", "no"],
                    [2, 1, "excellent", "yes"], [1, 2, "fair", "no"],
                    [1, 1, "fair", "yes"], [2, 2, "fair", "yes"],
                    [1, 2, "excellent", "yes"], [2, 2, "excellent", "yes"],
                    [2, 3, "fair", "yes"], [2, 2, "excellent", "no"],
                    [2, 3, "fair", "yes"]]
    mypy = MyPyTable(iphone_col_names, iphone_table)
    y2 = myutils.get_mypycol(mypy, "buys_iphone")
    nb2 = MyNaiveBayesClassifier()
    nb2.fit(iphone_table, y2)
    pred2 = nb2.predict([[1, 2, "fair"]])

    assert pred2 == ["yes"]

    # Bramer 3.2 train dataset
    train_col_names = ["day", "season", "wind", "rain", "class"]
    train_table = [["weekday", "spring", "none", "none", "on time"],
                   ["weekday", "winter", "none", "slight", "on time"],
                   ["weekday", "winter", "none", "slight", "on time"],
                   ["weekday", "winter", "high", "heavy", "late"],
                   ["saturday", "summer", "normal", "none", "on time"],
                   ["weekday", "autumn", "normal", "none", "very late"],
                   ["holiday", "summer", "high", "slight", "on time"],
                   ["sunday", "summer", "normal", "none", "on time"],
                   ["weekday", "winter", "high", "heavy", "very late"],
                   ["weekday", "summer", "none", "slight", "on time"],
                   ["saturday", "spring", "high", "heavy", "cancelled"],
                   ["weekday", "summer", "high", "slight", "on time"],
                   ["saturday", "winter", "normal", "none", "late"],
                   ["weekday", "summer", "high", "none", "on time"],
                   ["weekday", "winter", "normal", "heavy", "very late"],
                   ["saturday", "autumn", "high", "slight", "on time"],
                   ["weekday", "autumn", "none", "heavy", "on time"],
                   ["holiday", "spring", "normal", "slight", "on time"],
                   ["weekday", "spring", "normal", "none", "on time"],
                   ["weekday", "spring", "normal", "slight", "on time"]]
    mypy2 = MyPyTable(train_col_names, train_table)
    y3 = myutils.get_mypycol(mypy2, "class")
    nb3 = MyNaiveBayesClassifier()
    nb3.fit(train_table, y3)
    nb3.fit(train_table, y3)
    pred3 = nb3.predict([["weekday", "winter", "high", "heavy"]])

    assert pred3 == ["cancelled"]
示例#2
0
def confusionCategorical(yTrue, yTest, header, categories):
    table = MyPyTable()
    table.column_names = header
    table.data = []

    for val in categories:
        newRow = [val]
        for i in range(len(header) - 1):
            newRow.append(0)
        table.data.append(newRow)

    for i in range(len(yTrue)):
        rowIndex = categories.index(yTrue[i])
        colIndex = header.index(yTest[i])
        table.data[rowIndex][colIndex] += 1

    for row in table.data:
        total = 0
        for i in range(1, len(categories) + 1):
            total += row[i]
        row[len(categories) + 1] = total

    for i in range(len(table.data)):
        if table.data[i][len(categories) + 1] != 0:
            recognition = table.data[i][i +
                                        1] / table.data[i][len(categories) + 1]
            table.data[i][len(header) - 1] = round(100 * recognition, 2)
    return table
示例#3
0
def scatter_plot(table, x_column_name, y_column_name):
    """Creates a scatter plot with given data

    Args:
        table(MyPyTable): given table to perform operation
        column_name(string): column name to get column from for scatter plot. Column on the x axis
        y_column_name(string): column name to get column from for scatter plot. Column on the y axis
    
    Returns:
        coeficient(float): coeficient value
        cov(float): covariance value
    """
    y_col = MyPyTable.get_column(table, y_column_name, False)
    x_col = MyPyTable.get_column(table, x_column_name, False)

    coeficient = utils.correlation_coeficient(x_col, y_col)
    cov = utils.covariance(x_col, y_col)

    m, b = utils.compute_slope_intercept(x_col, y_col)
    plt.scatter(x_col, y_col)
    plt.plot([min(x_col), max(x_col)],
             [m * min(x_col) + b, m * max(x_col) + b],
             c="r",
             label="corr: " + str(coeficient) + ", cov: " + str(cov))
    plt.legend()
    plt.plot()
    plt.show()

    return coeficient, cov
示例#4
0
def get_sea_frequencies(MyPyTable, col_name):
    """Gets the frequency and count of a column by name

    Args:
        MyPyTable(MyPyTable): self of MyPyTable
        col_name(str): name of the column

    Returns:
        values, counts (string, int): name of value and its frequency"""

    rain_col = MyPyTable.get_column(col_name)
    row_index_to_drop = []
    print("range:", len(rain_col), len(MyPyTable.data))
    for i in range(len(rain_col)):
        if rain_col[i] == "FALSE":
            row_index_to_drop.append(i)
    
    count = 0
    row_to_drop = []
    for i in range(len(MyPyTable.data)):
        if i in row_index_to_drop:
            row_to_drop.append(MyPyTable.data[i])

    MyPyTable.drop_rows(row_to_drop)
    months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    month_col = MyPyTable.get_column('DATE')
    yes_col = []
    for month in months:
        yes = 0
        for i in range(len(month_col)):
            if month in month_col[i]:
                yes = yes + 1
        yes_col.append(yes)

    return months, yes_col
示例#5
0
def get_ratings_genre(table, genre, rating):
    """Get list with ratings attached with given genre column

    Args:
        table(MyPyTable): given object of MyPyTable
        genre(string): Genre to search for in table
        rating(string): Service provider to pull from in get column

    Returns:
        list(list): list with ratings from each correctly found genre"""

    genre_col = MyPyTable.get_column(table, 'Genres', True)

    col = MyPyTable.get_column(table, rating, True)
    list = []
    for i in range(len(genre_col)):
        if genre in genre_col[i]:
            if rating == 'Rotten Tomatoes' and '%' in col[i]:
                col[i] = float(col[i].strip('%'))
            list.append(col[i])

    copy_list = copy.deepcopy(list)

    for value in list:
        if value == '':
            copy_list.remove(value)

    list = copy_list

    return list
示例#6
0
def test_My_Random_Forest_Classifier_predict():
    # Object Declarations
    # Tests with N = 3, M = 2, F = 2 and seed = 1
    rand_forest_test = MyRandomForestClassifier(3, 2, 2, 1)
    table = MyPyTable()

    # Variable Assignment and Declaration
    table.data = interview_table
    table.column_names = interview_header

    y_train, X_train = [], []
    for inst in interview_table:
        y_train.append(inst[-1])
        X_train.append(inst[:-1])

    # Sets X_test
    X_test = [["Junior", "Java", "yes", "no"],
              ["Junior", "Java", "yes", "yes"]]

    # Tests on the Interview Dataset
    rand_forest_test.header = interview_header[:-1]
    rand_forest_test.fit(X_train, y_train)
    y_predicted = rand_forest_test.predict(X_test)

    print("y_predicted:", y_predicted)

    # Trace Test

    assert y_predicted == ['True', 'False']
示例#7
0
def get_aus_frequencies(MyPyTable, col_name):
    """Gets the frequency and count of a column by name

    Args:
        MyPyTable(MyPyTable): self of MyPyTable
        col_name(str): name of the column

    Returns:
        values, counts (string, int): name of value and its frequency"""

    rain_col = MyPyTable.get_column("RainToday")
    row_index_to_drop = []
    for i in range(len(rain_col)):
        if rain_col[i] == "No":
            row_to_drop.append(i)
    row_to_drop = []
    for i in range(len(MyPyTable.data)):
        if i in row_index_to_drop:
            row_to_drop.append(i)

    table = MyPyTable.drop_rows(rows_to_drop)
    table.pretty_print()
    return table
    values = []
    counts = []
    '''for value in col:
def test_random_forest_fit():
    interview_header = ["level", "lang", "tweets", "phd", "interviewed_well"]
    interview_table = [["Senior", "Java", "no", "no", "False"],
                       ["Senior", "Java", "no", "yes", "False"],
                       ["Mid", "Python", "no", "no", "True"],
                       ["Junior", "Python", "no", "no", "True"],
                       ["Junior", "R", "yes", "no", "True"],
                       ["Junior", "R", "yes", "yes", "False"],
                       ["Mid", "R", "yes", "yes", "True"],
                       ["Senior", "Python", "no", "no", "False"],
                       ["Senior", "R", "yes", "no", "True"],
                       ["Junior", "Python", "yes", "no", "True"],
                       ["Senior", "Python", "yes", "yes", "True"],
                       ["Mid", "Python", "no", "yes", "True"],
                       ["Mid", "Java", "yes", "no", "True"],
                       ["Junior", "Python", "no", "yes", "False"]]
    myutils.prepend_attribute_label(interview_table, interview_header)

    interview_pytable = MyPyTable(column_names=interview_header,
                                  data=interview_table)
    y_col = interview_pytable.get_column("interviewed_well", False)
    x_cols = interview_pytable.drop_col("interviewed_well")

    many_trees = MyRandomForestClassifier()
    X_sample, y_sample = myutils.compute_bootstrapped_sample(x_cols, y_col)
    X_train, X_test, y_train, y_test = myutils.train_test_split(
        X_sample, y_sample, .33)
    many_trees.fit(X_train, y_train, X_test, y_test)
    y_predicted = many_trees.predict(X_test)

    numCorrectPredictions = 0
    numWrongPredictions = 0
    for i in range(len(y_test)):
        values = [y_predicted[i], y_test[i]]  #predicted/actual
        if (values[0] == values[1]):
            numCorrectPredictions = numCorrectPredictions + 1
        else:
            numWrongPredictions = numWrongPredictions + 1

    accuracy = np.round((numCorrectPredictions) /
                        (numCorrectPredictions + numWrongPredictions), 3)
    error_rate = np.round(
        (numWrongPredictions) / (numCorrectPredictions + numWrongPredictions),
        3)

    print("-----------------------------------------------------------")
    print("Accuracy and Error Rate")
    print("-----------------------------------------------------------")
    print()
    print("Random Forest: accuracy = {}, error rate = {}".format(
        accuracy, error_rate))
    print()
    print(
        "Because of the random aspect of this classifier, this will not always pass the tests"
    )
    print()
    print("Predicted table: " + str(y_predicted))
    print("Testing set:     " + str(y_test))
    for i in range(len(y_test)):
        assert y_predicted[i] == y_test[i]
示例#9
0
def bagging(X, Y, N, M, F):
    # 1. split your dataset into a test set and a "remainder set"
    x_remainder, x_test, y_r, y_test = myevaluation.train_test_split(X, Y)
    # 2. using the remainder set, sample N bootsrap samples and use each one to build a classifier
    #    for each N sample:
    #        ~63% of the remainder set will be sampled into training set
    #        ~37% will be leftover for this tree's validation set
    forest = []
    # accuracies = [[0] for i in range(N)]
    accuracies = {}
    for i in range(N):
        x_train, y_train = compute_bootstrapped_sample(
            x_remainder, y_r)  #get the bootstrap sample
        tree = my_class.MyDecisionTreeClassifier()
        tree.fit(x_train, y_train, True, F)  #build classifier
        # get remainder of x_train and use as validation set
        x_v = []
        y_v = []
        for j in range(len(x_remainder)):
            if x_remainder[j] not in x_train:
                x_v.append(x_remainder[j])
                y_v.append(y_r[j])
        pred = tree.predict(x_v)
        accuracy = get_accuracy(y_v, pred)
        accuracies[str(i)] = accuracy  # {i: accuracy, }
        forest.append(tree)

# 3. measure the performance of the tree on the validation set and select the best M of N
#   trees based on the performance metrics
    best_trees_dict = best_M(M, accuracies)
    best_trees = []
    for key in best_trees_dict:
        best_trees.append(forest[int(key)])
# 4. using majority voting, make predictions from the M learners for each instance in the test set
    all_predictions = []  # [[predictions1],[predictions2]...]
    for tree in best_trees:
        pred = tree.predict(x_test)
        all_predictions.append(pred)  #think about this like flipping a table
    #get the majority for every single row
    pred_header = build_header(
        all_predictions)  #turn all predictions into a mypy
    pred_mypy = MyPyTable(pred_header, all_predictions)
    voted_predictions = []
    for i in range(
            len(all_predictions[0])
    ):  #loop through every x_test, create a column of predictions, pick the pred by majority rule
        pred_col = pred_mypy.get_column(i)
        vals, counts = get_freq_str(pred_col)
        j = counts.index(max(counts))
        y_predict = vals[j]
        voted_predictions.append(y_predict)

    forest_accuracy = get_accuracy(y_test, voted_predictions)
    return best_trees, voted_predictions, forest_accuracy
示例#10
0
def test_random_forest_classifier_fit():
    mp_table = MyPyTable(interview_header, interview_table)
    # Formulate X_train and y_train
    y_train = mp_table.get_column('interviewed_well')
    X_train_col_names = ["level", "lang", "tweets", "phd"]
    X_train = mp_table.get_rows(X_train_col_names)

    myRF = MyRandomForestClassifier(N=4, M=2, F=4)
    myRF.fit(X_train, y_train)

    assert len(myRF.M_attr_sets) == myRF.M
示例#11
0
def unique_genres(table):
    """Get list of unique genres within a table

    Args:
        table(MyPyTable): given object of MyPyTable

    Returns:
        values(list): list with unique genres"""
    genre_str = ''
    genre_col = MyPyTable.get_column(table, 'Genres', False)
    vals, counts = get_frequencies(table, 'Genres')
    for v in vals:
        genre_str = genre_str + v + ','
    genre_array = genre_str.split(',')

    values = []

    for value in genre_array:
        if value != '':
            if value not in values:
                # haven't seen this value before
                values.append(value)
            elif value in values:
                pass
    return values
示例#12
0
def test_My_Random_Forest_Classifier_fit():
    # Object Declarations
    # Tests with N = 3, M = 2, F = 2 and seed = 0
    rand_forest_test = MyRandomForestClassifier(3, 2, 2, 0)
    table = MyPyTable()

    # Variable Assignment and Declaration
    table.data = interview_table
    table.column_names = interview_header

    X_test = interview_table
    y_train = table.get_column("interviewed_well")

    # Tests on the Interview Dataset
    rand_forest_test.header = interview_header
    rand_forest_test.fit(X_test, y_train)

    trees = rand_forest_test.trees
示例#13
0
def table_setUp(file_name):
    """
    """
    
    file_path = os.path.join("input_data", file_name)

    # Inputs data from file into the table 
    table = MyPyTable().load_from_file(file_path)

    return table
示例#14
0
def random_forest_predict(X_test, trees):
# 4. using majority voting, make predictions from the M learners for each instance in the test set
    all_predictions = [] # [[predictions1],[predictions2]...]
    for tree in trees:
        pred = tree.predict(X_test)
        all_predictions.append(pred) #think about this like flipping a table
    #get the majority for every single row
    pred_header = build_header(all_predictions) #turn all predictions into a mypy
    pred_mypy = MyPyTable(pred_header, all_predictions)
    voted_predictions = []
    for i in range(len(all_predictions[0])): #loop through every x_test, create a column of predictions, pick the pred by majority rule
        pred_col = pred_mypy.get_column(i)
        vals, counts = get_freq_str(pred_col)
        j = counts.index(max(counts)) 
        y_predict = vals[j]
        voted_predictions.append(y_predict)

    # forest_accuracy = get_accuracy(y_test, voted_predictions)
    return voted_predictions
示例#15
0
def test_random_forest_classifier_predict():
    X_test = [["Mid", "Python", "no", "no", "True"],
              ["Mid", "R", "yes", "yes", "True"],
              ["Mid", "Python", "no", "yes", "True"]]

    y_test = ["True", "True", "True"]

    mp_table = MyPyTable(interview_header, interview_table)
    # Formulate X_train and y_train
    y_train = mp_table.get_column('interviewed_well')
    X_train_col_names = ["level", "lang", "tweets", "phd"]
    X_train = mp_table.get_rows(X_train_col_names)

    myRF = MyRandomForestClassifier(N=4, M=2, F=4)
    myRF.fit(X_train, y_train)
    predictions = myRF.predict(X_test)

    for i in range(0, len(predictions)):
        assert predictions[i] == y_test[i]
示例#16
0
def get_freq_str(col):

    header = ["y"]
    col_mypy = MyPyTable(header, col)

    dups = col_mypy.ordered_col(header)
    values = []
    counts = []

    for value in dups:
        if value not in values:
            # first time we have seen this value
            values.append(str(value))
            counts.append(1)
        else:
            # we have seen this value before
            counts[-1] += 1  # ok because the list is sorted

    return values, counts
示例#17
0
def pie_chart_dataPrep(table, cols_to_plot):
    """
    """
    totals_list = []
    for col_name in cols_to_plot:
        column_Category = MyPyTable.get_column(table, str(col_name))

        total = 0
        for val in column_Category:
            total = total + float(val)
        totals_list.append(total)
    return totals_list
示例#18
0
def hist_graph(table, column_name):
    """Creates a histogram graph with given data

    Args:
        table(MyPyTable): given table to perform operation
        column_name(string): column name to get column from for hist graph

    """
    col = MyPyTable.get_column(table, column_name, False)

    plt.hist(col, bins=10)
    plt.show()
示例#19
0
def get_year_counts(table, platform):
    """Get years of occuring platform game occurences along with their individual frequencies

    Args:
        table(MyPyTable): given object of MyPyTable
        platform(string): platform to search for in table

    Returns:
        values, counts (string, int): name of value and its frequency"
        """

    plat_col = MyPyTable.get_column(table, 'Platform', True)
    col = MyPyTable.get_column(table, "Year", True)
    list = []
    for i in range(len(plat_col)):
        if plat_col[i] == platform:
            list.append(col[i])

    copy_list = copy.deepcopy(list)

    for value in list:
        if value == 'N/A':
            copy_list.remove(value)

    list = copy_list
    list.sort()

    values = []
    counts = []

    for value in list:
        if value not in values:
            # haven't seen this value before
            values.append(value)
            counts.append(1)
        elif value in values:
            index = values.index(value)
            counts[index] += 1

    return values, counts
示例#20
0
def convert_attributes(table):
    """Converts IMDb to double digit float and Rotten Tomatoes to string without %

    Args:
        table(MyPyTable): given object of MyPyTable

    Returns:
        imbd_col(list): IMDb list in double digits
        rotten_col(list): Rotten Tomatoes list stripped of %"""
    #IMDb conversion
    col = MyPyTable.get_column(table, 'IMDb', False)
    rotten_col = MyPyTable.get_column(table, 'Rotten Tomatoes', False)
    imbd_col = []
    for i in col:
        i = i * 10
        imbd_col.append(i)

    #rotten tomatoes conversion
    for a, x in enumerate(rotten_col):
        rotten_col[a] = float(x[:-1])

    return imbd_col, rotten_col
示例#21
0
def get_mpg_frequencies(MyPyTable, col_name):
    """Gets the frequency and count of a column by name

    Args:
        MyPyTable(MyPyTable): self of MyPyTable
        col_name(str): name of the column

    Returns:
        values, counts (string, int): name of value and its frequency"""

    col = MyPyTable.get_column(col_name)

    values = []
    counts = []

    for value in col:
        if value not in values:
            # haven't seen this value before
            if value >= 13 and value < 14:
                values, counts = mpg_val_check(1, values, counts, value)
            elif value == 14:
                values, counts = mpg_val_check(2, values, counts, value)
            elif value > 14 and value <= 16:
                values, counts = mpg_val_check(3, values, counts, value)
            elif value > 16 and value <= 19:
                values, counts = mpg_val_check(4, values, counts, value)
            elif value > 19 and value <= 23:
                values, counts = mpg_val_check(5, values, counts, value)
            elif value > 23 and value <= 26:
                values, counts = mpg_val_check(6, values, counts, value)
            elif value > 26 and value <= 30:
                values, counts = mpg_val_check(7, values, counts, value)
            elif value > 30 and value <= 36:
                values, counts = mpg_val_check(8, values, counts, value)
            elif value > 36 and value <= 44:
                values, counts = mpg_val_check(9, values, counts, value)
            elif value >= 45:
                values, counts = mpg_val_check(10, values, counts, value)

    temp_counts = copy.deepcopy(counts)

    #re-order/sort values and temp_counts
    for i in range(len(values)):
        index = values[i]
        temp_counts[index - 1] = counts[i]
    values.sort()
    counts = temp_counts

    return values, counts
示例#22
0
def compute_entropy(instances, available_attributes, index):
    mypy = MyPyTable(available_attributes, instances)
    classes = mypy.get_column(-1)
    attributes = mypy.get_column(index)
    temp = set(attributes)
    __, tables = group_by(attributes, classes)
    totals = []
    sub_entropies = []
    # get the class counts here
    for jj, element in enumerate(temp):
        totals.append(attributes.count(element))
        # parallel array of counts of each att for each class
        arr = []
        for table in tables:
            arr.append(table.count(element))
        su = 0
        for kk in arr:
            if kk <= 0:
                pass
            else:
                su -= kk / totals[jj] * math.log2(kk / totals[jj])
        su *= totals[jj] / len(attributes)
        sub_entropies.append(su)
    return sum(sub_entropies)
示例#23
0
def percent_hist_graph(table, column_name):
    """Creates a histogram graph with given data and removes the percent sign from given column_names

    Args:
        table(MyPyTable): given table to perform operation
        column_name(string): column name to get column from for hist graph

    """
    col = MyPyTable.get_column(table, column_name, False)

    for i, x in enumerate(col):
        col[i] = float(x[:-1])

    plt.hist(col, bins=10)
    plt.show()
示例#24
0
def combine_two_columns(column_names, col1, col2):
    """Creates a MyPyTable from two columns and their column names

    Args:
        column_names(list): List of string column names
        col1(list): List of values from first column
        col2(list): List of values from second column

    Returns:
        table(MyPyTable): Returned MyPyTable with two columns"""
    data = []
    for i in range(len(col1)):
        data.append([col1[i], col2[i]])

    table = MyPyTable(column_names, data)
    return table
示例#25
0
def percentages_columns(table, column_names):
    """Gives the percentage of each column's frequency divided by total column length

    Args:
        table(MyPyTable): given object of MyPyTable
        column_names(list): List of string column names

    Returns:
        percentages(list): list of percentages in each correct index matching with the given columns list"""
    counts = get_occurences_given_columns(table, column_names)
    percentages = []
    col = MyPyTable.get_column(table, column_names[0], False)
    length = len(col)
    for count in counts:
        percentages.append(round((count / length) * 100, 0))
    return percentages
示例#26
0
def get_occurences_given_columns(table, column_names):
    """Gets the occurence from each column in a given columns list.

    Args:
        table(MyPyTable): given object of MyPyTable
        column_names(list): List of string column names

    Returns:
        count(list): list of frequencies in each correct index matching with the given columns list"""
    column = []
    count = []
    for i in range(len(column_names)):
        count.append(0)

    for col in column_names:
        attributes = MyPyTable.get_column(table, col, False)
        column.append(attributes)
    for i in range(len(column)):
        for j in column[i]:
            if j == 1.0:
                count[i] = count[i] + 1

    return count
示例#27
0
def get_frequencies(MyPyTable, col_name):
    """Gets the frequency and count of a column by name

    Args:
        MyPyTable(MyPyTable): self of MyPyTable
        col_name(str): name of the column

    Returns:
        values, counts (string, int): name of value and its frequency"""

    col = MyPyTable.get_column(col_name)
    values = []
    counts = []

    for value in col:
        if value not in values:
            # haven't seen this value before
            values.append(value)
            counts.append(1)
        elif value in values:
            index = values.index(value)
            counts[index] += 1

    return values, counts
示例#28
0
    track_data = []
    popularity = track["track"]["popularity"]
    if popularity == 0:
        continue #skip any track with 0 popularity, because I'm unsure if this is a default value 
    else:
        #name = track["track"]["name"]
        #track_data.append(name) # will be ignored but could but each track_data_obj should be identifiable
        features_dict = sp.audio_features(track["track"]["id"]) #this  returns a features dictonary
        for key in features_dict[0]: #loop through and add only the attributes we want
            if key != "type" and key != "id" and key != "uri" and key != "track_href" and key != "analysis_url" and key != "time_signature" and key != "mode" and key != "key" and key != "loudness":
                val = features_dict[0][key]
                if key != "tempo" and key != "duration_ms":
                    val = myutils.percent_to_rating(val)
                track_data.append(val)
                # if first == True:
                #     header.append(key)
        # first = False
        pop_class = myutils.pop_rating(popularity)
        track_data.append(pop_class) # popularity will be the y_train
        track_data_objs.append(track_data)
# header.append("popularity")
# now we can turn this into an xtrain and ytrain or keep it stitched together 
# when dealing with the data we can delete the first col, which is the name identifier

print(len(track_data_objs))

header = ['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'popularity']

tracks_mypy = MyPyTable(header, track_data_objs)
tracks_mypy.save_to_file("tracks_data.txt")
  
import pickle  # standard python library
from mysklearn.mypytable import MyPyTable
from mysklearn.myclassifiers import MyDecisionTreeClassifier, MyNaiveBayesClassifier
import mysklearn.myevaluation as myevaluation
import mysklearn.myutils as myutils
import os

# "pickle" an object (AKA object serialization)
# save a Python object to a binary file

# "unpickle" an object (AKA object de-serialization)
# load a Python object from a binary file (back into memory)

# Get data from csv file
table = MyPyTable().load_from_file(
    os.path.join("input_files", "winequality-red.csv"))
y_col = table.get_column("quality", False)
x_cols = table.drop_col("quality")

# Use Naive Bayes to classify
testcase = MyNaiveBayesClassifier()

#Returns x INDEXES
X_train, X_test = myevaluation.stratified_kfold_cross_validation(x_cols,
                                                                 y_col,
                                                                 n_splits=10)
X_train, X_test, y_train, y_test = myutils.getInstances(
    X_train, X_test, x_cols, y_col)

for i, fold in enumerate(X_train):
    train, test = myutils.normalize_values(X_train[i], X_test[i])
示例#30
0
from mysklearn.myclassifiers import MyNaiveBayesClassifier
import os
from mysklearn.mypytable import MyPyTable
import mysklearn.myevaluation as myevaluation
import mysklearn.myutils as myutils
import pickle

fname = os.path.join("input_data", "collisions.csv")
collisions_data = MyPyTable().load_from_file(fname)

weather = collisions_data.get_column('WEATHER')
road_condition = collisions_data.get_column('ROADCOND')
light_condition = collisions_data.get_column('LIGHTCOND')
junction_type = collisions_data.get_column('JUNCTIONTYPE')
severity = collisions_data.get_column('SEVERITYDESC')

X_train = [[
    weather[i], road_condition[i], light_condition[i], junction_type[i],
    severity[i]
] for i in range(len(weather))]
y_train = collisions_data.get_column('COLLISIONTYPE')

for i, val in enumerate(y_train):
    if val == 'Unknown':
        del y_train[i]
        del X_train[i]

strattrain_folds, strattest_folds = myevaluation.stratified_kfold_cross_validation(
    X_train, y_train, 10)
strat_xtrain, strat_ytrain, strat_xtest, strat_ytest = myutils.get_from_folds(
    X_train, y_train, strattrain_folds, strattest_folds)