def normalize_attributes(test_set, training_set):
    '''
        A function to normalize training and test data at a given set of data_indices
        The attributes of data_indices should be numeric and continuous. 
        Param training_set: A table of training data
        Param test_set: A table of testing data
        Param data_indices: The indices of the attributes to be normalized        
        Returns: tables containing only the attributes given by data_indices normalized 
                normalized_training_data - normalized training data [0,1]
                normalized_test_data - normalized test data [0,1]
    '''
    normalized_training_data = []
    normalized_test_data = []

    for index in range(len(training_set[0])):  # go by column
        training_column = utils.get_column(training_set, index)
        test_column = utils.get_column(test_set, index)
        min_val = min(training_column)
        max_val = max(training_column)
        training_column = normalize(min_val, max_val, training_column)
        test_column = normalize(min_val, max_val, test_column)
        normalized_training_data.append(training_column)
        normalized_test_data.append(test_column)

    # we appended columns to normalized data tables, so transpose them
    normalized_test_data = [
        utils.convert_to_numeric(list(i))
        for i in np.array(normalized_test_data).T
    ]
    normalized_training_data = [
        utils.convert_to_numeric(list(i))
        for i in np.array(normalized_training_data).T
    ]

    return normalized_test_data, normalized_training_data
示例#2
0
def transform_rows(input_file, num_tests):
    random.seed()
    train_results = []
    input_results = []
    output_results = []
    i = 0
    with open(input_file) as f:
        while i < num_tests:
            i += 1
            line = f.readline()
            path = utils.get_column(utils.Columns.path, line)
            if len(path) == 0:
                continue
            # FOR DEBUGGING
            #print line

            end = random.randint(0, len(path))
            input_row = ",".join(string.split(line, ",", 8)[:8]) + ","
            input_row += utils.path_to_csv(path[:end])
            input_results.append(input_row)
            output_row = "\"T" + str(
                utils.get_column(utils.Columns.trip_id, line)) + "\""
            output_row += "," + str(path[-1][0]) + "," + str(path[-1][1])
            output_results.append(output_row)

        for line in f:
            train_results.append(line)
    print len(train_results)
    return ("".join(train_results), "\n".join(input_results),
            "\n".join(output_results))
示例#3
0
def get_training_infos(training_data):
    training_infos = []
    i = 0
    for line in training_data:
        if i % 100000 == 0:
            print i
        i += 1
        id = utils.get_column(utils.Columns.trip_id, line)
        path = utils.get_column(utils.Columns.path, line)
        # Drop empty paths.
        if not path:
            continue
        training_infos.append((id, utils.path_to_array(path)))
    return training_infos
示例#4
0
def main():
    table = u.read_csv('ebay_data_dirty.csv')

    for row in table:
        del row[0]
        del row[0]
        del row[9]
        del row[-1]

    u.write_to_file(table, 'ebay_data_dirty_temp.csv')
    remove_modelnums_not_applicable('ebay_data_dirty_temp.csv')
    table_w_modelnums = u.read_csv('ebay_data_clean.csv')

    case_col = u.get_column(table_w_modelnums, 5)
    clean_case = clean_case_material(case_col)

    count = 0
    for row in table_w_modelnums:
        del row[5]
        row.insert(5,str(clean_case[count]))
        count = count + 1

    movement_col = u.get_column(table_w_modelnums, 4)
    clean_movement = clean_watch_movement(movement_col)

    count = 0
    for row in table_w_modelnums:
        del row[4]
        row.insert(4,str(clean_movement[count]))
        count = count + 1

    band_col = u.get_column(table_w_modelnums, 6)
    clean_band = clean_band_material(band_col)

    count = 0
    for row in table_w_modelnums:
        del row[6]
        row.insert(6,str(clean_band[count]))
        count = count + 1

    mod_col = u.get_column(table_w_modelnums, 7)
    clean_mod = clean_watch_model(mod_col)

    count = 0
    for row in table_w_modelnums:
        del row[7]
        row.insert(7,str(clean_mod[count]))
        count = count + 1
    u.write_to_file(table_w_modelnums, "ebay_data_clean.csv")
示例#5
0
def load_variants(filename, extra=""):
    """Return a dict with the variants in a filename."""
    extra_columns = []
    if extra:
        for column in extra.split(","):
            extra_columns.append(utils.get_column(filename, column))

    with open2(filename, encoding="utf-8", errors="replace") as f1:
        variants = {"header": f1.readline().rstrip().split("\t")}

        for line in f1:
            variant = line.split("\t")  # Some lines ends in lots of empty cols
                                        #  so we keep every one of them
            variant[-1] = variant[-1].rstrip()

            key = variant[:5]
            for column in extra_columns:
                if variant[column] in ["."]:
                    # Some columns, like "." should be forced to be empty
                    key.append("")
                else:
                    try:
                        key.append(round(float(variant[column]), 2))
                    except ValueError:
                        # If the column cannot be casted to a float, let it in
                        #  as a string
                        key.append(variant[column])

            variants[tuple(key)] = variant

    return variants
示例#6
0
def compute_naive_bayes_accuracy(train, test, header, attributes_list,
                                 class_label):
    '''
    for a training/test set pair, computes the accuracy of the naive bayes classification
    parameter train is the training set of data (a table) for linear regression
    parameter test is the test set of data (a table) to classify
    parameters table and header are the data table and a list of the attributes in order, respectively
    returns the accuracy
    '''
    predictions = []

    # for each instance in the training set, compute naive bayes classifications
    for instance in test:
        predictions.append(
            classify_using_naive_bayes(train, header, instance,
                                       "math score class", attributes_list))

    # get actual values
    actuals = utils.get_column(test, header.index("math score class"))

    # compute accuracy using predicted values and actuals
    con_mat = create_confusion_matrix(predictions, actuals)
    # print_confusion_matrix(con_mat[0], con_mat[1], con_mat[2], con_mat[3])

    return acc(con_mat[0], con_mat[1], con_mat[2], con_mat[3])
示例#7
0
def classify_using_naive_bayes(table, header, test_instance, class_label,
                               attr_list):
    '''
    uses Naive Bayes to classify an unseen instance given a training set
    parameter table is the training data to use
    parameter header is a list of the attributes in table
    parameter test_instance is an instance of the test set to be classified
    parameter class_label is the name (string) of the class
    parameter attr_list is a list of the attributes to use for classification
    returns the predicted class label for the test instance
    '''
    # get list of all possible class values
    classes = utils.get_unique_items(
        utils.get_column(table, header.index(class_label)))
    # initialize list of class probabilities
    class_probabilities = [0 for value in classes
                           ]  # should be parallel list to classes

    # for each class, compute posteriors then multiply to get P(class | test_instance)
    for class_value in classes:
        current_class_index = classes.index(class_value)
        calculate_posterior(table, header, class_label, class_value, test_instance, \
            attr_list, current_class_index, class_probabilities)

    # returns the class with the largest proportional probability given Naive Bayes classification
    max_class_index = class_probabilities.index(max(class_probabilities))
    return classes[max_class_index]
示例#8
0
    def encrypt(self):
        print('[+] Starting encryption.')
        scrambled = self.image.matrix

        pbar = tqdm(total=2 * (len(self.r_vector) * len(self.c_vector)) + (len(self.r_vector) + len(self.c_vector)))

        for i in range(len(self.r_vector)):
            ith_row_sum = utils.sum_row(i, scrambled)

            if bool(ith_row_sum % 2):
                scrambled[i] = utils.shift(scrambled[i], self.r_vector[i])
            else:
                scrambled[i] = utils.shift(scrambled[i], -self.r_vector[i])
            pbar.update(1)

        for j in range(len(self.c_vector)):
            # jth_column_sum = utils.sum_column(j, scrambled)
            col = utils.get_column(scrambled, j)
            shifted = utils.shift(col, -self.c_vector[j])
            scrambled = utils.set_column(scrambled, j, shifted)
            pbar.update(1)

        scrambled = self.xor_scrambled(scrambled, pbar)

        pbar.close()
        self.image.create_image(self.encrypted, scrambled)

        f = open(self.image.key_path, 'w')
        f.write(str(self.r_vector) + '\n')
        f.write(str(self.c_vector))
        f.close()
def choose_columns(data, indexes):
    new_table = [[] for _ in range(len(data))]
    for i in indexes:
        column = utils.get_column(data, i)
        for row, item in zip(new_table, column):
            row.append(item)
    return new_table
示例#10
0
    def xor_scrambled(self, scrambled, pbar):
        for i in range(len(scrambled)):
            if not bool(i % 2):
                reverse = self.c_vector
                reverse.reverse()
                scrambled[i] = [utils.xor(scrambled[i][j], reverse[j], pbar) for j in range(len(self.c_vector))]
            else:
                scrambled[i] = [utils.xor(scrambled[i][j], self.c_vector[j], pbar) for j in range(len(self.c_vector))]
            pbar.update(1)

        for j in range(len(scrambled[0])):
            col = utils.get_column(scrambled, j)
            if not bool(j % 2):
                reverse = self.r_vector
                reverse.reverse()
                scrambled = utils.set_column(scrambled, j,
                                             [utils.xor(col[i], reverse[i], pbar)
                                              for i in range(len(self.r_vector))])
            else:
                scrambled = utils.set_column(scrambled, j,
                                             [utils.xor(col[i], self.r_vector[i], pbar)
                                              for i in range(len(self.r_vector))])
            pbar.update(1)

        return scrambled
示例#11
0
    def build_target_map():

        target_map = {}
        wks = worksheets["SDG Targets"]

        target_col = utils.get_column(wks, 2)
        for cell in target_col:
            strsplit = cell.split()
            target_map[strsplit[0].strip()] = cell

        return target_map
示例#12
0
def predict_destinations(training_data, tests, answers):
    training_infos = get_training_infos(training_data)
    nearest_neighbors = create_nearest_neighbors_predictor(training_infos)

    for line in tests:
        nearest_paths = []
        test_id = utils.get_column(utils.Columns.trip_id, line)
        test_path = utils.get_column(utils.Columns.path, line)
        if not test_path:
            continue
        for index in nearest_neighbors.kneighbors([test_path[0]],
                                                  return_distance=False)[0]:
            nearest_paths.append(training_infos[index])
        predicted_destination = frechet_predictor(test_id, test_path,
                                                  nearest_paths)
        print test_id, "|", predicted_destination,
        if answers is not None:
            real_destination = get_real_destination(answers)
            print "|", utils.distance(real_destination, predicted_destination)
        else:
            print ""
def normalize_combined():
    combined_table, header = utils.read_table("combined_data.csv", True)
    columns = []
    new_header = []
    for x in range(len(header)):
        if x not in [2, 6, 7, 8, 9, 10, 11, 12, 13, 16]:
            new_header.append(header[x])
            columns.append(utils.get_column(combined_table, x))
    columns.append([
        round(columns[6][i] * 12 * 100 / columns[3][i], 1)
        for i in range(len(columns[0]))
    ])
    new_header.append("Pct_Income_as_Rent")

    columns[2] = normalize_data(columns[2])  # Poverty
    columns[3] = normalize_data(columns[3])  # Median Income
    columns[4] = discretize_data(columns[4], 5)  # Crime Rate
    columns[5] = normalize_data(columns[5])  # Population
    columns[6] = normalize_data(columns[6])  # Rent
    columns[7] = normalize_data(columns[7])  # Rent as percent of income.

    new_table = []
    for x in range(len(columns[0])):
        buffer = []
        for column in columns:
            buffer.append(column[x])
        new_table.append(buffer)

    new_table.insert(0, new_header)
    utils.write_table("combined_data_normalized.csv", new_table)

    columns[2] = discretize_data(columns[2], 3)  # Poverty
    columns[3] = discretize_data(columns[3], 3)  # Median Income
    #columns[4] = discretize_data(columns[4], 3) # Crime Rate
    columns[5] = discretize_data(columns[5], 5)  # Population
    columns[6] = discretize_data(columns[6], 3)  # Rent
    columns[7] = discretize_data(columns[7], 5)  # Rent as percent of income.

    new_table = []
    for x in range(len(columns[0])):
        buffer = []
        for column in columns:
            buffer.append(column[x])
        new_table.append(buffer)

    new_table.insert(0, new_header)
    utils.write_table("combined_data_discretized.csv", new_table)
def main():
    table = u.read_csv('rolex_prices_data.csv')

    price_col = u.get_column(table, 1)

    clean_prices = []
    for price in price_col:
        price = price.replace(',', '')
        price = price.replace('$', '')

        clean_prices.append(price)

    clean_table = []
    count = 0
    for row in table:
        clean_table.append([row[0], clean_prices[count]])
        count = count + 1

    u.write_to_file(clean_table, 'rolex_prices_data_clean.csv')
示例#15
0
def main():
    data = []
    f = open(utils.kInputFile, "r")
    for line in f:
        path = utils.get_column(utils.Columns.path, line)
        if not path:
            continue
        data.append(utils.distance(path[0], path[-1]))

    binwidth = 1
    bins = range(int(math.floor(min(data))),
                 int(math.ceil(max(data) + binwidth)), binwidth)
    # the histogram of the data
    plt.hist(data, bins=bins, facecolor='green', alpha=0.75)
    plt.xlabel('Trip Distance(km)')
    plt.ylabel('Number of trips')
    plt.title(r'Proportion of trips')
    plt.axis([0, 20, 0, 1000000])
    plt.grid(True)
    plt.show()
示例#16
0
def normalize_table(table, predictors):
    '''
    Reads a table and adds normalized columns.
    Parameter table: The table to be tested.
    Parameter predictors: List of predictors used to estimate.
    Returns: Normalized table.
    '''
    new_table = copy.deepcopy(table)
    normalized = [[] for _ in predictors]
    for x in predictors:
        column = utils.get_column(new_table, x)
        if all(isinstance(item, int) for item in column) or all(
                isinstance(item, float) for item in column):
            normalized.append(normalize(
                column))  # Normalizes and adds each predictor column.
        else:
            normalized.append([])

    for i in range(len(new_table)):
        for j in range(len(predictors)):
            if normalized[j] != []:
                new_table[i][predictors[j]] = normalized[j][i]

    return new_table
示例#17
0
    def find_similar_text(wks, args, rd, title_count):
        """ Uses similarity.py module to find similar text
            * We remove all duplicates first
            * We store row location of all indicator text
            * We apply NLP similarity on deduped list
            * In fast mode, we use sorted list and start comparing from the location
              of the first string.
            * In deep mode, we search every sentence with other
        """
        col = utils.get_column(wks, 5)
        qacol = utils.get_column(wks, 10)
        col_line_dict = defaultdict(list)
        for n,x in enumerate(col):
            col_line_dict[x].append(n+1+title_count)

        colunique = list(set(col))
        colunique.sort()

        all_similar_lines = []
        header_written = False

        for n1, val1 in enumerate(colunique):
            if val1 == '':
                continue
            for n2, val2 in enumerate(colunique[n1:]):
                if val2 == '':
                    continue
                if val1 == val2:
                    continue
                similar_val = similarity.cosine_sim(val1, val2)
                if similar_val > 0.7:
                    similar_lines = [col_line_dict[val1], val1, col_line_dict[val2], val2]

                    # Filter by column "Indicator QA Status". If a value exists remove it from
                    # the list

                    qacoltest = []
                    qacoltest.extend(col_line_dict[val1])
                    qacoltest.extend(col_line_dict[val2])
                    #for x in qacoltest:
                    #    print(qacol[x-1-title_count])
                    allqacol_filled = all([qacol[x-1-title_count] in ('Complete', 'Needs Review') for x in qacoltest])
                    #print(allqacol_filled)

                    # If this flag is set, do not filter by QA column
                    if args.all_similar:
                        allqacol_filled = False

                    if not(allqacol_filled):
                        all_similar_lines.append(similar_lines)
                        #print(qacoltest)
                        #print("ZZZZ===")
                        #print(similar_lines)
                        #print("=====ZZZZ")

                    if len(all_similar_lines) > 0 and not header_written:
                        rd["sheet"].write(rd["row"], 0, "Test for similar indicators values")
                        rd["sheet"].write(rd["row"], 1, "Failed", rd["red"])
                        rd["row"]+=1
                        rd["sheet"].write(rd["row"], 0, None)
                        rd["row"]+=1
                        rd["sheet"].write_row(rd["row"], 0, tuple(["Rows 1", "Similar Text 1",
                                        "Rows 2", "Similar Text 2", "Similarity Score"]), rd["bold"])
                        rd["row"]+=1
                        header_written = True
                    if not(allqacol_filled):
                        rd["sheet"].write_row(rd["row"], 0, tuple([
                                ','.join((str(s) for s in similar_lines[0])), "'{}'".format(similar_lines[1]),
                                ','.join((str(s) for s in similar_lines[2])), "'{}'".format(similar_lines[3]),
                                '{:.3f}'.format(similar_val)]))
                        rd["row"]+=1

                        print("\n Row: {}\n'{}'\n----\n Row: {}\n'{}'\nSimilarity Score = {:.3f}\n\n\n ====".format(
                            similar_lines[0], similar_lines[1],
                            similar_lines[2], similar_lines[3], similar_val))
                else:
                    if not args.deeply_similar:
                        break

                if len(all_similar_lines) == 0:
                    print("\n\n Test for similar indicators values: Passed. Good Job!")
                    rd["sheet"].write(rd["row"], 0, "Test for similar indicators values")
                    rd["sheet"].write(rd["row"], 1, "Passed. Good Job!", rd["green"])
                    rd["row"]+=1
def stats():
    data, header = utils.read_table("combined_data.csv", True)
    for row in data:
        if row[6] == min([x for x in utils.get_column(data, 6) if x != 0]):
            print(row)
示例#19
0
def main():
    '''audio_data = []
    utils.read_file_to_table("small_audio_data.csv", audio_data)
    print("done reading")
    headers = ["Acousticness", "Danceability", "Duration", "Energy", "Instrumentalness", "Key",
               "Liveness", "Loudness", "Mode", "Speechiness", "Tempo", "Time Signature", "Valence"]
    for i in range(0, 13):
        filename = headers[i] + ".pdf"
        create_scatter_plot(utils.get_column(audio_data, i), utils.get_column(
            audio_data, 13), filename, headers[i])'''

    # kNN classifier to predict popularity (index 13)
    # using: acousticness (0), danceability (1), duration (2), energy (3), instrumentalness (4),
    # liveness (6), loudness (7), speechiness (9), tempo (10), valence (12)
    trimmed_data = []
    utils.read_file_to_table("small_audio_data.csv", trimmed_data,
                             [0, 1, 2, 3, 4, 6, 7, 9, 10, 12, 13])

    # normalize duration (new index = 2), loudness (new index = 6), tempo (new index = 8)
    duration = utils.get_column(trimmed_data, 2)
    normalized_duration = utils.normalize(duration)
    loudness = utils.get_column(trimmed_data, 6)
    normalized_loudness = utils.normalize(loudness)
    tempo = utils.get_column(trimmed_data, 8)
    normalized_tempo = utils.normalize(tempo)

    # update table with normalized values
    # and discretize popularity
    for i in range(len(trimmed_data)):
        trimmed_data[i][2] = normalized_duration[i]
        trimmed_data[i][6] = normalized_loudness[i]
        trimmed_data[i][8] = normalized_tempo[i]
        trimmed_data[i][-1] = utils.discretize_popularity(trimmed_data[i][-1])

    # decision trees
    # values for this tree are already between 0 and 1
    col_names = [
        "acousticness", "danceability", "duration", "energy",
        "instrumentalness", "liveness", "loudness", "speechiness", "tempo",
        "valence", "popularity"
    ]
    labels = {
        "acousticness": "Acousticness",
        "danceability": "Danceability",
        "duration": "Duration",
        "energy": "Energy",
        "instrumentalness": "Instrumentalness",
        "liveness": "Liveness",
        "loudness": "Loudness",
        "speechiness": "Speechiness",
        "tempo": "Tempo",
        "valence": "Valence",
        "popularity": "Popularity"
    }
    att_domains = {
        0: [">=0.25", ">=0.50", ">=0.75", ">=1.0"],
        1: [">=0.25", ">=0.50", ">=0.75", ">=1.0"],
        2: [">=0.25", ">=0.50", ">=0.75", ">=1.0"],
        3: [">=0.25", ">=0.50", ">=0.75", ">=1.0"],
        4: [">=0.25", ">=0.50", ">=0.75", ">=1.0"],
        5: [">=0.25", ">=0.50", ">=0.75", ">=1.0"],
        6: [">=0.25", ">=0.50", ">=0.75", ">=1.0"],
        7: [">=0.25", ">=0.50", ">=0.75", ">=1.0"],
        8: [">=0.25", ">=0.50", ">=0.75", ">=1.0"],
        9: [">=0.25", ">=0.50", ">=0.75", ">=1.0"],
        10: [">=25", ">=50", ">=75", ">=100"]
    }
    class_index = len(col_names) - 1
    # att_indexes is a list of attributes to use for building the tree
    att_indexes = list(range(len(col_names) - 1))
    #spotify_tree = tree_utils.tdidt(tree_data, att_indexes, att_domains, class_index, col_names)
    #tree_utils.create_dot_tree(spotify_tree, labels, "spotify_tree")
    folds = utils.stratified_cross_folds(trimmed_data, 10)
    num_correct = 0
    for i in range(0, 10):  # range had to change
        train, test = utils.set_up_train_test(i, folds)
        actual_popularities = [x[-1] for x in test]
        att_indexes = list(range(len(col_names) - 1))
        predicted_popularities = tree_utils.tree_classifier(
            train, test, att_indexes, att_domains, class_index, col_names)
        for i in range(len(test)):
            if actual_popularities[i] == predicted_popularities[i]:
                num_correct += 1
    accuracy = num_correct / len(trimmed_data)
    print("Accuracy Decision Tree: " + str(round(accuracy * 100, 2)) + "%")

    # generate 10 stratified cross folds
    folds = utils.stratified_cross_folds(trimmed_data, 10)
    num_correct = 0
    for i in range(0, 10):
        train, test = utils.set_up_train_test(i, folds)
        actual_popularities = [x[-1] for x in test]
        predicted_popularities = utils.knn_classifier(train, test)
        for i in range(len(test)):
            if actual_popularities[i] == predicted_popularities[i]:
                num_correct += 1
        print(num_correct)
    accuracy = num_correct / len(trimmed_data)
    print("Accuracy kNN: " + str(round(accuracy * 100, 2)) + "%")

    # naive bayes
    num_correct_bayes = 0
    for i in range(0, 10):
        train, test = utils.set_up_train_test(i, folds)
        priors = utils.compute_probabilities(train)
        actual_popularities_bayes = [x[-1] for x in test]
        predicted_popularities_bayes = []
        for instance in test:
            predicted_popularity_bayes = utils.naive_bayes_classifier(
                priors, instance, train)
            predicted_popularities_bayes.append(predicted_popularity_bayes)
        for i in range(len(test)):
            if actual_popularities_bayes[i] == predicted_popularities_bayes[i]:
                num_correct_bayes += 1
        print(num_correct_bayes)
    accuracy_bayes = num_correct_bayes / len(trimmed_data)
    print("Accuracy Naive Bayes: " + str(round(accuracy_bayes * 100, 2)) + "%")

    # ensemble classifier (kNN)
    # generate five weak learners, each using a different subset of attributes
    num_correct_ensemble = 0
    for i in range(10):
        train, test = utils.set_up_train_test(i, folds)
        actual_popularities = [x[-1] for x in test]
        predicted_popularities = []
        for instance in test:
            predictions = []
            for j in range(6):
                training_subset = train[j:j + 4]
                prediction = utils.compute_class_knn(instance, training_subset)
                predictions.append(prediction)
            # use simple majority voting
            np_arr = np.array(predictions)
            majority_vote = np.bincount(np_arr).argmax()
            predicted_popularities.append(majority_vote)
        for i in range(len(test)):
            if predicted_popularities[i] == actual_popularities[i]:
                num_correct_ensemble += 1
    accuracy_ensemble = num_correct_ensemble / len(trimmed_data)
    print("Accuracy ensemble kNN: " + str(round(accuracy_ensemble * 100, 2)) +
          "%")

    # compare with scikit-learn kNN
    df = pd.DataFrame(trimmed_data)
    X = np.array(df.loc[:, 0:9])  # features
    y = np.array(df.loc[:, 10])  # class label (popularity)

    # split into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

    knn = KNeighborsClassifier(n_neighbors=8)
    knn.fit(X_train, y_train)
    prediction = knn.predict(X_test)
    print("Scikit-learn accuracy (kNN): " +
          str(round(accuracy_score(y_test, prediction) * 100, 2)) + "%")
示例#20
0
def tdidt(instances, att_indexes, all_att_indexes, att_domains, class_index,
          header, tree):
    '''
    Uses the tdidt algorithm to build a decision tree based on a given set of data
    '''
    print("Current Tree: ", tree)
    print("att_indexes = ", att_indexes)
    if att_indexes == []:
        return
    att_index = entropy(instances, header, att_domains, att_indexes)
    att_indexes.remove(att_index)
    partition = partition_instances(instances, att_index,
                                    att_domains[att_index])
    partition_keys = partition.keys()

    tree.append("Attribute")
    tree.append(header[att_index])
    count = 0
    for i in range(len(att_domains[att_index])):
        print(i)
        tree.append(["Value", att_domains[att_index][count]])
        col = utils.get_column(partition.get(att_domains[att_index][i]),
                               len(header) - 1)
        items_in_col = []
        for item in col:
            if item not in items_in_col:
                items_in_col.append(item)
        if len(items_in_col) == 1:
            tree[2 + count].append([
                "Leaves",
                has_same_class_label(instances, header, att_index, class_index,
                                     col, items_in_col[0])
            ])
        elif len(att_indexes) == 0 and len(col) > 0:
            majority_class = compute_partition_stats(col)
            tree[2 + count].append([
                "Leaves",
                has_same_class_label(instances, header, att_index, class_index,
                                     col, majority_class)
            ])
        elif col == []:
            del tree[2 + count]
            return []
        else:
            tree[2 + count].append([])
            new_branch = [
                tdidt(partition.get(att_domains[att_index][i]), att_indexes,
                      all_att_indexes, att_domains, class_index, header,
                      tree[2 + count][2])
            ]
            if new_branch == [[]]:
                majority_class = compute_partition_stats(col)
                tree[2][2] = [
                    "Leaves",
                    has_same_class_label(instances, header, att_index,
                                         class_index, col, majority_class)
                ]
            else:
                tree[2][2] = new_branch
        count += 1
    return tree
示例#21
0
def compute_domains(indexes,data):
    domains = {}
    for i in indexes:
        domains[i] = list(set(utils.get_column(data, i)))
    
    return domains
示例#22
0
        def missing_concept_code_from_target_sheet(rd):

            concept_code = set(utils.get_column(wks, 0))
            concept_code_from_target_sheet = set(utils.get_column(wks_target_mapping, 0))

            missing_in_sheet_1 = concept_code_from_target_sheet - concept_code
            missing_in_sheet_2 = concept_code - concept_code_from_target_sheet


            if len(missing_in_sheet_1)  ==  0:
                print("\n Test for Concept Code missing from 'BIA to SDG Target Mapping' worksheet: Passed. Good Job!")
                rd["sheet"].write(rd["row"], 0, "Test for Concept Code missing from 'BIA to SDG Target Mapping' worksheet")
                rd["sheet"].write(rd["row"], 1, "Passed. Good Job!", rd["green"])
                rd["row"]+=1
                rd["sheet"].write(rd["row"], 0, None)
                rd["row"]+=1
            else:
                print("\nTest for Concept Code missing from 'BIA to SDG Target Mapping' worksheet: Failed")
                rd["sheet"].write(rd["row"], 0, "Test for Concept Code missing from 'BIA to SDG Target Mapping' worksheet")
                rd["sheet"].write(rd["row"], 1, "Failed", rd["red"])
                rd["row"]+=1
                rd["sheet"].write(rd["row"], 0, None)
                rd["row"]+=1

                table = PrettyTable(["Concept Codes"])
                table.border = True
                rd["sheet"].write_row(rd["row"], 0, tuple(["Concept Code"]), rd["bold"])
                rd["row"]+=1

                for x in missing_in_sheet_1:
                    # Fix Row count by adding number of rows used for title
                    table.add_row([x])
                    rd["sheet"].write_row(rd["row"], 0, tuple([x]))
                    rd["row"]+=1
                rd["sheet"].write(rd["row"], 0, None)
                rd["row"]+=1
                rd["sheet"].write(rd["row"], 0, None)
                rd["row"]+=1
                print(table)


            if len(missing_in_sheet_2)  ==  0:
                print("\n Test for Concept Code from this sheet but missing in 'BIA to SDG Target Mapping' worksheet: Passed. Good Job!")
                rd["sheet"].write(rd["row"], 0, "Test for Concept Code from this sheet but missing in 'BIA to SDG Target Mapping' worksheet")
                rd["sheet"].write(rd["row"], 1, "Passed. Good Job!", rd["green"])
                rd["row"]+=1
                rd["sheet"].write(rd["row"], 0, None)
                rd["row"]+=1
            else:
                print("\nTest for Concept Code from this sheet but missing in 'BIA to SDG Target Mapping' worksheet: Failed")
                rd["sheet"].write(rd["row"], 0, "Test for Concept Code from this sheet but missing in 'BIA to SDG Target Mapping' worksheet")
                rd["sheet"].write(rd["row"], 1, "Failed", rd["red"])
                rd["row"]+=1
                rd["sheet"].write(rd["row"], 0, None)
                rd["row"]+=1

                table = PrettyTable(["Concept Codes"])
                table.border = True
                rd["sheet"].write_row(rd["row"], 0, tuple(["Concept Code"]), rd["bold"])
                rd["row"]+=1

                for x in missing_in_sheet_2:
                    # Fix Row count by adding number of rows used for title
                    table.add_row([x])
                    rd["sheet"].write_row(rd["row"], 0, tuple([x]))
                    rd["row"]+=1
                rd["sheet"].write(rd["row"], 0, None)
                rd["row"]+=1
                rd["sheet"].write(rd["row"], 0, None)
                rd["row"]+=1
                print(table)
def make_kNN_prediction(test_instance, training_set, k):
    closest = get_k_closest(test_instance, training_set, k)
    class_labels = utils.get_column(closest, -1)
    return np.median(
        class_labels
    )  # the most common of the two class labels will be the median
示例#24
0
import utils
from random import randint

f = open("train.csv", "r")
out = open(utils.kRegressionTrainFile, "w")

for line in f:
    origin_call = utils.get_column(2, line)
    if origin_call is None:
        origin_call = 0
    origin_stand = utils.get_column(3, line)
    if origin_stand is None:
        origin_stand = 0
    taxi_id = utils.get_column(4, line)
    timestamp = utils.get_column(5, line)
    missing_data = utils.get_column(7, line)
    path = utils.get_column(8, line)
    if len(path) < 2 or missing_data:
        continue
    point = path[randint(1, len(path) - 1)]
    out.write("%s,%s,%s,%s,%f,%f,%f,%f,%f,%f\n" %
              (origin_call, origin_stand, taxi_id, timestamp, path[0][0],
               path[0][1], point[0], point[1], path[-1][0], path[-1][1]))
f = open("test.csv", "r")
out = open(utils.kRegressionTestFile, "w")
for line in f:
    id = utils.get_column(0, line)
    origin_call = utils.get_column(2, line)
    if origin_call == "NA":
        origin_call = 0
    origin_stand = utils.get_column(3, line)
def data_vis():
    data, header = utils.read_table("combined_data.csv", True)
    x_data, y_data = utils.get_column(data, 3), utils.get_column(data, 5)
    dv.scatter_plot(x_data, y_data, "Poverty Levels v. Crime Rate",
                    "Poverty Levels (%)", "Crime Rate per 100,000 people", 10,
                    "Poverty_v_Crime_graphed.png", 100, True)