Exemplo n.º 1
0
def one_classification_prediction(directory, log_file, user_id, class_counts,
                                  verbose):
    data_directory = bias_util.data_directory
    data_file_name = bias_util.data_file_name

    all_logs, attr_logs, item_logs, help_logs, cat_logs = bias_util.recreate_logs(
        directory, log_file)
    dataset, attr_map = bias_util.read_data(data_directory, data_file_name)
    classification, decisions_labels, decisions_cat, decisions_help = bias_util.get_classifications_and_decisions(
        all_logs, dataset)

    X = []
    Y = []

    # decisions_labels of the form (index, user classification, actual classification, player id)
    prev = -1
    cur = -1
    for i in range(0, len(decisions_labels)):
        prev = cur
        cur = decisions_labels[i][1]

        if (not cur in class_counts):
            class_counts[cur] = 1
        else:
            class_counts[cur] += 1

        if (prev != -1 and
                cur != -1):  # and prev != cur): # create a training instance
            # comment prev != cur condition to allow repetitions
            X.append([bias_util.pos_to_num_map[prev]])
            Y.append(bias_util.pos_to_num_map[cur])

    return X, Y, class_counts
Exemplo n.º 2
0
def get_id_confusion_matrix(logs, dataset):
    id_confusion = np.zeros((7, 7))
    all_data = dict()
    classification, decisions_labels, decisions_cat, decisions_help = bias_util.get_classifications_and_decisions(
        logs, dataset)

    for i in range(0, len(dataset)):
        cur_data = dataset[i].get_full_map()
        cur_id = cur_data['Name'].replace('Player ', '')
        actual_pos = cur_data['Position']
        if (cur_id in classification):
            user_pos = classification[cur_id]
            id_confusion[bias_util.pos_to_num_map[user_pos],
                         bias_util.pos_to_num_map[actual_pos]] += 1
            key = 'user:'******',actual:' + actual_pos
            if key in all_data:
                all_data[key].append(cur_data)
            else:
                all_data[key] = [cur_data]

    return id_confusion, bias_util.pos_to_num_map, all_data
Exemplo n.º 3
0
def write_svm_results(directory, file_name, log_file, to_plot, fig_num,
                      verbose):
    if (verbose):
        print 'Writing and Plotting SVM Data: ', file_name

    data_directory = bias_util.data_directory
    data_file_name = bias_util.data_file_name

    all_logs, attr_logs, item_logs, help_logs, cat_logs = bias_util.recreate_logs(
        directory, log_file)
    logs = item_logs
    dataset, attr_map = bias_util.read_data(data_directory, data_file_name)
    classification, decisions_labels, decisions_cat, decisions_help = bias_util.get_classifications_and_decisions(
        logs, dataset)
    all_data = dict()
    all_data['classifications'] = classification

    x_data = []
    y_data = []
    data = []

    features = bias_util.get_bball_player(
        dataset,
        list(classification.keys())[0]).get_map().keys()
    features.remove('Name')
    features = sorted(features)

    for key in classification.keys():
        cur_player = bias_util.get_bball_player(dataset, key)
        cur_map = cur_player.get_map()
        cur_map['*Classification'] = classification[key]
        data.append(cur_map)

        cur_x = []
        for i in range(0, len(features)):
            cur_x.append(cur_map[features[i]])
        cur_x = [float(x) for x in cur_x]
        x_data.append(cur_x)
        y_data.append(bias_util.pos_to_num_map[classification[key]])

    svm_weights, svm_classes = get_svm_weights(x_data, y_data)
    weights_map = dict()
    i = 0
    for j in range(0, len(svm_classes)):
        for k in range(j + 1, len(svm_classes)):
            key = bias_util.num_to_pos_map[
                j] + ' - ' + bias_util.num_to_pos_map[k]
            value = svm_weights[i]
            weights_map[key] = value
            i += 1

    all_data['features'] = features
    all_data['weights'] = weights_map
    all_data['classifications'] = data

    if not os.path.exists(directory):
        os.makedirs(directory)
    f_out = open(directory + file_name, 'w+')
    f_out.write('{')
    f_out.write('"features":' + json.dumps(all_data['features']) + ',')
    f_out.write('"weights":' + json.dumps(all_data['weights']) + ',')
    f_out.write('"classifications":' + json.dumps(all_data['classifications']))
    f_out.write('}')
    f_out.close()

    if (to_plot == True):
        for key in weights_map.keys():
            plot_svm(
                features, weights_map[key], 'SVM Feature Weights: ' + key,
                'Feature', 'Weight', directory.replace('/logs/', '/plots/'),
                file_name.replace('.json',
                                  '.png').replace('svm', 'svm_' + key),
                fig_num, verbose)
            fig_num += 1

    return svm_weights
Exemplo n.º 4
0
def write_classification_accuracy(directory, file_name, log_file, fig_num,
                                  verbose):
    print 'Writing and Plotting Accuracy Over Time: ', file_name

    data_directory = bias_util.data_directory
    data_file_name = bias_util.data_file_name

    all_logs, attr_logs, item_logs, help_logs, cat_logs = bias_util.recreate_logs(
        directory, log_file)
    logs = item_logs
    dataset, attr_map = bias_util.read_data(data_directory, data_file_name)
    classification, decisions_labels, decisions_cat, decisions_help = bias_util.get_classifications_and_decisions(
        logs, dataset)

    total_labeled = 0
    total_correct = 0
    decision_points = np.arange(1, len(all_logs) + 1)
    accuracy = [-1] * len(all_logs)
    current_labels = dict()
    correct_labels = dict()

    for i in range(0, len(decisions_labels)):
        cur = decisions_labels[i]
        cur_id = cur[3]
        correct_labels[cur_id] = cur[2]

        if ((cur_id not in current_labels and cur[1] != 'Un-Assign') or
            (cur_id in current_labels and current_labels[cur_id] == 'Un-Assign'
             and cur[1] != 'Un-Assign')):
            total_labeled += 1
        elif (cur_id in current_labels and cur[1] == 'Un-Assign'
              and current_labels[cur_id] != 'Un-Assign'):
            total_labeled -= 1

        if (cur_id not in current_labels and cur[1] == correct_labels[cur_id]):
            total_correct += 1
        elif (cur_id in current_labels
              and current_labels[cur_id] != correct_labels[cur_id]
              and cur[1] == correct_labels[cur_id]):
            total_correct += 1

        if (cur_id in current_labels
                and current_labels[cur_id] == correct_labels[cur_id]
                and cur[1] != correct_labels[cur_id]):
            total_correct -= 1

        if (total_labeled != 0):
            accuracy[cur[0]] = total_correct / float(total_labeled)
        else:
            accuracy[cur[0]] = 0
        current_labels[cur_id] = cur[1]
    if (len(decisions_labels) < 1):
        first_decision = -1
    else:
        first_decision = decisions_labels[0][0]
    accuracy = bias_util.remove_defaults(accuracy, first_decision)
    accuracy = bias_util.forward_fill(accuracy)

    if not os.path.exists(directory):
        os.makedirs(directory)
    f_out = open(directory + file_name, 'w+')
    f_out.write('[')
    for i in range(0, len(decisions_labels)):
        f_out.write('{')
        f_out.write('"interaction_number":"' + str(decisions_labels[i][0]) +
                    '",')
        f_out.write('"data_point":"' + str(decisions_labels[i][3]) + '",')
        f_out.write('"actual_class":"' + str(decisions_labels[i][2]) + '",')
        f_out.write('"user_class":"' + str(decisions_labels[i][1]) + '",')
        f_out.write('"current_accuracy":"' +
                    str(accuracy[decisions_labels[i][0]]) + '"')
        f_out.write('}')
        if (i != len(decisions_labels) - 1):
            f_out.write(',')
    f_out.write(']')
    f_out.close()

    plot_classification_accuracy(decision_points, accuracy,
                                 'Accuracy Over Time', 'Interactions',
                                 'Accuracy',
                                 directory.replace('/logs/', '/plots/'),
                                 file_name.replace('.json', '.png'),
                                 decisions_labels, fig_num, verbose)
Exemplo n.º 5
0
    def simulate_bias_computation(self, plot_directory, time, interaction_types, num_quantiles, min_weight, max_weight, window_method, rolling_dist, marks, fig_num, verbose):
        dpc_metric = [-1] * len(self.all_logs)
        dpd_metric = [-1] * len(self.all_logs)
        ac_metric = dict()
        ad_metric = dict()
        awc_metric = dict()
        awd_metric = dict()
        for key in self.attr_value_map.keys(): 
            ac_metric[key] = [-1] * len(self.all_logs)
            ad_metric[key] = [-1] * len(self.all_logs)
            awc_metric[key] = [-1] * len(self.all_logs)
            awd_metric[key] = [-1] * len(self.all_logs)
        
        # classifications is a map from (id, final classification)
        # decisions_labels is a list of tuples (log index, user classification, actual classification)
        
        classifications, decisions_labels, decisions_cat, decisions_help = bias_util.get_classifications_and_decisions(self.all_logs, self.dataset)
        label_indices = [int(tup[0]) for tup in decisions_labels]
        if (label_indices[len(label_indices) - 1] != len(self.all_logs) - 1):
            label_indices.append(len(self.all_logs) - 1)
        cat_indices = [int(tup[0]) for tup in decisions_cat]
        if (len(cat_indices) > 0 and cat_indices[len(cat_indices) - 1] != len(self.all_logs) - 1): 
            cat_indices.append(len(self.all_logs) - 1)

        if not os.path.exists(self.directory):
            os.makedirs(self.directory)
        f_out = open(self.directory + self.out_file_name, 'w+')
        f_out.write('[')
        last_line = None
        last_written_line = None
        
        if (not window_method in bias_util.window_methods):
            print '**Error: Invalid window method.'
            sys.exit(0)
        elif (window_method == 'fixed' and rolling_dist > 0):
            start_iter = rolling_dist
        elif (window_method == 'classification_v1' and len(decisions_labels) > 0):
            start_iter = decisions_labels[0][0]
        elif (window_method == 'classification_v2' and len(decisions_labels) > 0):
            start_iter = 0
        else: 
            start_iter = 1
            
        prev_decision = 0
            
            
        # iterate through the logs
        for i in range(start_iter, len(self.all_logs)):
            if (verbose):
                print 'Interaction', i
            if (last_line != None):
                f_out.write(last_line + ',')
                last_written_line = last_line
                last_line = None
            line_contents = []

            # figure out which set of logs to use for this iteration based on the window method
            attr_log_set, item_log_set, prev_decision = bias_util.get_logs_by_window_method(window_method, self.all_logs, self.item_logs, self.attr_logs, i, rolling_dist, label_indices, cat_indices, prev_decision)
            if (window_method == 'classification_v1' and i not in label_indices): 
                continue
            if (window_method == 'category_v1' and i not in cat_indices):
                continue
            
            # compute the metrics that use data item logs
            if (len(item_log_set) > 0):
                cur_dpc = self.compute_data_point_coverage(item_log_set, time, interaction_types, verbose)
                self.dpc_logs.append(cur_dpc)
                if (cur_dpc is None):
                    dpc_metric[i] = -1
                else:
                    dpc_metric[i] = cur_dpc['metric_level']
                line_contents.append({'metric': 'data_point_coverage', 'log': cur_dpc})
                
                cur_dpd = self.compute_data_point_distribution(item_log_set, time, interaction_types, verbose)
                self.dpd_logs.append(cur_dpd)
                if (cur_dpc is None):
                    dpd_metric[i] = -1
                else:
                    dpd_metric[i] = cur_dpd['metric_level']
                line_contents.append({'metric': 'data_point_distribution', 'log': cur_dpd})
                
                cur_ac = self.compute_attribute_coverage(item_log_set, time, interaction_types, num_quantiles, verbose)
                self.ac_logs.append(cur_ac)
                if (cur_ac is not None):
                    for attribute in cur_ac['info']['attribute_vector'].keys():
                        cur_metric = cur_ac['info']['attribute_vector'][attribute]['metric_level']
                        ac_metric[attribute][i] = cur_metric
                line_contents.append({'metric': 'attribute_coverage', 'log': cur_ac})
                
                cur_ad = self.compute_attribute_distribution(item_log_set, time, interaction_types, verbose)
                self.ad_logs.append(cur_ad)
                if (cur_ad is not None):
                    for attribute in cur_ad['info']['attribute_vector'].keys():
                        cur_metric = cur_ad['info']['attribute_vector'][attribute]['metric_level']
                        ad_metric[attribute][i] = cur_metric
                line_contents.append({'metric': 'attribute_distribution', 'log': cur_ad})
                    
            # compute the metrics that use attribute logs
            if (len(attr_log_set) > 0):
                cur_awc = self.compute_attribute_weight_coverage(attr_log_set, time, interaction_types, num_quantiles, min_weight, max_weight, verbose)
                self.awc_logs.append(cur_awc)
                if (cur_awc is not None):
                    for attribute in cur_awc['info']['attribute_vector'].keys():
                        cur_metric = cur_awc['info']['attribute_vector'][attribute]['metric_level']
                        awc_metric[attribute][i] = cur_metric
                line_contents.append({'metric': 'attribute_weight_coverage', 'log': cur_awc})
                
                cur_awd = self.compute_attribute_weight_distribution(attr_log_set, time, interaction_types, min_weight, max_weight, verbose)
                self.awd_logs.append(cur_awd)
                if (cur_awd is not None):
                    for attribute in cur_awd['info']['attribute_vector'].keys():
                        cur_metric = cur_awd['info']['attribute_vector'][attribute]['metric_level']
                        awd_metric[attribute][i] = cur_metric
                line_contents.append({'metric': 'attribute_weight_distribution', 'log': cur_awd})
                    
            if (len(line_contents) > 0):
                line = ''
                for j in range(0, len(line_contents)):
                    cur_log_info = line_contents[j]
                    line = line + json.dumps(cur_log_info['metric']) + ':' + json.dumps(cur_log_info['log'])
                    if (j != len(line_contents) - 1):
                        line = line + ','
                        
                num_interactions = i
                if (window_method == 'fixed' and rolling_dist > -1): 
                    num_interactions = rolling_dist
                elif (window_method == 'classification_v1' and label_indices.index(i) > 0):
                    num_interactions = i - label_indices[label_indices.index(i) - 1]
                elif (window_method == 'classification_v2'):
                    num_interactions = i - prev_decision + 1
                last_line = '{"computing_at_interaction":"' + str(i) + '/' + str(len(self.all_logs)) + '","num_interactions":' + str(num_interactions) + ',"window_method":"' + window_method + '",'
                if (window_method == 'fixed'):
                    last_line += '"rolling_distance":' + str(rolling_dist) + ','
                if (window_method == 'classification_v1' or window_method == 'classification_v2'):
                    last_line += '"label_indices":' + str(label_indices) + ','
                last_line += '"bias_metrics":{' + line + '}}'
        if (last_line == None):
            # go back and remove the comma from the end of the file
            f_out.close()
            if not os.path.exists(self.directory):
                os.makedirs(self.directory)
            file = open(self.directory + self.out_file_name, 'r+')
            file.seek(0, os.SEEK_END)
            pos = file.tell() - 1
            while pos > 0 and file.read(1) != '\n':
                pos -= 1
                file.seek(pos, os.SEEK_SET)
            if pos > 0:
                file.seek(pos, os.SEEK_SET)
                file.truncate()
            file.close()
            if not os.path.exists(self.directory):
                os.makedirs(self.directory)
            f_out = open(self.directory + self.out_file_name, 'a')
            f_out.write(last_written_line[0 : len(last_written_line)])
        else: 
            f_out.write(last_line)  
        f_out.write(']')
        f_out.close()
        
        # get the average computed value for each metric
        avg_values = dict()
        avg_values['DPC'] = np.mean(np.array([x for x in dpc_metric if x > -1]).astype(np.float))
        avg_values['DPD'] = np.mean(np.array([x for x in dpd_metric if x > -1]).astype(np.float))
        for key in self.attr_value_map.keys():
            avg_values['AC_' + key.replace(' ', '')] = np.mean(np.array([x for x in ac_metric[key] if x > -1]).astype(np.float))
            avg_values['AD_' + key.replace(' ', '')] = np.mean(np.array([x for x in ad_metric[key] if x > -1]).astype(np.float))
            avg_values['AWC_' + key.replace(' ', '')] = np.mean(np.array([x for x in awc_metric[key] if x > -1]).astype(np.float))
            avg_values['AWD_' + key.replace(' ', '')] = np.mean(np.array([x for x in awd_metric[key] if x > -1]).astype(np.float))
        
#        if (verbose):
#            print '**** For interaction logs: ', self.in_file_name
#            print '> Average values: ', avg_values
        
        # fill in the -1 default values in the metric arrays
        first_decision_point = dpc_metric.index(filter(lambda x : x != -1, dpc_metric)[0])
        dpc_metric = bias_util.remove_defaults(dpc_metric, first_decision_point)
        dpd_metric = bias_util.remove_defaults(dpd_metric, first_decision_point)
        for attribute in self.attr_value_map.keys():
            ac_metric[attribute] = bias_util.remove_defaults(ac_metric[attribute], first_decision_point)
            ad_metric[attribute] = bias_util.remove_defaults(ad_metric[attribute], first_decision_point)
            awc_metric[attribute] = bias_util.remove_defaults(awc_metric[attribute], first_decision_point)
            awd_metric[attribute] = bias_util.remove_defaults(awd_metric[attribute], first_decision_point)
        
        # plot all of the results
        x_label = 'Interactions'
        y_label = 'Metric Value'
        x_values = np.arange(1, len(self.all_logs) + 1)
        user_id = self.in_file_name.replace('interactions_', '').replace('.json', '')
        marker_decisions = []
        if (marks == 'classifications'):
            marker_decisions = decisions_labels
        elif (marks == 'categories'):
            marker_decisions = decisions_cat
        
        # plot the data point metrics
        bias_util.plot_metric(x_values, dpc_metric, 'DPC - Pilot', x_label, y_label, plot_directory, user_id + '_dpc.png', marker_decisions, marks, fig_num, verbose)
        bias_util.plot_metric(x_values, dpd_metric, 'DPD - Pilot', x_label, y_label, plot_directory, user_id + '_dpd.png', marker_decisions, marks, fig_num + 1, verbose)
        
        # plot one series of subplots plots for each attribute and attribute weight metric
        attributes = ac_metric.keys()
        attr_x_values = []
        attr_y_values = []
        titles = []
        for attribute in attributes:
            attr_x_values.append(x_values)
            attr_y_values.append(ac_metric[attribute])
            titles.append('AC (' + attribute + ') - Pilot')
        bias_util.plot_metric_with_subplot(attr_x_values, attr_y_values, titles, x_label, y_label, plot_directory, user_id + '_ac' + '.png', marker_decisions, marks, fig_num + 2, verbose)
            
        attr_y_values = []
        titles = []
        for attribute in attributes:
            attr_y_values.append(ad_metric[attribute])
            titles.append('AD (' + attribute + ') - Pilot')
        bias_util.plot_metric_with_subplot(attr_x_values, attr_y_values, titles, x_label, y_label, plot_directory, user_id + '_ad' + '.png', marker_decisions, marks, fig_num + 3, verbose)
            
        attr_y_values = []
        titles = []
        for attribute in attributes:
            attr_y_values.append(awc_metric[attribute])
            titles.append('AWC (' + attribute + ') - Pilot')
        bias_util.plot_metric_with_subplot(attr_x_values, attr_y_values, titles, x_label, y_label, plot_directory, user_id + '_awc' + '.png', marker_decisions, marks, fig_num + 4, verbose)
            
        attr_y_values = []
        titles = []
        for attribute in attributes:
            attr_y_values.append(awd_metric[attribute])
            titles.append('AWD (' + attribute + ') - Pilot')
        bias_util.plot_metric_with_subplot(attr_x_values, attr_y_values, titles, x_label, y_label, plot_directory, user_id + '_awd' + '.png', marker_decisions, marks, fig_num + 5, verbose)