示例#1
0
def get_classifier_agreement_increase_table(target_weight_list,
                                            n_simulations=1000):
    agreement_before = np.zeros(n_simulations)
    agreement_after = np.zeros(n_simulations)
    annotations, labels = load_ambiguous_annotations_labeled(
        annotations_labeled_filename)
    result = ""

    for weight in target_weight_list:
        for i in xrange(n_simulations):
            classifier = joblib.load(classifier_pickle_filename)

            pool_annotations, test_annotations, pool_labels, test_labels = train_test_split(
                annotations, labels, test_size=0.33)

            # validate the initial state of the classifier
            agreement_before[i] = get_agreement(
                classifier, (test_annotations, test_labels))

            # test: target train on the entire pool, validate again
            classifier.target_weight = weight
            classifier.train_target_online(pool_annotations, pool_labels)
            agreement_after[i] = get_agreement(classifier,
                                               (test_annotations, test_labels))

        result += str(weight), np.mean(agreement_after - agreement_before)
    return result
def get_mturk_pickled_classifier_agreement(classifier_pickle_file,
                                           mturk_vote_file_path,
                                           classifier_class, **kwargs):
    classifier = joblib.load(classifier_pickle_file)
    mturk_labeled_data = data.load_ambiguous_annotations_labeled(
        mturk_vote_file_path)
    return get_agreement(classifier, mturk_labeled_data)
def plot_learning_curves(classifier_pickle_filename, target_weight=1000, n_simulations=100, test_size=0.33):

  classifier_loaded = joblib.load(CLASSIFIER_PICKLE_FOLDER + classifier_pickle_filename)
  annotations_loaded, labels_loaded = load_ambiguous_annotations_labeled(ANNOTATIONS_LABELED_FILENAME)

  pool, _, _, _ = train_test_split(annotations_loaded, labels_loaded, test_size = test_size) 
  n_iterations = len(pool) + 1

  passive_accuracy = np.zeros((n_simulations, n_iterations))
  active_accuracy = np.zeros((n_simulations, n_iterations))

  counter = CountPrinter(n_simulations)

  for run_number in range(n_simulations):
    # securing statelessness
    classifier = deepcopy(classifier_loaded)
    annotations= deepcopy(annotations_loaded)
    labels = deepcopy(labels_loaded)

    train_test_set = train_test_split(annotations, labels, test_size = test_size) 

    passive_accuracy[run_number] = get_accuracy_progression(
      train_test_set, classifier, annotations, labels, target_weight, PassiveLearner)
    active_accuracy[run_number] = get_accuracy_progression(
      train_test_set, classifier, annotations, labels, target_weight, UncertaintySamplingLeastConfidenceActiveLearner)

    counter.count()

  passive_avg_accuracy_progression = np.mean(passive_accuracy, axis=0)
  active_avg_accuracy_progression = np.mean(active_accuracy, axis=0)

  plot_filename = PLOT_FOLDER + classifier_pickle_filename + '_weight' + str(target_weight)

  plot_curves.plot_curves(plot_filename, title="Average iteration accuracy for %s simulations" % n_simulations,
   PassiveLearner=passive_avg_accuracy_progression, ActiveLearner=active_avg_accuracy_progression)
def crossvalidation(mturk_vote_file_path,
                    classifier_class,
                    n_folds=2,
                    verbose=False,
                    **kwargs):
    # train a classifier on ambiguous annotations
    ambig_annotations, labels = data.load_ambiguous_annotations_labeled(
        mturk_vote_file_path)
    ambig_annotations = np.array(ambig_annotations)
    labels = np.array(labels)

    folds = cross_validation.KFold(len(ambig_annotations),
                                   n_folds=n_folds,
                                   indices=True)

    counter = CountPrinter(n_folds)
    fold_errors = []

    for train_indices, test_indices in folds:
        if verbose: counter.count()
        classifier = classifier_class(**kwargs)
        classifier.train(ambig_annotations[train_indices],
                         labels[train_indices])
        predicted_group_numbers = classifier.predict(
            ambig_annotations[test_indices])
        voted_group_numbers = [
            data.Annotation.GROUP_MAPPING[label] for label in labels
        ]
        agreement = [
            int(predicted == voted) for predicted, voted in zip(
                predicted_group_numbers, voted_group_numbers)
        ]
        fold_errors.append(np.mean(agreement))

    return np.mean(fold_errors)
def get_mturk_classifier_agreement(ssc_file_path, mturk_vote_file_path, classifier_class, **kwargs):
  # train a classifier on unambiguous annotations
  unambig_annotations = data.load_unambiguous_annotations(ssc_file_path)
  classifier = classifier_class(**kwargs)
  classifier.train(unambig_annotations)

  # read mturk annotations 
  mturk_labeled_data = data.load_ambiguous_annotations_labeled(mturk_vote_file_path)

  return get_agreement(classifier, mturk_labeled_data)
def get_mturk_classifier_agreement(ssc_file_path, mturk_vote_file_path,
                                   classifier_class, **kwargs):
    # train a classifier on unambiguous annotations
    unambig_annotations = data.load_unambiguous_annotations(ssc_file_path)
    classifier = classifier_class(**kwargs)
    classifier.train(unambig_annotations)

    # read mturk annotations
    mturk_labeled_data = data.load_ambiguous_annotations_labeled(
        mturk_vote_file_path)

    return get_agreement(classifier, mturk_labeled_data)
def get_mturk_classifier_agreement_label(mturk_vote_file_path, classifier_class, **kwargs):
  # train a classifier on ambiguous annotations
  ambig_annotations, labels = data.load_ambiguous_annotations_labeled(mturk_vote_file_path)

  classifier = classifier_class(**kwargs)
  classifier.train(ambig_annotations,labels)

  # classify annotations and output the agreement
  predicted_group_numbers = classifier.predict(ambig_annotations)
  voted_group_numbers = [data.Annotation.GROUP_MAPPING[label] for label in labels]
  agreement = [int(predicted == voted) for predicted, voted in zip(predicted_group_numbers, voted_group_numbers)]

  return np.mean(agreement)
def plot_learning_curves(classifier_pickle_filename,
                         target_weight=1000,
                         n_simulations=100,
                         test_size=0.33):

    classifier_loaded = joblib.load(CLASSIFIER_PICKLE_FOLDER +
                                    classifier_pickle_filename)
    annotations_loaded, labels_loaded = load_ambiguous_annotations_labeled(
        ANNOTATIONS_LABELED_FILENAME)

    pool, _, _, _ = train_test_split(annotations_loaded,
                                     labels_loaded,
                                     test_size=test_size)
    n_iterations = len(pool) + 1

    passive_accuracy = np.zeros((n_simulations, n_iterations))
    active_accuracy = np.zeros((n_simulations, n_iterations))

    counter = CountPrinter(n_simulations)

    for run_number in range(n_simulations):
        # securing statelessness
        classifier = deepcopy(classifier_loaded)
        annotations = deepcopy(annotations_loaded)
        labels = deepcopy(labels_loaded)

        train_test_set = train_test_split(annotations,
                                          labels,
                                          test_size=test_size)

        passive_accuracy[run_number] = get_accuracy_progression(
            train_test_set, classifier, annotations, labels, target_weight,
            PassiveLearner)
        active_accuracy[run_number] = get_accuracy_progression(
            train_test_set, classifier, annotations, labels, target_weight,
            UncertaintySamplingLeastConfidenceActiveLearner)

        counter.count()

    passive_avg_accuracy_progression = np.mean(passive_accuracy, axis=0)
    active_avg_accuracy_progression = np.mean(active_accuracy, axis=0)

    plot_filename = PLOT_FOLDER + classifier_pickle_filename + '_weight' + str(
        target_weight)

    plot_curves.plot_curves(
        plot_filename,
        title="Average iteration accuracy for %s simulations" % n_simulations,
        PassiveLearner=passive_avg_accuracy_progression,
        ActiveLearner=active_avg_accuracy_progression)
def get_mturk_classifier_agreement_label(mturk_vote_file_path,
                                         classifier_class, **kwargs):
    # train a classifier on ambiguous annotations
    ambig_annotations, labels = data.load_ambiguous_annotations_labeled(
        mturk_vote_file_path)

    classifier = classifier_class(**kwargs)
    classifier.train(ambig_annotations, labels)

    # classify annotations and output the agreement
    predicted_group_numbers = classifier.predict(ambig_annotations)
    voted_group_numbers = [
        data.Annotation.GROUP_MAPPING[label] for label in labels
    ]
    agreement = [
        int(predicted == voted) for predicted, voted in zip(
            predicted_group_numbers, voted_group_numbers)
    ]

    return np.mean(agreement)
def crossvalidation(mturk_vote_file_path, classifier_class, n_folds = 2, verbose=False, **kwargs):
  # train a classifier on ambiguous annotations
  ambig_annotations, labels = data.load_ambiguous_annotations_labeled(mturk_vote_file_path)
  ambig_annotations = np.array(ambig_annotations)
  labels = np.array(labels)

  folds = cross_validation.KFold(len(ambig_annotations), n_folds=n_folds, indices=True)

  counter = CountPrinter(n_folds)
  fold_errors = []

  for train_indices, test_indices in folds:
    if verbose: counter.count()
    classifier = classifier_class(**kwargs)
    classifier.train(ambig_annotations[train_indices], labels[train_indices])
    predicted_group_numbers = classifier.predict(ambig_annotations[test_indices])
    voted_group_numbers = [data.Annotation.GROUP_MAPPING[label] for label in labels]
    agreement = [int(predicted == voted) for predicted, voted in zip(predicted_group_numbers, voted_group_numbers)]
    fold_errors.append(np.mean(agreement))
  
  return np.mean(fold_errors)
示例#11
0
def get_classifier_agreement_increase_table(target_weight_list, n_simulations = 1000):
  agreement_before = np.zeros(n_simulations)
  agreement_after = np.zeros(n_simulations)
  annotations, labels = load_ambiguous_annotations_labeled(annotations_labeled_filename)
  result = ""

  for weight in target_weight_list:
    for i in xrange(n_simulations):
      classifier = joblib.load(classifier_pickle_filename)

      pool_annotations, test_annotations, pool_labels, test_labels = train_test_split(
        annotations, labels, test_size = 0.33)  

      # validate the initial state of the classifier
      agreement_before[i] = get_agreement(classifier, (test_annotations, test_labels))

      # test: target train on the entire pool, validate again 
      classifier.target_weight = weight
      classifier.train_target_online(pool_annotations, pool_labels)
      agreement_after[i] = get_agreement(classifier, (test_annotations, test_labels))

    result += str(weight), np.mean(agreement_after - agreement_before)
  return result
示例#12
0
    return (y - x for x, y in itertools.izip(
        itertools.islice(seq, 0,
                         len(seq) - 1), itertools.islice(seq, 1, len(seq))))


def format_float_list(seq, sep=" "):
    result = ""
    for item in seq:
        result += "%.2f" % item
        result += sep
    return result


classifier = joblib.load(classifier_pickle_filename)

annotations, labels = load_ambiguous_annotations_labeled(
    annotations_labeled_filename)

N_SIMULATIONS = 100
accuracy_diffs = np.zeros((2, N_SIMULATIONS))
accuracy_diff_gains = np.zeros(N_SIMULATIONS)

for i in range(N_SIMULATIONS):
    accuracy_progression_passive = get_accuracy_progression(
        classifier, annotations, labels, 1000, PassiveLearner)
    accuracy_diff_passive = accuracy_progression_passive[
        -1] - accuracy_progression_passive[0]

    accuracy_progression_active = get_accuracy_progression(
        classifier, annotations, labels, 1000,
        UncertaintySamplingLeastConfidenceActiveLearner)
    accuracy_diff_active = accuracy_progression_active[
示例#13
0
def get_mturk_pickled_classifier_agreement(classifier_pickle_file, mturk_vote_file_path, classifier_class, **kwargs):
  classifier = joblib.load(classifier_pickle_file)
  mturk_labeled_data = data.load_ambiguous_annotations_labeled(mturk_vote_file_path)
  return get_agreement(classifier, mturk_labeled_data)
示例#14
0
#!/usr/bin/env python
from data import load_ambiguous_annotations_labeled
from mturk_classifier_agreement import get_agreement
from sklearn.externals.joblib import load, Parallel, delayed
from train_and_serialize import train_and_serialize
from copy import deepcopy
from sklearn.cross_validation import train_test_split
import numpy as np

MTURK_VOTE_FILE = '../vote_results_thr0.75-new6.csv'
annotations, labels = load_ambiguous_annotations_labeled(MTURK_VOTE_FILE)
''' This function gets an accuracy gain for RANDOM train/test split of data
'''


def get_accuracy_gain(loaded_classifier):
    classifier = deepcopy(loaded_classifier)
    pool_annotations, test_annotations, pool_labels, test_labels = train_test_split(
        annotations, labels, test_size=0.33)

    accuracy_before = get_agreement(classifier,
                                    (test_annotations, test_labels))
    classifier.train_target_online(pool_annotations, pool_labels)
    accuracy_after = get_agreement(classifier, (test_annotations, test_labels))

    return (accuracy_after, accuracy_before)


def get_mean_accuracy_gain(classifier_pickle_file, n_runs, **kwargs):
    loaded_classifier = load(classifier_pickle_file)
    for k, v in kwargs.items():
示例#15
0
def diff_iter(seq):
  return (y - x for x, y in
   itertools.izip(itertools.islice(seq, 0, len(seq) - 1), itertools.islice(seq, 1, len(seq)))
  )

def format_float_list(seq, sep=" "):
  result = ""
  for item in seq:
    result += "%.2f" % item
    result += sep
  return result

classifier = joblib.load(classifier_pickle_filename)

annotations, labels = load_ambiguous_annotations_labeled(annotations_labeled_filename)

N_SIMULATIONS = 100
accuracy_diffs = np.zeros((2, N_SIMULATIONS))
accuracy_diff_gains = np.zeros(N_SIMULATIONS)


for i in range(N_SIMULATIONS):
  accuracy_progression_passive = get_accuracy_progression(classifier, annotations, labels, 1000, PassiveLearner)
  accuracy_diff_passive = accuracy_progression_passive[-1] - accuracy_progression_passive[0]
  
  accuracy_progression_active = get_accuracy_progression(classifier, annotations, labels, 1000, UncertaintySamplingLeastConfidenceActiveLearner)
  accuracy_diff_active = accuracy_progression_active[-1] - accuracy_progression_active[0]

  accuracy_diff_gains[i] = accuracy_diff_active - accuracy_diff_passive
示例#16
0
#!/usr/bin/env python
from data import load_ambiguous_annotations_labeled
from mturk_classifier_agreement import get_agreement
from sklearn.externals.joblib import load, Parallel, delayed
from train_and_serialize import train_and_serialize
from copy import deepcopy
from sklearn.cross_validation import train_test_split
import numpy as np

MTURK_VOTE_FILE = '../vote_results_thr0.75-new6.csv'
annotations, labels = load_ambiguous_annotations_labeled(MTURK_VOTE_FILE)

''' This function gets an accuracy gain for RANDOM train/test split of data
''' 
def get_accuracy_gain(loaded_classifier):
  classifier = deepcopy(loaded_classifier)
  pool_annotations, test_annotations, pool_labels, test_labels = train_test_split(
        annotations, labels, test_size = 0.33)

  accuracy_before = get_agreement(classifier, (test_annotations, test_labels))
  classifier.train_target_online(pool_annotations, pool_labels)
  accuracy_after = get_agreement(classifier, (test_annotations, test_labels))
  
  return (accuracy_after, accuracy_before)

def get_mean_accuracy_gain(classifier_pickle_file, n_runs, **kwargs):
  loaded_classifier = load(classifier_pickle_file)
  for k, v in kwargs.items():
    loaded_classifier.__dict__[k] = v

  accuracies_before = np.zeros(n_runs)