def main():
    db = getMACResultsDB()
    experiment_name = 'experiment2b'
    accuracies = []
    labelling_accuracies = []
    number_training_examples = []
    errors = []
    try:
        assert experiment_name in db.collection_names()
        cursor = db[experiment_name].find()
        fractions = []
        for doc in cursor:
            print(doc)
            accuracies.append(doc['accuracy'])
            labelling_accuracies.append(doc['labelling accuracy'])
            errors.append(doc['error'])
            fractions.append(doc['fraction'])
            number_training_examples.append(doc['m'])
    except AssertionError:
        fractions, results = experiment2b()
        for result in results:
            accuracies.append(result[0])
            errors.append(result[1])

    plotMachineAccuracy(accuracies, errors, fractions, \
                        'plots/experiment2b_machine_performance')

    plotLabellingAccuracy(labelling_accuracies, fractions, \
                          'plots/experiment2b_labelling_accuracy')

    plotNumberTrainingExamples(number_training_examples, fractions, \
                               'plots/experiment2b_number_examples')
示例#2
0
def experiment2d():

    db = getMACResultsDB()
    collection = db['experiment2d']

    experiment2aNoise('experiment2d_2a_noise')
    """
def plotNumberTrainingExamples(number_training_examples, fractions, filename, \
  experiment='experiment2a'):

    db = getMACResultsDB()
    cursor = db[experiment].find({'name': 'gold benchmark'})
    for doc in cursor:
        gold_benchmark = doc['m']

    fig, ax = plt.subplots()

    ax.plot(np.arange(-0.02,1.03,0.01), np.ones((105,))*gold_benchmark, \
            color='#726DA8', label='total training examples')

    for i, fraction in enumerate(fractions):
        ax.plot(fraction, number_training_examples[i], \
                'o', color='#B8336A')

    ax.set_ylim(0, 10200)
    ax.set_xlim(-0.02, 1.02)
    ax.set_xlabel('majority class cluster proportion')
    ax.set_ylabel('number of training examples')
    plt.legend(loc='lower left')
    #plt.show()
    plt.savefig(filename + '.pdf')
    plt.savefig(filename + '.png')
def plotLabellingAccuracy(accuracies, fractions, filename, \
  experiment='experiment2a'):

    db = getMACResultsDB()
    cursor = db[experiment].find({'name': 'gold benchmark'})
    for doc in cursor:
        gold_benchmark = doc['labelling accuracy']

    cursor = db[experiment].find({'name': 'majority class label assignment'})
    for doc in cursor:
        majority_class_benchmark = doc['labelling accuracy']

    fig, ax = plt.subplots()

    ax.plot(np.arange(-0.02,1.03,0.01), np.ones((105,))*gold_benchmark, \
            color='#726DA8', label='gold benchmark')

    ax.plot(np.arange(-0.02,1.03,0.01), np.ones((105,))*majority_class_benchmark, \
            color='#A0D2DB', label='majority class benchmark')

    for i, fraction in enumerate(fractions):
        ax.plot(fraction, accuracies[i], \
                'o', color='#B8336A')

    ax.set_ylim(0, 102)
    ax.set_xlim(-0.02, 1.02)
    ax.set_xlabel('majority class cluster proportion')
    ax.set_ylabel('labelling accuracy')
    plt.legend(loc='lower right')
    #plt.show()
    plt.savefig(filename + '.pdf')
    plt.savefig(filename + '.png')
示例#5
0
def main():
    db = getMACResultsDB()
    experiment_name = 'experiment2c'
    accuracies = []
    labelling_accuracies = []
    recoveries = []
    errors = []
    try:
        assert experiment_name in db.collection_names()
        cursor = db[experiment_name].find()
        fractions = []
        for doc in cursor:
            print(doc)
            accuracies.append(doc['accuracy'])
            labelling_accuracies.append(doc['labelling accuracy'])
            errors.append(doc['error'])
            fractions.append(doc['fraction sampled'])
            recoveries.append(doc['cluster majority class recovery rate'])
    except AssertionError:
        fractions, results, recovery = experiment2c()
        for i, result in enumerate(results):
            accuracies.append(result[0])
            errors.append(result[1])
            recoveries.append(recovery[i])

    plotMachineAccuracy(accuracies, errors, fractions, \
                        'plots/experiment2c_machine_performance')

    plotLabellingAccuracy(labelling_accuracies, fractions, \
                          'plots/experiment2c_labelling_accuracy')

    #fractions, recovery = experiment2c()
    plotMajorityClassRecovery(fractions, recoveries, \
                              'plots/experiment2c_majority_class_recovery')
示例#6
0
def experiment2c():
    image_dim = 28
    n_classes = 10
    n_trials = 5
    epochs = 20
    batch_size = 500

    db = getMACResultsDB()
    collection = db['experiment2c']

    x_train, y_train, x_train_flattened, x_test, y_test = loadMNIST()

    y_train = np_utils.to_categorical(y_train, n_classes)
    y_test = np_utils.to_categorical(y_test, n_classes)

    clustering = clusterData(x_train_flattened)

    fractions = [0.00, 0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 1.0]

    recovery = []
    results = []
    for fraction in fractions:
        x_labelled, labels, labelled_indices, unlabelled_indices, \
        sampled_dominant_cluster_classes, dominant_cluster_classes = \
          experiment2cLabelleingMethod(x_train, y_train, clustering,
                                       n_classes, fraction)

        result = \
          runTrials(x_labelled, labels, x_test, y_test, n_trials, n_classes, \
                    data='mnist_experiment2c_fraction%.2lf'%(fraction), \
                    epochs=epochs)

        results.append(result)

        recovery.append(
            calculateLabellingAccuracy(
                np_utils.to_categorical(dominant_cluster_classes),
                np_utils.to_categorical(sampled_dominant_cluster_classes)) /
            100.0)

        doc = {
              'name': 'majority class recovery by cluster subsample',
              'm': labels.shape[0],
              'fraction sampled': fraction,
              'accuracy': result[0],
              'error': result[1],
              'trials accuracy': result[2],
              'labelling accuracy': \
                calculateLabellingAccuracy(y_train[labelled_indices], labels),
              'training set class distribution': \
                calculateClassDistribution(labels).tolist(),
              'cluster majority class recovery rate': calculateLabellingAccuracy(
                np_utils.to_categorical(dominant_cluster_classes),
                np_utils.to_categorical(sampled_dominant_cluster_classes))/100.0
            }

        collection.insert_one(doc)

    return fractions, results, recovery
示例#7
0
def main():
    db = getMACResultsDB()
    experiment_name = 'experiment2a'
    try:
        assert experiment_name in db.collection_names()
        cursor = db[experiment_name].find()
        for doc in cursor:
            print(doc)
    except AssertionError:
        experiment2a()
def experiment2b():

    image_dim = 28
    n_classes = 10
    n_trials = 5
    epochs = 20
    batch_size = 500

    db = getMACResultsDB()
    collection = db['experiment2b']

    x_train, y_train, x_train_flattened, x_test, y_test = loadMNIST()

    y_train = np_utils.to_categorical(y_train, n_classes)
    y_test = np_utils.to_categorical(y_test, n_classes)

    clustering = clusterData(x_train_flattened)

    fractions = [0.11, 0.2, 0.25, 0.3, 0.4, 0.5, \
                 0.6, 0.7, 0.8, 0.9, 0.95, 0.99, 1.0]

    results = []
    for fraction in fractions:
        x_labelled, labels, labelled_indices, unlabelled_indices = \
         experiment2bLabelleingMethod(x_train, y_train, clustering,
                                      n_classes, fraction)

        print(labels.shape[0])

        result = \
          runTrials(x_labelled, labels, x_test, y_test, n_trials, n_classes, \
                    data='mnist_experiment2b_fraction%.2lf'%(fraction), \
                    epochs=epochs)

        results.append(result)

        doc = {
              'name': 'majority class label assignment by fraction',
              'm': labels.shape[0],
              'fraction': fraction,
              'accuracy': result[0],
              'error': result[1],
              'trials accuracy': result[2],
              'labelling accuracy': \
                calculateLabellingAccuracy(y_train[labelled_indices], labels),
              'training set class distribution': \
                calculateClassDistribution(labels).tolist()
            }

        collection.insert_one(doc)

    return fractions, results
def plotMachineAccuracy(accuracies, errors, fractions, filename, \
  experiment='experiment2a'):

    db = getMACResultsDB()
    cursor = db[experiment].find({'name': 'gold benchmark'})
    for doc in cursor:
        gold_benchmark = doc['accuracy']
        gold_benchmark_error = doc['error']

    cursor = db[experiment].find({'name': 'majority class label assignment'})
    for doc in cursor:
        majority_class_benchmark = doc['accuracy']
        majority_class_error = doc['error']

    fig, ax = plt.subplots()

    ax.plot(np.arange(-0.02,1.03,0.01), np.ones((105,))*gold_benchmark, \
            color='#726DA8', label='gold benchmark')
    ax.axhspan(gold_benchmark-gold_benchmark_error, \
               gold_benchmark+gold_benchmark_error, \
               facecolor='#726DA8', alpha=0.5)

    ax.plot(np.arange(-0.02,1.03,0.01), np.ones((105,))*majority_class_benchmark, \
            color='#A0D2DB', label='majority class benchmark')
    ax.axhspan(majority_class_benchmark-majority_class_error, \
               majority_class_benchmark+majority_class_error, \
               facecolor='#A0D2DB', alpha=0.5)

    for i, fraction in enumerate(fractions):
        ax.errorbar(fraction, accuracies[i], yerr=errors[i], \
                     fmt='o', color='#B8336A')

    ax.set_ylim(0, 100)
    ax.set_xlim(-0.02, 1.02)
    ax.set_xlabel('majority class cluster proportion')
    ax.set_ylabel('machine test set accuracy')
    plt.legend(loc='lower right')
    #plt.show()
    plt.savefig(filename + '.pdf')
    plt.savefig(filename + '.png')
示例#10
0
def experiment2aNoise(filename):
    image_dim = 28
    n_classes = 10
    n_trials = 5
    epochs = 20
    batch_size = 500

    db = getMACResultsDB()

    x_train, y_train, x_train_flattened, x_test, y_test = loadMNIST()

    # calculate performance on gold labels
    y_train = np_utils.to_categorical(y_train, n_classes)
    y_test = np_utils.to_categorical(y_test, n_classes)

    clustering = clusterData(x_train_flattened)

    # experiment 2a labelling accuracy
    labelling_accuracies_2a = []
    labelling_accuracies_2a_errors = []
    labelling_accuracies_2a_2 = []
    labelling_accuracies_2a_2_errors = []
    # experiment 2a machine accuracy
    machine_accuracies_2a = []
    machine_accuracies_2a_errors = []
    machine_accuracies_2a_2 = []
    machine_accuracies_2a_2_errors = []

    noise_levels = np.arange(0, 1.1, 0.1)

    collection_name = 'experiment2d'
    collection = db[collection_name]

    for noise in noise_levels:
        l_results = []
        l_results_2 = []
        m_results = []
        m_results_2 = []
        for i in range(n_trials):
            # experiment 2a with random noise
            experiment_name = \
              'experiment 2d - experiment2a random noise %.2lf trial %d' % (noise, i)
            try:
                assert collection_name in db.collection_names()
                doc = db[collection_name].find({'name': experiment_name})[0]
                l_results.append(doc['labelling accuracy'])
                m_results.append(doc['machine accuracy'])
            except (AssertionError, IndexError):
                x_labelled, labels, labelled_indices, unlabelled_indices = \
                  experiment2aNoiseLabelling(x_train, y_train, clustering, \
                  n_classes, noise=noise)
                l_results.append(
                    calculateLabellingAccuracy(y_train[labelled_indices],
                                               labels))
                r = runTrials(x_labelled, labels, x_test, y_test, 1, n_classes, \
                  data='mnist_experiment2d_noise%.2lf_trial%d'%(noise, i))
                m_results.append(r[0])
                doc = {
                        'name':experiment_name,
                        'm': labels.shape[0],
                        'noise': noise,
                        'trial': i,
                        'labelling accuracy': \
                          calculateLabellingAccuracy(y_train[labelled_indices], labels),
                        'machine accuracy': r[0],
                        'training set class distribution': \
                          calculateClassDistribution(labels).tolist()
                      }
                collection.insert_one(doc)

            # experiment 2a with weighted noise
            experiment_name = \
              'experiment 2d - experiment2a class weighted noise %.2lf trial %d' \
                % (noise, i)
            try:
                assert collection_name in db.collection_names()
                doc = db[collection_name].find({'name': experiment_name})[0]
                l_results_2.append(doc['labelling accuracy'])
                m_results_2.append(doc['machine accuracy'])
            except (AssertionError, IndexError):
                x_labelled, labels, labelled_indices, unlabelled_indices = \
                  experiment2aNoiseLabelling(x_train, y_train, clustering, \
                  n_classes, intelligent_noise=noise)
                l_results_2.append(
                    calculateLabellingAccuracy(y_train[labelled_indices],
                                               labels))
                r = runTrials(x_labelled, labels, x_test, y_test, 1, n_classes, \
                  data='mnist_experiment2d_class_weighted_noise%.2lf_trial%d'%(noise, i))
                m_results_2.append(r[0])
                doc = {
                        'name':experiment_name,
                        'm': labels.shape[0],
                        'noise': noise,
                        'trial': i,
                        'labelling accuracy': \
                          calculateLabellingAccuracy(y_train[labelled_indices], labels),
                        'machine accuracy': r[0],
                        'training set class distribution': \
                          calculateClassDistribution(labels).tolist()
                      }
                collection.insert_one(doc)

        labelling_accuracies_2a.append(np.mean(l_results))
        labelling_accuracies_2a_errors.append(np.std(l_results))
        labelling_accuracies_2a_2.append(np.mean(l_results_2))
        labelling_accuracies_2a_2_errors.append(np.std(l_results_2))

        machine_accuracies_2a.append(np.mean(m_results))
        machine_accuracies_2a_errors.append(np.std(m_results))
        machine_accuracies_2a_2.append(np.mean(m_results_2))
        machine_accuracies_2a_2_errors.append(np.std(m_results_2))

    cursor = db['experiment2a'].find(
        {'name': 'majority class label assignment'})
    for doc in cursor:
        majority_class_benchmark = doc['labelling accuracy']

    fig, ax = plt.subplots()
    ax.plot(np.arange(-0.02,1.03,0.01), \
      np.ones((105))*majority_class_benchmark,'k--')

    ax.errorbar(noise_levels, labelling_accuracies_2a, \
        yerr=labelling_accuracies_2a_errors, fmt='o', mfc='None', \
        color='#B8336A', label='majority class - random noise')

    ax.errorbar(noise_levels, labelling_accuracies_2a_2, \
        yerr=labelling_accuracies_2a_2_errors, fmt='o', mfc='None', \
        color='#726DA8', label='majority class - class weighted noise', zorder=100)

    ax.set_xlabel('labelling noise')
    ax.set_ylabel('labelling accuracy')
    ax.set_ylim(-2, 100)
    ax.set_xlim(-0.02, 1.03)
    plt.legend(loc='lower left')
    #plt.show()
    plt.savefig(filename + 'labelling_noise.pdf')
    plt.savefig(filename + 'labelling_noise.png')

    cursor = db['experiment2a'].find({'name': 'gold benchmark'})
    for doc in cursor:
        gold_benchmark = doc['accuracy']
        gold_benchmark_error = doc['error']

    cursor = db['experiment2a'].find(
        {'name': 'majority class label assignment'})
    for doc in cursor:
        majority_class_benchmark = doc['accuracy']
        majority_class_error = doc['error']

    fig, ax = plt.subplots()

    ax.plot(np.arange(-0.02,1.03,0.01), np.ones((105,))*gold_benchmark, \
            color='#726DA8', label='gold benchmark')
    ax.axhspan(gold_benchmark-gold_benchmark_error, \
               gold_benchmark+gold_benchmark_error, \
               facecolor='#726DA8', alpha=0.5)

    ax.plot(np.arange(-0.02,1.03,0.01), np.ones((105,))*majority_class_benchmark, \
            color='#A0D2DB', label='majority class benchmark')
    ax.axhspan(majority_class_benchmark-majority_class_error, \
               majority_class_benchmark+majority_class_error, \
               facecolor='#A0D2DB', alpha=0.5)

    ax.errorbar(noise_levels, machine_accuracies_2a, \
        yerr=machine_accuracies_2a_errors, fmt='o', mfc='None', \
        color='#B8336A', label='majority class - random noise')

    ax.errorbar(noise_levels, machine_accuracies_2a_2, \
        yerr=machine_accuracies_2a_2_errors, fmt='o', mfc='None', \
        color='#726DA8', label='majority class - class weighted noise', zorder=100)

    ax.set_xlabel('labelling noise')
    ax.set_ylabel('machine accuracy')
    ax.set_ylim(-2, 100)
    ax.set_xlim(-0.02, 1.03)
    plt.legend(loc='lower left')
    #plt.show()
    plt.savefig(filename + '_machine_accuracy.pdf')
    plt.savefig(filename + '_machine_accuracy.png')
示例#11
0
def experiment2a():

    image_dim = 28
    n_classes = 10
    n_trials = 5
    epochs = 20
    batch_size = 500

    db = getMACResultsDB()
    collection = db['experiment2a']

    x_train, y_train, x_train_flattened, x_test, y_test = loadMNIST()

    # calculate performance on gold labels
    y_train = np_utils.to_categorical(y_train, n_classes)
    y_test = np_utils.to_categorical(y_test, n_classes)

    gold_benchmark = \
      runTrials(x_train, y_train, x_test, y_test, n_trials, n_classes, \
                data='mnist_gold', epochs=epochs)

    print(gold_benchmark)

    doc = {
            'name': 'gold benchmark',
            'm': y_train.shape[0],
            'accuracy': gold_benchmark[0],
            'error': gold_benchmark[1],
            'trials accuracy': gold_benchmark[2],
            'labelling accuracy': calculateLabellingAccuracy(y_train, y_train),
            'training set class distribution': \
              calculateClassDistribution(y_train).tolist()
          }

    collection.insert_one(doc)

    clustering = clusterData(x_train_flattened)

    x_labelled, labels, labelled_indices, unlabelled_indices = \
     experiment2aLabelleingMethod(x_train, y_train, \
                                  clustering, n_classes)

    result = \
      runTrials(x_labelled, labels, x_test, y_test, n_trials, n_classes, \
                data='mnist_experiment2a', epochs=epochs)

    print(result)

    print(calculateLabellingAccuracy(y_train[labelled_indices], labels))
    doc = {
            'name': 'majority class label assignment',
            'm': labels.shape[0],
            'accuracy': result[0],
            'error': result[1],
            'trials accuracy': result[2],
            'labelling accuracy': \
              calculateLabellingAccuracy(y_train[labelled_indices], labels),
            'training set class distribution': \
              calculateClassDistribution(labels).tolist()
          }

    collection.insert_one(doc)