def main(): db = getMACResultsDB() experiment_name = 'experiment2b' accuracies = [] labelling_accuracies = [] number_training_examples = [] errors = [] try: assert experiment_name in db.collection_names() cursor = db[experiment_name].find() fractions = [] for doc in cursor: print(doc) accuracies.append(doc['accuracy']) labelling_accuracies.append(doc['labelling accuracy']) errors.append(doc['error']) fractions.append(doc['fraction']) number_training_examples.append(doc['m']) except AssertionError: fractions, results = experiment2b() for result in results: accuracies.append(result[0]) errors.append(result[1]) plotMachineAccuracy(accuracies, errors, fractions, \ 'plots/experiment2b_machine_performance') plotLabellingAccuracy(labelling_accuracies, fractions, \ 'plots/experiment2b_labelling_accuracy') plotNumberTrainingExamples(number_training_examples, fractions, \ 'plots/experiment2b_number_examples')
def experiment2d(): db = getMACResultsDB() collection = db['experiment2d'] experiment2aNoise('experiment2d_2a_noise') """
def plotNumberTrainingExamples(number_training_examples, fractions, filename, \ experiment='experiment2a'): db = getMACResultsDB() cursor = db[experiment].find({'name': 'gold benchmark'}) for doc in cursor: gold_benchmark = doc['m'] fig, ax = plt.subplots() ax.plot(np.arange(-0.02,1.03,0.01), np.ones((105,))*gold_benchmark, \ color='#726DA8', label='total training examples') for i, fraction in enumerate(fractions): ax.plot(fraction, number_training_examples[i], \ 'o', color='#B8336A') ax.set_ylim(0, 10200) ax.set_xlim(-0.02, 1.02) ax.set_xlabel('majority class cluster proportion') ax.set_ylabel('number of training examples') plt.legend(loc='lower left') #plt.show() plt.savefig(filename + '.pdf') plt.savefig(filename + '.png')
def plotLabellingAccuracy(accuracies, fractions, filename, \ experiment='experiment2a'): db = getMACResultsDB() cursor = db[experiment].find({'name': 'gold benchmark'}) for doc in cursor: gold_benchmark = doc['labelling accuracy'] cursor = db[experiment].find({'name': 'majority class label assignment'}) for doc in cursor: majority_class_benchmark = doc['labelling accuracy'] fig, ax = plt.subplots() ax.plot(np.arange(-0.02,1.03,0.01), np.ones((105,))*gold_benchmark, \ color='#726DA8', label='gold benchmark') ax.plot(np.arange(-0.02,1.03,0.01), np.ones((105,))*majority_class_benchmark, \ color='#A0D2DB', label='majority class benchmark') for i, fraction in enumerate(fractions): ax.plot(fraction, accuracies[i], \ 'o', color='#B8336A') ax.set_ylim(0, 102) ax.set_xlim(-0.02, 1.02) ax.set_xlabel('majority class cluster proportion') ax.set_ylabel('labelling accuracy') plt.legend(loc='lower right') #plt.show() plt.savefig(filename + '.pdf') plt.savefig(filename + '.png')
def main(): db = getMACResultsDB() experiment_name = 'experiment2c' accuracies = [] labelling_accuracies = [] recoveries = [] errors = [] try: assert experiment_name in db.collection_names() cursor = db[experiment_name].find() fractions = [] for doc in cursor: print(doc) accuracies.append(doc['accuracy']) labelling_accuracies.append(doc['labelling accuracy']) errors.append(doc['error']) fractions.append(doc['fraction sampled']) recoveries.append(doc['cluster majority class recovery rate']) except AssertionError: fractions, results, recovery = experiment2c() for i, result in enumerate(results): accuracies.append(result[0]) errors.append(result[1]) recoveries.append(recovery[i]) plotMachineAccuracy(accuracies, errors, fractions, \ 'plots/experiment2c_machine_performance') plotLabellingAccuracy(labelling_accuracies, fractions, \ 'plots/experiment2c_labelling_accuracy') #fractions, recovery = experiment2c() plotMajorityClassRecovery(fractions, recoveries, \ 'plots/experiment2c_majority_class_recovery')
def experiment2c(): image_dim = 28 n_classes = 10 n_trials = 5 epochs = 20 batch_size = 500 db = getMACResultsDB() collection = db['experiment2c'] x_train, y_train, x_train_flattened, x_test, y_test = loadMNIST() y_train = np_utils.to_categorical(y_train, n_classes) y_test = np_utils.to_categorical(y_test, n_classes) clustering = clusterData(x_train_flattened) fractions = [0.00, 0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 1.0] recovery = [] results = [] for fraction in fractions: x_labelled, labels, labelled_indices, unlabelled_indices, \ sampled_dominant_cluster_classes, dominant_cluster_classes = \ experiment2cLabelleingMethod(x_train, y_train, clustering, n_classes, fraction) result = \ runTrials(x_labelled, labels, x_test, y_test, n_trials, n_classes, \ data='mnist_experiment2c_fraction%.2lf'%(fraction), \ epochs=epochs) results.append(result) recovery.append( calculateLabellingAccuracy( np_utils.to_categorical(dominant_cluster_classes), np_utils.to_categorical(sampled_dominant_cluster_classes)) / 100.0) doc = { 'name': 'majority class recovery by cluster subsample', 'm': labels.shape[0], 'fraction sampled': fraction, 'accuracy': result[0], 'error': result[1], 'trials accuracy': result[2], 'labelling accuracy': \ calculateLabellingAccuracy(y_train[labelled_indices], labels), 'training set class distribution': \ calculateClassDistribution(labels).tolist(), 'cluster majority class recovery rate': calculateLabellingAccuracy( np_utils.to_categorical(dominant_cluster_classes), np_utils.to_categorical(sampled_dominant_cluster_classes))/100.0 } collection.insert_one(doc) return fractions, results, recovery
def main(): db = getMACResultsDB() experiment_name = 'experiment2a' try: assert experiment_name in db.collection_names() cursor = db[experiment_name].find() for doc in cursor: print(doc) except AssertionError: experiment2a()
def experiment2b(): image_dim = 28 n_classes = 10 n_trials = 5 epochs = 20 batch_size = 500 db = getMACResultsDB() collection = db['experiment2b'] x_train, y_train, x_train_flattened, x_test, y_test = loadMNIST() y_train = np_utils.to_categorical(y_train, n_classes) y_test = np_utils.to_categorical(y_test, n_classes) clustering = clusterData(x_train_flattened) fractions = [0.11, 0.2, 0.25, 0.3, 0.4, 0.5, \ 0.6, 0.7, 0.8, 0.9, 0.95, 0.99, 1.0] results = [] for fraction in fractions: x_labelled, labels, labelled_indices, unlabelled_indices = \ experiment2bLabelleingMethod(x_train, y_train, clustering, n_classes, fraction) print(labels.shape[0]) result = \ runTrials(x_labelled, labels, x_test, y_test, n_trials, n_classes, \ data='mnist_experiment2b_fraction%.2lf'%(fraction), \ epochs=epochs) results.append(result) doc = { 'name': 'majority class label assignment by fraction', 'm': labels.shape[0], 'fraction': fraction, 'accuracy': result[0], 'error': result[1], 'trials accuracy': result[2], 'labelling accuracy': \ calculateLabellingAccuracy(y_train[labelled_indices], labels), 'training set class distribution': \ calculateClassDistribution(labels).tolist() } collection.insert_one(doc) return fractions, results
def plotMachineAccuracy(accuracies, errors, fractions, filename, \ experiment='experiment2a'): db = getMACResultsDB() cursor = db[experiment].find({'name': 'gold benchmark'}) for doc in cursor: gold_benchmark = doc['accuracy'] gold_benchmark_error = doc['error'] cursor = db[experiment].find({'name': 'majority class label assignment'}) for doc in cursor: majority_class_benchmark = doc['accuracy'] majority_class_error = doc['error'] fig, ax = plt.subplots() ax.plot(np.arange(-0.02,1.03,0.01), np.ones((105,))*gold_benchmark, \ color='#726DA8', label='gold benchmark') ax.axhspan(gold_benchmark-gold_benchmark_error, \ gold_benchmark+gold_benchmark_error, \ facecolor='#726DA8', alpha=0.5) ax.plot(np.arange(-0.02,1.03,0.01), np.ones((105,))*majority_class_benchmark, \ color='#A0D2DB', label='majority class benchmark') ax.axhspan(majority_class_benchmark-majority_class_error, \ majority_class_benchmark+majority_class_error, \ facecolor='#A0D2DB', alpha=0.5) for i, fraction in enumerate(fractions): ax.errorbar(fraction, accuracies[i], yerr=errors[i], \ fmt='o', color='#B8336A') ax.set_ylim(0, 100) ax.set_xlim(-0.02, 1.02) ax.set_xlabel('majority class cluster proportion') ax.set_ylabel('machine test set accuracy') plt.legend(loc='lower right') #plt.show() plt.savefig(filename + '.pdf') plt.savefig(filename + '.png')
def experiment2aNoise(filename): image_dim = 28 n_classes = 10 n_trials = 5 epochs = 20 batch_size = 500 db = getMACResultsDB() x_train, y_train, x_train_flattened, x_test, y_test = loadMNIST() # calculate performance on gold labels y_train = np_utils.to_categorical(y_train, n_classes) y_test = np_utils.to_categorical(y_test, n_classes) clustering = clusterData(x_train_flattened) # experiment 2a labelling accuracy labelling_accuracies_2a = [] labelling_accuracies_2a_errors = [] labelling_accuracies_2a_2 = [] labelling_accuracies_2a_2_errors = [] # experiment 2a machine accuracy machine_accuracies_2a = [] machine_accuracies_2a_errors = [] machine_accuracies_2a_2 = [] machine_accuracies_2a_2_errors = [] noise_levels = np.arange(0, 1.1, 0.1) collection_name = 'experiment2d' collection = db[collection_name] for noise in noise_levels: l_results = [] l_results_2 = [] m_results = [] m_results_2 = [] for i in range(n_trials): # experiment 2a with random noise experiment_name = \ 'experiment 2d - experiment2a random noise %.2lf trial %d' % (noise, i) try: assert collection_name in db.collection_names() doc = db[collection_name].find({'name': experiment_name})[0] l_results.append(doc['labelling accuracy']) m_results.append(doc['machine accuracy']) except (AssertionError, IndexError): x_labelled, labels, labelled_indices, unlabelled_indices = \ experiment2aNoiseLabelling(x_train, y_train, clustering, \ n_classes, noise=noise) l_results.append( calculateLabellingAccuracy(y_train[labelled_indices], labels)) r = runTrials(x_labelled, labels, x_test, y_test, 1, n_classes, \ data='mnist_experiment2d_noise%.2lf_trial%d'%(noise, i)) m_results.append(r[0]) doc = { 'name':experiment_name, 'm': labels.shape[0], 'noise': noise, 'trial': i, 'labelling accuracy': \ calculateLabellingAccuracy(y_train[labelled_indices], labels), 'machine accuracy': r[0], 'training set class distribution': \ calculateClassDistribution(labels).tolist() } collection.insert_one(doc) # experiment 2a with weighted noise experiment_name = \ 'experiment 2d - experiment2a class weighted noise %.2lf trial %d' \ % (noise, i) try: assert collection_name in db.collection_names() doc = db[collection_name].find({'name': experiment_name})[0] l_results_2.append(doc['labelling accuracy']) m_results_2.append(doc['machine accuracy']) except (AssertionError, IndexError): x_labelled, labels, labelled_indices, unlabelled_indices = \ experiment2aNoiseLabelling(x_train, y_train, clustering, \ n_classes, intelligent_noise=noise) l_results_2.append( calculateLabellingAccuracy(y_train[labelled_indices], labels)) r = runTrials(x_labelled, labels, x_test, y_test, 1, n_classes, \ data='mnist_experiment2d_class_weighted_noise%.2lf_trial%d'%(noise, i)) m_results_2.append(r[0]) doc = { 'name':experiment_name, 'm': labels.shape[0], 'noise': noise, 'trial': i, 'labelling accuracy': \ calculateLabellingAccuracy(y_train[labelled_indices], labels), 'machine accuracy': r[0], 'training set class distribution': \ calculateClassDistribution(labels).tolist() } collection.insert_one(doc) labelling_accuracies_2a.append(np.mean(l_results)) labelling_accuracies_2a_errors.append(np.std(l_results)) labelling_accuracies_2a_2.append(np.mean(l_results_2)) labelling_accuracies_2a_2_errors.append(np.std(l_results_2)) machine_accuracies_2a.append(np.mean(m_results)) machine_accuracies_2a_errors.append(np.std(m_results)) machine_accuracies_2a_2.append(np.mean(m_results_2)) machine_accuracies_2a_2_errors.append(np.std(m_results_2)) cursor = db['experiment2a'].find( {'name': 'majority class label assignment'}) for doc in cursor: majority_class_benchmark = doc['labelling accuracy'] fig, ax = plt.subplots() ax.plot(np.arange(-0.02,1.03,0.01), \ np.ones((105))*majority_class_benchmark,'k--') ax.errorbar(noise_levels, labelling_accuracies_2a, \ yerr=labelling_accuracies_2a_errors, fmt='o', mfc='None', \ color='#B8336A', label='majority class - random noise') ax.errorbar(noise_levels, labelling_accuracies_2a_2, \ yerr=labelling_accuracies_2a_2_errors, fmt='o', mfc='None', \ color='#726DA8', label='majority class - class weighted noise', zorder=100) ax.set_xlabel('labelling noise') ax.set_ylabel('labelling accuracy') ax.set_ylim(-2, 100) ax.set_xlim(-0.02, 1.03) plt.legend(loc='lower left') #plt.show() plt.savefig(filename + 'labelling_noise.pdf') plt.savefig(filename + 'labelling_noise.png') cursor = db['experiment2a'].find({'name': 'gold benchmark'}) for doc in cursor: gold_benchmark = doc['accuracy'] gold_benchmark_error = doc['error'] cursor = db['experiment2a'].find( {'name': 'majority class label assignment'}) for doc in cursor: majority_class_benchmark = doc['accuracy'] majority_class_error = doc['error'] fig, ax = plt.subplots() ax.plot(np.arange(-0.02,1.03,0.01), np.ones((105,))*gold_benchmark, \ color='#726DA8', label='gold benchmark') ax.axhspan(gold_benchmark-gold_benchmark_error, \ gold_benchmark+gold_benchmark_error, \ facecolor='#726DA8', alpha=0.5) ax.plot(np.arange(-0.02,1.03,0.01), np.ones((105,))*majority_class_benchmark, \ color='#A0D2DB', label='majority class benchmark') ax.axhspan(majority_class_benchmark-majority_class_error, \ majority_class_benchmark+majority_class_error, \ facecolor='#A0D2DB', alpha=0.5) ax.errorbar(noise_levels, machine_accuracies_2a, \ yerr=machine_accuracies_2a_errors, fmt='o', mfc='None', \ color='#B8336A', label='majority class - random noise') ax.errorbar(noise_levels, machine_accuracies_2a_2, \ yerr=machine_accuracies_2a_2_errors, fmt='o', mfc='None', \ color='#726DA8', label='majority class - class weighted noise', zorder=100) ax.set_xlabel('labelling noise') ax.set_ylabel('machine accuracy') ax.set_ylim(-2, 100) ax.set_xlim(-0.02, 1.03) plt.legend(loc='lower left') #plt.show() plt.savefig(filename + '_machine_accuracy.pdf') plt.savefig(filename + '_machine_accuracy.png')
def experiment2a(): image_dim = 28 n_classes = 10 n_trials = 5 epochs = 20 batch_size = 500 db = getMACResultsDB() collection = db['experiment2a'] x_train, y_train, x_train_flattened, x_test, y_test = loadMNIST() # calculate performance on gold labels y_train = np_utils.to_categorical(y_train, n_classes) y_test = np_utils.to_categorical(y_test, n_classes) gold_benchmark = \ runTrials(x_train, y_train, x_test, y_test, n_trials, n_classes, \ data='mnist_gold', epochs=epochs) print(gold_benchmark) doc = { 'name': 'gold benchmark', 'm': y_train.shape[0], 'accuracy': gold_benchmark[0], 'error': gold_benchmark[1], 'trials accuracy': gold_benchmark[2], 'labelling accuracy': calculateLabellingAccuracy(y_train, y_train), 'training set class distribution': \ calculateClassDistribution(y_train).tolist() } collection.insert_one(doc) clustering = clusterData(x_train_flattened) x_labelled, labels, labelled_indices, unlabelled_indices = \ experiment2aLabelleingMethod(x_train, y_train, \ clustering, n_classes) result = \ runTrials(x_labelled, labels, x_test, y_test, n_trials, n_classes, \ data='mnist_experiment2a', epochs=epochs) print(result) print(calculateLabellingAccuracy(y_train[labelled_indices], labels)) doc = { 'name': 'majority class label assignment', 'm': labels.shape[0], 'accuracy': result[0], 'error': result[1], 'trials accuracy': result[2], 'labelling accuracy': \ calculateLabellingAccuracy(y_train[labelled_indices], labels), 'training set class distribution': \ calculateClassDistribution(labels).tolist() } collection.insert_one(doc)