def simple_test(expt): # Build a generator and a classifier that are perfectly matched # with respect to means and see what sort of error rate we get for # various variance values in the generator. gen = DataGenerator(expt.num_phonemes, expt.num_features, expt.var_diag_interval, expt.var_offdiag_interval) test_data = gen.generate_simulated_data(expt.num_test_frames) # Make perfect "training data" in the form of two points for each # class whose mean is exactly the mean for that class. Training # on this will give a correct mean for the model, but with some # non-zero variance labels = gen.get_labels() means = [array(target) for target in gen._targets] # Construct a list of (label, point) pairs with two points for each label delta = [0.1] * expt.num_features assert len(labels) == len(means) data = zip(labels, (m+delta for m in means)) + zip(labels, (m-delta for m in means)) # print dump_data(data) c = SimpleClassifier(labels, gen.num_features) c.train_all(data) (rate, results) = measureAccuracy(c, test_data) summary = make_summary_string("Simple test", rate, results, c, test_data, gen) print summary
def simple_test(expt): # Build a generator and a classifier that are perfectly matched # with respect to means and see what sort of error rate we get for # various variance values in the generator. gen = DataGenerator(expt.num_phonemes, expt.num_features, expt.var_diag_interval, expt.var_offdiag_interval) test_data = gen.generate_simulated_data(expt.num_test_frames) # Make perfect "training data" in the form of two points for each # class whose mean is exactly the mean for that class. Training # on this will give a correct mean for the model, but with some # non-zero variance labels = gen.get_labels() means = [array(target) for target in gen._targets] # Construct a list of (label, point) pairs with two points for each label delta = [0.1] * expt.num_features assert len(labels) == len(means) data = zip(labels, (m + delta for m in means)) + zip(labels, (m - delta for m in means)) # print dump_data(data) c = SimpleClassifier(labels, gen.num_features) c.train_all(data) (rate, results) = measureAccuracy(c, test_data) summary = make_summary_string("Simple test", rate, results, c, test_data, gen) print summary
def do_ddt_runs(expt): gen = DataGenerator(expt.num_phonemes, expt.num_features, expt.var_diag_interval, expt.var_offdiag_interval) perfect_practice_data = gen.generate_simulated_data(expt.num_practice_frames) practice_data, num_practice_errors = gen.add_errors_to_data(perfect_practice_data, expt.practice_error_rate) practice_data_dict = partition_data(practice_data) # We got some practice data for every point, right? assert( len(practice_data_dict.keys() == expt.num_phonemes)) test_data = gen.generate_simulated_data(expt.num_test_frames) n = expt.num_training_frames assert( n * expt.training_error_rate >= 5) # number of errorful points assert( n * (1-expt.training_error_rate) > 5) # number of correct points error_training_frame_indices = range(0,5) correct_training_frame_indices = range(n-5, n) all_results = {} all_results['Error'] = [] all_results['Correct'] = [] for run_idx in range(0, expt.num_runs): training_data, num_errors = make_training_data(gen, expt) c = SimpleClassifier(gen.get_labels(), gen.num_features) c.train_all(training_data) def run_some_frames(frame_indices): frame_results = [] for i in frame_indices: label = training_data[i][0] a = SimpleAllele(c, [label]) # subtract (label, frame) from training_data for active phoneme alt_data = training_data[:i] + training_data[i+1:] # train alternate model in allele on alternate data a.train_variants(alt_data) # print a.make_details_string() # Construct a subset of the practice data with only the points # which are labelled with the active label of the allele (see comments below). data = [(label, point) for point in practice_data_dict[label]] results = measurePrimaryAndVariantAccuracy(a, data) # KJB - here's the original version, in which we just # used all the practice data This essential means we # aren't using the practice data labels at all, which # might be an interesting variation, but isn't the # original intention. #results = measurePrimaryAndVariantAccuracy(a, practice_data) frame_results.append(results) return frame_results error_results = run_some_frames(error_training_frame_indices) all_results['Error'].append(error_results) correct_results = run_some_frames(correct_training_frame_indices) all_results['Correct'].append(correct_results) return all_results
def do_baseline_runs(expt): gen = DataGenerator(expt.num_phonemes, expt.num_features, expt.var_diag_interval, expt.var_offdiag_interval) all_results = [] for run_idx in range(expt.num_runs): test_data = gen.generate_simulated_data(expt.num_test_frames) # There's a problem here if there's only one data point, since # then we end up with a variance of 0. We currently hack # around this problem by guaranteeing more than one point. We # could change the models to allow zero variance but this will # mean not being able to make samples from the models without # some extra work. Note that we don't care at all about order # of training data in these experiments, so we just build our # training data in two parts and cat them together. If you # hit either of these asserts, you're asking for an error rate # that's too hig and/or a training data size that's too low. # We need two correct samples per phoneme. num_secondary_frames = expt.num_training_frames - expt.num_phonemes * 2 num_errorful_frames = expt.num_training_frames * expt.training_error_rate assert expt.num_training_frames >= expt.num_phonemes * 2 assert num_secondary_frames > num_errorful_frames errorless_training_data = gen.generate_simulated_data_per_phoneme(2) secondary_training_data = gen.generate_simulated_data( num_secondary_frames) # Slight trickiness to get a correct error rate for this subset of the data subset_error_rate = float(num_errorful_frames) / num_secondary_frames errorful_training_data, num_errors = gen.add_errors_to_data( secondary_training_data, subset_error_rate) practice_data = gen.generate_simulated_data(expt.num_practice_frames) errorful_practice_data, num_errors = gen.add_errors_to_data( practice_data, expt.practice_error_rate) training_data = errorless_training_data + errorful_training_data + errorful_practice_data c = SimpleClassifier(gen.get_labels(), gen.num_features) c.train_all(training_data) (rate, results) = measureAccuracy(c, test_data) name = "Baseline 0.%d" % (run_idx, ) summary = make_summary_string(name, rate, results, c, test_data, gen) all_results.append((name, rate)) # print "Classifier:\n" # print c.to_string() # print summary print "\n--------------------------Summary-----------------------" print make_all_runs_summary_string(expt, all_results)
def do_baseline_runs(expt): gen = DataGenerator(expt.num_phonemes, expt.num_features, expt.var_diag_interval, expt.var_offdiag_interval) all_results = [] for run_idx in range(expt.num_runs): test_data = gen.generate_simulated_data(expt.num_test_frames) # There's a problem here if there's only one data point, since # then we end up with a variance of 0. We currently hack # around this problem by guaranteeing more than one point. We # could change the models to allow zero variance but this will # mean not being able to make samples from the models without # some extra work. Note that we don't care at all about order # of training data in these experiments, so we just build our # training data in two parts and cat them together. If you # hit either of these asserts, you're asking for an error rate # that's too hig and/or a training data size that's too low. # We need two correct samples per phoneme. num_secondary_frames = expt.num_training_frames - expt.num_phonemes * 2 num_errorful_frames = expt.num_training_frames * expt.training_error_rate assert expt.num_training_frames >= expt.num_phonemes * 2 assert num_secondary_frames > num_errorful_frames errorless_training_data = gen.generate_simulated_data_per_phoneme(2) secondary_training_data = gen.generate_simulated_data(num_secondary_frames) # Slight trickiness to get a correct error rate for this subset of the data subset_error_rate = float(num_errorful_frames) / num_secondary_frames errorful_training_data, num_errors = gen.add_errors_to_data(secondary_training_data, subset_error_rate) practice_data = gen.generate_simulated_data(expt.num_practice_frames) errorful_practice_data, num_errors = gen.add_errors_to_data(practice_data, expt.practice_error_rate) training_data = errorless_training_data + errorful_training_data + errorful_practice_data c = SimpleClassifier(gen.get_labels(), gen.num_features) c.train_all(training_data) (rate, results) = measureAccuracy(c, test_data) name = "Baseline 0.%d" % (run_idx,) summary = make_summary_string(name, rate, results, c, test_data, gen) all_results.append((name, rate)) # print "Classifier:\n" # print c.to_string() # print summary print "\n--------------------------Summary-----------------------" print make_all_runs_summary_string(expt, all_results)
def do_simple_allele_test(expt): gen = DataGenerator(expt.num_phonemes, expt.num_features, expt.var_diag_interval, expt.var_offdiag_interval) test_data = gen.generate_simulated_data(expt.num_test_frames) for run_idx in range(0, expt.num_runs): training_data, num_errors = make_training_data(gen, expt) # select training data frames to be tested, put into sample_training_frames # sample_training_frames is a subset of the training data consisting of some # errorful frames and some correct frames - we hope to identify the # incorrect frames # For now, use first 5 frames and last 5. The former will have errors and the # latter will be correct n = len(training_data) assert (n * expt.training_error_rate > 5) # number of errorful points assert (n * (1 - expt.training_error_rate) > 5 ) # number of correct points sample_training_frame_indices = range(0, 5) + range(n - 5, n) c = SimpleClassifier(gen.get_labels(), gen.num_features) c.train_all(training_data) all_results = [] for i in sample_training_frame_indices: label = training_data[i][0] a = SimpleAllele(c, [label]) # subtract (label, frame) from training_data for active phoneme alt_data = training_data[:i] + training_data[i + 1:] # train alternate model in allele on alternate data a.train_variants(alt_data) # print a.make_details_string() results = measurePrimaryAndVariantAccuracy(a, test_data) print results all_results.append(results) print 'End run %d \n' % (run_idx, )
def do_simple_allele_test(expt): gen = DataGenerator(expt.num_phonemes, expt.num_features, expt.var_diag_interval, expt.var_offdiag_interval) test_data = gen.generate_simulated_data(expt.num_test_frames) for run_idx in range(0, expt.num_runs): training_data, num_errors = make_training_data(gen, expt) # select training data frames to be tested, put into sample_training_frames # sample_training_frames is a subset of the training data consisting of some # errorful frames and some correct frames - we hope to identify the # incorrect frames # For now, use first 5 frames and last 5. The former will have errors and the # latter will be correct n = len(training_data) assert( n * expt.training_error_rate > 5) # number of errorful points assert( n * (1-expt.training_error_rate) > 5) # number of correct points sample_training_frame_indices = range(0,5) + range(n-5, n) c = SimpleClassifier(gen.get_labels(), gen.num_features) c.train_all(training_data) all_results = [] for i in sample_training_frame_indices: label = training_data[i][0] a = SimpleAllele(c, [label]) # subtract (label, frame) from training_data for active phoneme alt_data = training_data[:i] + training_data[i+1:] # train alternate model in allele on alternate data a.train_variants(alt_data) # print a.make_details_string() results = measurePrimaryAndVariantAccuracy(a, test_data) print results all_results.append(results) print 'End run %d \n' % (run_idx,)
def do_ddt_runs(expt): gen = DataGenerator(expt.num_phonemes, expt.num_features, expt.var_diag_interval, expt.var_offdiag_interval) perfect_practice_data = gen.generate_simulated_data( expt.num_practice_frames) practice_data, num_practice_errors = gen.add_errors_to_data( perfect_practice_data, expt.practice_error_rate) practice_data_dict = partition_data(practice_data) # We got some practice data for every point, right? assert (len(practice_data_dict.keys() == expt.num_phonemes)) test_data = gen.generate_simulated_data(expt.num_test_frames) n = expt.num_training_frames assert (n * expt.training_error_rate >= 5) # number of errorful points assert (n * (1 - expt.training_error_rate) > 5) # number of correct points error_training_frame_indices = range(0, 5) correct_training_frame_indices = range(n - 5, n) all_results = {} all_results['Error'] = [] all_results['Correct'] = [] for run_idx in range(0, expt.num_runs): training_data, num_errors = make_training_data(gen, expt) c = SimpleClassifier(gen.get_labels(), gen.num_features) c.train_all(training_data) def run_some_frames(frame_indices): frame_results = [] for i in frame_indices: label = training_data[i][0] a = SimpleAllele(c, [label]) # subtract (label, frame) from training_data for active phoneme alt_data = training_data[:i] + training_data[i + 1:] # train alternate model in allele on alternate data a.train_variants(alt_data) # print a.make_details_string() # Construct a subset of the practice data with only the points # which are labelled with the active label of the allele (see comments below). data = [(label, point) for point in practice_data_dict[label]] results = measurePrimaryAndVariantAccuracy(a, data) # KJB - here's the original version, in which we just # used all the practice data This essential means we # aren't using the practice data labels at all, which # might be an interesting variation, but isn't the # original intention. #results = measurePrimaryAndVariantAccuracy(a, practice_data) frame_results.append(results) return frame_results error_results = run_some_frames(error_training_frame_indices) all_results['Error'].append(error_results) correct_results = run_some_frames(correct_training_frame_indices) all_results['Correct'].append(correct_results) return all_results