Exemplo n.º 1
0
def simple_test(expt):
    # Build a generator and a classifier that are perfectly matched
    # with respect to means and see what sort of error rate we get for
    # various variance values in the generator.
    gen = DataGenerator(expt.num_phonemes, expt.num_features,
                        expt.var_diag_interval, expt.var_offdiag_interval)
    test_data = gen.generate_simulated_data(expt.num_test_frames)

    # Make perfect "training data" in the form of two points for each
    # class whose mean is exactly the mean for that class.  Training
    # on this will give a correct mean for the model, but with some
    # non-zero variance

    labels = gen.get_labels()
    means = [array(target) for target in gen._targets]

    # Construct a list of (label, point) pairs with two points for each label
    delta = [0.1] * expt.num_features
    assert len(labels) == len(means)
    data = zip(labels, (m+delta for m in means)) + zip(labels, (m-delta for m in means))
    # print dump_data(data)

    c = SimpleClassifier(labels, gen.num_features)
    c.train_all(data)

    (rate, results) = measureAccuracy(c, test_data)
    summary = make_summary_string("Simple test", rate, results, c, test_data, gen)
    print summary
Exemplo n.º 2
0
def simple_test(expt):
    # Build a generator and a classifier that are perfectly matched
    # with respect to means and see what sort of error rate we get for
    # various variance values in the generator.
    gen = DataGenerator(expt.num_phonemes, expt.num_features,
                        expt.var_diag_interval, expt.var_offdiag_interval)
    test_data = gen.generate_simulated_data(expt.num_test_frames)

    # Make perfect "training data" in the form of two points for each
    # class whose mean is exactly the mean for that class.  Training
    # on this will give a correct mean for the model, but with some
    # non-zero variance

    labels = gen.get_labels()
    means = [array(target) for target in gen._targets]

    # Construct a list of (label, point) pairs with two points for each label
    delta = [0.1] * expt.num_features
    assert len(labels) == len(means)
    data = zip(labels,
               (m + delta for m in means)) + zip(labels,
                                                 (m - delta for m in means))
    # print dump_data(data)

    c = SimpleClassifier(labels, gen.num_features)
    c.train_all(data)

    (rate, results) = measureAccuracy(c, test_data)
    summary = make_summary_string("Simple test", rate, results, c, test_data,
                                  gen)
    print summary
Exemplo n.º 3
0
def do_ddt_runs(expt):
    gen = DataGenerator(expt.num_phonemes, expt.num_features,
                        expt.var_diag_interval, expt.var_offdiag_interval)

    perfect_practice_data = gen.generate_simulated_data(expt.num_practice_frames)
    practice_data, num_practice_errors = gen.add_errors_to_data(perfect_practice_data, expt.practice_error_rate)
    practice_data_dict = partition_data(practice_data)
    # We got some practice data for every point, right?
    assert( len(practice_data_dict.keys() == expt.num_phonemes))

    test_data = gen.generate_simulated_data(expt.num_test_frames)

    n = expt.num_training_frames
    assert( n * expt.training_error_rate >= 5)   # number of errorful points
    assert( n * (1-expt.training_error_rate) > 5)  # number of correct points
    error_training_frame_indices = range(0,5)
    correct_training_frame_indices = range(n-5, n)

    all_results = {}
    all_results['Error'] = []
    all_results['Correct'] = []
    for run_idx in range(0, expt.num_runs):
        training_data, num_errors = make_training_data(gen, expt)
        c = SimpleClassifier(gen.get_labels(), gen.num_features)
        c.train_all(training_data)

        def run_some_frames(frame_indices):
            frame_results = []
            for i in frame_indices:
                label = training_data[i][0]
                a = SimpleAllele(c, [label])
            
                # subtract (label, frame) from training_data for active phoneme
                alt_data = training_data[:i] + training_data[i+1:]
            
                # train alternate model in allele on alternate data
                a.train_variants(alt_data)
                # print a.make_details_string()

                # Construct a subset of the practice data with only the points
                # which are labelled with the active label of the allele (see comments below).
                data = [(label, point) for point in practice_data_dict[label]]
                results = measurePrimaryAndVariantAccuracy(a, data)

                # KJB - here's the original version, in which we just
                # used all the practice data This essential means we
                # aren't using the practice data labels at all, which
                # might be an interesting variation, but isn't the
                # original intention.
                #results = measurePrimaryAndVariantAccuracy(a, practice_data)

                frame_results.append(results)
            return frame_results

        error_results = run_some_frames(error_training_frame_indices)
        all_results['Error'].append(error_results)
        correct_results = run_some_frames(correct_training_frame_indices)
        all_results['Correct'].append(correct_results)
    return all_results
Exemplo n.º 4
0
def do_baseline_runs(expt):
    gen = DataGenerator(expt.num_phonemes, expt.num_features,
                        expt.var_diag_interval, expt.var_offdiag_interval)

    all_results = []
    for run_idx in range(expt.num_runs):
        test_data = gen.generate_simulated_data(expt.num_test_frames)

        # There's a problem here if there's only one data point, since
        # then we end up with a variance of 0.  We currently hack
        # around this problem by guaranteeing more than one point.  We
        # could change the models to allow zero variance but this will
        # mean not being able to make samples from the models without
        # some extra work.  Note that we don't care at all about order
        # of training data in these experiments, so we just build our
        # training data in two parts and cat them together.  If you
        # hit either of these asserts, you're asking for an error rate
        # that's too hig and/or a training data size that's too low.
        # We need two correct samples per phoneme.
        num_secondary_frames = expt.num_training_frames - expt.num_phonemes * 2
        num_errorful_frames = expt.num_training_frames * expt.training_error_rate
        assert expt.num_training_frames >= expt.num_phonemes * 2
        assert num_secondary_frames > num_errorful_frames
        errorless_training_data = gen.generate_simulated_data_per_phoneme(2)
        secondary_training_data = gen.generate_simulated_data(
            num_secondary_frames)

        # Slight trickiness to get a correct error rate for this subset of the data
        subset_error_rate = float(num_errorful_frames) / num_secondary_frames
        errorful_training_data, num_errors = gen.add_errors_to_data(
            secondary_training_data, subset_error_rate)

        practice_data = gen.generate_simulated_data(expt.num_practice_frames)
        errorful_practice_data, num_errors = gen.add_errors_to_data(
            practice_data, expt.practice_error_rate)

        training_data = errorless_training_data + errorful_training_data + errorful_practice_data

        c = SimpleClassifier(gen.get_labels(), gen.num_features)
        c.train_all(training_data)

        (rate, results) = measureAccuracy(c, test_data)
        name = "Baseline 0.%d" % (run_idx, )
        summary = make_summary_string(name, rate, results, c, test_data, gen)
        all_results.append((name, rate))

        # print "Classifier:\n"
        # print c.to_string()
        # print summary
    print "\n--------------------------Summary-----------------------"
    print make_all_runs_summary_string(expt, all_results)
Exemplo n.º 5
0
def do_baseline_runs(expt):
    gen = DataGenerator(expt.num_phonemes, expt.num_features,
                        expt.var_diag_interval, expt.var_offdiag_interval)

    all_results = []
    for run_idx in range(expt.num_runs):
        test_data = gen.generate_simulated_data(expt.num_test_frames)

        # There's a problem here if there's only one data point, since
        # then we end up with a variance of 0.  We currently hack
        # around this problem by guaranteeing more than one point.  We
        # could change the models to allow zero variance but this will
        # mean not being able to make samples from the models without
        # some extra work.  Note that we don't care at all about order
        # of training data in these experiments, so we just build our
        # training data in two parts and cat them together.  If you
        # hit either of these asserts, you're asking for an error rate
        # that's too hig and/or a training data size that's too low.
        # We need two correct samples per phoneme.
        num_secondary_frames  =  expt.num_training_frames - expt.num_phonemes * 2
        num_errorful_frames = expt.num_training_frames * expt.training_error_rate 
        assert expt.num_training_frames >= expt.num_phonemes * 2
        assert num_secondary_frames >  num_errorful_frames
        errorless_training_data = gen.generate_simulated_data_per_phoneme(2)
        secondary_training_data = gen.generate_simulated_data(num_secondary_frames)

        # Slight trickiness to get a correct error rate for this subset of the data
        subset_error_rate = float(num_errorful_frames) / num_secondary_frames
        errorful_training_data, num_errors = gen.add_errors_to_data(secondary_training_data, subset_error_rate)

        practice_data = gen.generate_simulated_data(expt.num_practice_frames)
        errorful_practice_data, num_errors = gen.add_errors_to_data(practice_data, expt.practice_error_rate)

        training_data = errorless_training_data + errorful_training_data + errorful_practice_data

        c = SimpleClassifier(gen.get_labels(), gen.num_features)
        c.train_all(training_data)

        (rate, results) = measureAccuracy(c, test_data)
        name = "Baseline 0.%d" % (run_idx,)
        summary = make_summary_string(name, rate, results, c, test_data, gen)
        all_results.append((name, rate))

        # print "Classifier:\n"
        # print c.to_string()
        # print summary
    print "\n--------------------------Summary-----------------------"
    print make_all_runs_summary_string(expt, all_results)
Exemplo n.º 6
0
def do_simple_allele_test(expt):
    gen = DataGenerator(expt.num_phonemes, expt.num_features,
                        expt.var_diag_interval, expt.var_offdiag_interval)
    test_data = gen.generate_simulated_data(expt.num_test_frames)

    for run_idx in range(0, expt.num_runs):
        training_data, num_errors = make_training_data(gen, expt)
        # select training data frames to be tested, put into sample_training_frames
        # sample_training_frames is a subset of the training data consisting of some
        # errorful frames and some correct frames - we hope to identify the
        # incorrect frames

        # For now, use first 5 frames and last 5.  The former will have errors and the
        # latter will be correct
        n = len(training_data)
        assert (n * expt.training_error_rate > 5)  # number of errorful points
        assert (n * (1 - expt.training_error_rate) > 5
                )  # number of correct points
        sample_training_frame_indices = range(0, 5) + range(n - 5, n)

        c = SimpleClassifier(gen.get_labels(), gen.num_features)
        c.train_all(training_data)

        all_results = []
        for i in sample_training_frame_indices:
            label = training_data[i][0]
            a = SimpleAllele(c, [label])

            # subtract (label, frame) from training_data for active phoneme
            alt_data = training_data[:i] + training_data[i + 1:]

            # train alternate model in allele on alternate data
            a.train_variants(alt_data)
            # print a.make_details_string()

            results = measurePrimaryAndVariantAccuracy(a, test_data)
            print results
            all_results.append(results)
        print 'End run %d \n' % (run_idx, )
Exemplo n.º 7
0
def do_simple_allele_test(expt):
    gen = DataGenerator(expt.num_phonemes, expt.num_features,
                        expt.var_diag_interval, expt.var_offdiag_interval)
    test_data = gen.generate_simulated_data(expt.num_test_frames)

    for run_idx in range(0, expt.num_runs):
        training_data, num_errors = make_training_data(gen, expt)
        # select training data frames to be tested, put into sample_training_frames
        # sample_training_frames is a subset of the training data consisting of some
        # errorful frames and some correct frames - we hope to identify the
        # incorrect frames

        # For now, use first 5 frames and last 5.  The former will have errors and the
        # latter will be correct
        n = len(training_data)
        assert( n * expt.training_error_rate > 5)   # number of errorful points
        assert( n * (1-expt.training_error_rate) > 5)  # number of correct points
        sample_training_frame_indices = range(0,5) + range(n-5, n)

        c = SimpleClassifier(gen.get_labels(), gen.num_features)
        c.train_all(training_data)

        all_results = []
        for i in sample_training_frame_indices:
            label = training_data[i][0]
            a = SimpleAllele(c, [label])
            
            # subtract (label, frame) from training_data for active phoneme
            alt_data = training_data[:i] + training_data[i+1:]
            
            # train alternate model in allele on alternate data
            a.train_variants(alt_data)
            # print a.make_details_string()
            
            results = measurePrimaryAndVariantAccuracy(a, test_data)
            print results
            all_results.append(results)
        print 'End run %d \n' % (run_idx,)
Exemplo n.º 8
0
def do_ddt_runs(expt):
    gen = DataGenerator(expt.num_phonemes, expt.num_features,
                        expt.var_diag_interval, expt.var_offdiag_interval)

    perfect_practice_data = gen.generate_simulated_data(
        expt.num_practice_frames)
    practice_data, num_practice_errors = gen.add_errors_to_data(
        perfect_practice_data, expt.practice_error_rate)
    practice_data_dict = partition_data(practice_data)
    # We got some practice data for every point, right?
    assert (len(practice_data_dict.keys() == expt.num_phonemes))

    test_data = gen.generate_simulated_data(expt.num_test_frames)

    n = expt.num_training_frames
    assert (n * expt.training_error_rate >= 5)  # number of errorful points
    assert (n * (1 - expt.training_error_rate) > 5)  # number of correct points
    error_training_frame_indices = range(0, 5)
    correct_training_frame_indices = range(n - 5, n)

    all_results = {}
    all_results['Error'] = []
    all_results['Correct'] = []
    for run_idx in range(0, expt.num_runs):
        training_data, num_errors = make_training_data(gen, expt)
        c = SimpleClassifier(gen.get_labels(), gen.num_features)
        c.train_all(training_data)

        def run_some_frames(frame_indices):
            frame_results = []
            for i in frame_indices:
                label = training_data[i][0]
                a = SimpleAllele(c, [label])

                # subtract (label, frame) from training_data for active phoneme
                alt_data = training_data[:i] + training_data[i + 1:]

                # train alternate model in allele on alternate data
                a.train_variants(alt_data)
                # print a.make_details_string()

                # Construct a subset of the practice data with only the points
                # which are labelled with the active label of the allele (see comments below).
                data = [(label, point) for point in practice_data_dict[label]]
                results = measurePrimaryAndVariantAccuracy(a, data)

                # KJB - here's the original version, in which we just
                # used all the practice data This essential means we
                # aren't using the practice data labels at all, which
                # might be an interesting variation, but isn't the
                # original intention.
                #results = measurePrimaryAndVariantAccuracy(a, practice_data)

                frame_results.append(results)
            return frame_results

        error_results = run_some_frames(error_training_frame_indices)
        all_results['Error'].append(error_results)
        correct_results = run_some_frames(correct_training_frame_indices)
        all_results['Correct'].append(correct_results)
    return all_results