def __init__(self, data, gene_length, genome_length):
        """"""
        BinaryClassifier.__init__(self, data, gene_length, genome_length)

        self.representation = Representation(
            {"length": genome_length, "type": "enum", "values": ["0", "1"], "duplicates": True}
        )

        self.population_size = 100
        self.generations = 250
        self.selection_func = tournament
        self.tournament_size = 10
        self.genome_lengths = list()
        self.mutation_prob = 0.01
        self.mutation_func = self.mutation_func_variable_length
        self.crossover_prob = 0.2
        self.elite_count = 6
Пример #2
0
def main():
    """
    Run the classification GA on the data file given on the the command line
    """
    if len(sys.argv) != 2:
        sys.exit('usage: classifier.py datafile')

    data_file = sys.argv[1]

    with open(data_file, 'r') as f:
        # Read the first (informational) line
        info_line = f.readline().split()

        # Set length of variables + class
        gene_length = int(info_line[3]) + 1
        # Derive the length of an individual
        genome_length = (int(info_line[0]) * gene_length)

        # Binary data (data1.txt)
        if genome_length == 192:
            data = [list(line.rstrip().replace(' ', '')) for line in f]
            classifier = BinaryClassifier(data, gene_length,
                                          genome_length)

        # Binary data (data2.txt)
        elif genome_length == 448:
            data = [list(line.rstrip().replace(' ', '')) for line in f]
            classifier = VariableLengthBinaryClassifier(data, gene_length,
                                                        genome_length)

        # Real-valued data
        elif genome_length == 14000:
            data = [map(float, line.rstrip().split()) for line in f]
            # 2 floats (upper, lower) per "bit", plus class
            gene_length = (gene_length * 2) - 2
            genome_length = (int(info_line[0]) * gene_length)
            classifier = RealValueClassifier(data, gene_length,
                                             genome_length)

        else:
            raise IOError('unknown data file format')

        print '[i] loaded data file:', data_file

    #---------------------------------------------------------------------------
    # Generate the initial population
    #---------------------------------------------------------------------------
    generations = classifier.generations

    p = Population(representation=classifier.representation,
                   size=classifier.population_size,
                   fitness_func=classifier.fitness_func,
                   selection_func=classifier.selection_func,
                   crossover_func=classifier.crossover_func,
                   mutation_func=classifier.mutation_func,
                   natural_fitness=True,
                   crossover_probability=classifier.crossover_prob,
                   mutation_probability=classifier.mutation_prob,
                   elite_count=classifier.elite_count,
                   tournament_size=classifier.tournament_size)
    p.gen_population()

    #---------------------------------------------------------------------------
    # Fiddle the population (ugly hack alert)
    #---------------------------------------------------------------------------
    step = classifier.gene_length

    if isinstance(classifier, VariableLengthBinaryClassifier):
        for individual in p:
            # Fix a 0 or 1 in the class position
            for i in xrange(step - 1, len(individual.genes), step):
                if individual.genes[i] == '#':
                    individual.genes[i] = '1' if random.random() < 0.5 else '0'

            classifier.genome_lengths.append(len(individual.genes))

    if isinstance(classifier, RealValueClassifier):
        for i, individual in enumerate(p.population):

            new_genes = list()
            average_sigmas = list()
            individual.average_sigmas = list()

            for genes in classifier.batch_gen(individual.genes,
                                              classifier.gene_length):
                g = Gene(genes)
                g.class_label = 1 if random.random() < 0.5 else 0
                g.mutation_step_sizes = [0.05 for _
                                         in xrange(classifier.gene_length)]
                new_genes.append(g)

                # Update info for plotter
                average_sigmas.append(sum(g.mutation_step_sizes)
                                      / len(g.mutation_step_sizes))

            individual.genes = new_genes

            individual.average_sigmas.append(sum(average_sigmas)
                                             / len(average_sigmas))
            # Add strategy parameters
            individual.strategy_params = {'mutation_step_size':
                                          0.05}

    print '[i] fiddled population'

    # if hasattr(classifier, 'genome_lengths'):
    #     p.add_to_plot([len(i) for i in p], 'avg genome length')

    #---------------------------------------------------------------------------
    # Run the GA
    #---------------------------------------------------------------------------
    p.run(generations)

    #---------------------------------------------------------------------------
    # Validate the population
    #---------------------------------------------------------------------------
    print
    avg = 0
    for individual in p:
        avg += classifier.fitness_func(individual.genes, validate=True)
    print 'min individual: %d/%dt, %d/%dv (len=%d, num genes=%d) %s' % \
          (classifier.fitness_func(p.min_individual().genes),
           len(classifier.training_set),
           classifier.fitness_func(p.min_individual().genes, validate=True),
           len(classifier.validation_set),
           len(p.min_individual()),
           len(p.min_individual()) / classifier.gene_length,
           p.min_individual())
    print 'mean validation fitness:', avg / len(p)
    print 'max individual: %d/%dt, %d/%dv (len=%d, num genes=%d) %s' % \
          (classifier.fitness_func(p.max_individual().genes),
           len(classifier.training_set),
           classifier.fitness_func(p.max_individual().genes, validate=True),
           len(classifier.validation_set),
           len(p.max_individual()),
           len(p.max_individual()) / classifier.gene_length,
           p.max_individual())

    # TODO: plot amount of generalisation

    if isinstance(classifier, VariableLengthBinaryClassifier):
        data = list()
        for chunk in classifier.chunker(classifier.genome_lengths, p.size):
            data.append(sum(chunk) / len(chunk))

        p.add_to_plot(data, 'avg genome length')

        for gene in classifier.chunker(p.max_individual().genes,
                                         classifier.gene_length):
            print gene[:-1], gene[-1]

    elif isinstance(classifier, RealValueClassifier):
        data = list()
        for chunk in chunker(p.average_sigmas, p.size):
            data.append(sum(chunk) / len(chunk))

        p.add_to_plot(data, 'average sigma')

        for gene in p.max_individual().genes:
            for pair in chunker(gene.alleles, 2):
                print pair

            print gene.class_label
            print

    p.show_plot()