Exemplo n.º 1
0
 def __init__(self, base_path, modifier, data_dir):
     self.base_path = base_path
     self.data_dir = data_dir
     self.modifier = modifier
     self.summary = Config(os.path.join(data_dir, 'summary.txt'))
Exemplo n.º 2
0
class Experiment:

    def __init__(self, base_path, modifier, data_dir):
        self.base_path = base_path
        self.data_dir = data_dir
        self.modifier = modifier
        self.summary = Config(os.path.join(data_dir, 'summary.txt'))

    def run(self, end=1):
        self.summary.set('overall', 'base_path', self.base_path)
        self.summary.set('overall', 'modifier_name', self.modifier.name)
        print 'Running experiment'
        start = self.summary.getint('overall', 'trial_count', 0) + 1
        end = start + end
        for trial in xrange(start, end):
            print '    performing trial %d' % (trial, )
            self.summary.set('overall', 'trial_count', trial)
            self.generate()
            self.test()
        self.print_summary()

    def generate(self):
        print '        generating new data'
        self.data_path = os.path.join(
            self.data_dir,
            'data%s' % os.path.splitext(self.base_path)[1]
        )
        with open(self.base_path, 'rb') as fd:
            base_data = fd.read()
        with open(self.data_path, 'wb') as fd:
            start = True
            for modified in self.modifier.modify(base_data):
                if not start:
                    fd.write(utilities.RECORD_SEPARATOR)
                else:
                    start = False
                fd.write(modified)

    def __run_test(self, name, module, *args):
        print '           ', name
        compressed_data_path = module.encode(self.data_path, *args)
        self.update_averages(name, compressed_data_path)

    def test(self):
        print '        running encoding tests'
        iframe_options = [0, 1, 2, 5, 10]
        tests = [
            # (name, module, options)
            ('raw', raw, None),
            ('bz2', bzip2, None),
            ('gzip', gz, None),
            ('zip', _zip, None),
            ('{o} order optimal',
             optimal,
             xrange(0, 10)),
            ('bsdiff, iframe @ {o}',
             bsdiff,
             iframe_options),
            ('diffe, iframe @ {o}',
             diffe,
             iframe_options),
            ('diffe_gz, iframe @ {o}',
             diffe_gz,
             iframe_options),
            ('vcdiff, iframe @ {o}',
             vcdiff,
             iframe_options)
        ]
        for (name, module, options) in tests:
            if options is None:
                self.__run_test(name, module)
            else:
                for o in options:
                    new_name = name.format(o=o)
                    self.__run_test(new_name, module, o)
        self.summary.save()

    def update_averages(self, name, data_path):
        trial_count = self.summary.getint('overall', 'trial_count', 1)

        size = os.stat(data_path).st_size
        mean = self.summary.getfloat(name, 'size_mean', 0.0)
        m2 = self.summary.getfloat(name, 'size_m2', 0.0)

        delta = size - mean
        mean = mean + delta / trial_count
        m2 = m2 + delta * (size - mean)

        if trial_count == 1:
            variance = 0
        else:
            variance = m2 / (trial_count - 1)

        self.summary.set(name, 'size_mean', mean)
        self.summary.set(name, 'size_m2', m2)
        self.summary.set(name, 'size_variance', variance)

    def print_summary(self):
        print ''
        print 'Summary'
        print '-------'
        print 'Base data:', self.summary.get('overall', 'base_path')
        print 'Modifier:', self.summary.get('overall', 'modifier_name')
        print 'Trial Count:', self.summary.get('overall', 'trial_count')
        print ''
        print 'Results:'
        data = []
        raw_mean = self.summary.getfloat('raw', 'size_mean')
        for name in filter(lambda n: n != 'overall', self.summary.sections()):
            mean = self.summary.getfloat(name, 'size_mean')
            variance = self.summary.getfloat(name, 'size_variance')
            data.append([name, mean, variance])
        data = sorted(data, key=itemgetter(1))
        for row in data:
            row.append((100.0 * row[1]) / raw_mean)
            row.append((100.0 * row[1]) / data[0][1])
        headers = [
            'encoding',
            'mean size',
            'size variance',
            '% of raw',
            '% of best']
        print tabulate(data, headers, floatfmt=".2f")
        print ''