예제 #1
0
def main(sival_dir, outputfile):
    names_files = glob.glob(os.path.join(sival_dir, '*.names'))
    classes = sorted([os.path.basename(nf[:-6]) for nf in names_files])

    mat = {}
    mat['class_names'] = np.array(classes)

    data = None
    reverse_index = {}
    progress = ProgressMonitor(total=len(classes), msg='Getting class labels')
    for i, clazz in enumerate(classes, 1):
        exset = parse_c45(clazz, sival_dir)
        if data is None:
            data = np.array(exset.to_float())[:, 2:-1]
            inst_classes = np.zeros(len(exset))
            index = [(ex[0], ex[1]) for ex in exset]
            for j, key in enumerate(index):
                reverse_index[key] = j

        for ex in exset:
            inst_classes[reverse_index[(ex[0], ex[1])]] += i*ex[-1]
        progress.increment()

    mat['instance_ids'] = np.array(index)
    mat['X'] = data
    mat['y'] = inst_classes

    savemat(outputfile, mat)
예제 #2
0
def _prog(plist):
    progress = ProgressMonitor(total=len(plist),
                               print_interval=1,
                               msg='Constructing Kernel')
    for p in plist:
        yield p
        progress.increment()
예제 #3
0
 def make_weights(X):
     n = len(X)
     prog = ProgressMonitor(total=n,
                            print_interval=1,
                            msg='Constructing Kernel')
     ws = []
     for x in X:
         prog.increment()
         ws.append(median_weight(k, x))
     return ws
예제 #4
0
def compute_andor(configuration_file, kerneldir):
    print 'Loading configuration...'
    with open(configuration_file, 'r') as f:
        configuration = yaml.load(f)

    kernels = dict()
    for experiment in configuration['experiments']:
        dataset = experiment['dataset']
        epsilon = experiment['epsilon']
        delta = experiment['delta']
        seed = experiment['seed']

        n = get_dset_size(dataset)
        mantissa = np.zeros((n, n))
        exponent = np.zeros((n, n))
        time = np.zeros((n, n))

        prog = ProgressMonitor(total=(n*(n+1)/2), msg='%s,andor,%f,%f,%d' % (dataset, epsilon, delta, seed))
        alldone = True
        for i in range(n):
            for j in range(i, n):
                prog.increment()
                andorkey = (dataset, 'andor', epsilon, delta, seed, i, j)
                andortask = Task(*andorkey)
                andortask.ground(kerneldir)
                if andortask.finished:
                    continue

                andkey = (dataset, 'and', epsilon, delta, seed, i, j)
                andtask = Task(*andkey)
                andtask.ground(kerneldir)
                if not andtask.finished:
                    alldone = False
                    continue

                orkey = (dataset, 'or', epsilon, delta, seed, i, j)
                ortask = Task(*orkey)
                ortask.ground(kerneldir)
                if not ortask.finished:
                    alldone = False
                    continue

                andtime = andtask.runtime()
                ortime = ortask.runtime()
                andman, andexp = andtask.value()
                orman, orexp = ortask.value()
                submission = {
                    'mantissa' : (andman / orman),
                    'exponent' : (andexp - orexp),
                    'time'     : (andtime + ortime),
                }
                andortask.store_results(submission)

        if not alldone:
            print 'Unfinished: %s, %f, %f, %d' % (dataset, epsilon, delta, seed)
예제 #5
0
def main(outputfile):
    progress = ProgressMonitor(total=len(DATASETS), msg='Extracting statistics')
    with open(outputfile, 'w+') as f:
        stats = ','.join(stat for stat, _ in STATISTICS)
        f.write('#%s\n' % stats)
        for dataset in DATASETS:
            dset = get_dataset(dataset)
            dset.name = dataset
            stats = ','.join(map(str, (f(dset) for _, f in STATISTICS)))
            f.write('%s\n' % stats)
            progress.increment()
예제 #6
0
def main(outputfile):
    progress = ProgressMonitor(total=len(DATASETS),
                               msg='Extracting statistics')
    with open(outputfile, 'w+') as f:
        stats = ','.join(stat for stat, _ in STATISTICS)
        f.write('#%s\n' % stats)
        for dataset in DATASETS:
            dset = get_dataset(dataset)
            dset.name = dataset
            stats = ','.join(map(str, (f(dset) for _, f in STATISTICS)))
            f.write('%s\n' % stats)
            progress.increment()
예제 #7
0
    def learn(self, X_labeled, y_labeled, X_pool, y_pool, X_test):
        # Initial Predictions
        self.classifier.fit(X_labeled, y_labeled)
        predictions = [self.classifier.decision_function(X_test)]

        if self.verbose:
            progress = ProgressMonitor(total=self.queries,
                                       msg='Active Learning')
        for q in range(self.queries):
            if len(X_pool) <= 0:
                if self.verbose: print 'Warning: skipping query %d...' % q
                predictions.append(predictions[-1])
            else:
                next_labeled = self.select(X_pool)
                X_labeled.append(X_pool.pop(next_labeled))
                y_labeled.append(y_pool.pop(next_labeled))
                self.classifier.fit(X_labeled, y_labeled)
                predictions.append(self.classifier.decision_function(X_test))
            if self.verbose: progress.increment()

        return predictions
예제 #8
0
def main(dataset, folddir, outputdir, reps=0):
    data_dict = data.get_dataset(dataset)
    folds = data.get_folds(folddir, dataset)

    all_bag_ids = set(data_dict.keys())

    progress = ProgressMonitor(total=reps * len(folds),
                               msg='Generating Replicates')

    for f in range(len(folds)):
        test = data.get_fold(folddir, dataset, f)
        bag_ids = np.array(list(all_bag_ids - set(test)))
        n = len(bag_ids)

        for r in range(1, reps + 1):
            rep_path = os.path.join(outputdir,
                                    '%s_%04d_%06d.rep' % (dataset, f, r))
            if not os.path.exists(rep_path):
                sample = np.random.randint(n, size=n)
                sampled_bags = bag_ids[sample]
                with open(rep_path, 'w+') as ofile:
                    ofile.write('\n'.join([bid for bid in sampled_bags.flat]))
            progress.increment()
예제 #9
0
def main(configfile, folddir, resultsdir):
    with open(configfile, 'r') as f:
        configuration = yaml.load(f)

    # Generate tasks from experiment list
    total = 0
    actual = 0
    prog = ProgressMonitor(total=len(configuration['experiments']),
                           msg='Computing noise')
    for experiment in configuration['experiments']:
        technique = experiment['technique']
        classifier = experiment['classifier']
        dataset = experiment['dataset']

        ids, _, y = data.get_dataset(dataset)
        y_dict = {}
        for (bid, iid), yi in zip(ids, y):
            y_dict[bid, iid] = yi

        folds = data.get_folds(folddir, dataset)
        for f in range(len(folds)):
            for r in range(experiment['reps']):
                for i in experiment['initial']:
                    for s in experiment['shuffled']:
                        labeled = setup_rep(technique, experiment['noise'],
                                            dataset, f, r, i, s, folddir,
                                            resultsdir)
                        pos_shuffled = get_positive_shuffled(labeled, i, s)
                        total += len(pos_shuffled)
                        actual += count_actual_positive(pos_shuffled, y_dict)

        prog.increment()
        if total > 0:
            print 1 - (float(actual) / total)

    if total > 0:
        print 1 - (float(actual) / total)
예제 #10
0
파일: server.py 프로젝트: garydoranjr/smile
def main(configfile, folddir, resultsdir):
    with open(configfile, 'r') as f:
        configuration = yaml.load(f)

    # Count total experiments for progress monitor
    exps = 0
    for experiment in configuration['experiments']:
        dataset = experiment['dataset']
        folds = get_folds(folddir, dataset)
        for f in range(len(folds)):
            for r in range(experiment['reps']):
                for n in experiment['noise']:
                    for s in experiment['shuffled']:
                        exps += 1

    prog = ProgressMonitor(total=exps, msg='Generating Shuffled Bags')

    # Generate tasks from experiment list
    tasks = {}
    for experiment in configuration['experiments']:
        technique = experiment['technique']
        classifier = experiment['classifier']
        dataset = experiment['dataset']
        folds = get_folds(folddir, dataset)
        for f in range(len(folds)):
            for r in range(experiment['reps']):
                for n in experiment['noise']:
                    for s in experiment['shuffled']:
                        key = (technique, classifier,
                               dataset,
                               experiment['kernel'],
                               f, r, n, s)
                        kwargs = {}
                        kwargs['params'] = experiment['params']
                        kwargs['shuffled_bags'] = setup_rep(technique, dataset,
                                                            f, r, n, s,
                                                            folddir, resultsdir)
                        task = Task(*key, **kwargs)
                        tasks[key] = task
                        prog.increment()

    # Mark finished tasks
    for task in tasks.values():
        predfile = os.path.join(resultsdir, task.filebase('preds'))
        if os.path.exists(predfile):
            task.finish()

    def handle(key, task, submission):
        if 'stats' in submission:
            sfile = os.path.join(resultsdir, task.filebase('stats'))
            with open(sfile, 'w+') as f:
                f.write(yaml.dump(submission['stats'], default_flow_style=False))

        pfile = os.path.join(resultsdir, task.filebase('preds'))
        with open(pfile, 'w+') as f:
            f.write(yaml.dump(submission['preds'], default_flow_style=False))

    server = ExperimentServer(tasks, render, handle)
    cherrypy.config.update({'server.socket_port': PORT,
                            'server.socket_host': '0.0.0.0'})
    cherrypy.quickstart(server)
예제 #11
0
def load_config(configuration_file, results_root_dir):
    tasks = {}
    parameter_dict = {}

    print 'Loading configuration...'
    with open(configuration_file, 'r') as f:
        configuration = yaml.load(f)

    experiment_key = configuration['experiment_key']
    experiment_name = configuration['experiment_name']

    if experiment_name == 'mi_kernels':
        from resampling import NullResamplingConfiguration
        def constructor_from_experiment(experiment):
            return lambda dset: NullResamplingConfiguration(dset)
    else:
        raise ValueError('Unknown experiment name "%s"' % experiment_name)

    for experiment in configuration['experiments']:
        try:
            experiment_id = tuple(experiment[k] for k in experiment_key)
        except KeyError:
            raise KeyError('Experiment missing identifier "%s"'
                            % experiment_key)

        def _missing(pretty_name):
            raise KeyError('%s not specified for experiment "%s"'
                            % (pretty_name, str(experiment_id)))

        def _resolve(field_name, pretty_name):
            field = experiment.get(field_name,
                        configuration.get(field_name, None))
            if field is None: _missing(pretty_name)
            return field

        print 'Setting up experiment "%s"...' % str(experiment_id)

        try:
            dataset = experiment['dataset']
        except KeyError: _missing('Dataset')

        experiment_format = _resolve('experiment_key_format',
                                     'Experiment key format')

        parameter_key = _resolve('parameter_key', 'Parameter key')
        parameter_format = _resolve('parameter_key_format',
                                    'Parameter key format')
        parameters = _resolve('parameters', 'Parameters')
        param_config = ParameterConfiguration(results_root_dir,
                        experiment_name, experiment_id,
                        experiment_format, parameter_key,
                        parameter_format, parameters)
        parameter_dict[experiment_id] = param_config

        folds = _resolve('folds', 'Folds')
        fold_config = FoldConfiguration(dataset, *folds)

        resampling_constructor = constructor_from_experiment(experiment)

        priority = experiment.get('priority', 0)

        experiment_config = ExperimentConfiguration(
                                experiment_name, experiment_id,
                                fold_config, param_config,
                                resampling_constructor)
        settings = experiment_config.get_settings()
        prog = ProgressMonitor(total=len(settings), print_interval=10,
                               msg='\tGetting tasks')
        for setting in settings:
            key = experiment_config.get_key(**setting)
            task = experiment_config.get_task(**setting)
            task.priority_adjustment = priority
            task.ground(results_root_dir,
                experiment_format, parameter_format)
            tasks[key] = task
            prog.increment()

    return tasks, parameter_dict
    dst_did = dstk.get_dataset_id(dst)
    dst_tid = dstk.get_ktype_id(t)
    srccon = srck.get_connection()
    cursor = srccon.cursor()
    cursor.execute('SELECT * FROM kernel')
    new_entries = [
        (dst_did, dst_tid, epsilon, delta, seed, i, j,
         mantissa, exponent, time)
        for _, _, epsilon, delta, seed, i, j, mantissa, exponent, time
        in cursor.fetchall()
    ]

    dstcon = dstk.get_connection()
    with dstcon:
        dstcon.executemany(
            'INSERT INTO kernel '
            'VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', new_entries
        )

if __name__ == '__main__':

    prog = ProgressMonitor(total=len(DST)*len(TYPES), msg='Copying kernels')
    for dst in DST:
        for t in TYPES:
            srcfile = filename(SRC, t)
            dstfile = filename(dst, t)
            srck = KernelManager(srcfile)
            dstk = KernelManager(dstfile)
            copy_kernel(srck, dstk, dst, t)
            prog.increment()
예제 #13
0
def load_config(configuration_file, results_root_dir):
    tasks = {}
    parameter_dict = {}

    print 'Loading configuration...'
    with open(configuration_file, 'r') as f:
        configuration = yaml.load(f)

    experiment_key = configuration['experiment_key']
    experiment_name = configuration['experiment_name']

    if experiment_name == 'mi_kernels':
        from resampling import NullResamplingConfiguration
        def constructor_from_experiment(experiment):
            return lambda dset: NullResamplingConfiguration(dset)
    else:
        raise ValueError('Unknown experiment name "%s"' % experiment_name)

    for experiment in configuration['experiments']:
        try:
            experiment_id = tuple(experiment[k] for k in experiment_key)
        except KeyError:
            raise KeyError('Experiment missing identifier "%s"'
                            % experiment_key)

        def _missing(pretty_name):
            raise KeyError('%s not specified for experiment "%s"'
                            % (pretty_name, str(experiment_id)))

        def _resolve(field_name, pretty_name):
            field = experiment.get(field_name,
                        configuration.get(field_name, None))
            if field is None: _missing(pretty_name)
            return field

        print 'Setting up experiment "%s"...' % str(experiment_id)

        try:
            dataset = experiment['dataset']
        except KeyError: _missing('Dataset')

        experiment_format = _resolve('experiment_key_format',
                                     'Experiment key format')

        parameter_key = _resolve('parameter_key', 'Parameter key')
        parameter_format = _resolve('parameter_key_format',
                                    'Parameter key format')
        parameters = _resolve('parameters', 'Parameters')
        param_config = ParameterConfiguration(results_root_dir,
                        experiment_name, experiment_id,
                        experiment_format, parameter_key,
                        parameter_format, parameters)
        parameter_dict[experiment_id] = param_config

        folds = _resolve('folds', 'Folds')
        fold_config = FoldConfiguration(dataset, *folds)

        resampling_constructor = constructor_from_experiment(experiment)

        priority = experiment.get('priority', 0)

        experiment_config = ExperimentConfiguration(
                                experiment_name, experiment_id,
                                fold_config, param_config,
                                resampling_constructor)
        settings = experiment_config.get_settings()
        #import pdb;pdb.set_trace()
	prog = ProgressMonitor(total=len(settings), print_interval=10,
                               msg='\tGetting tasks')
        for setting in settings:
            key = experiment_config.get_key(**setting)
            task = experiment_config.get_task(**setting)
            task.priority_adjustment = priority
            task.ground(results_root_dir,
                experiment_format, parameter_format)
            tasks[key] = task
            prog.increment()

    return tasks, parameter_dict