def handle(self, *args, **options):
        database_name = options['database_name']
        load_from = options['load_from']
        tmp_dir = options['tmp_dir']
        format = options['format']
        window_len = options['window_len']

        extractor = extractors[format]

        if not load_from.lower().endswith('.zip'):
            load_from += '.zip'

        if not os.path.isdir(tmp_dir):
            mkdirp(tmp_dir)

        variables = read_variables(load_from)
        variables['tmp_dir'] = tmp_dir
        variables['extractor'] = extractor
        variables['is_log_psd'] = format.startswith('log_')
        variables['database_name'] = database_name
        variables['window_len'] = window_len

        factory = NDS2SAEFactory()
        factory.set_output(load_from)
        factory.learning_rate = None
        factory.learning_rate_func = None
        encoder = factory.build()
        session = encoder.recreate_session()

        showcase_segmentation(variables, encoder, session)
        session.close()
Пример #2
0
 def create_segmenter(self, variables) -> Segmenter:
     load_from = variables['load_from']
     factory = NDS2SAEFactory()
     factory.set_output(load_from)
     factory.learning_rate = None
     factory.learning_rate_func = None
     self.encoder = factory.build()
     self.session = self.encoder.recreate_session()
     return SeqAutoEncoderSegmenter(variables, self.encoder, self.session)
Пример #3
0
def train(variables, save_to):
    sids_for_training = variables['sids_for_training']
    sids_for_testing = variables['sids_for_testing']
    n_train = len(sids_for_training)
    n_test = len(sids_for_testing)
    topology = variables['topology']
    batch_size = variables['batch_size']
    n_iterations = variables['n_iterations']
    keep_prob = variables['keep_prob']
    profiles = variables['profiles']

    batch_index_limits = dict(train=n_train, test=n_test)
    sids_collections = dict(train=sids_for_training, test=sids_for_testing)

    spects = {}
    windows_masked = {}

    # @profile  # noqa F821
    def get_batch(this_batch_size=10, data_type='train'):
        batch_index_limit = batch_index_limits[data_type]
        sids_collection = sids_collections[data_type]
        if this_batch_size is None:
            this_batch_size = batch_index_limit

        current_batch_index = variables['current_batch_index'][data_type]
        next_batch_index = current_batch_index + this_batch_size

        if current_batch_index == 0:
            np.random.shuffle(sids_collection)

        if next_batch_index >= batch_index_limit:
            next_batch_index = batch_index_limit
            variables['current_batch_index'][data_type] = 0
            final_batch = True
        else:
            variables['current_batch_index'][data_type] = next_batch_index
            final_batch = False

        batch_ids = sids_for_training[current_batch_index:next_batch_index]

        input_spects = []
        output_masks = []

        for sid in batch_ids:
            filepath, beg, end, window_masked = profiles[sid]
            if sid in windows_masked:
                window_masked = windows_masked[sid]
            else:
                window_masked = np.array(window_masked, dtype=np.float32)
                windows_masked[sid] = window_masked

            if filepath in spects:
                file_spect = spects[filepath]
            else:
                with open(filepath, 'rb') as f:
                    file_spect = pickle.load(f).transpose(1, 0)
                spects[filepath] = file_spect

            windowed_spect = file_spect[beg:end, :]
            input_spects.append(windowed_spect)
            output_masks.append(window_masked)

        return input_spects, output_masks, final_batch

    def train_batch_gen(batch_size):
        return get_batch(batch_size, 'train')

    def test_batch_gen(batch_size):
        return get_batch(batch_size, 'test')

    factory = NDS2SAEFactory()
    factory.set_output(save_to)
    factory.lrtype = variables['lrtype']
    factory.lrargs = variables['lrargs']
    factory.input_dim = variables['input_dims']
    factory.output_dim = variables['output_dims']
    factory.keep_prob = keep_prob
    factory.stop_pad_length = 0
    factory.go_token = -1
    factory.layer_sizes = infer_topology(topology, variables['input_dims'])
    encoder = factory.build()
    encoder.train(train_batch_gen, test_batch_gen, batch_size=batch_size, n_iterations=n_iterations, display_step=100,
                  save_step=500)
Пример #4
0
def train(variables, save_to):
    sids_for_training = variables['sids_for_training']
    sids_for_testing = variables['sids_for_testing']
    n_train = len(sids_for_training)
    n_test = len(sids_for_testing)
    spect_dir = variables['spect_dir']
    format = variables['format']
    topology = variables['topology']
    batch_size = variables['batch_size']
    n_iterations = variables['n_iterations']
    keep_prob = variables['keep_prob']

    batch_index_limits = dict(train=n_train, test=n_test)
    sids_collections = dict(train=sids_for_training, test=sids_for_testing)

    def get_batch(this_batch_size=10, data_type='train'):
        batch_index_limit = batch_index_limits[data_type]
        sids_collection = sids_collections[data_type]
        if this_batch_size is None:
            this_batch_size = batch_index_limit

        current_batch_index = variables['current_batch_index'][data_type]
        next_batch_index = current_batch_index + this_batch_size

        if current_batch_index == 0:
            np.random.shuffle(sids_collection)

        if next_batch_index >= batch_index_limit:
            next_batch_index = batch_index_limit
            variables['current_batch_index'][data_type] = 0
            final_batch = True
        else:
            variables['current_batch_index'][data_type] = next_batch_index
            final_batch = False

        batch_ids = sids_for_training[current_batch_index:next_batch_index]

        spects = []

        for i, sid in enumerate(batch_ids):
            spect_path = os.path.join(spect_dir, '{}.{}'.format(sid, format))
            with open(spect_path, 'rb') as f:
                spect = pickle.load(f)
                spects.append(spect.transpose(1, 0))

        return spects, spects, final_batch

    def train_batch_gen(batch_size):
        return get_batch(batch_size, 'train')

    def test_batch_gen(batch_size):
        return get_batch(batch_size, 'test')

    factory = NDS2SAEFactory()
    if os.path.isfile(save_to):
        factory.load(save_to)
    factory.lrtype = variables['lrtype']
    factory.lrargs = variables['lrargs']
    factory.input_dim = variables['dims']
    factory.output_dim = variables['dims']
    factory.keep_prob = keep_prob
    factory.stop_pad_length = 5
    factory.stop_pad_token = 0
    factory.pad_token = -2
    factory.go_token = -3
    factory.layer_sizes = infer_topology(topology, variables['dims'])
    encoder = factory.build(save_to)
    encoder.train(train_batch_gen,
                  test_batch_gen,
                  batch_size=batch_size,
                  n_iterations=n_iterations,
                  display_step=100,
                  save_step=200)
    def handle(self, *args, **options):
        mode = options['mode']
        database_name = options['database_name']
        database_only = options['database_only']
        kernel_only = options['kernel_only']
        load_from = options['load_from']
        tmp_dir = options['tmp_dir']
        dm_name = options['dm_name']
        format = options['format']
        with_duration = options['with_duration']
        min_max_loc = options['min_max_loc']
        denormalised = options['denormalised']

        extractor = extractors[format]

        if database_name is None and database_only:
            raise Exception(
                'only_database must be True when database_name is provided')

        if denormalised and min_max_loc is None:
            raise Exception(
                'If data is denomalised, --min-max-loc must be provided')

        if mode not in ['showcase', 'dm']:
            raise Exception('--mode can only be "showcase" or "dm"')

        if mode == 'showcase':
            if dm_name is not None:
                raise Exception(
                    'Can\'t accept --dm-name argument in showcase mode')

        else:
            if dm_name is None:
                raise Exception('Must provide --dm-name argument in dm mode')
            if database_name is None:
                raise Exception('database-name is required in dm mode')

        if not load_from.lower().endswith('.zip'):
            load_from += '.zip'

        if not os.path.isdir(tmp_dir):
            mkdirp(tmp_dir)

        variables = read_variables(load_from)
        variables['tmp_dir'] = tmp_dir
        variables['dm_name'] = dm_name
        variables['extractor'] = extractor
        variables['with_duration'] = with_duration
        variables['denormalised'] = denormalised

        if denormalised:
            global_min, global_max = load_global_min_max(min_max_loc)
            variables['global_min'] = global_min
            variables['global_max'] = global_max

        variables['is_log_psd'] = format.startswith('log_')

        factory = NDS2SAEFactory()
        factory.load(load_from)
        factory.learning_rate = None
        factory.learning_rate_func = None
        encoder = factory.build(load_from)
        session = encoder.recreate_session()

        if mode == 'showcase':
            showcase_reconstruct(variables, encoder, session, database_name,
                                 database_only)
        else:
            encode_into_datamatrix(variables, encoder, session, database_name,
                                   kernel_only)

        session.close()
Пример #6
0
    def handle(self, *args, **options):
        clsf_type = options['clsf_type']
        database_name = options['database_name']
        annotator_name = options['annotator_name']
        label_level = options['label_level']
        min_occur = options['min_occur']
        ipc = options['ipc']
        ratio_ = options['ratio']
        profile = options['profile']

        load_from = options['load_from']
        format = options['format']
        min_max_loc = options['min_max_loc']
        denormalised = options['denormalised']
        kernel_only = options['kernel_only']

        extractor = extractors[format]

        tsv_file = profile + '.tsv'
        trials_file = profile + '.trials'
        if ipc is not None:
            assert ipc <= min_occur, 'Instances per class cannot exceed as min-occur'
            ipc_min = ipc
            ipc_max = ipc
        else:
            ipc_min = min_occur
            ipc_max = int(np.floor(min_occur * 1.5))

        train_ratio, valid_ratio, test_ratio = get_ratios(ratio_)

        open_mode = 'w'

        assert clsf_type in classifiers.keys(), 'Unknown _classify: {}'.format(
            clsf_type)
        classifier = classifiers[clsf_type]

        annotator = get_or_error(User, dict(username__iexact=annotator_name))

        if not load_from.lower().endswith('.zip'):
            load_from += '.zip'

        variables = read_variables(load_from)
        variables['extractor'] = extractor
        variables['denormalised'] = denormalised

        if denormalised:
            global_min, global_max = load_global_min_max(min_max_loc)
            variables['global_min'] = global_min
            variables['global_max'] = global_max

        variables['is_log_psd'] = format.startswith('log_')

        factory = NDS2SAEFactory()
        factory.set_output(load_from)
        factory.learning_rate = None
        factory.learning_rate_func = None
        encoder = factory.build()
        session = encoder.recreate_session()

        _sids, full_data = encode_into_data(variables, encoder, session,
                                            database_name, kernel_only)

        labels, no_label_ids = get_labels_by_sids(_sids, label_level,
                                                  annotator, min_occur)

        if len(no_label_ids) > 0:
            sids, _, labels = exclude_no_labels(_sids, None, labels,
                                                no_label_ids)
            lookup_ids_rows = np.searchsorted(_sids, sids)
            full_data = full_data[lookup_ids_rows, :]

        full_data = zscore(full_data)
        full_data[np.where(np.isnan(full_data))] = 0
        full_data[np.where(np.isinf(full_data))] = 0

        ndims = full_data.shape[1]

        unique_labels = np.unique(labels)
        nlabels = len(unique_labels)

        dp = EnumDataProvider(full_data, labels, balanced=True)
        trainvalidset, testset = dp.split(test_ratio,
                                          limits=(ipc_min, ipc_max))

        v2t_ratio = valid_ratio / (train_ratio + valid_ratio)
        nfolds = int(np.floor(1. / v2t_ratio + 0.01))

        params_names = []
        params_converters = []
        params_count = 0

        def loss(params):
            classifier_args = {}
            for i in range(params_count):
                param_name = params_names[i]
                param_converter = params_converters[i]
                param_value = params[i]
                classifier_args[param_name] = param_converter(param_value)

            print(classifier_args)
            score = perform_k_fold(classifier, trainvalidset, nfolds,
                                   v2t_ratio, nlabels, **classifier_args)
            return 1. - score

        n_estimators_choices = hp.uniform('n_estimators', 40, 100)
        min_samples_split_choices = hp.uniform('min_samples_split', 2, 21)
        min_samples_leaf_choices = hp.uniform('min_samples_leaf', 1, 20)

        n_features = full_data.shape[1]
        auto_gamma = 1 / n_features
        gamma_choices = hp.uniform('gamma', auto_gamma / 10, auto_gamma * 10)
        c_choices = hp.uniform('C', -1, 2)
        hidden_layer_size_choices = hp.uniform('hidden_layer_sizes', 100, 5000)
        n_neighbors_choices = hp.uniform('n_neighbors', 1, 10)

        choices = {
            'rf': {
                'n_estimators':
                (lambda x: int(np.round(x)), n_estimators_choices),
                'min_samples_split':
                (lambda x: int(np.round(x)), min_samples_split_choices),
                'min_samples_leaf':
                (lambda x: int(np.round(x)), min_samples_leaf_choices),
            },
            'svm_rbf': {
                'gamma': (float, gamma_choices),
                'C': (lambda x: 10**x, c_choices),
            },
            'svm_linear': {
                'C': (lambda x: 10**x, c_choices),
            },
            'nnet': {
                'hidden_layer_sizes':
                (lambda x: (int(np.round(x)), ), hidden_layer_size_choices)
            },
            'knn': {
                'n_neighbors':
                (lambda x: int(np.round(x)), n_neighbors_choices)
            }
        }

        space = []
        for arg_name, (converter, arg_values) in choices[clsf_type].items():
            space.append(arg_values)
            params_names.append(arg_name)
            params_converters.append(converter)
            params_count += 1

        trials = Trials()
        max_evals = params_count * 10
        best = fmin(fn=loss,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=max_evals,
                    trials=trials)
        print(best)

        with open(trials_file, 'wb') as f:
            pickle.dump(trials, f)

        best_trial = trials.best_trial
        best_trial_args_values_ = best_trial['misc']['vals']
        best_trial_args_values = {}
        for arg_name, arg_values in best_trial_args_values_.items():
            converter = choices[clsf_type][arg_name][0]
            arg_value = converter(arg_values[0])
            best_trial_args_values[arg_name] = arg_value

        model_args = ['id'] + list(
            best_trial_args_values.keys()) + ['accuracy']

        model_args_values = {x: [] for x in model_args}
        for idx, trial in enumerate(trials.trials):
            if trial == best_trial:
                idx = 'Best'
            trial_args_values = trial['misc']['vals']
            for arg_name in model_args:
                if arg_name == 'id':
                    model_args_values['id'].append(idx)
                elif arg_name == 'accuracy':
                    trial_accuracy = 1. - trial['result']['loss']
                    model_args_values['accuracy'].append(trial_accuracy)
                else:
                    # choice = choices[clsf_type][arg_name]
                    converter = choices[clsf_type][arg_name][0]
                    val = converter(trial_args_values[arg_name][0])
                    # val = choice[choice_idx]
                    model_args_values[arg_name].append(val)

        # Perform classification on the test set
        train_x = np.array(trainvalidset.data)
        train_y = np.array(trainvalidset.labels, dtype=np.int32)
        test_x = np.array(testset.data)
        test_y = np.array(testset.labels, dtype=np.int32)

        score, label_hits, label_misses, cfmat, importances =\
            classifier(train_x, train_y, test_x, test_y, nlabels, True, **best_trial_args_values)
        lb_hitrates = label_hits / (label_hits + label_misses).astype(np.float)

        with open(tsv_file, open_mode, encoding='utf-8') as f:
            for arg in model_args:
                values = model_args_values[arg]
                f.write('{}\t'.format(arg))
                f.write('\t'.join(map(str, values)))
                f.write('\n')

            f.write('Results using best-model\'s paramaters on testset\n')
            f.write(
                'Feature group\tNdims\tLabel prediction score\t{}\n'.format(
                    '\t '.join(unique_labels)))
            f.write('{}\t{}\t{}\t{}\n'.format('s2senc', ndims, score,
                                              '\t'.join(map(str,
                                                            lb_hitrates))))

            f.write('\n')
            open_mode = 'a'